import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt


car_data = pd.read_csv('CarDataset.csv')
car_data.head()


car_data.shape

(11914, 16)


car_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11914 entries, 0 to 11913
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Make               11914 non-null  object 
 1   Model              11914 non-null  object 
 2   Year               11914 non-null  int64  
 3   Engine Fuel Type   11911 non-null  object 
 4   Engine HP          11845 non-null  float64
 5   Engine Cylinders   11884 non-null  float64
 6   Transmission Type  11914 non-null  object 
 7   Driven_Wheels      11914 non-null  object 
 8   Number of Doors    11908 non-null  float64
 9   Market Category    8172 non-null   object 
 10  Vehicle Size       11914 non-null  object 
 11  Vehicle Style      11914 non-null  object 
 12  highway MPG        11914 non-null  int64  
 13  city mpg           11914 non-null  int64  
 14  Popularity         11914 non-null  int64  
 15  MSRP               11914 non-null  int64  
dtypes: float64(3), int64(5), object(8)
memory usage: 1.5+ MB


car_data.describe()


car_data.rename(columns={'Make':'make', 'Model':'Model', 'Year':'Year', 'Engine Fuel Type':'Fuel Type', 
                         'Engine HP':'HP','Engine Cylinders':'Cylinders', 'Transmission Type':'Transmission', 'Driven_Wheels':'Driven_Wheels',
                         'Number of Doors':'Doors', 'Market Category':'Market Category', 'Vehicle Size':'Size', 'Vehicle Style':'Style',
                         'highway MPG':'h_MPG', 'city mpg':'c_MPG', 'Popularity':'Popularity', 'MSRP':'price'},inplace = True)


for col in car_data.columns:
    print(col)
    print(car_data[col].unique()[:5])
    print(car_data[col].nunique())
    print('\n')

make
['BMW' 'Audi' 'FIAT' 'Mercedes-Benz' 'Chrysler']
48


Model
['1 Series M' '1 Series' '100' '124 Spider' '190-Class']
915


Year
[2011 2012 2013 1992 1993]
28


Fuel Type
['premium unleaded (required)' 'regular unleaded'
 'premium unleaded (recommended)' 'flex-fuel (unleaded/E85)' 'diesel']
10


HP
[335. 300. 230. 320. 172.]
356


Cylinders
[ 6.  4.  5.  8. 12.]
9


Transmission
['MANUAL' 'AUTOMATIC' 'AUTOMATED_MANUAL' 'DIRECT_DRIVE' 'UNKNOWN']
5


Driven_Wheels
['rear wheel drive' 'front wheel drive' 'all wheel drive'
 'four wheel drive']
4


Doors
[ 2.  4.  3. nan]
3


Market Category
['Factory Tuner,Luxury,High-Performance' 'Luxury,Performance'
 'Luxury,High-Performance' 'Luxury' 'Performance']
71


Size
['Compact' 'Midsize' 'Large']
3


Style
['Coupe' 'Convertible' 'Sedan' 'Wagon' '4dr Hatchback']
16


h_MPG
[26 28 27 25 24]
59


c_MPG
[19 20 18 17 16]
69


Popularity
[3916 3105  819  617 1013]
48


price
[46135 40650 36350 29450 34500]
6049


missing_df = car_data.isnull().sum().to_frame().reset_index().rename({"index" : 'Variable', 0: 'Missing Values'}, axis =1)
print(missing_df)

           Variable  Missing Values
0              make               0
1             Model               0
2              Year               0
3         Fuel Type               3
4                HP              69
5         Cylinders              30
6      Transmission               0
7     Driven_Wheels               0
8             Doors               6
9   Market Category            3742
10             Size               0
11            Style               0
12            h_MPG               0
13            c_MPG               0
14       Popularity               0
15            price               0


car_data['Market Category'].fillna(car_data['Market Category'].mode()[0], inplace = True)
car_data['Fuel Type'].fillna(car_data['Fuel Type'].mode()[0], inplace = True)
car_data['HP'].fillna(car_data['HP'].mean(), inplace = True)
car_data['Cylinders'].fillna(car_data['Cylinders'].median(), inplace = True)
car_data['Doors'].fillna(car_data['Doors'].median(), inplace = True)


missing_df = (car_data.isnull().sum().to_frame().reset_index().rename({'index' : 'Variables', 0: 'Missing Values'},  axis =1))

print(missing_df)

          Variables  Missing Values
0              make               0
1             Model               0
2              Year               0
3         Fuel Type               0
4                HP               0
5         Cylinders               0
6      Transmission               0
7     Driven_Wheels               0
8             Doors               0
9   Market Category               0
10             Size               0
11            Style               0
12            h_MPG               0
13            c_MPG               0
14       Popularity               0
15            price               0


car_data.drop_duplicates(inplace = True)


categorical = list(car_data.dtypes[car_data.dtypes == 'object'].index)
numerical = [col for col in car_data.columns if col not in categorical]
car_data[numerical].corr()

	Year	Engine HP	Engine Cylinders	Number of Doors	highway MPG	city mpg	Popularity	MSRP
count	11914.000000	11845.00000	11884.000000	11908.000000	11914.000000	11914.000000	11914.000000	1.191400e+04
mean	2010.384338	249.38607	5.628829	3.436093	26.637485	19.733255	1554.911197	4.059474e+04
std	7.579740	109.19187	1.780559	0.881315	8.863001	8.987798	1441.855347	6.010910e+04
min	1990.000000	55.00000	0.000000	2.000000	12.000000	7.000000	2.000000	2.000000e+03
25%	2007.000000	170.00000	4.000000	2.000000	22.000000	16.000000	549.000000	2.100000e+04
50%	2015.000000	227.00000	6.000000	4.000000	26.000000	18.000000	1385.000000	2.999500e+04
75%	2016.000000	300.00000	6.000000	4.000000	30.000000	22.000000	2009.000000	4.223125e+04
max	2017.000000	1001.00000	16.000000	4.000000	354.000000	137.000000	5657.000000	2.065902e+06

	Year	HP	Cylinders	Doors	h_MPG	c_MPG	Popularity	price
Year	1.000000	0.335573	-0.034152	0.247739	0.244972	0.188417	0.085874	0.209635
HP	0.335573	1.000000	0.772510	-0.128917	-0.359993	-0.351397	0.042186	0.658789
Cylinders	-0.034152	0.772510	1.000000	-0.149798	-0.596246	-0.562599	0.038325	0.538531
Doors	0.247739	-0.128917	-0.149798	1.000000	0.115081	0.121013	-0.057213	-0.144353
h_MPG	0.244972	-0.359993	-0.596246	0.115081	1.000000	0.886299	-0.017159	-0.166631
c_MPG	0.188417	-0.351397	-0.562599	0.121013	0.886299	1.000000	-0.000549	-0.162343
Popularity	0.085874	0.042186	0.038325	-0.057213	-0.017159	-0.000549	1.000000	-0.048371
price	0.209635	0.658789	0.538531	-0.144353	-0.166631	-0.162343	-0.048371	1.000000

Car Analysis¶

I used this example to support my learning, making a correlation matrix between various categorial columns of a cars dataset: https://www.kaggle.com/rupindersinghrana ¶

Import Libraries, File and Inspect Data¶

Rename Columns¶

Check Common Values In All Columns¶

Check for Missing Data¶

Fill Missing Fields Using Appropriate Averages¶

Remove Duplicate Entries¶

See the Correlation Coefficients Between Columns¶

	Make	Model	Year	Engine Fuel Type	Engine HP	Engine Cylinders	Transmission Type	Driven_Wheels	Number of Doors	Market Category	Vehicle Size	Vehicle Style	highway MPG	city mpg	Popularity	MSRP
0	BMW	1 Series M	2011	premium unleaded (required)	335.0	6.0	MANUAL	rear wheel drive	2.0	Factory Tuner,Luxury,High-Performance	Compact	Coupe	26	19	3916	46135
1	BMW	1 Series	2011	premium unleaded (required)	300.0	6.0	MANUAL	rear wheel drive	2.0	Luxury,Performance	Compact	Convertible	28	19	3916	40650
2	BMW	1 Series	2011	premium unleaded (required)	300.0	6.0	MANUAL	rear wheel drive	2.0	Luxury,High-Performance	Compact	Coupe	28	20	3916	36350
3	BMW	1 Series	2011	premium unleaded (required)	230.0	6.0	MANUAL	rear wheel drive	2.0	Luxury,Performance	Compact	Coupe	28	18	3916	29450
4	BMW	1 Series	2011	premium unleaded (required)	230.0	6.0	MANUAL	rear wheel drive	2.0	Luxury	Compact	Convertible	28	18	3916	34500

Car Analysis¶

I used this example to support my learning, making a correlation matrix between various categorial columns of a cars dataset: https://www.kaggle.com/rupindersinghrana¶

Import Libraries, File and Inspect Data¶

Rename Columns¶

Check Common Values In All Columns¶

Check for Missing Data¶

Fill Missing Fields Using Appropriate Averages¶

Remove Duplicate Entries¶

See the Correlation Coefficients Between Columns¶

I used this example to support my learning, making a correlation matrix between various categorial columns of a cars dataset: https://www.kaggle.com/rupindersinghrana ¶