import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
car_data = pd.read_csv('CarDataset.csv')
car_data.head()
Make | Model | Year | Engine Fuel Type | Engine HP | Engine Cylinders | Transmission Type | Driven_Wheels | Number of Doors | Market Category | Vehicle Size | Vehicle Style | highway MPG | city mpg | Popularity | MSRP | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | BMW | 1 Series M | 2011 | premium unleaded (required) | 335.0 | 6.0 | MANUAL | rear wheel drive | 2.0 | Factory Tuner,Luxury,High-Performance | Compact | Coupe | 26 | 19 | 3916 | 46135 |
1 | BMW | 1 Series | 2011 | premium unleaded (required) | 300.0 | 6.0 | MANUAL | rear wheel drive | 2.0 | Luxury,Performance | Compact | Convertible | 28 | 19 | 3916 | 40650 |
2 | BMW | 1 Series | 2011 | premium unleaded (required) | 300.0 | 6.0 | MANUAL | rear wheel drive | 2.0 | Luxury,High-Performance | Compact | Coupe | 28 | 20 | 3916 | 36350 |
3 | BMW | 1 Series | 2011 | premium unleaded (required) | 230.0 | 6.0 | MANUAL | rear wheel drive | 2.0 | Luxury,Performance | Compact | Coupe | 28 | 18 | 3916 | 29450 |
4 | BMW | 1 Series | 2011 | premium unleaded (required) | 230.0 | 6.0 | MANUAL | rear wheel drive | 2.0 | Luxury | Compact | Convertible | 28 | 18 | 3916 | 34500 |
car_data.shape
(11914, 16)
car_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 11914 entries, 0 to 11913 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Make 11914 non-null object 1 Model 11914 non-null object 2 Year 11914 non-null int64 3 Engine Fuel Type 11911 non-null object 4 Engine HP 11845 non-null float64 5 Engine Cylinders 11884 non-null float64 6 Transmission Type 11914 non-null object 7 Driven_Wheels 11914 non-null object 8 Number of Doors 11908 non-null float64 9 Market Category 8172 non-null object 10 Vehicle Size 11914 non-null object 11 Vehicle Style 11914 non-null object 12 highway MPG 11914 non-null int64 13 city mpg 11914 non-null int64 14 Popularity 11914 non-null int64 15 MSRP 11914 non-null int64 dtypes: float64(3), int64(5), object(8) memory usage: 1.5+ MB
car_data.describe()
Year | Engine HP | Engine Cylinders | Number of Doors | highway MPG | city mpg | Popularity | MSRP | |
---|---|---|---|---|---|---|---|---|
count | 11914.000000 | 11845.00000 | 11884.000000 | 11908.000000 | 11914.000000 | 11914.000000 | 11914.000000 | 1.191400e+04 |
mean | 2010.384338 | 249.38607 | 5.628829 | 3.436093 | 26.637485 | 19.733255 | 1554.911197 | 4.059474e+04 |
std | 7.579740 | 109.19187 | 1.780559 | 0.881315 | 8.863001 | 8.987798 | 1441.855347 | 6.010910e+04 |
min | 1990.000000 | 55.00000 | 0.000000 | 2.000000 | 12.000000 | 7.000000 | 2.000000 | 2.000000e+03 |
25% | 2007.000000 | 170.00000 | 4.000000 | 2.000000 | 22.000000 | 16.000000 | 549.000000 | 2.100000e+04 |
50% | 2015.000000 | 227.00000 | 6.000000 | 4.000000 | 26.000000 | 18.000000 | 1385.000000 | 2.999500e+04 |
75% | 2016.000000 | 300.00000 | 6.000000 | 4.000000 | 30.000000 | 22.000000 | 2009.000000 | 4.223125e+04 |
max | 2017.000000 | 1001.00000 | 16.000000 | 4.000000 | 354.000000 | 137.000000 | 5657.000000 | 2.065902e+06 |
car_data.rename(columns={'Make':'make', 'Model':'Model', 'Year':'Year', 'Engine Fuel Type':'Fuel Type',
'Engine HP':'HP','Engine Cylinders':'Cylinders', 'Transmission Type':'Transmission', 'Driven_Wheels':'Driven_Wheels',
'Number of Doors':'Doors', 'Market Category':'Market Category', 'Vehicle Size':'Size', 'Vehicle Style':'Style',
'highway MPG':'h_MPG', 'city mpg':'c_MPG', 'Popularity':'Popularity', 'MSRP':'price'},inplace = True)
for col in car_data.columns:
print(col)
print(car_data[col].unique()[:5])
print(car_data[col].nunique())
print('\n')
make ['BMW' 'Audi' 'FIAT' 'Mercedes-Benz' 'Chrysler'] 48 Model ['1 Series M' '1 Series' '100' '124 Spider' '190-Class'] 915 Year [2011 2012 2013 1992 1993] 28 Fuel Type ['premium unleaded (required)' 'regular unleaded' 'premium unleaded (recommended)' 'flex-fuel (unleaded/E85)' 'diesel'] 10 HP [335. 300. 230. 320. 172.] 356 Cylinders [ 6. 4. 5. 8. 12.] 9 Transmission ['MANUAL' 'AUTOMATIC' 'AUTOMATED_MANUAL' 'DIRECT_DRIVE' 'UNKNOWN'] 5 Driven_Wheels ['rear wheel drive' 'front wheel drive' 'all wheel drive' 'four wheel drive'] 4 Doors [ 2. 4. 3. nan] 3 Market Category ['Factory Tuner,Luxury,High-Performance' 'Luxury,Performance' 'Luxury,High-Performance' 'Luxury' 'Performance'] 71 Size ['Compact' 'Midsize' 'Large'] 3 Style ['Coupe' 'Convertible' 'Sedan' 'Wagon' '4dr Hatchback'] 16 h_MPG [26 28 27 25 24] 59 c_MPG [19 20 18 17 16] 69 Popularity [3916 3105 819 617 1013] 48 price [46135 40650 36350 29450 34500] 6049
missing_df = car_data.isnull().sum().to_frame().reset_index().rename({"index" : 'Variable', 0: 'Missing Values'}, axis =1)
print(missing_df)
Variable Missing Values 0 make 0 1 Model 0 2 Year 0 3 Fuel Type 3 4 HP 69 5 Cylinders 30 6 Transmission 0 7 Driven_Wheels 0 8 Doors 6 9 Market Category 3742 10 Size 0 11 Style 0 12 h_MPG 0 13 c_MPG 0 14 Popularity 0 15 price 0
car_data['Market Category'].fillna(car_data['Market Category'].mode()[0], inplace = True)
car_data['Fuel Type'].fillna(car_data['Fuel Type'].mode()[0], inplace = True)
car_data['HP'].fillna(car_data['HP'].mean(), inplace = True)
car_data['Cylinders'].fillna(car_data['Cylinders'].median(), inplace = True)
car_data['Doors'].fillna(car_data['Doors'].median(), inplace = True)
missing_df = (car_data.isnull().sum().to_frame().reset_index().rename({'index' : 'Variables', 0: 'Missing Values'}, axis =1))
print(missing_df)
Variables Missing Values 0 make 0 1 Model 0 2 Year 0 3 Fuel Type 0 4 HP 0 5 Cylinders 0 6 Transmission 0 7 Driven_Wheels 0 8 Doors 0 9 Market Category 0 10 Size 0 11 Style 0 12 h_MPG 0 13 c_MPG 0 14 Popularity 0 15 price 0
car_data.drop_duplicates(inplace = True)
categorical = list(car_data.dtypes[car_data.dtypes == 'object'].index)
numerical = [col for col in car_data.columns if col not in categorical]
car_data[numerical].corr()
Year | HP | Cylinders | Doors | h_MPG | c_MPG | Popularity | price | |
---|---|---|---|---|---|---|---|---|
Year | 1.000000 | 0.335573 | -0.034152 | 0.247739 | 0.244972 | 0.188417 | 0.085874 | 0.209635 |
HP | 0.335573 | 1.000000 | 0.772510 | -0.128917 | -0.359993 | -0.351397 | 0.042186 | 0.658789 |
Cylinders | -0.034152 | 0.772510 | 1.000000 | -0.149798 | -0.596246 | -0.562599 | 0.038325 | 0.538531 |
Doors | 0.247739 | -0.128917 | -0.149798 | 1.000000 | 0.115081 | 0.121013 | -0.057213 | -0.144353 |
h_MPG | 0.244972 | -0.359993 | -0.596246 | 0.115081 | 1.000000 | 0.886299 | -0.017159 | -0.166631 |
c_MPG | 0.188417 | -0.351397 | -0.562599 | 0.121013 | 0.886299 | 1.000000 | -0.000549 | -0.162343 |
Popularity | 0.085874 | 0.042186 | 0.038325 | -0.057213 | -0.017159 | -0.000549 | 1.000000 | -0.048371 |
price | 0.209635 | 0.658789 | 0.538531 | -0.144353 | -0.166631 | -0.162343 | -0.048371 | 1.000000 |