Import Libraries and File
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('GBBO.csv')
Initial Inspection of the dataset
print(df.head())
Airdate Season Episode Theme Signature \ 0 8/17/2010 1 1 Cake Cake 1 8/24/2010 1 2 Biscuit Personality Biscuits 2 8/31/2010 1 3 Bread Signature Bread 3 9/7/2010 1 4 Pudding Classic Pudding 4 9/14/2010 1 5 Pastry Savoury Pie Signature Time (min) Technical Technical Time (min) \ 0 180.0 Victoria Sandwich NaN 1 120.0 Scones 60.0 2 210.0 Cob 150.0 3 150.0 Mini Hot Lemon Soufflés 40.0 4 150.0 Cornish Pasties 90.0 Showstopper Showstopper Time (min) \ 0 Chocolate Celebration Cake NaN 1 Meringue, Choux, and Macaron Petits Fours 240.0 2 12 Sweet and 12 Savoury Rolls 360.0 3 Crumble, Bread, & Suet Puddings 300.0 4 Savory Canapés and Sweet Tartlets 300.0 MyRating (out of 10) MyViewership 0 8.0 8.57 1 7.8 11.14 2 7.9 11.47 3 7.7 11.03 4 7.8 11.93
print(df.info())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 134 entries, 0 to 133 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Airdate 134 non-null object 1 Season 134 non-null int64 2 Episode 134 non-null int64 3 Theme 134 non-null object 4 Signature 134 non-null object 5 Signature Time (min) 124 non-null float64 6 Technical 133 non-null object 7 Technical Time (min) 122 non-null float64 8 Showstopper 134 non-null object 9 Showstopper Time (min) 117 non-null float64 10 MyRating (out of 10) 134 non-null float64 11 MyViewership 134 non-null float64 dtypes: float64(5), int64(2), object(5) memory usage: 12.7+ KB None
The frist row contains a NaN value, so let's explore further:
missing_df = df.isnull().sum().to_frame().reset_index().rename({"index" : 'Column', 0: 'Missing Values'}, axis =1)
display(missing_df)
# This could have also worked... showing the proportion rather than the counts
# missing_df = (df.isnull().sum() / (len(df.index)) * 100).to_frame().reset_index().rename({"index" : 'Column',0:'Proportion'}, axis = 1)
# display(missing_df)
Column | Missing Values | |
---|---|---|
0 | Airdate | 0 |
1 | Season | 0 |
2 | Episode | 0 |
3 | Theme | 0 |
4 | Signature | 0 |
5 | Signature Time (min) | 10 |
6 | Technical | 1 |
7 | Technical Time (min) | 12 |
8 | Showstopper | 0 |
9 | Showstopper Time (min) | 17 |
10 | MyRating (out of 10) | 0 |
11 | MyViewership | 0 |
The Technical field is just a text description so we can ignore the one missing value. Also, the technical time and Showstopper time are very specific to what was made, so we'll likely have to ignore them too.
However, the Signature time is likely related to the Theme. Let's aggregative the Signature Times with the Theme field and fill in blanks using these averages.
category_means = df.groupby('Theme')['Signature Time (min)'].mean()
for category, mean_value in category_means.items():
df.loc[(df['Theme'] == category) & (df['Signature Time (min)'].isnull()), 'Signature Time (min)'] = mean_value
# Another check:
missing_df = df.isnull().sum().to_frame().reset_index().rename({"index" : 'Column', 0: 'Missing Values'}, axis =1)
display(missing_df)
Column | Missing Values | |
---|---|---|
0 | Airdate | 0 |
1 | Season | 0 |
2 | Episode | 0 |
3 | Theme | 0 |
4 | Signature | 0 |
5 | Signature Time (min) | 4 |
6 | Technical | 1 |
7 | Technical Time (min) | 12 |
8 | Showstopper | 0 |
9 | Showstopper Time (min) | 17 |
10 | MyRating (out of 10) | 0 |
11 | MyViewership | 0 |
missing_values_rows = df[df['Signature Time (min)'].isnull()]
print(missing_values_rows['Theme'])
87 Dairy 90 Festival 100 The '80s 125 Biscuits Name: Theme, dtype: object
df.dropna(subset=['Signature Time (min)'], inplace=True)