import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


df = pd.read_csv('GBBO.csv')


print(df.head())

     Airdate  Season  Episode    Theme             Signature  \
0  8/17/2010       1        1     Cake                  Cake   
1  8/24/2010       1        2  Biscuit  Personality Biscuits   
2  8/31/2010       1        3    Bread       Signature Bread   
3   9/7/2010       1        4  Pudding       Classic Pudding   
4  9/14/2010       1        5   Pastry           Savoury Pie   

   Signature Time (min)                Technical  Technical Time (min)  \
0                 180.0        Victoria Sandwich                   NaN   
1                 120.0                   Scones                  60.0   
2                 210.0                      Cob                 150.0   
3                 150.0  Mini Hot Lemon Soufflés                  40.0   
4                 150.0          Cornish Pasties                  90.0   

                                 Showstopper  Showstopper Time (min)  \
0                 Chocolate Celebration Cake                     NaN   
1  Meringue, Choux, and Macaron Petits Fours                   240.0   
2              12 Sweet and 12 Savoury Rolls                   360.0   
3            Crumble, Bread, & Suet Puddings                   300.0   
4          Savory Canapés and Sweet Tartlets                   300.0   

   MyRating (out of 10)  MyViewership  
0                   8.0          8.57  
1                   7.8         11.14  
2                   7.9         11.47  
3                   7.7         11.03  
4                   7.8         11.93


print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134 entries, 0 to 133
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Airdate                 134 non-null    object 
 1   Season                  134 non-null    int64  
 2   Episode                 134 non-null    int64  
 3   Theme                   134 non-null    object 
 4   Signature               134 non-null    object 
 5   Signature Time (min)    124 non-null    float64
 6   Technical               133 non-null    object 
 7   Technical Time (min)    122 non-null    float64
 8   Showstopper             134 non-null    object 
 9   Showstopper Time (min)  117 non-null    float64
 10  MyRating (out of 10)    134 non-null    float64
 11  MyViewership            134 non-null    float64
dtypes: float64(5), int64(2), object(5)
memory usage: 12.7+ KB
None


missing_df = df.isnull().sum().to_frame().reset_index().rename({"index" : 'Column', 0: 'Missing Values'}, axis =1)
display(missing_df)


# This could have also worked... showing the proportion rather than the counts
# missing_df = (df.isnull().sum() / (len(df.index)) * 100).to_frame().reset_index().rename({"index" : 'Column',0:'Proportion'}, axis = 1)
# display(missing_df)


category_means = df.groupby('Theme')['Signature Time (min)'].mean()


for category, mean_value in category_means.items():
    df.loc[(df['Theme'] == category) & (df['Signature Time (min)'].isnull()), 'Signature Time (min)'] = mean_value


# Another check:

missing_df = df.isnull().sum().to_frame().reset_index().rename({"index" : 'Column', 0: 'Missing Values'}, axis =1)
display(missing_df)


missing_values_rows = df[df['Signature Time (min)'].isnull()]

print(missing_values_rows['Theme'])

87        Dairy
90     Festival
100    The '80s
125    Biscuits
Name: Theme, dtype: object


df.dropna(subset=['Signature Time (min)'], inplace=True)

	Column	Missing Values
0	Airdate	0
1	Season	0
2	Episode	0
3	Theme	0
4	Signature	0
5	Signature Time (min)	10
6	Technical	1
7	Technical Time (min)	12
8	Showstopper	0
9	Showstopper Time (min)	17
10	MyRating (out of 10)	0
11	MyViewership	0