import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob
files = glob.glob('marathon/*.csv')
df_list = []
for f in files:
temp_df = pd.read_csv(f)
temp_df['Year'] = f
df_list.append(temp_df)
df = pd.concat(df_list)
df.head()
Overall Place | Gender Place | Category Place | Name | Club | Runner Number | Gender | Category | Event | Half Time | Finish Time | Year | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1.0 | 1.0 | 1.0 | Hehir, Sean (IRL) | Rathfarn... | 1000 | M | 18-39 | Mass | 01:06:35 | 02:17:20 | marathon/London_2016_mass_results.csv |
1 | 2.0 | 2.0 | 2.0 | Martelletti, Paul (GBR) | Run Fast | 1288 | M | 18-39 | Mass | 01:08:29 | 02:17:26 | marathon/London_2016_mass_results.csv |
2 | 3.0 | 3.0 | 3.0 | Frazer, Thomas (IRL) | St Mala... | 1316 | M | 18-39 | Mass | 01:06:54 | 02:19:17 | marathon/London_2016_mass_results.csv |
3 | 4.0 | 4.0 | 4.0 | Scott, Aaron (GBR) | Notts AC | 1374 | M | 18-39 | Mass | 01:09:02 | 02:19:18 | marathon/London_2016_mass_results.csv |
4 | 5.0 | 5.0 | 1.0 | Way, Steven (GBR) | Bournem... | 1158 | M | 40-44 | Mass | 01:09:02 | 02:20:50 | marathon/London_2016_mass_results.csv |
df.shape
(268303, 12)
df['Year'] = df['Year'].str.extract(r'_(\d{4})_')
df['Year'] = pd.to_numeric(df['Year'])
df[['Name', 'Country Abbreviation']] = df['Name'].str.extract(r'(.+?) \((\w{3})\)')
df = df.drop(['Runner Number', 'Event','Club'], axis=1)
newCols = ['Year','Country Abbreviation','Name','Category','Half Time','Finish Time', 'Name','Gender','Overall Place','Gender Place','Category Place']
df = df.reindex(columns = newCols)
df.head()
Year | Country Abbreviation | Name | Category | Half Time | Finish Time | Name | Gender | Overall Place | Gender Place | Category Place | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2016 | IRL | Hehir, Sean | 18-39 | 01:06:35 | 02:17:20 | Hehir, Sean | M | 1.0 | 1.0 | 1.0 |
1 | 2016 | GBR | Martelletti, Paul | 18-39 | 01:08:29 | 02:17:26 | Martelletti, Paul | M | 2.0 | 2.0 | 2.0 |
2 | 2016 | IRL | Frazer, Thomas | 18-39 | 01:06:54 | 02:19:17 | Frazer, Thomas | M | 3.0 | 3.0 | 3.0 |
3 | 2016 | GBR | Scott, Aaron | 18-39 | 01:09:02 | 02:19:18 | Scott, Aaron | M | 4.0 | 4.0 | 4.0 |
4 | 2016 | GBR | Way, Steven | 40-44 | 01:09:02 | 02:20:50 | Way, Steven | M | 5.0 | 5.0 | 1.0 |
df.dropna(subset=['Half Time'], inplace=True)
df.dropna(subset=['Finish Time'], inplace=True)
df_temp = df['Finish Time'] != 'DSQ'
df = df[df_temp]
df.shape
(267703, 11)
df['Seconds_Finish'] = pd.to_timedelta(df['Finish Time']).dt.total_seconds()
df['Seconds_FirstHalf'] = pd.to_timedelta(df['Half Time']).dt.total_seconds()
df['Seconds_SecondHalf'] = df['Seconds_Finish'] - df['Seconds_FirstHalf']
df.head()
Year | Country Abbreviation | Name | Category | Half Time | Finish Time | Name | Gender | Overall Place | Gender Place | Category Place | Seconds_Finish | Seconds_FirstHalf | Seconds_SecondHalf | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2016 | IRL | Hehir, Sean | 18-39 | 01:06:35 | 02:17:20 | Hehir, Sean | M | 1.0 | 1.0 | 1.0 | 8240.0 | 3995.0 | 4245.0 |
1 | 2016 | GBR | Martelletti, Paul | 18-39 | 01:08:29 | 02:17:26 | Martelletti, Paul | M | 2.0 | 2.0 | 2.0 | 8246.0 | 4109.0 | 4137.0 |
2 | 2016 | IRL | Frazer, Thomas | 18-39 | 01:06:54 | 02:19:17 | Frazer, Thomas | M | 3.0 | 3.0 | 3.0 | 8357.0 | 4014.0 | 4343.0 |
3 | 2016 | GBR | Scott, Aaron | 18-39 | 01:09:02 | 02:19:18 | Scott, Aaron | M | 4.0 | 4.0 | 4.0 | 8358.0 | 4142.0 | 4216.0 |
4 | 2016 | GBR | Way, Steven | 40-44 | 01:09:02 | 02:20:50 | Way, Steven | M | 5.0 | 5.0 | 1.0 | 8450.0 | 4142.0 | 4308.0 |
(2018 seemed a slower year)
plt.figure(figsize=(10, 6))
sns.boxplot(x='Year', y='Seconds_Finish', hue='Gender', data= df)
plt.title('Finish Times (Seconds) by Year and Gender')
plt.xlabel('Year')
plt.ylabel('Seconds')
plt.show()
# Create a new column for the time in hours
df['Finish Time - Hours'] = df['Finish Time'].apply(lambda x: pd.to_timedelta(x).total_seconds() / 3600)
time_bins = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
df['Time_Category'] = pd.cut(df['Finish Time - Hours'], bins=time_bins, right=False)
result_table = df.groupby(['Year', 'Time_Category']).size().unstack().apply(lambda x: x / x.sum(), axis=1)
result_table.columns = [f'Prop {bins}-{bins+1} hrs' for bins in time_bins[:-1]]
result_table = result_table.reset_index()
result_table.head()
Year | Prop 2-3 hrs | Prop 3-4 hrs | Prop 4-5 hrs | Prop 5-6 hrs | Prop 6-7 hrs | Prop 7-8 hrs | Prop 8-9 hrs | Prop 9-10 hrs | Prop 10-11 hrs | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 2014 | 0.038808 | 0.290679 | 0.388690 | 0.208203 | 0.061690 | 0.011707 | 0.000224 | 0.000000 | 0.0 |
1 | 2015 | 0.050369 | 0.323830 | 0.393117 | 0.177132 | 0.048712 | 0.006787 | 0.000053 | 0.000000 | 0.0 |
2 | 2016 | 0.053628 | 0.312841 | 0.380365 | 0.187133 | 0.054658 | 0.011091 | 0.000283 | 0.000000 | 0.0 |
3 | 2017 | 0.049221 | 0.291525 | 0.368519 | 0.210120 | 0.065517 | 0.014358 | 0.000714 | 0.000026 | 0.0 |
4 | 2018 | 0.027728 | 0.214229 | 0.333358 | 0.280800 | 0.114908 | 0.027328 | 0.001599 | 0.000050 | 0.0 |