import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob


files = glob.glob('marathon/*.csv')

df_list = []

for f in files:
    temp_df = pd.read_csv(f)
    temp_df['Year'] = f
    df_list.append(temp_df)
   
df = pd.concat(df_list)


df.head()


df.shape

(268303, 12)


df['Year'] = df['Year'].str.extract(r'_(\d{4})_')


df['Year'] = pd.to_numeric(df['Year'])


df[['Name', 'Country Abbreviation']] = df['Name'].str.extract(r'(.+?) \((\w{3})\)')


df = df.drop(['Runner Number', 'Event','Club'], axis=1)


newCols = ['Year','Country Abbreviation','Name','Category','Half Time','Finish Time', 'Name','Gender','Overall Place','Gender Place','Category Place']


df = df.reindex(columns = newCols)


df.head()


df.dropna(subset=['Half Time'], inplace=True)


df.dropna(subset=['Finish Time'], inplace=True)


df_temp = df['Finish Time'] != 'DSQ'


df = df[df_temp]


df.shape

(267703, 11)


df['Seconds_Finish'] = pd.to_timedelta(df['Finish Time']).dt.total_seconds()

df['Seconds_FirstHalf'] = pd.to_timedelta(df['Half Time']).dt.total_seconds()

df['Seconds_SecondHalf'] = df['Seconds_Finish'] - df['Seconds_FirstHalf']


df.head()


plt.figure(figsize=(10, 6))
sns.boxplot(x='Year', y='Seconds_Finish', hue='Gender', data= df)
plt.title('Finish Times (Seconds) by Year and Gender')
plt.xlabel('Year')
plt.ylabel('Seconds')
plt.show()


# Create a new column for the time in hours
df['Finish Time - Hours'] = df['Finish Time'].apply(lambda x: pd.to_timedelta(x).total_seconds() / 3600)


time_bins = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11]

df['Time_Category'] = pd.cut(df['Finish Time - Hours'], bins=time_bins, right=False)


result_table = df.groupby(['Year', 'Time_Category']).size().unstack().apply(lambda x: x / x.sum(), axis=1)

result_table.columns = [f'Prop {bins}-{bins+1} hrs' for bins in time_bins[:-1]]


result_table = result_table.reset_index()

result_table.head()

	Overall Place	Gender Place	Category Place	Name	Club	Runner Number	Gender	Category	Event	Half Time	Finish Time	Year
0	1.0	1.0	1.0	Hehir, Sean (IRL)	Rathfarn...	1000	M	18-39	Mass	01:06:35	02:17:20	marathon/London_2016_mass_results.csv
1	2.0	2.0	2.0	Martelletti, Paul (GBR)	Run Fast	1288	M	18-39	Mass	01:08:29	02:17:26	marathon/London_2016_mass_results.csv
2	3.0	3.0	3.0	Frazer, Thomas (IRL)	St Mala...	1316	M	18-39	Mass	01:06:54	02:19:17	marathon/London_2016_mass_results.csv
3	4.0	4.0	4.0	Scott, Aaron (GBR)	Notts AC	1374	M	18-39	Mass	01:09:02	02:19:18	marathon/London_2016_mass_results.csv
4	5.0	5.0	1.0	Way, Steven (GBR)	Bournem...	1158	M	40-44	Mass	01:09:02	02:20:50	marathon/London_2016_mass_results.csv

	Year	Country Abbreviation	Name	Category	Half Time	Finish Time	Name	Gender	Overall Place	Gender Place	Category Place
0	2016	IRL	Hehir, Sean	18-39	01:06:35	02:17:20	Hehir, Sean	M	1.0	1.0	1.0
1	2016	GBR	Martelletti, Paul	18-39	01:08:29	02:17:26	Martelletti, Paul	M	2.0	2.0	2.0
2	2016	IRL	Frazer, Thomas	18-39	01:06:54	02:19:17	Frazer, Thomas	M	3.0	3.0	3.0
3	2016	GBR	Scott, Aaron	18-39	01:09:02	02:19:18	Scott, Aaron	M	4.0	4.0	4.0
4	2016	GBR	Way, Steven	40-44	01:09:02	02:20:50	Way, Steven	M	5.0	5.0	1.0

	Year	Country Abbreviation	Name	Category	Half Time	Finish Time	Name	Gender	Overall Place	Gender Place	Category Place	Seconds_Finish	Seconds_FirstHalf	Seconds_SecondHalf
0	2016	IRL	Hehir, Sean	18-39	01:06:35	02:17:20	Hehir, Sean	M	1.0	1.0	1.0	8240.0	3995.0	4245.0
1	2016	GBR	Martelletti, Paul	18-39	01:08:29	02:17:26	Martelletti, Paul	M	2.0	2.0	2.0	8246.0	4109.0	4137.0
2	2016	IRL	Frazer, Thomas	18-39	01:06:54	02:19:17	Frazer, Thomas	M	3.0	3.0	3.0	8357.0	4014.0	4343.0
3	2016	GBR	Scott, Aaron	18-39	01:09:02	02:19:18	Scott, Aaron	M	4.0	4.0	4.0	8358.0	4142.0	4216.0
4	2016	GBR	Way, Steven	40-44	01:09:02	02:20:50	Way, Steven	M	5.0	5.0	1.0	8450.0	4142.0	4308.0

	Year	Prop 2-3 hrs	Prop 3-4 hrs	Prop 4-5 hrs	Prop 5-6 hrs	Prop 6-7 hrs	Prop 7-8 hrs	Prop 8-9 hrs	Prop 9-10 hrs
0	2014	0.038808	0.290679	0.388690	0.208203	0.061690	0.011707	0.000224	0.000000
1	2015	0.050369	0.323830	0.393117	0.177132	0.048712	0.006787	0.000053	0.000000
2	2016	0.053628	0.312841	0.380365	0.187133	0.054658	0.011091	0.000283	0.000000
3	2017	0.049221	0.291525	0.368519	0.210120	0.065517	0.014358	0.000714	0.000026
4	2018	0.027728	0.214229	0.333358	0.280800	0.114908	0.027328	0.001599	0.000050

London Marathon Finishers (Over the Years)¶

A project to practice concatenating multiple csv files¶

Import Libraries¶

Data Loading: Concatenating tables with the past 7 years of London Marathon finishers.¶

Ammending the data in the Year column to contain the 4 numbers only.¶

Extracting Name and Country Abbreviation from the data in the Name column.¶

Dropping and Reording Columns to make it easier to work with.¶

Taking another look at the data¶

Removing rows with runner who didn't complete the races or got disqualified¶

Converting First Half and Finish Time data into seconds, and adding a column giving the time for the 2nd half of the races¶

Comparing times over the years¶

Converting Finish Times back into Hours, and comparing the proportions of runners who complete the race between 2-3 hours, 3-4 hours, etc¶