import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt


df = pd.read_csv('ManhattanHousing.csv')


df.head()


x = df[['bedrooms', 'bathrooms', 'size_sqft', 'min_to_subway', 'floor', 'building_age_yrs', 'no_fee', 'has_roofdeck', 'has_washer_dryer', 'has_doorman', 'has_elevator', 'has_dishwasher', 'has_patio', 'has_gym']]

y = df[['rent']]

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, test_size=0.2)


clf = DecisionTreeClassifier(criterion='gini')

clf = clf.fit(x_train, y_train)


feature_importances = clf.feature_importances_


column_names = x_train.columns

feature_importance_dict = dict(zip(column_names, feature_importances))


sorted_feature_importance = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

print(sorted_feature_importance)

[('size_sqft', 0.19370695155593112), ('floor', 0.19106434380874543), ('building_age_yrs', 0.18732609467033248), ('min_to_subway', 0.15155380021152945), ('bedrooms', 0.06072393803989258), ('has_doorman', 0.03185215681807589), ('has_elevator', 0.030288746802205425), ('no_fee', 0.026845148178826852), ('has_washer_dryer', 0.026581151325924222), ('has_dishwasher', 0.023933827353835042), ('has_gym', 0.020814343880004305), ('has_roofdeck', 0.020672676860949684), ('bathrooms', 0.020298547272458292), ('has_patio', 0.014338273221289421)]


features = [item[0] for item in sorted_feature_importance]
importances = [item[1] for item in sorted_feature_importance]

plt.figure(figsize=(10, 6))
plt.barh(range(len(features)), importances, align='center')
plt.yticks(range(len(features)), features)
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importances')
plt.gca().invert_yaxis()
plt.show()

	rental_id	rent	bedrooms	bathrooms	size_sqft	min_to_subway	floor	building_age_yrs	no_fee	has_roofdeck	has_doorman	has_elevator	has_dishwasher	has_gym	neighborhood	borough
0	1545	2550	0.0	1	480	9	2.0	17	1	1	0	1	1	1	Upper East Side	Manhattan
1	2472	11500	2.0	2	2000	4	1.0	96	0	0	0	0	0	0	Greenwich Village	Manhattan
2	2919	4500	1.0	1	916	2	51.0	29	0	1	1	1	1	0	Midtown	Manhattan
3	2790	4795	1.0	1	975	3	8.0	31	0	0	1	1	1	1	Greenwich Village	Manhattan
4	3946	17500	2.0	2	4800	3	4.0	136	0	0	1	1	1	1	Soho	Manhattan

Feature Importance Practice¶

Practice at assessing the feature importance of a dataset based on the highest gini gain¶

A dataset with apartment features and montly rent was used.¶

Import Libraries, File and Inspect Head¶

Split the Data¶

Fit A Decision Tree to the Model¶

Create a Dictionary of Features and their Importance¶

Assess Feature Importance¶

Visualise Feature Importance¶