import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
df = pd.read_csv('ManhattanHousing.csv')
df.head()
rental_id | rent | bedrooms | bathrooms | size_sqft | min_to_subway | floor | building_age_yrs | no_fee | has_roofdeck | has_washer_dryer | has_doorman | has_elevator | has_dishwasher | has_patio | has_gym | neighborhood | borough | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1545 | 2550 | 0.0 | 1 | 480 | 9 | 2.0 | 17 | 1 | 1 | 0 | 0 | 1 | 1 | 0 | 1 | Upper East Side | Manhattan |
1 | 2472 | 11500 | 2.0 | 2 | 2000 | 4 | 1.0 | 96 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | Greenwich Village | Manhattan |
2 | 2919 | 4500 | 1.0 | 1 | 916 | 2 | 51.0 | 29 | 0 | 1 | 0 | 1 | 1 | 1 | 0 | 0 | Midtown | Manhattan |
3 | 2790 | 4795 | 1.0 | 1 | 975 | 3 | 8.0 | 31 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 1 | Greenwich Village | Manhattan |
4 | 3946 | 17500 | 2.0 | 2 | 4800 | 3 | 4.0 | 136 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 1 | Soho | Manhattan |
x = df[['bedrooms', 'bathrooms', 'size_sqft', 'min_to_subway', 'floor', 'building_age_yrs', 'no_fee', 'has_roofdeck', 'has_washer_dryer', 'has_doorman', 'has_elevator', 'has_dishwasher', 'has_patio', 'has_gym']]
y = df[['rent']]
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, test_size=0.2)
clf = DecisionTreeClassifier(criterion='gini')
clf = clf.fit(x_train, y_train)
feature_importances = clf.feature_importances_
column_names = x_train.columns
feature_importance_dict = dict(zip(column_names, feature_importances))
sorted_feature_importance = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)
print(sorted_feature_importance)
[('size_sqft', 0.19370695155593112), ('floor', 0.19106434380874543), ('building_age_yrs', 0.18732609467033248), ('min_to_subway', 0.15155380021152945), ('bedrooms', 0.06072393803989258), ('has_doorman', 0.03185215681807589), ('has_elevator', 0.030288746802205425), ('no_fee', 0.026845148178826852), ('has_washer_dryer', 0.026581151325924222), ('has_dishwasher', 0.023933827353835042), ('has_gym', 0.020814343880004305), ('has_roofdeck', 0.020672676860949684), ('bathrooms', 0.020298547272458292), ('has_patio', 0.014338273221289421)]
features = [item[0] for item in sorted_feature_importance]
importances = [item[1] for item in sorted_feature_importance]
plt.figure(figsize=(10, 6))
plt.barh(range(len(features)), importances, align='center')
plt.yticks(range(len(features)), features)
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importances')
plt.gca().invert_yaxis()
plt.show()