I followed a CodeCademy project to consolidate understanding of Hyperparameter tuning. The project classified types of raisins, using the Kaggle Dataset here: https://www.kaggle.com/datasets/muratkokludataset/raisin-dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
raisins = pd.read_csv('Raisin_Dataset.csv')
raisins.head()
Area | MajorAxisLength | MinorAxisLength | Eccentricity | ConvexArea | Extent | Perimeter | Class | |
---|---|---|---|---|---|---|---|---|
0 | 87524 | 442.246011 | 253.291155 | 0.819738 | 90546 | 0.758651 | 1184.040 | Kecimen |
1 | 75166 | 406.690687 | 243.032436 | 0.801805 | 78789 | 0.684130 | 1121.786 | Kecimen |
2 | 90856 | 442.267048 | 266.328318 | 0.798354 | 93717 | 0.637613 | 1208.575 | Kecimen |
3 | 45928 | 286.540559 | 208.760042 | 0.684989 | 47336 | 0.699599 | 844.162 | Kecimen |
4 | 79408 | 352.190770 | 290.827533 | 0.564011 | 81463 | 0.792772 | 1073.251 | Kecimen |
raisins['Class'].unique()
array(['Kecimen', 'Besni'], dtype=object)
raisins['Class'] = raisins['Class'].replace({'Kecimen': 0, 'Besni': 1})
raisins.head()
Area | MajorAxisLength | MinorAxisLength | Eccentricity | ConvexArea | Extent | Perimeter | Class | |
---|---|---|---|---|---|---|---|---|
0 | 87524 | 442.246011 | 253.291155 | 0.819738 | 90546 | 0.758651 | 1184.040 | 0 |
1 | 75166 | 406.690687 | 243.032436 | 0.801805 | 78789 | 0.684130 | 1121.786 | 0 |
2 | 90856 | 442.267048 | 266.328318 | 0.798354 | 93717 | 0.637613 | 1208.575 | 0 |
3 | 45928 | 286.540559 | 208.760042 | 0.684989 | 47336 | 0.699599 | 844.162 | 0 |
4 | 79408 | 352.190770 | 290.827533 | 0.564011 | 81463 | 0.792772 | 1073.251 | 0 |
X = raisins.drop('Class', axis=1)
y = raisins['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 20)
tree = DecisionTreeClassifier()
parameters = {'min_samples_split': [2,3,4], 'max_depth': [3,5,7]}
grid = GridSearchCV(tree, parameters)
grid.fit(X_train, y_train)
GridSearchCV(estimator=DecisionTreeClassifier(), param_grid={'max_depth': [3, 5, 7], 'min_samples_split': [2, 3, 4]})In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
GridSearchCV(estimator=DecisionTreeClassifier(), param_grid={'max_depth': [3, 5, 7], 'min_samples_split': [2, 3, 4]})
DecisionTreeClassifier()
DecisionTreeClassifier()
print(grid.best_estimator_)
print(grid.best_score_)
print(grid.score(X_test, y_test))
DecisionTreeClassifier(max_depth=3) 0.8562962962962963 0.8488888888888889
df = pd.concat([pd.DataFrame(grid.cv_results_['params']), pd.DataFrame(grid.cv_results_['mean_test_score'], columns=['Score'])], axis=1)
df
max_depth | min_samples_split | Score | |
---|---|---|---|
0 | 3 | 2 | 0.856296 |
1 | 3 | 3 | 0.854815 |
2 | 3 | 4 | 0.856296 |
3 | 5 | 2 | 0.851852 |
4 | 5 | 3 | 0.844444 |
5 | 5 | 4 | 0.840000 |
6 | 7 | 2 | 0.826667 |
7 | 7 | 3 | 0.820741 |
8 | 7 | 4 | 0.819259 |
lr = LogisticRegression(solver = 'liblinear', max_iter = 1000)
distributions = {'penalty': ['l1', 'l2'], 'C': uniform(loc=0, scale=100)}
clf = RandomizedSearchCV(lr, distributions, n_iter=8)
clf.fit(X_train, y_train)
RandomizedSearchCV(estimator=LogisticRegression(max_iter=1000, solver='liblinear'), n_iter=8, param_distributions={'C': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x150b8a350>, 'penalty': ['l1', 'l2']})In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomizedSearchCV(estimator=LogisticRegression(max_iter=1000, solver='liblinear'), n_iter=8, param_distributions={'C': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x150b8a350>, 'penalty': ['l1', 'l2']})
LogisticRegression(max_iter=1000, solver='liblinear')
LogisticRegression(max_iter=1000, solver='liblinear')
df = pd.concat([pd.DataFrame(clf.cv_results_['params']), pd.DataFrame(clf.cv_results_['mean_test_score'], columns=['Accuracy'])] ,axis=1)
df.sort_values('Accuracy', ascending = False)
df.rename(columns={'C': 'Regularization Strength'}, inplace=True)
df.rename(columns={'penalty': 'Penalty'}, inplace=True)
df.rename(columns={'Accuracy': 'Score'}, inplace=True)
df
Regularization Strength | Penalty | Score | |
---|---|---|---|
0 | 54.151434 | l1 | 0.853333 |
1 | 97.077638 | l2 | 0.872593 |
2 | 35.393114 | l2 | 0.874074 |
3 | 71.363318 | l1 | 0.862222 |
4 | 9.010911 | l1 | 0.859259 |
5 | 87.305201 | l2 | 0.872593 |
6 | 26.673168 | l2 | 0.874074 |
7 | 47.934787 | l2 | 0.872593 |