https://www.kaggle.com/datasets/geomack/spotifyclassification/data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv("music.csv")
df.head()
Unnamed: 0 | acousticness | danceability | duration_ms | energy | instrumentalness | key | liveness | loudness | mode | speechiness | tempo | time_signature | valence | target | song_title | artist | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 0.0102 | 0.833 | 204600 | 0.434 | 0.021900 | 2 | 0.1650 | -8.795 | 1 | 0.4310 | 150.062 | 4.0 | 0.286 | 1 | Mask Off | Future |
1 | 1 | 0.1990 | 0.743 | 326933 | 0.359 | 0.006110 | 1 | 0.1370 | -10.401 | 1 | 0.0794 | 160.083 | 4.0 | 0.588 | 1 | Redbone | Childish Gambino |
2 | 2 | 0.0344 | 0.838 | 185707 | 0.412 | 0.000234 | 2 | 0.1590 | -7.148 | 1 | 0.2890 | 75.044 | 4.0 | 0.173 | 1 | Xanny Family | Future |
3 | 3 | 0.6040 | 0.494 | 199413 | 0.338 | 0.510000 | 5 | 0.0922 | -15.236 | 1 | 0.0261 | 86.468 | 4.0 | 0.230 | 1 | Master Of None | Beach House |
4 | 4 | 0.1800 | 0.678 | 392893 | 0.561 | 0.512000 | 5 | 0.4390 | -11.648 | 0 | 0.0694 | 174.004 | 4.0 | 0.904 | 1 | Parallel Lines | Junior Boys |
df.rename(columns={'Unnamed: 0': 'Index'}, inplace=True)
df.rename(columns={'target': 'LikedSong'}, inplace=True)
df.drop(columns = 'mode', axis=1, inplace=True)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2017 entries, 0 to 2016 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Index 2017 non-null int64 1 acousticness 2017 non-null float64 2 danceability 2017 non-null float64 3 duration_ms 2017 non-null int64 4 energy 2017 non-null float64 5 instrumentalness 2017 non-null float64 6 key 2017 non-null int64 7 liveness 2017 non-null float64 8 loudness 2017 non-null float64 9 speechiness 2017 non-null float64 10 tempo 2017 non-null float64 11 time_signature 2017 non-null float64 12 valence 2017 non-null float64 13 LikedSong 2017 non-null int64 14 song_title 2017 non-null object 15 artist 2017 non-null object dtypes: float64(10), int64(4), object(2) memory usage: 252.3+ KB
X = df[df.columns[1:13]]
y = df['LikedSong']
lr = LogisticRegression(max_iter=1000)
lr.fit(X, y)
print(lr.score(X,y))
0.5057015369360436
sfs = SFS(lr,
k_features=7,
forward=True,
floating=False,
scoring='accuracy',
cv=0)
sfs.fit(X, y)
SequentialFeatureSelector(cv=0, estimator=LogisticRegression(max_iter=1000), k_features=(7, 7), scoring='accuracy')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SequentialFeatureSelector(cv=0, estimator=LogisticRegression(max_iter=1000), k_features=(7, 7), scoring='accuracy')
LogisticRegression(max_iter=1000)
LogisticRegression(max_iter=1000)
print(sfs.subsets_[7]['feature_names'])
print(sfs.subsets_[7]['avg_score'])
('acousticness', 'danceability', 'instrumentalness', 'key', 'loudness', 'speechiness', 'time_signature') 0.6668319286068418
plot_sfs(sfs.get_metric_dict())
plt.show()
sbs = SFS(lr,
k_features=7,
forward=False,
floating=False,
scoring='accuracy',
cv=0)
sbs.fit(X, y)
SequentialFeatureSelector(cv=0, estimator=LogisticRegression(max_iter=1000), forward=False, k_features=(7, 7), scoring='accuracy')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SequentialFeatureSelector(cv=0, estimator=LogisticRegression(max_iter=1000), forward=False, k_features=(7, 7), scoring='accuracy')
LogisticRegression(max_iter=1000)
LogisticRegression(max_iter=1000)
print(sbs.subsets_[7]['feature_names'])
print(sbs.subsets_[7]['avg_score'])
('acousticness', 'danceability', 'instrumentalness', 'key', 'loudness', 'speechiness', 'tempo') 0.6698066435299951
plot_sfs(sbs.get_metric_dict())
plt.show()
rfe = RFE(estimator=lr, n_features_to_select=7)
scaler = StandardScaler()
X_normalized = scaler.fit_transform(X)
rfe.fit(X_normalized, y)
RFE(estimator=LogisticRegression(max_iter=1000), n_features_to_select=7)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RFE(estimator=LogisticRegression(max_iter=1000), n_features_to_select=7)
LogisticRegression(max_iter=1000)
LogisticRegression(max_iter=1000)
print(rfe.score(X_normalized, y))
0.6678235002478929
feature_list = list(X.columns)
rfe_features = [f for (f, support) in zip(feature_list, rfe.support_) if support]
print(rfe_features)
['acousticness', 'danceability', 'duration_ms', 'instrumentalness', 'loudness', 'speechiness', 'valence']