import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE

import warnings
warnings.filterwarnings('ignore')


df = pd.read_csv("music.csv")

df.head()


df.rename(columns={'Unnamed: 0': 'Index'}, inplace=True)
df.rename(columns={'target': 'LikedSong'}, inplace=True)
df.drop(columns = 'mode', axis=1, inplace=True)


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2017 entries, 0 to 2016
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Index             2017 non-null   int64  
 1   acousticness      2017 non-null   float64
 2   danceability      2017 non-null   float64
 3   duration_ms       2017 non-null   int64  
 4   energy            2017 non-null   float64
 5   instrumentalness  2017 non-null   float64
 6   key               2017 non-null   int64  
 7   liveness          2017 non-null   float64
 8   loudness          2017 non-null   float64
 9   speechiness       2017 non-null   float64
 10  tempo             2017 non-null   float64
 11  time_signature    2017 non-null   float64
 12  valence           2017 non-null   float64
 13  LikedSong         2017 non-null   int64  
 14  song_title        2017 non-null   object 
 15  artist            2017 non-null   object 
dtypes: float64(10), int64(4), object(2)
memory usage: 252.3+ KB


X = df[df.columns[1:13]]
y = df['LikedSong']


lr = LogisticRegression(max_iter=1000)
lr.fit(X, y)
print(lr.score(X,y))

0.5057015369360436


sfs = SFS(lr, 
          k_features=7, 
          forward=True, 
          floating=False, 
          scoring='accuracy',
          cv=0)


sfs.fit(X, y)

SequentialFeatureSelector(cv=0, estimator=LogisticRegression(max_iter=1000),
                          k_features=(7, 7), scoring='accuracy')

SequentialFeatureSelector(cv=0, estimator=LogisticRegression(max_iter=1000),
                          k_features=(7, 7), scoring='accuracy')

LogisticRegression(max_iter=1000)

LogisticRegression(max_iter=1000)


print(sfs.subsets_[7]['feature_names'])

print(sfs.subsets_[7]['avg_score'])

('acousticness', 'danceability', 'instrumentalness', 'key', 'loudness', 'speechiness', 'time_signature')
0.6668319286068418


plot_sfs(sfs.get_metric_dict())
plt.show()


sbs = SFS(lr, 
          k_features=7, 
          forward=False, 
          floating=False, 
          scoring='accuracy',
          cv=0)


sbs.fit(X, y)

SequentialFeatureSelector(cv=0, estimator=LogisticRegression(max_iter=1000),
                          forward=False, k_features=(7, 7), scoring='accuracy')

SequentialFeatureSelector(cv=0, estimator=LogisticRegression(max_iter=1000),
                          forward=False, k_features=(7, 7), scoring='accuracy')

LogisticRegression(max_iter=1000)

LogisticRegression(max_iter=1000)


print(sbs.subsets_[7]['feature_names'])

print(sbs.subsets_[7]['avg_score'])

('acousticness', 'danceability', 'instrumentalness', 'key', 'loudness', 'speechiness', 'tempo')
0.6698066435299951


plot_sfs(sbs.get_metric_dict())
plt.show()


rfe = RFE(estimator=lr, n_features_to_select=7)


scaler = StandardScaler()
X_normalized = scaler.fit_transform(X)


rfe.fit(X_normalized, y)

RFE(estimator=LogisticRegression(max_iter=1000), n_features_to_select=7)

RFE(estimator=LogisticRegression(max_iter=1000), n_features_to_select=7)

LogisticRegression(max_iter=1000)

LogisticRegression(max_iter=1000)


print(rfe.score(X_normalized, y))

0.6678235002478929


feature_list = list(X.columns)


rfe_features = [f for (f, support) in zip(feature_list, rfe.support_) if support]


print(rfe_features)

['acousticness', 'danceability', 'duration_ms', 'instrumentalness', 'loudness', 'speechiness', 'valence']

	Unnamed: 0	acousticness	danceability	duration_ms	energy	instrumentalness	key	liveness	loudness	mode	speechiness	tempo	time_signature	valence	target	song_title	artist
0	0	0.0102	0.833	204600	0.434	0.021900	2	0.1650	-8.795	1	0.4310	150.062	4.0	0.286	1	Mask Off	Future
1	1	0.1990	0.743	326933	0.359	0.006110	1	0.1370	-10.401	1	0.0794	160.083	4.0	0.588	1	Redbone	Childish Gambino
2	2	0.0344	0.838	185707	0.412	0.000234	2	0.1590	-7.148	1	0.2890	75.044	4.0	0.173	1	Xanny Family	Future
3	3	0.6040	0.494	199413	0.338	0.510000	5	0.0922	-15.236	1	0.0261	86.468	4.0	0.230	1	Master Of None	Beach House
4	4	0.1800	0.678	392893	0.561	0.512000	5	0.4390	-11.648	0	0.0694	174.004	4.0	0.904	1	Parallel Lines	Junior Boys

Wrapper Method Pratice: Songs¶

Using a song attributes dataset on kaggle to practice Feature Selection using different Wrapper Methods¶

The dataset was filled with song attributes and whether the listener liked it or not¶

Import Libraries, File and Inspect Data¶

Tidy the Data¶

Split the data and score the accuracy of the logistic regression model¶

Sequential Forward Selection¶

Note, the accuracy of the model is improved by using a subset of the features (0.668 > 0.505)¶

Plot the model accuracy as a function of the number of features used¶

Sequential Backward Selection¶

Note, using SBS finds a subset (which includes the tempo feature) and SFS finds a similar subject (swapping tempo with time_signature)¶

The accuracy of the model is slightly improved by using backward selection (0.6698 > 0.6668)¶

Recursive Feature Elimination¶

The data has to be normalized for this method¶

Note, the accuracy of the model is between SFS and SBS¶