import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv('diabetes.csv')
df.head()
Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
---|---|---|---|---|---|---|---|---|---|
0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 |
1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 |
2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 |
3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 |
4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 768 entries, 0 to 767 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Pregnancies 768 non-null int64 1 Glucose 768 non-null int64 2 BloodPressure 768 non-null int64 3 SkinThickness 768 non-null int64 4 Insulin 768 non-null int64 5 BMI 768 non-null float64 6 DiabetesPedigreeFunction 768 non-null float64 7 Age 768 non-null int64 8 Outcome 768 non-null int64 dtypes: float64(2), int64(7) memory usage: 54.1 KB
df['Outcome'].value_counts()
Outcome 0 500 1 268 Name: count, dtype: int64
features = df[['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']]
label = df['Outcome']
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.3)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
model = LogisticRegression()
model.fit(X_train, y_train)
print(model.score(X_train, y_train))
print(model.score(X_test, y_test))
0.7690875232774674 0.8008658008658008
y_pred = model.predict(X_test)
ComparisonDF = pd.DataFrame({'TestOutcomes': y_test, 'PredictedOutcomes': y_pred})
ComparisonDF.head()
TestOutcomes | PredictedOutcomes | |
---|---|---|
157 | 0 | 0 |
425 | 1 | 1 |
605 | 0 | 0 |
127 | 0 | 0 |
700 | 0 | 0 |
def assign_value(row):
if row['TestOutcomes'] == 1 and row['PredictedOutcomes'] == 1:
return 'True Positive'
elif row['TestOutcomes'] == 0 and row['PredictedOutcomes'] == 0:
return 'True Negative'
elif row['TestOutcomes'] == 1 and row['PredictedOutcomes'] == 0:
return 'False Negative'
elif row['TestOutcomes'] == 0 and row['PredictedOutcomes'] == 1:
return 'False Postive'
ComparisonDF['Confusion'] = ComparisonDF.apply(assign_value, axis=1)
ComparisonDF.head()
TestOutcomes | PredictedOutcomes | Confusion | |
---|---|---|---|
157 | 0 | 0 | True Negative |
425 | 1 | 1 | True Positive |
605 | 0 | 0 | True Negative |
127 | 0 | 0 | True Negative |
700 | 0 | 0 | True Negative |
print('Confusion matrix: ')
test_conf_matrix = pd.DataFrame(
confusion_matrix(y_test, y_pred),
index=['Actual 0:', 'Actual 1:'],
columns=['Predicted 0:', 'Predicted 1:']
)
print(test_conf_matrix)
Confusion matrix: Predicted 0: Predicted 1: Actual 0: 138 16 Actual 1: 30 47
print(classification_report(y_test,y_pred))
precision recall f1-score support 0 0.82 0.90 0.86 154 1 0.75 0.61 0.67 77 accuracy 0.80 231 macro avg 0.78 0.75 0.76 231 weighted avg 0.80 0.80 0.80 231