import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings('ignore')


df = pd.read_csv('diabetes.csv')


df.head()


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


df['Outcome'].value_counts()

Outcome
0    500
1    268
Name: count, dtype: int64


features = df[['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']]
label = df['Outcome']


X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.3)


scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


model = LogisticRegression()
model.fit(X_train, y_train)


print(model.score(X_train, y_train))


print(model.score(X_test, y_test))

0.7690875232774674
0.8008658008658008


y_pred = model.predict(X_test)


ComparisonDF = pd.DataFrame({'TestOutcomes': y_test, 'PredictedOutcomes': y_pred})


ComparisonDF.head()


def assign_value(row):
    if row['TestOutcomes'] == 1 and row['PredictedOutcomes'] == 1:
        return 'True Positive'
    elif row['TestOutcomes'] == 0 and row['PredictedOutcomes'] == 0:
        return 'True Negative'
    elif row['TestOutcomes'] == 1 and row['PredictedOutcomes'] == 0:
        return 'False Negative'
    elif row['TestOutcomes'] == 0 and row['PredictedOutcomes'] == 1:
        return 'False Postive'

ComparisonDF['Confusion'] = ComparisonDF.apply(assign_value, axis=1)

ComparisonDF.head()


print('Confusion matrix: ')

test_conf_matrix = pd.DataFrame(
    confusion_matrix(y_test, y_pred), 
    index=['Actual 0:', 'Actual 1:'], 
    columns=['Predicted 0:', 'Predicted 1:']
)

print(test_conf_matrix)

Confusion matrix: 
           Predicted 0:  Predicted 1:
Actual 0:           138            16
Actual 1:            30            47


print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.82      0.90      0.86       154
           1       0.75      0.61      0.67        77

    accuracy                           0.80       231
   macro avg       0.78      0.75      0.76       231
weighted avg       0.80      0.80      0.80       231

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome
0	6	148	72	35	0	33.6	0.627	50	1
1	1	85	66	29	0	26.6	0.351	31	0
2	8	183	64	0	0	23.3	0.672	32	1
3	1	89	66	23	94	28.1	0.167	21	0
4	0	137	40	35	168	43.1	2.288	33	1

	TestOutcomes	PredictedOutcomes
157	0	0
425	1	1
605	0	0
127	0	0
700	0	0

Logistic Regression Practice: Diabetes¶

Practice of Logistic Regression to predict diabetes from a set of patient data¶

Import Libraries, File and Inspect Data¶

Count Outcomes (0 = Non Diabetic, 1 = Diabetic)¶

Set Up Training and Test Data¶

Normalise The Data, Fit the Model and Score It (over 0.7 is a good fit)¶

Comparing Outcomes of Test Data to Predicted Outcomes¶

Accuracy Summary¶

	TestOutcomes	PredictedOutcomes	Confusion
157	0	0	True Negative
425	1	1	True Positive
605	0	0	True Negative
127	0	0	True Negative
700	0	0	True Negative