import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, precision_score, recall_score, f1_score

import warnings
warnings.filterwarnings('ignore')


df = pd.read_csv('diabetes.csv')


df.head()


plt.rcParams.update({'font.size': 12})
df.hist(figsize=(12,8))
plt.show()


X = df[df.columns[:8]]
y = df[df.columns[8]]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


classifier = tree.DecisionTreeClassifier(random_state=0)
classifier.fit(X_train, y_train)

DecisionTreeClassifier(random_state=0)

DecisionTreeClassifier(random_state=0)


plt.figure(figsize=(24,16))

tree.plot_tree(classifier)
plt.show()


accuracies = []
for k in range(1,8):
  classifier = tree.DecisionTreeClassifier(random_state=0, max_depth=k)
  classifier.fit(X_train, y_train)
  accuracies.append(classifier.score(X_test, y_test))


k_list = range(1,8)

plt.plot(k_list, accuracies)
plt.xlabel('Max Depth of Tree')
plt.ylabel('Accuracy Score')
plt.title('Regression Accuracy using Different Decision Tree Widths')
plt.show()


classifier = tree.DecisionTreeClassifier(random_state=0, max_depth=7)
classifier.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=7, random_state=0)

DecisionTreeClassifier(max_depth=7, random_state=0)


plt.figure(figsize=(24,16))

tree.plot_tree(classifier)
plt.show()


y_pred = classifier.predict(X)

print('Confusion matrix: ')

test_conf_matrix = pd.DataFrame(
    confusion_matrix(y, y_pred), 
    index=['Actual 0:', 'Actual 1:'], 
    columns=['Predicted 0:', 'Predicted 1:']
)

print(test_conf_matrix)

Confusion matrix: 
           Predicted 0:  Predicted 1:
Actual 0:           471            29
Actual 1:            73           195


df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')


#'Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'

NewPatient = [[2, 135, 70, 20, 0,30.0,0.48,35]]

predict = classifier.predict(NewPatient)

if predict == 0:
    print("The model predicts you may not have diabetes.")
else:
    print("The model predicts you may have diabetes.")

The model predicts you may not have diabetes.

	Pregnancies	Glucose	BloodPressure	SkinThickness	Insulin	BMI	DiabetesPedigreeFunction	Age	Outcome
0	6	148	72	35	0	33.6	0.627	50	1
1	1	85	66	29	0	26.6	0.351	31	0
2	8	183	64	0	0	23.3	0.672	32	1
3	1	89	66	23	94	28.1	0.167	21	0
4	0	137	40	35	168	43.1	2.288	33	1

Decision Tree Classification Practice: Diabetes¶

Practice using a Decision Tree to predict diabetes from a set of patient data¶

Import Libraries, File and Inspect Data¶

Histograms for each variable¶

Separate data into predictor and target datasets... then train, test and split¶

Build the Decision Tree¶

Checking accuracies of the model as the max-depth varies¶

Rebuild tree with depth of 7¶

Observe Confusion Matrix¶

Test Model with a new patient¶