import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from sklearn.cluster import KMeans 
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score


import warnings

warnings.filterwarnings('ignore')


seeds = pd.read_csv('seeds.csv')


seeds.head()


pd.crosstab(index=seeds["seedType"],columns="count")


features = ['lengthOfKernel','asymmetryCoefficient','lengthOfKernelGroove']


fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

for seed_type, color in zip(seeds['seedType'].unique(), ['r', 'g', 'b']):
    x = seeds.loc[seeds['seedType'] == seed_type, features[0]]
    y = seeds.loc[seeds['seedType'] == seed_type, features[1]]
    z = seeds.loc[seeds['seedType'] == seed_type, features[2]]
    ax.scatter(x, y, z, c=color, label=f'Seed Type: {seed_type}')

ax.set_xlabel(features[0])
ax.set_ylabel(features[1])
ax.set_zlabel(features[2])
ax.legend()

plt.show()


X = seeds[features]
y = seeds['seedType']


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


kmeans = KMeans(n_clusters=3, random_state=42)


kmeans.fit(X_scaled)

KMeans(n_clusters=3, random_state=42)

KMeans(n_clusters=3, random_state=42)


cluster_centers = kmeans.cluster_centers_
print(cluster_centers)

[[-0.41346401 -0.78767024 -0.64725959]
 [ 1.21053806 -0.03869657  1.27322065]
 [-0.83969739  1.00570427 -0.62359456]]


fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

for seed_type, color in zip(y.unique(), ['r', 'g', 'b']):
    x = X_scaled[y == seed_type, 0]
    y_data = X_scaled[y == seed_type, 1]
    z = X_scaled[y == seed_type, 2]
    ax.scatter(x, y_data, z, c=color, label=f'Seed Type: {seed_type}')  # Updated label assignment

ax.set_xlabel(features[0])
ax.set_ylabel(features[1])
ax.set_zlabel(features[2])
ax.legend()

ax.scatter(cluster_centers[:, 0], cluster_centers[:, 1], cluster_centers[:, 2],
           c='black', marker='x', s=300, label='Cluster Centers')

# Show the plot
plt.show()


predicted_labels = kmeans.labels_
predicted_seed_types = predicted_labels + 1


print(predicted_seed_types)

[1 1 1 1 1 1 1 1 2 1 3 1 3 1 1 3 3 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 3 1 1 1 2 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 3 3 3 3 3 3 1 3
 1 3 3 3 3 1 3 3 1 3 3 3 1 3 3 3 3 1 3 1 3 1 3 3 3 3 3 3 3 3 3 1 3 3 3 3 3
 3 3 3 3 3 3 3 1 3 3 3 3 3 1 1 3 1 3 3 3 1 3 3 1 3]


print(y)

0      1
1      1
2      1
3      1
4      1
      ..
205    3
206    3
207    3
208    3
209    3
Name: seedType, Length: 210, dtype: int64


accuracy = accuracy_score(y, predicted_seed_types)


print("Accuracy:", accuracy)

Accuracy: 0.8714285714285714


newSeedFeatures = [6,2,6] #['lengthOfKernel','asymmetryCoefficient','lengthOfKernelGroove']

newSeedFeaturesScaled = scaler.transform([newSeedFeatures])

predictedCluster = kmeans.predict(newSeedFeaturesScaled)

seedType = predictedCluster + 1

print("Predicted seed type:", seedType)

Predicted seed type: [2]


num_clusters = list(range(1, 9))
inertias = []


for k in num_clusters:
  model = KMeans(n_clusters=k)
  model.fit(X_scaled)
  inertias.append(model.inertia_)


plt.plot(num_clusters, inertias, '-o')

plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')

plt.show()

	ID	area	perimeter	compactness	lengthOfKernel	widthOfKernel	asymmetryCoefficient	lengthOfKernelGroove	seedType
0	1	15.26	14.84	0.8710	5.763	3.312	2.221	5.220	1
1	2	14.88	14.57	0.8811	5.554	3.333	1.018	4.956	1
2	3	14.29	14.09	0.9050	5.291	3.337	2.699	4.825	1
3	4	13.84	13.94	0.8955	5.324	3.379	2.259	4.805	1
4	5	16.14	14.99	0.9034	5.658	3.562	1.355	5.175	1

K-Means Clustering Practice¶

I used a dataset containing seed features and seed type (found here: https://data.world/databeats/seeds) to practice K-Means Clustering¶

Import Libraries¶

Import and Inspect File¶

Feature List¶

Scatter Plot of the Seeds Given The Feature List¶

Prepare the Data, and Run the K-Means Model¶

Another Scatter Plot Allows Observation of the Centroids¶

Determine the Accuracy of the Model (Comparing Predicted Seed Types to Actual Data)¶

Making Prediction of A Future Seed¶

If We Hadn't Known There Were 3 Clusters, We Could Have Predicted This By Finding the Elbow On An Interia Plot¶

col_0	count
seedType
1	70
2	70
3	70