import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')
seeds = pd.read_csv('seeds.csv')
seeds.head()
ID | area | perimeter | compactness | lengthOfKernel | widthOfKernel | asymmetryCoefficient | lengthOfKernelGroove | seedType | |
---|---|---|---|---|---|---|---|---|---|
0 | 1 | 15.26 | 14.84 | 0.8710 | 5.763 | 3.312 | 2.221 | 5.220 | 1 |
1 | 2 | 14.88 | 14.57 | 0.8811 | 5.554 | 3.333 | 1.018 | 4.956 | 1 |
2 | 3 | 14.29 | 14.09 | 0.9050 | 5.291 | 3.337 | 2.699 | 4.825 | 1 |
3 | 4 | 13.84 | 13.94 | 0.8955 | 5.324 | 3.379 | 2.259 | 4.805 | 1 |
4 | 5 | 16.14 | 14.99 | 0.9034 | 5.658 | 3.562 | 1.355 | 5.175 | 1 |
pd.crosstab(index=seeds["seedType"],columns="count")
col_0 | count |
---|---|
seedType | |
1 | 70 |
2 | 70 |
3 | 70 |
The dataset is filled with 210 seed samples, seventy of each seed type.
features = ['lengthOfKernel','asymmetryCoefficient','lengthOfKernelGroove']
I wanted to use only 3 features to make visualisation easy (obviously more could be used, and Principle Component Analysis could have determined which). Creating a features array allows me to change the features used in the model.
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
for seed_type, color in zip(seeds['seedType'].unique(), ['r', 'g', 'b']):
x = seeds.loc[seeds['seedType'] == seed_type, features[0]]
y = seeds.loc[seeds['seedType'] == seed_type, features[1]]
z = seeds.loc[seeds['seedType'] == seed_type, features[2]]
ax.scatter(x, y, z, c=color, label=f'Seed Type: {seed_type}')
ax.set_xlabel(features[0])
ax.set_ylabel(features[1])
ax.set_zlabel(features[2])
ax.legend()
plt.show()
X = seeds[features]
y = seeds['seedType']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(X_scaled)
KMeans(n_clusters=3, random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
KMeans(n_clusters=3, random_state=42)
cluster_centers = kmeans.cluster_centers_
print(cluster_centers)
[[-0.41346401 -0.78767024 -0.64725959] [ 1.21053806 -0.03869657 1.27322065] [-0.83969739 1.00570427 -0.62359456]]
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
for seed_type, color in zip(y.unique(), ['r', 'g', 'b']):
x = X_scaled[y == seed_type, 0]
y_data = X_scaled[y == seed_type, 1]
z = X_scaled[y == seed_type, 2]
ax.scatter(x, y_data, z, c=color, label=f'Seed Type: {seed_type}') # Updated label assignment
ax.set_xlabel(features[0])
ax.set_ylabel(features[1])
ax.set_zlabel(features[2])
ax.legend()
ax.scatter(cluster_centers[:, 0], cluster_centers[:, 1], cluster_centers[:, 2],
c='black', marker='x', s=300, label='Cluster Centers')
# Show the plot
plt.show()
Note: 1 has been added to the seed types below, as the the model produces (0,1 or 2) but the seed types are 1,2 or 3.
predicted_labels = kmeans.labels_
predicted_seed_types = predicted_labels + 1
print(predicted_seed_types)
[1 1 1 1 1 1 1 1 2 1 3 1 3 1 1 3 3 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 2 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 3 3 3 3 3 3 1 3 1 3 3 3 3 1 3 3 1 3 3 3 1 3 3 3 3 1 3 1 3 1 3 3 3 3 3 3 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 1 3 3 3 3 3 1 1 3 1 3 3 3 1 3 3 1 3]
print(y)
0 1 1 1 2 1 3 1 4 1 .. 205 3 206 3 207 3 208 3 209 3 Name: seedType, Length: 210, dtype: int64
accuracy = accuracy_score(y, predicted_seed_types)
print("Accuracy:", accuracy)
Accuracy: 0.8714285714285714
newSeedFeatures = [6,2,6] #['lengthOfKernel','asymmetryCoefficient','lengthOfKernelGroove']
newSeedFeaturesScaled = scaler.transform([newSeedFeatures])
predictedCluster = kmeans.predict(newSeedFeaturesScaled)
seedType = predictedCluster + 1
print("Predicted seed type:", seedType)
Predicted seed type: [2]
num_clusters = list(range(1, 9))
inertias = []
for k in num_clusters:
model = KMeans(n_clusters=k)
model.fit(X_scaled)
inertias.append(model.inertia_)
plt.plot(num_clusters, inertias, '-o')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.show()