# importing libraries
import numpy as nm
import matplotlib.pyplot as mtp
import pandas as pd
from sklearn.cluster import DBSCAN
from numpy import unique
from numpy import where
from matplotlib import pyplot
# Importing the dataset
dataset = pd.read_csv('All.csv')
dataset = dataset[dataset.mean_reaction_time != 0]
dataset.head()
# statistics of the data
dataset.describe()
id | child_gender | total_correct_responses | correct_responses | commission_errors | omission_errors | child_age | mean_reaction_time | total_duration | percentage_no_of_correct_responses | CER | OER | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 107.000000 | 107.000000 | 107.000000 | 107.000000 | 107.000000 | 107.000000 | 107.000000 | 107.000000 | 107.000000 | 107.000000 | 107.000000 | 107.000000 |
mean | 86.850467 | 1.579439 | 12.654206 | 11.542056 | 1.046729 | 1.112150 | 4.476636 | 1339.093458 | 75985.299065 | 90.083684 | 0.196427 | 0.420396 |
std | 60.863566 | 0.495972 | 5.005851 | 5.053077 | 2.071292 | 1.475121 | 0.501804 | 258.155523 | 25388.761189 | 14.347553 | 0.339745 | 0.449506 |
min | 1.000000 | 1.000000 | 4.000000 | 1.000000 | 0.000000 | 0.000000 | 4.000000 | 823.000000 | 25196.000000 | 12.500000 | 0.000000 | 0.000000 |
25% | 27.500000 | 1.000000 | 10.000000 | 8.000000 | 0.000000 | 0.000000 | 4.000000 | 1154.500000 | 60000.000000 | 85.438596 | 0.000000 | 0.000000 |
50% | 109.000000 | 2.000000 | 12.000000 | 10.000000 | 0.000000 | 1.000000 | 4.000000 | 1303.000000 | 72500.000000 | 94.736842 | 0.000000 | 0.250000 |
75% | 135.500000 | 2.000000 | 17.000000 | 13.000000 | 2.000000 | 2.000000 | 5.000000 | 1499.500000 | 86764.500000 | 100.000000 | 0.400000 | 1.000000 |
max | 216.000000 | 2.000000 | 34.000000 | 33.000000 | 11.000000 | 7.000000 | 5.000000 | 2032.000000 | 220070.000000 | 100.000000 | 1.000000 | 1.000000 |
x = dataset.iloc[:, [12, 13]].values
display(x)
array([[ 4, 1479], [ 4, 1605], [ 4, 1404], [ 4, 1782], [ 4, 1258], [ 4, 1043], [ 4, 1267], [ 4, 1439], [ 4, 1614], [ 4, 1540], [ 4, 1076], [ 4, 1303], [ 4, 1384], [ 4, 1191], [ 4, 1335], [ 4, 1253], [ 4, 1239], [ 4, 1109], [ 4, 952], [ 4, 928], [ 4, 1428], [ 4, 1115], [ 4, 1448], [ 4, 1331], [ 4, 1426], [ 4, 1632], [ 4, 1340], [ 4, 1564], [ 4, 1366], [ 4, 1291], [ 4, 2032], [ 4, 1789], [ 4, 1680], [ 4, 1317], [ 4, 1040], [ 4, 1142], [ 4, 1168], [ 4, 1150], [ 4, 1270], [ 4, 1457], [ 4, 1180], [ 4, 1261], [ 4, 1234], [ 4, 1165], [ 4, 1238], [ 4, 1830], [ 4, 1657], [ 4, 1817], [ 4, 1500], [ 4, 1472], [ 4, 1523], [ 4, 1267], [ 4, 1501], [ 4, 1369], [ 4, 998], [ 4, 1655], [ 5, 1144], [ 5, 1366], [ 5, 1525], [ 5, 1146], [ 5, 1135], [ 5, 1185], [ 5, 1375], [ 5, 912], [ 5, 823], [ 5, 1506], [ 5, 1489], [ 5, 1157], [ 5, 1097], [ 5, 1160], [ 5, 1053], [ 5, 953], [ 5, 1303], [ 5, 1257], [ 5, 1008], [ 5, 1188], [ 5, 1600], [ 5, 1396], [ 5, 1380], [ 5, 1350], [ 5, 1310], [ 5, 1462], [ 5, 1069], [ 5, 1221], [ 5, 1775], [ 5, 1852], [ 5, 1598], [ 5, 1785], [ 5, 1628], [ 5, 1758], [ 5, 1215], [ 5, 1134], [ 5, 1364], [ 5, 1499], [ 5, 1998], [ 5, 1916], [ 5, 1152], [ 5, 1086], [ 5, 1207], [ 5, 1047], [ 5, 1162], [ 5, 1278], [ 5, 1296], [ 5, 1173], [ 5, 851], [ 5, 1029], [ 5, 1056]], dtype=int64)
# standardizing the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x = scaler.fit_transform(x)
# statistics of scaled data
pd.DataFrame(x).describe()
0 | 1 | |
---|---|---|
count | 1.070000e+02 | 1.070000e+02 |
mean | 1.079095e-16 | 3.029768e-16 |
std | 1.004706e+00 | 1.004706e+00 |
min | -9.543135e-01 | -2.008565e+00 |
25% | -9.543135e-01 | -7.184124e-01 |
50% | -9.543135e-01 | -1.404708e-01 |
75% | 1.047874e+00 | 6.242803e-01 |
max | 1.047874e+00 | 2.696697e+00 |
# Finding the optimal number of clusters using the elbow method
from sklearn.cluster import KMeans
wcss_list= [] #Initializing the list for the values of WCSS
#Using for loop for iterations from 1 to 10.
for i in range(1, 11):
kmeans = KMeans(n_clusters=i, init='k-means++', random_state= 42)
kmeans.fit(x)
wcss_list.append(kmeans.inertia_)
mtp.plot(range(1, 11), wcss_list)
mtp.title('The Elobw Method Graph')
mtp.xlabel('Number of clusters(k)')
mtp.ylabel('wcss_list')
mtp.show()
#training the K-means model on a dataset
kmeans = KMeans(n_clusters=4, init='k-means++', random_state= 42)
y_predict= kmeans.fit_predict(x)
print(y_predict)
#visulaizing the clusters
mtp.scatter(x[y_predict == 0, 0], x[y_predict == 0, 1], s = 100, c = 'blue', label = 'Cluster 1') #for first cluster
mtp.scatter(x[y_predict == 1, 0], x[y_predict == 1, 1], s = 100, c = 'green', label = 'Cluster 2') #for second cluster
mtp.scatter(x[y_predict== 2, 0], x[y_predict == 2, 1], s = 100, c = 'red', label = 'Cluster 3') #for third cluster
mtp.scatter(x[y_predict== 3, 0], x[y_predict == 3, 1], s = 100, c = 'deeppink', label = 'Cluster 4') #for third cluster
mtp.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 300, c = 'yellow', label = 'Centroid')
mtp.title('Clusters of children')
mtp.xlabel('Mean Reaction Time')
mtp.ylabel('Age')
mtp.legend()
mtp.show()
[1 1 1 1 3 3 3 1 1 1 3 3 3 3 3 3 3 3 3 3 1 3 1 3 1 1 3 1 3 3 1 1 1 3 3 3 3 3 3 1 3 3 3 3 3 1 1 1 1 1 1 3 1 3 3 1 0 0 2 0 0 0 0 0 0 2 2 0 0 0 0 0 0 0 0 0 2 0 0 0 0 2 0 0 2 2 2 2 2 2 0 0 0 2 2 2 0 0 0 0 0 0 0 0 0 0 0]
new_df = dataset.iloc[:, [12, 13]].copy()
new_df['clusters'] = y_predict
new_df.head()
display(new_df)
child_age | mean_reaction_time | clusters | |
---|---|---|---|
0 | 4 | 1479 | 1 |
1 | 4 | 1605 | 1 |
2 | 4 | 1404 | 1 |
3 | 4 | 1782 | 1 |
4 | 4 | 1258 | 3 |
... | ... | ... | ... |
211 | 5 | 1296 | 0 |
212 | 5 | 1173 | 0 |
213 | 5 | 851 | 0 |
214 | 5 | 1029 | 0 |
215 | 5 | 1056 | 0 |
107 rows × 3 columns
len(new_df[new_df["clusters"] == 0])
37
cluster_1 = new_df[new_df["clusters"] == 0 ]
maxVal = cluster_1['mean_reaction_time'].max()
minVal = cluster_1['mean_reaction_time'].min()
print(minVal)
print(maxVal)
823 1396
display(cluster_1)
cluster_1.boxplot(column =['mean_reaction_time'], grid = False)
child_age | mean_reaction_time | clusters | |
---|---|---|---|
111 | 5 | 1144 | 0 |
112 | 5 | 1366 | 0 |
114 | 5 | 1146 | 0 |
115 | 5 | 1135 | 0 |
116 | 5 | 1185 | 0 |
117 | 5 | 1375 | 0 |
118 | 5 | 912 | 0 |
119 | 5 | 823 | 0 |
122 | 5 | 1157 | 0 |
123 | 5 | 1097 | 0 |
124 | 5 | 1160 | 0 |
125 | 5 | 1053 | 0 |
126 | 5 | 953 | 0 |
127 | 5 | 1303 | 0 |
128 | 5 | 1257 | 0 |
129 | 5 | 1008 | 0 |
130 | 5 | 1188 | 0 |
132 | 5 | 1396 | 0 |
133 | 5 | 1380 | 0 |
134 | 5 | 1350 | 0 |
135 | 5 | 1310 | 0 |
137 | 5 | 1069 | 0 |
138 | 5 | 1221 | 0 |
145 | 5 | 1215 | 0 |
146 | 5 | 1134 | 0 |
147 | 5 | 1364 | 0 |
151 | 5 | 1152 | 0 |
152 | 5 | 1086 | 0 |
153 | 5 | 1207 | 0 |
154 | 5 | 1047 | 0 |
155 | 5 | 1162 | 0 |
156 | 5 | 1278 | 0 |
211 | 5 | 1296 | 0 |
212 | 5 | 1173 | 0 |
213 | 5 | 851 | 0 |
214 | 5 | 1029 | 0 |
215 | 5 | 1056 | 0 |
<AxesSubplot:>
len(new_df[new_df["clusters"] == 1])
24
cluster_2 = new_df[new_df["clusters"] == 1 ]
maxVal = cluster_2['mean_reaction_time'].max()
minVal = cluster_2['mean_reaction_time'].min()
print(minVal)
print(maxVal)
1404 2032
display(cluster_2)
cluster_2.sort_values('mean_reaction_time')
cluster_2.boxplot(column =['mean_reaction_time'], grid = False)
child_age | mean_reaction_time | clusters | |
---|---|---|---|
0 | 4 | 1479 | 1 |
1 | 4 | 1605 | 1 |
2 | 4 | 1404 | 1 |
3 | 4 | 1782 | 1 |
7 | 4 | 1439 | 1 |
8 | 4 | 1614 | 1 |
9 | 4 | 1540 | 1 |
20 | 4 | 1428 | 1 |
22 | 4 | 1448 | 1 |
24 | 4 | 1426 | 1 |
25 | 4 | 1632 | 1 |
27 | 4 | 1564 | 1 |
30 | 4 | 2032 | 1 |
31 | 4 | 1789 | 1 |
32 | 4 | 1680 | 1 |
39 | 4 | 1457 | 1 |
45 | 4 | 1830 | 1 |
46 | 4 | 1657 | 1 |
47 | 4 | 1817 | 1 |
103 | 4 | 1500 | 1 |
104 | 4 | 1472 | 1 |
105 | 4 | 1523 | 1 |
107 | 4 | 1501 | 1 |
110 | 4 | 1655 | 1 |
<AxesSubplot:>
len(new_df[new_df["clusters"] == 2])
14
cluster_3 = new_df[new_df["clusters"] == 2 ]
maxVal = cluster_3['mean_reaction_time'].max()
minVal = cluster_3['mean_reaction_time'].min()
print(minVal)
print(maxVal)
1462 1998
display(cluster_3)
cluster_3.sort_values('mean_reaction_time')
cluster_3.boxplot(column =['mean_reaction_time'], grid = False)
child_age | mean_reaction_time | clusters | |
---|---|---|---|
113 | 5 | 1525 | 2 |
120 | 5 | 1506 | 2 |
121 | 5 | 1489 | 2 |
131 | 5 | 1600 | 2 |
136 | 5 | 1462 | 2 |
139 | 5 | 1775 | 2 |
140 | 5 | 1852 | 2 |
141 | 5 | 1598 | 2 |
142 | 5 | 1785 | 2 |
143 | 5 | 1628 | 2 |
144 | 5 | 1758 | 2 |
148 | 5 | 1499 | 2 |
149 | 5 | 1998 | 2 |
150 | 5 | 1916 | 2 |
<AxesSubplot:>
len(new_df[new_df["clusters"] == 3])
32
cluster_4 = new_df[new_df["clusters"] == 3 ]
maxVal = cluster_4['mean_reaction_time'].max()
minVal = cluster_4['mean_reaction_time'].min()
print(minVal)
print(maxVal)
928 1384
display(cluster_4)
cluster_4.sort_values('mean_reaction_time')
cluster_4.boxplot(column =['mean_reaction_time'], grid = False)
child_age | mean_reaction_time | clusters | |
---|---|---|---|
4 | 4 | 1258 | 3 |
5 | 4 | 1043 | 3 |
6 | 4 | 1267 | 3 |
10 | 4 | 1076 | 3 |
11 | 4 | 1303 | 3 |
12 | 4 | 1384 | 3 |
13 | 4 | 1191 | 3 |
14 | 4 | 1335 | 3 |
15 | 4 | 1253 | 3 |
16 | 4 | 1239 | 3 |
17 | 4 | 1109 | 3 |
18 | 4 | 952 | 3 |
19 | 4 | 928 | 3 |
21 | 4 | 1115 | 3 |
23 | 4 | 1331 | 3 |
26 | 4 | 1340 | 3 |
28 | 4 | 1366 | 3 |
29 | 4 | 1291 | 3 |
33 | 4 | 1317 | 3 |
34 | 4 | 1040 | 3 |
35 | 4 | 1142 | 3 |
36 | 4 | 1168 | 3 |
37 | 4 | 1150 | 3 |
38 | 4 | 1270 | 3 |
40 | 4 | 1180 | 3 |
41 | 4 | 1261 | 3 |
42 | 4 | 1234 | 3 |
43 | 4 | 1165 | 3 |
44 | 4 | 1238 | 3 |
106 | 4 | 1267 | 3 |
108 | 4 | 1369 | 3 |
109 | 4 | 998 | 3 |
<AxesSubplot:>
from matplotlib import pyplot as plt
# Pandas dataframe
data = pd.DataFrame({"Cluster1": cluster_1['mean_reaction_time'], "Cluster2": cluster_2['mean_reaction_time'], "Cluster3": cluster_3['mean_reaction_time'], "Cluster2": cluster_2['mean_reaction_time'], "Cluster4": cluster_4['mean_reaction_time']})
# Plot the dataframe
ax = data[['Cluster1', 'Cluster2', 'Cluster3', 'Cluster4']].plot(kind='box', title='boxplot')
# Display the plot
plt.show()