In [1]:
# importing libraries    
import numpy as nm    
import matplotlib.pyplot as mtp    
import pandas as pd   
from sklearn.cluster import DBSCAN
from numpy import unique
from numpy import where
from matplotlib import pyplot
In [2]:
# Importing the dataset  
dataset = pd.read_csv('All.csv')  
dataset = dataset[dataset.mean_reaction_time != 0]
dataset.head()
# statistics of the data
dataset.describe()
Out[2]:
id child_gender total_correct_responses correct_responses commission_errors omission_errors child_age mean_reaction_time total_duration percentage_no_of_correct_responses CER OER
count 107.000000 107.000000 107.000000 107.000000 107.000000 107.000000 107.000000 107.000000 107.000000 107.000000 107.000000 107.000000
mean 86.850467 1.579439 12.654206 11.542056 1.046729 1.112150 4.476636 1339.093458 75985.299065 90.083684 0.196427 0.420396
std 60.863566 0.495972 5.005851 5.053077 2.071292 1.475121 0.501804 258.155523 25388.761189 14.347553 0.339745 0.449506
min 1.000000 1.000000 4.000000 1.000000 0.000000 0.000000 4.000000 823.000000 25196.000000 12.500000 0.000000 0.000000
25% 27.500000 1.000000 10.000000 8.000000 0.000000 0.000000 4.000000 1154.500000 60000.000000 85.438596 0.000000 0.000000
50% 109.000000 2.000000 12.000000 10.000000 0.000000 1.000000 4.000000 1303.000000 72500.000000 94.736842 0.000000 0.250000
75% 135.500000 2.000000 17.000000 13.000000 2.000000 2.000000 5.000000 1499.500000 86764.500000 100.000000 0.400000 1.000000
max 216.000000 2.000000 34.000000 33.000000 11.000000 7.000000 5.000000 2032.000000 220070.000000 100.000000 1.000000 1.000000
In [3]:
x = dataset.iloc[:, [12, 13]].values  
display(x)
array([[   4, 1479],
       [   4, 1605],
       [   4, 1404],
       [   4, 1782],
       [   4, 1258],
       [   4, 1043],
       [   4, 1267],
       [   4, 1439],
       [   4, 1614],
       [   4, 1540],
       [   4, 1076],
       [   4, 1303],
       [   4, 1384],
       [   4, 1191],
       [   4, 1335],
       [   4, 1253],
       [   4, 1239],
       [   4, 1109],
       [   4,  952],
       [   4,  928],
       [   4, 1428],
       [   4, 1115],
       [   4, 1448],
       [   4, 1331],
       [   4, 1426],
       [   4, 1632],
       [   4, 1340],
       [   4, 1564],
       [   4, 1366],
       [   4, 1291],
       [   4, 2032],
       [   4, 1789],
       [   4, 1680],
       [   4, 1317],
       [   4, 1040],
       [   4, 1142],
       [   4, 1168],
       [   4, 1150],
       [   4, 1270],
       [   4, 1457],
       [   4, 1180],
       [   4, 1261],
       [   4, 1234],
       [   4, 1165],
       [   4, 1238],
       [   4, 1830],
       [   4, 1657],
       [   4, 1817],
       [   4, 1500],
       [   4, 1472],
       [   4, 1523],
       [   4, 1267],
       [   4, 1501],
       [   4, 1369],
       [   4,  998],
       [   4, 1655],
       [   5, 1144],
       [   5, 1366],
       [   5, 1525],
       [   5, 1146],
       [   5, 1135],
       [   5, 1185],
       [   5, 1375],
       [   5,  912],
       [   5,  823],
       [   5, 1506],
       [   5, 1489],
       [   5, 1157],
       [   5, 1097],
       [   5, 1160],
       [   5, 1053],
       [   5,  953],
       [   5, 1303],
       [   5, 1257],
       [   5, 1008],
       [   5, 1188],
       [   5, 1600],
       [   5, 1396],
       [   5, 1380],
       [   5, 1350],
       [   5, 1310],
       [   5, 1462],
       [   5, 1069],
       [   5, 1221],
       [   5, 1775],
       [   5, 1852],
       [   5, 1598],
       [   5, 1785],
       [   5, 1628],
       [   5, 1758],
       [   5, 1215],
       [   5, 1134],
       [   5, 1364],
       [   5, 1499],
       [   5, 1998],
       [   5, 1916],
       [   5, 1152],
       [   5, 1086],
       [   5, 1207],
       [   5, 1047],
       [   5, 1162],
       [   5, 1278],
       [   5, 1296],
       [   5, 1173],
       [   5,  851],
       [   5, 1029],
       [   5, 1056]], dtype=int64)
In [4]:
# standardizing the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x = scaler.fit_transform(x)

# statistics of scaled data
pd.DataFrame(x).describe()
Out[4]:
0 1
count 1.070000e+02 1.070000e+02
mean 1.079095e-16 3.029768e-16
std 1.004706e+00 1.004706e+00
min -9.543135e-01 -2.008565e+00
25% -9.543135e-01 -7.184124e-01
50% -9.543135e-01 -1.404708e-01
75% 1.047874e+00 6.242803e-01
max 1.047874e+00 2.696697e+00
In [5]:
# Finding the optimal number of clusters using the elbow method
from sklearn.cluster import KMeans  
wcss_list= []  #Initializing the list for the values of WCSS  
  
#Using for loop for iterations from 1 to 10.  
for i in range(1, 11):  
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state= 42)  
    kmeans.fit(x)  
    wcss_list.append(kmeans.inertia_)  
mtp.plot(range(1, 11), wcss_list)  
mtp.title('The Elobw Method Graph')  
mtp.xlabel('Number of clusters(k)')  
mtp.ylabel('wcss_list')  
mtp.show()
In [6]:
#training the K-means model on a dataset  
kmeans = KMeans(n_clusters=4, init='k-means++', random_state= 42)  
y_predict= kmeans.fit_predict(x)  
print(y_predict)

#visulaizing the clusters  
mtp.scatter(x[y_predict == 0, 0], x[y_predict == 0, 1], s = 100, c = 'blue', label = 'Cluster 1') #for first cluster  
mtp.scatter(x[y_predict == 1, 0], x[y_predict == 1, 1], s = 100, c = 'green', label = 'Cluster 2') #for second cluster  
mtp.scatter(x[y_predict== 2, 0], x[y_predict == 2, 1], s = 100, c = 'red', label = 'Cluster 3') #for third cluster 
mtp.scatter(x[y_predict== 3, 0], x[y_predict == 3, 1], s = 100, c = 'deeppink', label = 'Cluster 4') #for third cluster    
mtp.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 300, c = 'yellow', label = 'Centroid')   
mtp.title('Clusters of children')  
mtp.xlabel('Mean Reaction Time')  
mtp.ylabel('Age')  
mtp.legend()  
mtp.show()  
[1 1 1 1 3 3 3 1 1 1 3 3 3 3 3 3 3 3 3 3 1 3 1 3 1 1 3 1 3 3 1 1 1 3 3 3 3
 3 3 1 3 3 3 3 3 1 1 1 1 1 1 3 1 3 3 1 0 0 2 0 0 0 0 0 0 2 2 0 0 0 0 0 0 0
 0 0 2 0 0 0 0 2 0 0 2 2 2 2 2 2 0 0 0 2 2 2 0 0 0 0 0 0 0 0 0 0 0]
In [7]:
new_df = dataset.iloc[:, [12, 13]].copy()
new_df['clusters'] = y_predict
new_df.head()
display(new_df)
child_age mean_reaction_time clusters
0 4 1479 1
1 4 1605 1
2 4 1404 1
3 4 1782 1
4 4 1258 3
... ... ... ...
211 5 1296 0
212 5 1173 0
213 5 851 0
214 5 1029 0
215 5 1056 0

107 rows × 3 columns

Cluster Analysis¶

Cluster 1¶

In [8]:
len(new_df[new_df["clusters"] == 0])
Out[8]:
37
In [9]:
cluster_1 = new_df[new_df["clusters"] == 0 ]

maxVal = cluster_1['mean_reaction_time'].max()
minVal = cluster_1['mean_reaction_time'].min()

print(minVal)
print(maxVal)
823
1396
In [10]:
display(cluster_1)
cluster_1.boxplot(column =['mean_reaction_time'], grid = False)
child_age mean_reaction_time clusters
111 5 1144 0
112 5 1366 0
114 5 1146 0
115 5 1135 0
116 5 1185 0
117 5 1375 0
118 5 912 0
119 5 823 0
122 5 1157 0
123 5 1097 0
124 5 1160 0
125 5 1053 0
126 5 953 0
127 5 1303 0
128 5 1257 0
129 5 1008 0
130 5 1188 0
132 5 1396 0
133 5 1380 0
134 5 1350 0
135 5 1310 0
137 5 1069 0
138 5 1221 0
145 5 1215 0
146 5 1134 0
147 5 1364 0
151 5 1152 0
152 5 1086 0
153 5 1207 0
154 5 1047 0
155 5 1162 0
156 5 1278 0
211 5 1296 0
212 5 1173 0
213 5 851 0
214 5 1029 0
215 5 1056 0
Out[10]:
<AxesSubplot:>

Cluster 2¶

In [11]:
len(new_df[new_df["clusters"] == 1])
Out[11]:
24
In [12]:
cluster_2 = new_df[new_df["clusters"] == 1 ]

maxVal = cluster_2['mean_reaction_time'].max()
minVal = cluster_2['mean_reaction_time'].min()

print(minVal)
print(maxVal)
1404
2032
In [13]:
display(cluster_2)
cluster_2.sort_values('mean_reaction_time')
cluster_2.boxplot(column =['mean_reaction_time'], grid = False)
child_age mean_reaction_time clusters
0 4 1479 1
1 4 1605 1
2 4 1404 1
3 4 1782 1
7 4 1439 1
8 4 1614 1
9 4 1540 1
20 4 1428 1
22 4 1448 1
24 4 1426 1
25 4 1632 1
27 4 1564 1
30 4 2032 1
31 4 1789 1
32 4 1680 1
39 4 1457 1
45 4 1830 1
46 4 1657 1
47 4 1817 1
103 4 1500 1
104 4 1472 1
105 4 1523 1
107 4 1501 1
110 4 1655 1
Out[13]:
<AxesSubplot:>

Cluster 3¶

In [14]:
len(new_df[new_df["clusters"] == 2])
Out[14]:
14
In [15]:
cluster_3 = new_df[new_df["clusters"] == 2 ]

maxVal = cluster_3['mean_reaction_time'].max()
minVal = cluster_3['mean_reaction_time'].min()

print(minVal)
print(maxVal)
1462
1998
In [16]:
display(cluster_3)
cluster_3.sort_values('mean_reaction_time')
cluster_3.boxplot(column =['mean_reaction_time'], grid = False)
child_age mean_reaction_time clusters
113 5 1525 2
120 5 1506 2
121 5 1489 2
131 5 1600 2
136 5 1462 2
139 5 1775 2
140 5 1852 2
141 5 1598 2
142 5 1785 2
143 5 1628 2
144 5 1758 2
148 5 1499 2
149 5 1998 2
150 5 1916 2
Out[16]:
<AxesSubplot:>

Cluster 4¶

In [17]:
len(new_df[new_df["clusters"] == 3])
Out[17]:
32
In [18]:
cluster_4 = new_df[new_df["clusters"] == 3 ]

maxVal = cluster_4['mean_reaction_time'].max()
minVal = cluster_4['mean_reaction_time'].min()

print(minVal)
print(maxVal)
928
1384
In [19]:
display(cluster_4)
cluster_4.sort_values('mean_reaction_time')
cluster_4.boxplot(column =['mean_reaction_time'], grid = False)
child_age mean_reaction_time clusters
4 4 1258 3
5 4 1043 3
6 4 1267 3
10 4 1076 3
11 4 1303 3
12 4 1384 3
13 4 1191 3
14 4 1335 3
15 4 1253 3
16 4 1239 3
17 4 1109 3
18 4 952 3
19 4 928 3
21 4 1115 3
23 4 1331 3
26 4 1340 3
28 4 1366 3
29 4 1291 3
33 4 1317 3
34 4 1040 3
35 4 1142 3
36 4 1168 3
37 4 1150 3
38 4 1270 3
40 4 1180 3
41 4 1261 3
42 4 1234 3
43 4 1165 3
44 4 1238 3
106 4 1267 3
108 4 1369 3
109 4 998 3
Out[19]:
<AxesSubplot:>
In [21]:
from matplotlib import pyplot as plt

# Pandas dataframe
data = pd.DataFrame({"Cluster1": cluster_1['mean_reaction_time'], "Cluster2": cluster_2['mean_reaction_time'], "Cluster3": cluster_3['mean_reaction_time'], "Cluster2": cluster_2['mean_reaction_time'], "Cluster4": cluster_4['mean_reaction_time']})

# Plot the dataframe
ax = data[['Cluster1', 'Cluster2', 'Cluster3', 'Cluster4']].plot(kind='box', title='boxplot')

# Display the plot
plt.show()
In [ ]: