In [1]:
# importing libraries    
import numpy as nm    
import matplotlib.pyplot as mtp    
import pandas as pd   
from sklearn.cluster import DBSCAN
from numpy import unique
from numpy import where
from matplotlib import pyplot
In [2]:
# Importing the dataset  
dataset = pd.read_csv('All.csv')
dataset.drop(dataset.index[dataset['game'] == 'Alternating'], inplace = True)
dataset.drop(dataset.index[dataset['game'] == 'Divided'], inplace = True)
dataset.drop(dataset.index[dataset['game'] == 'Focused'], inplace = True)
display(dataset)
# statistics of the data
# dataset.describe()
id child_gender sequence_of_responses sequence_of_stimuli colour order_of_selection sequence_of_sides no_of_clicks total_correct_responses correct_responses commission_errors omission_errors child_age mean_reaction_time total_duration diagnosis percentage_no_of_correct_responses game CER OER
48 49 1 [C, C, C, C, C, C, C, C] ladybird red red_ladybird, red_ladybird, red_ladybird, red_... NaN 8 8 8 0 0 4 0 3459 No 100.000000 Selective 0.0 0.0
49 50 2 [C, C, C, C, C, C] butterfly blue blue_buterfly, blue_buterfly, blue_buterfly, b... NaN 6 6 6 0 0 4 0 3000 No 100.000000 Selective 0.0 0.0
50 51 2 [C, C, C, C, C, C] butterfly blue blue_buterfly, blue_buterfly, blue_buterfly, b... NaN 6 6 6 0 0 4 0 10888 No 100.000000 Selective 0.0 0.0
51 52 2 [C, C, C, C, C, C] butterfly blue blue_buterfly, blue_buterfly, blue_buterfly, b... NaN 6 6 6 0 0 4 0 7081 No 100.000000 Selective 0.0 0.0
52 53 2 [C, C, C, C, C, C] ladybird red blue_buterfly, blue_buterfly, blue_buterfly, b... NaN 6 7 6 0 1 4 0 9953 No 85.714286 Selective 0.0 1.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
211 212 2 NaN [C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, ... bird pink NaN NaN 34 33 0 1 5 1296 220070 No 97.058824 Sustained 0.0 1.0
212 213 2 NaN [C, C, C, C, C, M] bird red NaN NaN 7 5 0 2 5 1173 44919 No 71.428571 Sustained 0.0 1.0
213 214 1 NaN [C, C, C, C, C, C, C, C, C, C, C, C, C] bird pink NaN NaN 14 13 0 1 5 851 86529 No 92.857143 Sustained 0.0 1.0
214 215 1 NaN [C, C, C, M, C] bird pink NaN NaN 5 4 0 1 5 1029 33134 No 80.000000 Sustained 0.0 1.0
215 216 2 NaN [C, C, C, C, C, C, C, C, C, C, C, M, C, C, C, C] bird red NaN NaN 17 15 0 2 5 1056 106353 No 88.235294 Sustained 0.0 1.0

122 rows × 20 columns

In [3]:
x = dataset.iloc[:, [12, 14]].values  
display(x)
array([[     4,   3459],
       [     4,   3000],
       [     4,  10888],
       [     4,   7081],
       [     4,   9953],
       [     4,  14207],
       [     4,  14036],
       [     4,  12646],
       [     4,   7251],
       [     4,  10026],
       [     4,  11482],
       [     4,  12086],
       [     4,  10816],
       [     4,   6345],
       [     4,  11201],
       [     4,  10236],
       [     4,  10830],
       [     4,  10348],
       [     4,  16222],
       [     4,  10595],
       [     4,  13987],
       [     4,  15440],
       [     4,  22296],
       [     4,  11724],
       [     4,  12248],
       [     4,  11101],
       [     4,  11191],
       [     4,  13694],
       [     4,   3231],
       [     4,   9907],
       [     4,  11292],
       [     4,  17495],
       [     4,  11627],
       [     4,  23274],
       [     4,   6276],
       [     4,   7853],
       [     4,   8574],
       [     4,   7801],
       [     4,   8413],
       [     4,   8300],
       [     4,  15645],
       [     4,  14987],
       [     4,  18248],
       [     4,  13130],
       [     4,   9934],
       [     4,  30387],
       [     4,  17464],
       [     4,  23988],
       [     4,  12807],
       [     4,  25561],
       [     4,  22375],
       [     4,  11325],
       [     4,  14820],
       [     4,  16869],
       [     4,  14130],
       [     4, 132928],
       [     4, 133598],
       [     4, 203243],
       [     4,  86043],
       [     4,  46708],
       [     4, 114319],
       [     4,  95887],
       [     4,  25196],
       [     5,  12367],
       [     5,  20664],
       [     5,  23106],
       [     5,  11978],
       [     5,  10358],
       [     5,  11941],
       [     5,  13415],
       [     5,   9074],
       [     5,   8005],
       [     5,  15879],
       [     5,  12677],
       [     5,   7744],
       [     5,  10849],
       [     5,  10577],
       [     5,  10628],
       [     5,  10764],
       [     5,  12553],
       [     5,  12413],
       [     5,   7879],
       [     5,   9200],
       [     5,   7375],
       [     5,   5437],
       [     5,  14770],
       [     5,   8935],
       [     5,  13979],
       [     5,  16502],
       [     5,  12426],
       [     5,  12219],
       [     5,   8014],
       [     5,   9373],
       [     5,   9838],
       [     5,   9299],
       [     5,  15905],
       [     5,  11342],
       [     5,  10124],
       [     5,   9889],
       [     5,  13156],
       [     5,  13786],
       [     5,  14397],
       [     5,  10058],
       [     5,  10874],
       [     5,  10874],
       [     5,   5112],
       [     5,  12410],
       [     5,  13305],
       [     5,  11075],
       [     5,   9059],
       [     5,  10000],
       [     5,  12889],
       [     5,  25719],
       [     5,  50461],
       [     5,  19886],
       [     5,  42199],
       [     5,  10381],
       [     5, 220070],
       [     5,  44919],
       [     5,  86529],
       [     5,  33134],
       [     5, 106353]], dtype=int64)
In [4]:
# standardizing the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x = scaler.fit_transform(x)

# statistics of scaled data
pd.DataFrame(x).describe()

display(x)
array([[-9.67733402e-01, -5.59247666e-01],
       [-9.67733402e-01, -5.72745277e-01],
       [-9.67733402e-01, -3.40786342e-01],
       [-9.67733402e-01, -4.52737111e-01],
       [-9.67733402e-01, -3.68281474e-01],
       [-9.67733402e-01, -2.43185973e-01],
       [-9.67733402e-01, -2.48214495e-01],
       [-9.67733402e-01, -2.89089612e-01],
       [-9.67733402e-01, -4.47737996e-01],
       [-9.67733402e-01, -3.66134795e-01],
       [-9.67733402e-01, -3.23318846e-01],
       [-9.67733402e-01, -3.05557284e-01],
       [-9.67733402e-01, -3.42903614e-01],
       [-9.67733402e-01, -4.74380338e-01],
       [-9.67733402e-01, -3.31582089e-01],
       [-9.67733402e-01, -3.59959418e-01],
       [-9.67733402e-01, -3.42491922e-01],
       [-9.67733402e-01, -3.56665883e-01],
       [-9.67733402e-01, -1.83931758e-01],
       [-9.67733402e-01, -3.49402463e-01],
       [-9.67733402e-01, -2.49655416e-01],
       [-9.67733402e-01, -2.06927687e-01],
       [-9.67733402e-01, -5.31632015e-03],
       [-9.67733402e-01, -3.16202459e-01],
       [-9.67733402e-01, -3.00793422e-01],
       [-9.67733402e-01, -3.34522745e-01],
       [-9.67733402e-01, -3.31876154e-01],
       [-9.67733402e-01, -2.58271538e-01],
       [-9.67733402e-01, -5.65952362e-01],
       [-9.67733402e-01, -3.69634176e-01],
       [-9.67733402e-01, -3.28906092e-01],
       [-9.67733402e-01, -1.46497208e-01],
       [-9.67733402e-01, -3.19054895e-01],
       [-9.67733402e-01,  2.34432943e-02],
       [-9.67733402e-01, -4.76409390e-01],
       [-9.67733402e-01, -4.30035247e-01],
       [-9.67733402e-01, -4.08833119e-01],
       [-9.67733402e-01, -4.31564388e-01],
       [-9.67733402e-01, -4.13567575e-01],
       [-9.67733402e-01, -4.16890516e-01],
       [-9.67733402e-01, -2.00899342e-01],
       [-9.67733402e-01, -2.20248858e-01],
       [-9.67733402e-01, -1.24354070e-01],
       [-9.67733402e-01, -2.74856837e-01],
       [-9.67733402e-01, -3.68840199e-01],
       [-9.67733402e-01,  2.32612146e-01],
       [-9.67733402e-01, -1.47408812e-01],
       [-9.67733402e-01,  4.44395772e-02],
       [-9.67733402e-01, -2.84355156e-01],
       [-9.67733402e-01,  9.06960940e-02],
       [-9.67733402e-01, -2.99320202e-03],
       [-9.67733402e-01, -3.27935676e-01],
       [-9.67733402e-01, -2.25159753e-01],
       [-9.67733402e-01, -1.64905714e-01],
       [-9.67733402e-01, -2.45450278e-01],
       [-9.67733402e-01,  3.24799008e+00],
       [-9.67733402e-01,  3.26769248e+00],
       [-9.67733402e-01,  5.31571226e+00],
       [-9.67733402e-01,  1.86926358e+00],
       [-9.67733402e-01,  7.12556591e-01],
       [-9.67733402e-01,  2.70076343e+00],
       [-9.67733402e-01,  2.15874174e+00],
       [-9.67733402e-01,  7.99627001e-02],
       [ 1.03334245e+00, -2.97294041e-01],
       [ 1.03334245e+00, -5.33078239e-02],
       [ 1.03334245e+00,  1.85029924e-02],
       [ 1.03334245e+00, -3.08733193e-01],
       [ 1.03334245e+00, -3.56371818e-01],
       [ 1.03334245e+00, -3.09821235e-01],
       [ 1.03334245e+00, -2.66475968e-01],
       [ 1.03334245e+00, -3.94129839e-01],
       [ 1.03334245e+00, -4.25565451e-01],
       [ 1.03334245e+00, -1.94018207e-01],
       [ 1.03334245e+00, -2.88178008e-01],
       [ 1.03334245e+00, -4.33240562e-01],
       [ 1.03334245e+00, -3.41933198e-01],
       [ 1.03334245e+00, -3.49931782e-01],
       [ 1.03334245e+00, -3.48432047e-01],
       [ 1.03334245e+00, -3.44432755e-01],
       [ 1.03334245e+00, -2.91824422e-01],
       [ 1.03334245e+00, -2.95941340e-01],
       [ 1.03334245e+00, -4.29270677e-01],
       [ 1.03334245e+00, -3.90424613e-01],
       [ 1.03334245e+00, -4.44091582e-01],
       [ 1.03334245e+00, -5.01081493e-01],
       [ 1.03334245e+00, -2.26630081e-01],
       [ 1.03334245e+00, -3.98217351e-01],
       [ 1.03334245e+00, -2.49890669e-01],
       [ 1.03334245e+00, -1.75697921e-01],
       [ 1.03334245e+00, -2.95559055e-01],
       [ 1.03334245e+00, -3.01646212e-01],
       [ 1.03334245e+00, -4.25300791e-01],
       [ 1.03334245e+00, -3.85337278e-01],
       [ 1.03334245e+00, -3.71663228e-01],
       [ 1.03334245e+00, -3.87513364e-01],
       [ 1.03334245e+00, -1.93253637e-01],
       [ 1.03334245e+00, -3.27435764e-01],
       [ 1.03334245e+00, -3.63252953e-01],
       [ 1.03334245e+00, -3.70163494e-01],
       [ 1.03334245e+00, -2.74092267e-01],
       [ 1.03334245e+00, -2.55566135e-01],
       [ 1.03334245e+00, -2.37598727e-01],
       [ 1.03334245e+00, -3.65193785e-01],
       [ 1.03334245e+00, -3.41198034e-01],
       [ 1.03334245e+00, -3.41198034e-01],
       [ 1.03334245e+00, -5.10638625e-01],
       [ 1.03334245e+00, -2.96029559e-01],
       [ 1.03334245e+00, -2.69710689e-01],
       [ 1.03334245e+00, -3.35287315e-01],
       [ 1.03334245e+00, -3.94570938e-01],
       [ 1.03334245e+00, -3.66899366e-01],
       [ 1.03334245e+00, -2.81943818e-01],
       [ 1.03334245e+00,  9.53423303e-02],
       [ 1.03334245e+00,  8.22919406e-01],
       [ 1.03334245e+00, -7.61861266e-02],
       [ 1.03334245e+00,  5.79962418e-01],
       [ 1.03334245e+00, -3.55695467e-01],
       [ 1.03334245e+00,  5.81053642e+00],
       [ 1.03334245e+00,  6.59948257e-01],
       [ 1.03334245e+00,  1.88355516e+00],
       [ 1.03334245e+00,  3.13391963e-01],
       [ 1.03334245e+00,  2.46651078e+00]])
In [5]:
# Finding the optimal number of clusters using the elbow method
from sklearn.cluster import KMeans  
wcss_list= []  #Initializing the list for the values of WCSS  
  
#Using for loop for iterations from 1 to 10.  
for i in range(1, 11):  
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state= 42)  
    kmeans.fit(x)  
    wcss_list.append(kmeans.inertia_)  
mtp.plot(range(1, 11), wcss_list)  
mtp.title('The Elobw Method Graph')  
mtp.xlabel('Number of clusters(k)')  
mtp.ylabel('wcss_list')  
mtp.show()
In [6]:
#training the K-means model on a dataset  
kmeans = KMeans(n_clusters=3, init='k-means++', random_state= 42)  
y_predict= kmeans.fit_predict(x)  
print(y_predict)

#visulaizing the clusters  
mtp.scatter(x[y_predict == 0, 0], x[y_predict == 0, 1], s = 100, c = 'blue', label = 'Cluster 1') #for first cluster  
mtp.scatter(x[y_predict == 1, 0], x[y_predict == 1, 1], s = 100, c = 'green', label = 'Cluster 2') #for second cluster  
mtp.scatter(x[y_predict== 2, 0], x[y_predict == 2, 1], s = 100, c = 'red', label = 'Cluster 3') #for third cluster    
mtp.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 300, c = 'yellow', label = 'Centroid')   
mtp.title('Clusters of children')  
mtp.xlabel('Total Duration')  
mtp.ylabel('Age')  
mtp.legend()  
mtp.show()  
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 1 2 2 1 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 2 0 2 0 2]
In [7]:
new_df = dataset.iloc[:, [12, 14]].copy()
new_df['clusters'] = y_predict
new_df.head()
display(new_df)
child_age total_duration clusters
48 4 3459 1
49 4 3000 1
50 4 10888 1
51 4 7081 1
52 4 9953 1
... ... ... ...
211 5 220070 2
212 5 44919 0
213 5 86529 2
214 5 33134 0
215 5 106353 2

122 rows × 3 columns

Cluster Analysis¶

Cluster 1¶

In [8]:
len(new_df[new_df["clusters"] == 0])
Out[8]:
56
In [9]:
cluster_1 = new_df[new_df["clusters"] == 0 ]

maxVal = cluster_1['total_duration'].max()
minVal = cluster_1['total_duration'].min()

print(minVal)
print(maxVal)
5112
50461
In [10]:
display(cluster_1)
cluster_1.boxplot(column =['total_duration'], grid = False)
child_age total_duration clusters
157 5 12367 0
158 5 20664 0
159 5 23106 0
160 5 11978 0
161 5 10358 0
162 5 11941 0
163 5 13415 0
164 5 9074 0
165 5 8005 0
166 5 15879 0
167 5 12677 0
168 5 7744 0
169 5 10849 0
170 5 10577 0
171 5 10628 0
172 5 10764 0
173 5 12553 0
174 5 12413 0
175 5 7879 0
176 5 9200 0
177 5 7375 0
178 5 5437 0
179 5 14770 0
180 5 8935 0
181 5 13979 0
182 5 16502 0
183 5 12426 0
184 5 12219 0
185 5 8014 0
186 5 9373 0
187 5 9838 0
188 5 9299 0
189 5 15905 0
190 5 11342 0
191 5 10124 0
192 5 9889 0
193 5 13156 0
194 5 13786 0
195 5 14397 0
196 5 10058 0
197 5 10874 0
198 5 10874 0
199 5 5112 0
200 5 12410 0
201 5 13305 0
202 5 11075 0
203 5 9059 0
204 5 10000 0
205 5 12889 0
206 5 25719 0
207 5 50461 0
208 5 19886 0
209 5 42199 0
210 5 10381 0
212 5 44919 0
214 5 33134 0
Out[10]:
<AxesSubplot:>

Cluster 2¶

In [11]:
len(new_df[new_df["clusters"] == 1])
Out[11]:
57
In [12]:
cluster_2 = new_df[new_df["clusters"] == 1 ]

maxVal = cluster_2['total_duration'].max()
minVal = cluster_2['total_duration'].min()

print(minVal)
print(maxVal)
3000
46708
In [13]:
display(cluster_2)
cluster_2.sort_values('total_duration')
cluster_2.boxplot(column =['total_duration'], grid = False)
child_age total_duration clusters
48 4 3459 1
49 4 3000 1
50 4 10888 1
51 4 7081 1
52 4 9953 1
53 4 14207 1
54 4 14036 1
55 4 12646 1
56 4 7251 1
57 4 10026 1
58 4 11482 1
59 4 12086 1
60 4 10816 1
61 4 6345 1
62 4 11201 1
63 4 10236 1
64 4 10830 1
65 4 10348 1
66 4 16222 1
67 4 10595 1
68 4 13987 1
69 4 15440 1
70 4 22296 1
71 4 11724 1
72 4 12248 1
73 4 11101 1
74 4 11191 1
75 4 13694 1
76 4 3231 1
77 4 9907 1
78 4 11292 1
79 4 17495 1
80 4 11627 1
81 4 23274 1
82 4 6276 1
83 4 7853 1
84 4 8574 1
85 4 7801 1
86 4 8413 1
87 4 8300 1
88 4 15645 1
89 4 14987 1
90 4 18248 1
91 4 13130 1
92 4 9934 1
93 4 30387 1
94 4 17464 1
95 4 23988 1
96 4 12807 1
97 4 25561 1
98 4 22375 1
99 4 11325 1
100 4 14820 1
101 4 16869 1
102 4 14130 1
107 4 46708 1
110 4 25196 1
Out[13]:
<AxesSubplot:>

Cluster 3¶

In [14]:
len(new_df[new_df["clusters"] == 2])
Out[14]:
9
In [15]:
cluster_3 = new_df[new_df["clusters"] == 2 ]

maxVal = cluster_3['total_duration'].max()
minVal = cluster_3['total_duration'].min()

print(minVal)
print(maxVal)
86043
220070
In [16]:
display(cluster_3)
cluster_3.sort_values('total_duration')
cluster_3.boxplot(column =['total_duration'], grid = False)
child_age total_duration clusters
103 4 132928 2
104 4 133598 2
105 4 203243 2
106 4 86043 2
108 4 114319 2
109 4 95887 2
211 5 220070 2
213 5 86529 2
215 5 106353 2
Out[16]:
<AxesSubplot:>
In [17]:
from matplotlib import pyplot as plt

# Pandas dataframe
data = pd.DataFrame({"Cluster1": cluster_1['total_duration'], "Cluster2": cluster_2['total_duration'], "Cluster3": cluster_3['total_duration']})

# Plot the dataframe
ax = data[['Cluster1', 'Cluster2', 'Cluster3']].plot(kind='box', title='boxplot')

# Display the plot
plt.show()
In [ ]: