# importing libraries
import numpy as nm
import matplotlib.pyplot as mtp
import pandas as pd
from sklearn.cluster import DBSCAN
from numpy import unique
from numpy import where
from matplotlib import pyplot
# Importing the dataset
dataset = pd.read_csv('All.csv')
dataset.drop(dataset.index[dataset['game'] == 'Alternating'], inplace = True)
dataset.drop(dataset.index[dataset['game'] == 'Divided'], inplace = True)
dataset.drop(dataset.index[dataset['game'] == 'Focused'], inplace = True)
display(dataset)
# statistics of the data
# dataset.describe()
id | child_gender | sequence_of_responses | sequence_of_stimuli | colour | order_of_selection | sequence_of_sides | no_of_clicks | total_correct_responses | correct_responses | commission_errors | omission_errors | child_age | mean_reaction_time | total_duration | diagnosis | percentage_no_of_correct_responses | game | CER | OER | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
48 | 49 | 1 | [C, C, C, C, C, C, C, C] | ladybird | red | red_ladybird, red_ladybird, red_ladybird, red_... | NaN | 8 | 8 | 8 | 0 | 0 | 4 | 0 | 3459 | No | 100.000000 | Selective | 0.0 | 0.0 |
49 | 50 | 2 | [C, C, C, C, C, C] | butterfly | blue | blue_buterfly, blue_buterfly, blue_buterfly, b... | NaN | 6 | 6 | 6 | 0 | 0 | 4 | 0 | 3000 | No | 100.000000 | Selective | 0.0 | 0.0 |
50 | 51 | 2 | [C, C, C, C, C, C] | butterfly | blue | blue_buterfly, blue_buterfly, blue_buterfly, b... | NaN | 6 | 6 | 6 | 0 | 0 | 4 | 0 | 10888 | No | 100.000000 | Selective | 0.0 | 0.0 |
51 | 52 | 2 | [C, C, C, C, C, C] | butterfly | blue | blue_buterfly, blue_buterfly, blue_buterfly, b... | NaN | 6 | 6 | 6 | 0 | 0 | 4 | 0 | 7081 | No | 100.000000 | Selective | 0.0 | 0.0 |
52 | 53 | 2 | [C, C, C, C, C, C] | ladybird | red | blue_buterfly, blue_buterfly, blue_buterfly, b... | NaN | 6 | 7 | 6 | 0 | 1 | 4 | 0 | 9953 | No | 85.714286 | Selective | 0.0 | 1.0 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
211 | 212 | 2 | NaN | [C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, ... | bird | pink | NaN | NaN | 34 | 33 | 0 | 1 | 5 | 1296 | 220070 | No | 97.058824 | Sustained | 0.0 | 1.0 |
212 | 213 | 2 | NaN | [C, C, C, C, C, M] | bird | red | NaN | NaN | 7 | 5 | 0 | 2 | 5 | 1173 | 44919 | No | 71.428571 | Sustained | 0.0 | 1.0 |
213 | 214 | 1 | NaN | [C, C, C, C, C, C, C, C, C, C, C, C, C] | bird | pink | NaN | NaN | 14 | 13 | 0 | 1 | 5 | 851 | 86529 | No | 92.857143 | Sustained | 0.0 | 1.0 |
214 | 215 | 1 | NaN | [C, C, C, M, C] | bird | pink | NaN | NaN | 5 | 4 | 0 | 1 | 5 | 1029 | 33134 | No | 80.000000 | Sustained | 0.0 | 1.0 |
215 | 216 | 2 | NaN | [C, C, C, C, C, C, C, C, C, C, C, M, C, C, C, C] | bird | red | NaN | NaN | 17 | 15 | 0 | 2 | 5 | 1056 | 106353 | No | 88.235294 | Sustained | 0.0 | 1.0 |
122 rows × 20 columns
x = dataset.iloc[:, [12, 14]].values
display(x)
array([[ 4, 3459], [ 4, 3000], [ 4, 10888], [ 4, 7081], [ 4, 9953], [ 4, 14207], [ 4, 14036], [ 4, 12646], [ 4, 7251], [ 4, 10026], [ 4, 11482], [ 4, 12086], [ 4, 10816], [ 4, 6345], [ 4, 11201], [ 4, 10236], [ 4, 10830], [ 4, 10348], [ 4, 16222], [ 4, 10595], [ 4, 13987], [ 4, 15440], [ 4, 22296], [ 4, 11724], [ 4, 12248], [ 4, 11101], [ 4, 11191], [ 4, 13694], [ 4, 3231], [ 4, 9907], [ 4, 11292], [ 4, 17495], [ 4, 11627], [ 4, 23274], [ 4, 6276], [ 4, 7853], [ 4, 8574], [ 4, 7801], [ 4, 8413], [ 4, 8300], [ 4, 15645], [ 4, 14987], [ 4, 18248], [ 4, 13130], [ 4, 9934], [ 4, 30387], [ 4, 17464], [ 4, 23988], [ 4, 12807], [ 4, 25561], [ 4, 22375], [ 4, 11325], [ 4, 14820], [ 4, 16869], [ 4, 14130], [ 4, 132928], [ 4, 133598], [ 4, 203243], [ 4, 86043], [ 4, 46708], [ 4, 114319], [ 4, 95887], [ 4, 25196], [ 5, 12367], [ 5, 20664], [ 5, 23106], [ 5, 11978], [ 5, 10358], [ 5, 11941], [ 5, 13415], [ 5, 9074], [ 5, 8005], [ 5, 15879], [ 5, 12677], [ 5, 7744], [ 5, 10849], [ 5, 10577], [ 5, 10628], [ 5, 10764], [ 5, 12553], [ 5, 12413], [ 5, 7879], [ 5, 9200], [ 5, 7375], [ 5, 5437], [ 5, 14770], [ 5, 8935], [ 5, 13979], [ 5, 16502], [ 5, 12426], [ 5, 12219], [ 5, 8014], [ 5, 9373], [ 5, 9838], [ 5, 9299], [ 5, 15905], [ 5, 11342], [ 5, 10124], [ 5, 9889], [ 5, 13156], [ 5, 13786], [ 5, 14397], [ 5, 10058], [ 5, 10874], [ 5, 10874], [ 5, 5112], [ 5, 12410], [ 5, 13305], [ 5, 11075], [ 5, 9059], [ 5, 10000], [ 5, 12889], [ 5, 25719], [ 5, 50461], [ 5, 19886], [ 5, 42199], [ 5, 10381], [ 5, 220070], [ 5, 44919], [ 5, 86529], [ 5, 33134], [ 5, 106353]], dtype=int64)
# standardizing the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x = scaler.fit_transform(x)
# statistics of scaled data
pd.DataFrame(x).describe()
display(x)
array([[-9.67733402e-01, -5.59247666e-01], [-9.67733402e-01, -5.72745277e-01], [-9.67733402e-01, -3.40786342e-01], [-9.67733402e-01, -4.52737111e-01], [-9.67733402e-01, -3.68281474e-01], [-9.67733402e-01, -2.43185973e-01], [-9.67733402e-01, -2.48214495e-01], [-9.67733402e-01, -2.89089612e-01], [-9.67733402e-01, -4.47737996e-01], [-9.67733402e-01, -3.66134795e-01], [-9.67733402e-01, -3.23318846e-01], [-9.67733402e-01, -3.05557284e-01], [-9.67733402e-01, -3.42903614e-01], [-9.67733402e-01, -4.74380338e-01], [-9.67733402e-01, -3.31582089e-01], [-9.67733402e-01, -3.59959418e-01], [-9.67733402e-01, -3.42491922e-01], [-9.67733402e-01, -3.56665883e-01], [-9.67733402e-01, -1.83931758e-01], [-9.67733402e-01, -3.49402463e-01], [-9.67733402e-01, -2.49655416e-01], [-9.67733402e-01, -2.06927687e-01], [-9.67733402e-01, -5.31632015e-03], [-9.67733402e-01, -3.16202459e-01], [-9.67733402e-01, -3.00793422e-01], [-9.67733402e-01, -3.34522745e-01], [-9.67733402e-01, -3.31876154e-01], [-9.67733402e-01, -2.58271538e-01], [-9.67733402e-01, -5.65952362e-01], [-9.67733402e-01, -3.69634176e-01], [-9.67733402e-01, -3.28906092e-01], [-9.67733402e-01, -1.46497208e-01], [-9.67733402e-01, -3.19054895e-01], [-9.67733402e-01, 2.34432943e-02], [-9.67733402e-01, -4.76409390e-01], [-9.67733402e-01, -4.30035247e-01], [-9.67733402e-01, -4.08833119e-01], [-9.67733402e-01, -4.31564388e-01], [-9.67733402e-01, -4.13567575e-01], [-9.67733402e-01, -4.16890516e-01], [-9.67733402e-01, -2.00899342e-01], [-9.67733402e-01, -2.20248858e-01], [-9.67733402e-01, -1.24354070e-01], [-9.67733402e-01, -2.74856837e-01], [-9.67733402e-01, -3.68840199e-01], [-9.67733402e-01, 2.32612146e-01], [-9.67733402e-01, -1.47408812e-01], [-9.67733402e-01, 4.44395772e-02], [-9.67733402e-01, -2.84355156e-01], [-9.67733402e-01, 9.06960940e-02], [-9.67733402e-01, -2.99320202e-03], [-9.67733402e-01, -3.27935676e-01], [-9.67733402e-01, -2.25159753e-01], [-9.67733402e-01, -1.64905714e-01], [-9.67733402e-01, -2.45450278e-01], [-9.67733402e-01, 3.24799008e+00], [-9.67733402e-01, 3.26769248e+00], [-9.67733402e-01, 5.31571226e+00], [-9.67733402e-01, 1.86926358e+00], [-9.67733402e-01, 7.12556591e-01], [-9.67733402e-01, 2.70076343e+00], [-9.67733402e-01, 2.15874174e+00], [-9.67733402e-01, 7.99627001e-02], [ 1.03334245e+00, -2.97294041e-01], [ 1.03334245e+00, -5.33078239e-02], [ 1.03334245e+00, 1.85029924e-02], [ 1.03334245e+00, -3.08733193e-01], [ 1.03334245e+00, -3.56371818e-01], [ 1.03334245e+00, -3.09821235e-01], [ 1.03334245e+00, -2.66475968e-01], [ 1.03334245e+00, -3.94129839e-01], [ 1.03334245e+00, -4.25565451e-01], [ 1.03334245e+00, -1.94018207e-01], [ 1.03334245e+00, -2.88178008e-01], [ 1.03334245e+00, -4.33240562e-01], [ 1.03334245e+00, -3.41933198e-01], [ 1.03334245e+00, -3.49931782e-01], [ 1.03334245e+00, -3.48432047e-01], [ 1.03334245e+00, -3.44432755e-01], [ 1.03334245e+00, -2.91824422e-01], [ 1.03334245e+00, -2.95941340e-01], [ 1.03334245e+00, -4.29270677e-01], [ 1.03334245e+00, -3.90424613e-01], [ 1.03334245e+00, -4.44091582e-01], [ 1.03334245e+00, -5.01081493e-01], [ 1.03334245e+00, -2.26630081e-01], [ 1.03334245e+00, -3.98217351e-01], [ 1.03334245e+00, -2.49890669e-01], [ 1.03334245e+00, -1.75697921e-01], [ 1.03334245e+00, -2.95559055e-01], [ 1.03334245e+00, -3.01646212e-01], [ 1.03334245e+00, -4.25300791e-01], [ 1.03334245e+00, -3.85337278e-01], [ 1.03334245e+00, -3.71663228e-01], [ 1.03334245e+00, -3.87513364e-01], [ 1.03334245e+00, -1.93253637e-01], [ 1.03334245e+00, -3.27435764e-01], [ 1.03334245e+00, -3.63252953e-01], [ 1.03334245e+00, -3.70163494e-01], [ 1.03334245e+00, -2.74092267e-01], [ 1.03334245e+00, -2.55566135e-01], [ 1.03334245e+00, -2.37598727e-01], [ 1.03334245e+00, -3.65193785e-01], [ 1.03334245e+00, -3.41198034e-01], [ 1.03334245e+00, -3.41198034e-01], [ 1.03334245e+00, -5.10638625e-01], [ 1.03334245e+00, -2.96029559e-01], [ 1.03334245e+00, -2.69710689e-01], [ 1.03334245e+00, -3.35287315e-01], [ 1.03334245e+00, -3.94570938e-01], [ 1.03334245e+00, -3.66899366e-01], [ 1.03334245e+00, -2.81943818e-01], [ 1.03334245e+00, 9.53423303e-02], [ 1.03334245e+00, 8.22919406e-01], [ 1.03334245e+00, -7.61861266e-02], [ 1.03334245e+00, 5.79962418e-01], [ 1.03334245e+00, -3.55695467e-01], [ 1.03334245e+00, 5.81053642e+00], [ 1.03334245e+00, 6.59948257e-01], [ 1.03334245e+00, 1.88355516e+00], [ 1.03334245e+00, 3.13391963e-01], [ 1.03334245e+00, 2.46651078e+00]])
# Finding the optimal number of clusters using the elbow method
from sklearn.cluster import KMeans
wcss_list= [] #Initializing the list for the values of WCSS
#Using for loop for iterations from 1 to 10.
for i in range(1, 11):
kmeans = KMeans(n_clusters=i, init='k-means++', random_state= 42)
kmeans.fit(x)
wcss_list.append(kmeans.inertia_)
mtp.plot(range(1, 11), wcss_list)
mtp.title('The Elobw Method Graph')
mtp.xlabel('Number of clusters(k)')
mtp.ylabel('wcss_list')
mtp.show()
#training the K-means model on a dataset
kmeans = KMeans(n_clusters=3, init='k-means++', random_state= 42)
y_predict= kmeans.fit_predict(x)
print(y_predict)
#visulaizing the clusters
mtp.scatter(x[y_predict == 0, 0], x[y_predict == 0, 1], s = 100, c = 'blue', label = 'Cluster 1') #for first cluster
mtp.scatter(x[y_predict == 1, 0], x[y_predict == 1, 1], s = 100, c = 'green', label = 'Cluster 2') #for second cluster
mtp.scatter(x[y_predict== 2, 0], x[y_predict == 2, 1], s = 100, c = 'red', label = 'Cluster 3') #for third cluster
mtp.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 300, c = 'yellow', label = 'Centroid')
mtp.title('Clusters of children')
mtp.xlabel('Total Duration')
mtp.ylabel('Age')
mtp.legend()
mtp.show()
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 1 2 2 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 2 0 2]
new_df = dataset.iloc[:, [12, 14]].copy()
new_df['clusters'] = y_predict
new_df.head()
display(new_df)
child_age | total_duration | clusters | |
---|---|---|---|
48 | 4 | 3459 | 1 |
49 | 4 | 3000 | 1 |
50 | 4 | 10888 | 1 |
51 | 4 | 7081 | 1 |
52 | 4 | 9953 | 1 |
... | ... | ... | ... |
211 | 5 | 220070 | 2 |
212 | 5 | 44919 | 0 |
213 | 5 | 86529 | 2 |
214 | 5 | 33134 | 0 |
215 | 5 | 106353 | 2 |
122 rows × 3 columns
len(new_df[new_df["clusters"] == 0])
56
cluster_1 = new_df[new_df["clusters"] == 0 ]
maxVal = cluster_1['total_duration'].max()
minVal = cluster_1['total_duration'].min()
print(minVal)
print(maxVal)
5112 50461
display(cluster_1)
cluster_1.boxplot(column =['total_duration'], grid = False)
child_age | total_duration | clusters | |
---|---|---|---|
157 | 5 | 12367 | 0 |
158 | 5 | 20664 | 0 |
159 | 5 | 23106 | 0 |
160 | 5 | 11978 | 0 |
161 | 5 | 10358 | 0 |
162 | 5 | 11941 | 0 |
163 | 5 | 13415 | 0 |
164 | 5 | 9074 | 0 |
165 | 5 | 8005 | 0 |
166 | 5 | 15879 | 0 |
167 | 5 | 12677 | 0 |
168 | 5 | 7744 | 0 |
169 | 5 | 10849 | 0 |
170 | 5 | 10577 | 0 |
171 | 5 | 10628 | 0 |
172 | 5 | 10764 | 0 |
173 | 5 | 12553 | 0 |
174 | 5 | 12413 | 0 |
175 | 5 | 7879 | 0 |
176 | 5 | 9200 | 0 |
177 | 5 | 7375 | 0 |
178 | 5 | 5437 | 0 |
179 | 5 | 14770 | 0 |
180 | 5 | 8935 | 0 |
181 | 5 | 13979 | 0 |
182 | 5 | 16502 | 0 |
183 | 5 | 12426 | 0 |
184 | 5 | 12219 | 0 |
185 | 5 | 8014 | 0 |
186 | 5 | 9373 | 0 |
187 | 5 | 9838 | 0 |
188 | 5 | 9299 | 0 |
189 | 5 | 15905 | 0 |
190 | 5 | 11342 | 0 |
191 | 5 | 10124 | 0 |
192 | 5 | 9889 | 0 |
193 | 5 | 13156 | 0 |
194 | 5 | 13786 | 0 |
195 | 5 | 14397 | 0 |
196 | 5 | 10058 | 0 |
197 | 5 | 10874 | 0 |
198 | 5 | 10874 | 0 |
199 | 5 | 5112 | 0 |
200 | 5 | 12410 | 0 |
201 | 5 | 13305 | 0 |
202 | 5 | 11075 | 0 |
203 | 5 | 9059 | 0 |
204 | 5 | 10000 | 0 |
205 | 5 | 12889 | 0 |
206 | 5 | 25719 | 0 |
207 | 5 | 50461 | 0 |
208 | 5 | 19886 | 0 |
209 | 5 | 42199 | 0 |
210 | 5 | 10381 | 0 |
212 | 5 | 44919 | 0 |
214 | 5 | 33134 | 0 |
<AxesSubplot:>
len(new_df[new_df["clusters"] == 1])
57
cluster_2 = new_df[new_df["clusters"] == 1 ]
maxVal = cluster_2['total_duration'].max()
minVal = cluster_2['total_duration'].min()
print(minVal)
print(maxVal)
3000 46708
display(cluster_2)
cluster_2.sort_values('total_duration')
cluster_2.boxplot(column =['total_duration'], grid = False)
child_age | total_duration | clusters | |
---|---|---|---|
48 | 4 | 3459 | 1 |
49 | 4 | 3000 | 1 |
50 | 4 | 10888 | 1 |
51 | 4 | 7081 | 1 |
52 | 4 | 9953 | 1 |
53 | 4 | 14207 | 1 |
54 | 4 | 14036 | 1 |
55 | 4 | 12646 | 1 |
56 | 4 | 7251 | 1 |
57 | 4 | 10026 | 1 |
58 | 4 | 11482 | 1 |
59 | 4 | 12086 | 1 |
60 | 4 | 10816 | 1 |
61 | 4 | 6345 | 1 |
62 | 4 | 11201 | 1 |
63 | 4 | 10236 | 1 |
64 | 4 | 10830 | 1 |
65 | 4 | 10348 | 1 |
66 | 4 | 16222 | 1 |
67 | 4 | 10595 | 1 |
68 | 4 | 13987 | 1 |
69 | 4 | 15440 | 1 |
70 | 4 | 22296 | 1 |
71 | 4 | 11724 | 1 |
72 | 4 | 12248 | 1 |
73 | 4 | 11101 | 1 |
74 | 4 | 11191 | 1 |
75 | 4 | 13694 | 1 |
76 | 4 | 3231 | 1 |
77 | 4 | 9907 | 1 |
78 | 4 | 11292 | 1 |
79 | 4 | 17495 | 1 |
80 | 4 | 11627 | 1 |
81 | 4 | 23274 | 1 |
82 | 4 | 6276 | 1 |
83 | 4 | 7853 | 1 |
84 | 4 | 8574 | 1 |
85 | 4 | 7801 | 1 |
86 | 4 | 8413 | 1 |
87 | 4 | 8300 | 1 |
88 | 4 | 15645 | 1 |
89 | 4 | 14987 | 1 |
90 | 4 | 18248 | 1 |
91 | 4 | 13130 | 1 |
92 | 4 | 9934 | 1 |
93 | 4 | 30387 | 1 |
94 | 4 | 17464 | 1 |
95 | 4 | 23988 | 1 |
96 | 4 | 12807 | 1 |
97 | 4 | 25561 | 1 |
98 | 4 | 22375 | 1 |
99 | 4 | 11325 | 1 |
100 | 4 | 14820 | 1 |
101 | 4 | 16869 | 1 |
102 | 4 | 14130 | 1 |
107 | 4 | 46708 | 1 |
110 | 4 | 25196 | 1 |
<AxesSubplot:>
len(new_df[new_df["clusters"] == 2])
9
cluster_3 = new_df[new_df["clusters"] == 2 ]
maxVal = cluster_3['total_duration'].max()
minVal = cluster_3['total_duration'].min()
print(minVal)
print(maxVal)
86043 220070
display(cluster_3)
cluster_3.sort_values('total_duration')
cluster_3.boxplot(column =['total_duration'], grid = False)
child_age | total_duration | clusters | |
---|---|---|---|
103 | 4 | 132928 | 2 |
104 | 4 | 133598 | 2 |
105 | 4 | 203243 | 2 |
106 | 4 | 86043 | 2 |
108 | 4 | 114319 | 2 |
109 | 4 | 95887 | 2 |
211 | 5 | 220070 | 2 |
213 | 5 | 86529 | 2 |
215 | 5 | 106353 | 2 |
<AxesSubplot:>
from matplotlib import pyplot as plt
# Pandas dataframe
data = pd.DataFrame({"Cluster1": cluster_1['total_duration'], "Cluster2": cluster_2['total_duration'], "Cluster3": cluster_3['total_duration']})
# Plot the dataframe
ax = data[['Cluster1', 'Cluster2', 'Cluster3']].plot(kind='box', title='boxplot')
# Display the plot
plt.show()