In [1]:
# importing libraries    
import numpy as nm    
import matplotlib.pyplot as mtp    
import pandas as pd   
from sklearn.cluster import DBSCAN
from numpy import unique
from numpy import where
from matplotlib import pyplot
In [2]:
# Importing the dataset  
dataset = pd.read_csv('All.csv')  
dataset = dataset[dataset.percentage_no_of_correct_responses != 0]
dataset.head()
# statistics of the data
# dataset.describe()
Out[2]:
id child_gender sequence_of_responses sequence_of_stimuli colour order_of_selection sequence_of_sides no_of_clicks total_correct_responses correct_responses commission_errors omission_errors child_age mean_reaction_time total_duration diagnosis percentage_no_of_correct_responses game CER OER
0 1 2 [M, C, C, C, C, C, C, C, C, C, C, C, C, C, C, ... red star, red crab, white bunny, pink pig, bro... NaN NaN right, right, left, left, left, left, right, l... NaN 19 18 0 1 4 1479 57000 Yes 94.736842 Alternating 0.00 1.00
1 2 1 [C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, ... red star, red crab, white bunny, red fish, blu... NaN NaN right, right, left, right, right, left, left, ... NaN 19 19 0 0 4 1605 57000 No 100.000000 Alternating 0.00 0.00
2 3 2 [M, C, C, C, W, C, C, C, C, W, C, W, C, C, C, ... red star, white bunny, pink pig, brown dog, re... NaN NaN right, left, left, left, right, left, left, le... NaN 19 18 3 1 4 1404 57000 No 94.736842 Alternating 0.75 0.25
3 4 2 [C, C, C, W, C, C, C, C, W, C, C, C, C, C, W, ... white bunny, pink pig, brown dog, red star, pa... NaN NaN left, left, left, right, left, left, right, le... NaN 19 19 4 0 4 1782 57000 No 100.000000 Alternating 1.00 0.00
4 5 2 [C, C, C, W, C, C, C, W, C, W, C, W, C, C, C, ... red star, red crab, white bunny, red fish, blu... NaN NaN right, right, left, right, right, right, left,... NaN 19 19 6 0 4 1258 57000 No 100.000000 Alternating 1.00 0.00
In [3]:
x = dataset.iloc[:, [12, 16]].values  
display(x)
array([[  4.        ,  94.73684211],
       [  4.        , 100.        ],
       [  4.        ,  94.73684211],
       [  4.        , 100.        ],
       [  4.        , 100.        ],
       [  4.        ,  78.94736842],
       [  4.        , 100.        ],
       [  4.        ,  78.94736842],
       [  4.        ,  94.73684211],
       [  4.        ,  68.42105263],
       [  4.        , 100.        ],
       [  4.        ,  62.5       ],
       [  4.        ,  62.5       ],
       [  4.        ,  87.5       ],
       [  4.        ,  50.        ],
       [  4.        ,  75.        ],
       [  4.        , 100.        ],
       [  4.        ,  87.5       ],
       [  4.        , 100.        ],
       [  4.        , 100.        ],
       [  4.        ,  87.5       ],
       [  4.        ,  62.5       ],
       [  4.        , 100.        ],
       [  4.        , 100.        ],
       [  4.        , 100.        ],
       [  4.        , 100.        ],
       [  4.        ,  90.        ],
       [  4.        , 100.        ],
       [  4.        ,  80.        ],
       [  4.        , 100.        ],
       [  4.        ,  80.        ],
       [  4.        ,  90.        ],
       [  4.        , 100.        ],
       [  4.        ,  90.        ],
       [  4.        , 100.        ],
       [  4.        , 100.        ],
       [  4.        , 100.        ],
       [  4.        , 100.        ],
       [  4.        , 100.        ],
       [  4.        ,  90.        ],
       [  4.        , 100.        ],
       [  4.        ,  90.        ],
       [  4.        ,  70.        ],
       [  4.        , 100.        ],
       [  4.        , 100.        ],
       [  4.        ,  90.        ],
       [  4.        ,  80.        ],
       [  4.        ,  70.        ],
       [  4.        , 100.        ],
       [  4.        , 100.        ],
       [  4.        , 100.        ],
       [  4.        , 100.        ],
       [  4.        ,  85.71428571],
       [  4.        , 100.        ],
       [  4.        , 100.        ],
       [  4.        , 100.        ],
       [  4.        , 100.        ],
       [  4.        , 100.        ],
       [  4.        , 100.        ],
       [  4.        , 100.        ],
       [  4.        , 100.        ],
       [  4.        , 100.        ],
       [  4.        , 100.        ],
       [  4.        ,  87.5       ],
       [  4.        , 100.        ],
       [  4.        ,  12.5       ],
       [  4.        , 100.        ],
       [  4.        , 100.        ],
       [  4.        ,  75.        ],
       [  4.        ,  85.71428571],
       [  4.        , 100.        ],
       [  4.        ,  33.33333333],
       [  4.        , 100.        ],
       [  4.        ,  85.71428571],
       [  4.        ,  83.33333333],
       [  4.        , 100.        ],
       [  4.        ,  87.5       ],
       [  4.        , 100.        ],
       [  4.        , 100.        ],
       [  4.        , 100.        ],
       [  4.        , 100.        ],
       [  4.        , 100.        ],
       [  4.        ,  16.66666667],
       [  4.        , 100.        ],
       [  4.        , 100.        ],
       [  4.        , 100.        ],
       [  4.        , 100.        ],
       [  4.        , 100.        ],
       [  4.        , 100.        ],
       [  4.        , 100.        ],
       [  4.        , 100.        ],
       [  4.        , 100.        ],
       [  4.        , 100.        ],
       [  4.        , 100.        ],
       [  4.        , 100.        ],
       [  4.        , 100.        ],
       [  4.        , 100.        ],
       [  4.        , 100.        ],
       [  4.        , 100.        ],
       [  4.        , 100.        ],
       [  4.        , 100.        ],
       [  4.        ,  87.5       ],
       [  4.        , 100.        ],
       [  4.        ,  80.        ],
       [  4.        ,  90.        ],
       [  4.        ,  90.32258065],
       [  4.        ,  92.30769231],
       [  4.        , 100.        ],
       [  4.        ,  88.23529412],
       [  4.        ,  86.66666667],
       [  4.        ,  75.        ],
       [  5.        , 100.        ],
       [  5.        ,  94.73684211],
       [  5.        ,  89.47368421],
       [  5.        , 100.        ],
       [  5.        , 100.        ],
       [  5.        ,  94.73684211],
       [  5.        ,  94.73684211],
       [  5.        , 100.        ],
       [  5.        , 100.        ],
       [  5.        ,  84.21052632],
       [  5.        ,  68.42105263],
       [  5.        , 100.        ],
       [  5.        ,  75.        ],
       [  5.        , 100.        ],
       [  5.        ,  87.5       ],
       [  5.        ,  87.5       ],
       [  5.        ,  75.        ],
       [  5.        ,  37.5       ],
       [  5.        , 100.        ],
       [  5.        ,  12.5       ],
       [  5.        ,  91.66666667],
       [  5.        ,  91.66666667],
       [  5.        , 100.        ],
       [  5.        , 100.        ],
       [  5.        , 100.        ],
       [  5.        , 100.        ],
       [  5.        , 100.        ],
       [  5.        , 100.        ],
       [  5.        ,  83.33333333],
       [  5.        ,  83.33333333],
       [  5.        ,  91.66666667],
       [  5.        , 100.        ],
       [  5.        , 100.        ],
       [  5.        , 100.        ],
       [  5.        , 100.        ],
       [  5.        , 100.        ],
       [  5.        ,  91.66666667],
       [  5.        , 100.        ],
       [  5.        , 100.        ],
       [  5.        ,  83.33333333],
       [  5.        , 100.        ],
       [  5.        , 100.        ],
       [  5.        , 100.        ],
       [  5.        , 100.        ],
       [  5.        ,  83.33333333],
       [  5.        , 100.        ],
       [  5.        ,  71.42857143],
       [  5.        , 100.        ],
       [  5.        , 100.        ],
       [  5.        , 100.        ],
       [  5.        , 100.        ],
       [  5.        , 100.        ],
       [  5.        , 100.        ],
       [  5.        , 100.        ],
       [  5.        , 100.        ],
       [  5.        , 100.        ],
       [  5.        , 100.        ],
       [  5.        , 100.        ],
       [  5.        , 100.        ],
       [  5.        , 100.        ],
       [  5.        , 100.        ],
       [  5.        , 100.        ],
       [  5.        ,  66.66666667],
       [  5.        , 100.        ],
       [  5.        , 100.        ],
       [  5.        , 100.        ],
       [  5.        , 100.        ],
       [  5.        , 100.        ],
       [  5.        , 100.        ],
       [  5.        , 100.        ],
       [  5.        , 100.        ],
       [  5.        , 100.        ],
       [  5.        , 100.        ],
       [  5.        , 100.        ],
       [  5.        , 100.        ],
       [  5.        , 100.        ],
       [  5.        ,  83.33333333],
       [  5.        , 100.        ],
       [  5.        ,  85.71428571],
       [  5.        , 100.        ],
       [  5.        , 100.        ],
       [  5.        , 100.        ],
       [  5.        , 100.        ],
       [  5.        ,  83.33333333],
       [  5.        , 100.        ],
       [  5.        , 100.        ],
       [  5.        , 100.        ],
       [  5.        , 100.        ],
       [  5.        ,  33.33333333],
       [  5.        , 100.        ],
       [  5.        ,  83.33333333],
       [  5.        , 100.        ],
       [  5.        , 100.        ],
       [  5.        , 100.        ],
       [  5.        , 100.        ],
       [  5.        , 100.        ],
       [  5.        , 100.        ],
       [  5.        , 100.        ],
       [  5.        , 100.        ],
       [  5.        ,  50.        ],
       [  5.        ,  97.05882353],
       [  5.        ,  71.42857143],
       [  5.        ,  92.85714286],
       [  5.        ,  80.        ],
       [  5.        ,  88.23529412]])
In [4]:
# standardizing the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x = scaler.fit_transform(x)

# statistics of scaled data
pd.DataFrame(x).describe()
Out[4]:
0 1
count 2.160000e+02 2.160000e+02
mean 7.237009e-16 -2.055969e-17
std 1.002323e+00 1.002323e+00
min -9.725975e-01 -5.220439e+00
25% -9.725975e-01 -2.050824e-01
50% -9.725975e-01 5.038019e-01
75% 1.028175e+00 5.038019e-01
max 1.028175e+00 5.038019e-01
In [5]:
# Finding the optimal number of clusters using the elbow method
from sklearn.cluster import KMeans  
wcss_list= []  #Initializing the list for the values of WCSS  
  
#Using for loop for iterations from 1 to 10.  
for i in range(1, 11):  
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state= 42)  
    kmeans.fit(x)  
    wcss_list.append(kmeans.inertia_)  
mtp.plot(range(1, 11), wcss_list)  
mtp.title('The Elobw Method Graph')  
mtp.xlabel('Number of clusters(k)')  
mtp.ylabel('wcss_list')  
mtp.show()
In [6]:
#training the K-means model on a dataset  
kmeans = KMeans(n_clusters=3, init='k-means++', random_state= 42)  
y_predict= kmeans.fit_predict(x)  
print(y_predict)

#visulaizing the clusters  
mtp.scatter(x[y_predict == 0, 0], x[y_predict == 0, 1], s = 100, c = 'blue', label = 'Cluster 1') #for first cluster  
mtp.scatter(x[y_predict == 1, 0], x[y_predict == 1, 1], s = 100, c = 'green', label = 'Cluster 2') #for second cluster  
mtp.scatter(x[y_predict== 2, 0], x[y_predict == 2, 1], s = 100, c = 'red', label = 'Cluster 3') #for third cluster    
mtp.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 300, c = 'yellow', label = 'Centroid')   
mtp.title('Clusters of children')  
mtp.xlabel('Percentage of Correct Responses')  
mtp.ylabel('Age')  
mtp.legend()  
mtp.show()  
[0 0 0 0 0 0 0 0 0 0 0 2 2 0 2 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 2 0 0
 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1]
In [7]:
new_df = dataset.iloc[:, [12, 16]].copy()
new_df['clusters'] = y_predict
new_df.head()
display(new_df)
child_age percentage_no_of_correct_responses clusters
0 4 94.736842 0
1 4 100.000000 0
2 4 94.736842 0
3 4 100.000000 0
4 4 100.000000 0
... ... ... ...
211 5 97.058824 1
212 5 71.428571 1
213 5 92.857143 1
214 5 80.000000 1
215 5 88.235294 1

216 rows × 3 columns

Cluster Analysis¶

Cluster 1¶

In [8]:
len(new_df[new_df["clusters"] == 0])
Out[8]:
104
In [9]:
cluster_1 = new_df[new_df["clusters"] == 0 ]

maxVal = cluster_1['percentage_no_of_correct_responses'].max()
minVal = cluster_1['percentage_no_of_correct_responses'].min()

print(minVal)
print(maxVal)
68.42105263
100.0
In [10]:
display(cluster_1)
cluster_1.boxplot(column =['percentage_no_of_correct_responses'], grid = False)
child_age percentage_no_of_correct_responses clusters
0 4 94.736842 0
1 4 100.000000 0
2 4 94.736842 0
3 4 100.000000 0
4 4 100.000000 0
... ... ... ...
106 4 92.307692 0
107 4 100.000000 0
108 4 88.235294 0
109 4 86.666667 0
110 4 75.000000 0

104 rows × 3 columns

Out[10]:
<AxesSubplot:>

Cluster 2¶

In [11]:
len(new_df[new_df["clusters"] == 1])
Out[11]:
101
In [12]:
cluster_2 = new_df[new_df["clusters"] == 1 ]

maxVal = cluster_2['percentage_no_of_correct_responses'].max()
minVal = cluster_2['percentage_no_of_correct_responses'].min()

print(minVal)
print(maxVal)
66.66666667
100.0
In [13]:
display(cluster_2)
cluster_2.sort_values('percentage_no_of_correct_responses')
cluster_2.boxplot(column =['percentage_no_of_correct_responses'], grid = False)
child_age percentage_no_of_correct_responses clusters
111 5 100.000000 1
112 5 94.736842 1
113 5 89.473684 1
114 5 100.000000 1
115 5 100.000000 1
... ... ... ...
211 5 97.058824 1
212 5 71.428571 1
213 5 92.857143 1
214 5 80.000000 1
215 5 88.235294 1

101 rows × 3 columns

Out[13]:
<AxesSubplot:>

Cluster 3¶

In [14]:
len(new_df[new_df["clusters"] == 2])
Out[14]:
11
In [15]:
cluster_3 = new_df[new_df["clusters"] == 2 ]

maxVal = cluster_3['percentage_no_of_correct_responses'].max()
minVal = cluster_3['percentage_no_of_correct_responses'].min()

print(minVal)
print(maxVal)
12.5
62.5
In [16]:
display(cluster_3)
cluster_3.sort_values('percentage_no_of_correct_responses')
cluster_3.boxplot(column =['percentage_no_of_correct_responses'], grid = False)
child_age percentage_no_of_correct_responses clusters
11 4 62.500000 2
12 4 62.500000 2
14 4 50.000000 2
21 4 62.500000 2
65 4 12.500000 2
71 4 33.333333 2
82 4 16.666667 2
128 5 37.500000 2
130 5 12.500000 2
199 5 33.333333 2
210 5 50.000000 2
Out[16]:
<AxesSubplot:>
In [17]:
from matplotlib import pyplot as plt

# Pandas dataframe
data = pd.DataFrame({"Cluster1": cluster_1['percentage_no_of_correct_responses'], "Cluster2": cluster_2['percentage_no_of_correct_responses'], "Cluster3": cluster_3['percentage_no_of_correct_responses']})

# Plot the dataframe
ax = data[['Cluster1', 'Cluster2', 'Cluster3']].plot(kind='box', title='boxplot')

# Display the plot
plt.show()
In [ ]: