# importing libraries
import numpy as nm
import matplotlib.pyplot as mtp
import pandas as pd
from sklearn.cluster import DBSCAN
from numpy import unique
from numpy import where
from matplotlib import pyplot
# Importing the dataset
dataset = pd.read_csv('All.csv')
dataset = dataset[dataset.percentage_no_of_correct_responses != 0]
dataset.head()
# statistics of the data
# dataset.describe()
id | child_gender | sequence_of_responses | sequence_of_stimuli | colour | order_of_selection | sequence_of_sides | no_of_clicks | total_correct_responses | correct_responses | commission_errors | omission_errors | child_age | mean_reaction_time | total_duration | diagnosis | percentage_no_of_correct_responses | game | CER | OER | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 2 | [M, C, C, C, C, C, C, C, C, C, C, C, C, C, C, ... | red star, red crab, white bunny, pink pig, bro... | NaN | NaN | right, right, left, left, left, left, right, l... | NaN | 19 | 18 | 0 | 1 | 4 | 1479 | 57000 | Yes | 94.736842 | Alternating | 0.00 | 1.00 |
1 | 2 | 1 | [C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, ... | red star, red crab, white bunny, red fish, blu... | NaN | NaN | right, right, left, right, right, left, left, ... | NaN | 19 | 19 | 0 | 0 | 4 | 1605 | 57000 | No | 100.000000 | Alternating | 0.00 | 0.00 |
2 | 3 | 2 | [M, C, C, C, W, C, C, C, C, W, C, W, C, C, C, ... | red star, white bunny, pink pig, brown dog, re... | NaN | NaN | right, left, left, left, right, left, left, le... | NaN | 19 | 18 | 3 | 1 | 4 | 1404 | 57000 | No | 94.736842 | Alternating | 0.75 | 0.25 |
3 | 4 | 2 | [C, C, C, W, C, C, C, C, W, C, C, C, C, C, W, ... | white bunny, pink pig, brown dog, red star, pa... | NaN | NaN | left, left, left, right, left, left, right, le... | NaN | 19 | 19 | 4 | 0 | 4 | 1782 | 57000 | No | 100.000000 | Alternating | 1.00 | 0.00 |
4 | 5 | 2 | [C, C, C, W, C, C, C, W, C, W, C, W, C, C, C, ... | red star, red crab, white bunny, red fish, blu... | NaN | NaN | right, right, left, right, right, right, left,... | NaN | 19 | 19 | 6 | 0 | 4 | 1258 | 57000 | No | 100.000000 | Alternating | 1.00 | 0.00 |
x = dataset.iloc[:, [12, 16]].values
display(x)
array([[ 4. , 94.73684211], [ 4. , 100. ], [ 4. , 94.73684211], [ 4. , 100. ], [ 4. , 100. ], [ 4. , 78.94736842], [ 4. , 100. ], [ 4. , 78.94736842], [ 4. , 94.73684211], [ 4. , 68.42105263], [ 4. , 100. ], [ 4. , 62.5 ], [ 4. , 62.5 ], [ 4. , 87.5 ], [ 4. , 50. ], [ 4. , 75. ], [ 4. , 100. ], [ 4. , 87.5 ], [ 4. , 100. ], [ 4. , 100. ], [ 4. , 87.5 ], [ 4. , 62.5 ], [ 4. , 100. ], [ 4. , 100. ], [ 4. , 100. ], [ 4. , 100. ], [ 4. , 90. ], [ 4. , 100. ], [ 4. , 80. ], [ 4. , 100. ], [ 4. , 80. ], [ 4. , 90. ], [ 4. , 100. ], [ 4. , 90. ], [ 4. , 100. ], [ 4. , 100. ], [ 4. , 100. ], [ 4. , 100. ], [ 4. , 100. ], [ 4. , 90. ], [ 4. , 100. ], [ 4. , 90. ], [ 4. , 70. ], [ 4. , 100. ], [ 4. , 100. ], [ 4. , 90. ], [ 4. , 80. ], [ 4. , 70. ], [ 4. , 100. ], [ 4. , 100. ], [ 4. , 100. ], [ 4. , 100. ], [ 4. , 85.71428571], [ 4. , 100. ], [ 4. , 100. ], [ 4. , 100. ], [ 4. , 100. ], [ 4. , 100. ], [ 4. , 100. ], [ 4. , 100. ], [ 4. , 100. ], [ 4. , 100. ], [ 4. , 100. ], [ 4. , 87.5 ], [ 4. , 100. ], [ 4. , 12.5 ], [ 4. , 100. ], [ 4. , 100. ], [ 4. , 75. ], [ 4. , 85.71428571], [ 4. , 100. ], [ 4. , 33.33333333], [ 4. , 100. ], [ 4. , 85.71428571], [ 4. , 83.33333333], [ 4. , 100. ], [ 4. , 87.5 ], [ 4. , 100. ], [ 4. , 100. ], [ 4. , 100. ], [ 4. , 100. ], [ 4. , 100. ], [ 4. , 16.66666667], [ 4. , 100. ], [ 4. , 100. ], [ 4. , 100. ], [ 4. , 100. ], [ 4. , 100. ], [ 4. , 100. ], [ 4. , 100. ], [ 4. , 100. ], [ 4. , 100. ], [ 4. , 100. ], [ 4. , 100. ], [ 4. , 100. ], [ 4. , 100. ], [ 4. , 100. ], [ 4. , 100. ], [ 4. , 100. ], [ 4. , 100. ], [ 4. , 100. ], [ 4. , 87.5 ], [ 4. , 100. ], [ 4. , 80. ], [ 4. , 90. ], [ 4. , 90.32258065], [ 4. , 92.30769231], [ 4. , 100. ], [ 4. , 88.23529412], [ 4. , 86.66666667], [ 4. , 75. ], [ 5. , 100. ], [ 5. , 94.73684211], [ 5. , 89.47368421], [ 5. , 100. ], [ 5. , 100. ], [ 5. , 94.73684211], [ 5. , 94.73684211], [ 5. , 100. ], [ 5. , 100. ], [ 5. , 84.21052632], [ 5. , 68.42105263], [ 5. , 100. ], [ 5. , 75. ], [ 5. , 100. ], [ 5. , 87.5 ], [ 5. , 87.5 ], [ 5. , 75. ], [ 5. , 37.5 ], [ 5. , 100. ], [ 5. , 12.5 ], [ 5. , 91.66666667], [ 5. , 91.66666667], [ 5. , 100. ], [ 5. , 100. ], [ 5. , 100. ], [ 5. , 100. ], [ 5. , 100. ], [ 5. , 100. ], [ 5. , 83.33333333], [ 5. , 83.33333333], [ 5. , 91.66666667], [ 5. , 100. ], [ 5. , 100. ], [ 5. , 100. ], [ 5. , 100. ], [ 5. , 100. ], [ 5. , 91.66666667], [ 5. , 100. ], [ 5. , 100. ], [ 5. , 83.33333333], [ 5. , 100. ], [ 5. , 100. ], [ 5. , 100. ], [ 5. , 100. ], [ 5. , 83.33333333], [ 5. , 100. ], [ 5. , 71.42857143], [ 5. , 100. ], [ 5. , 100. ], [ 5. , 100. ], [ 5. , 100. ], [ 5. , 100. ], [ 5. , 100. ], [ 5. , 100. ], [ 5. , 100. ], [ 5. , 100. ], [ 5. , 100. ], [ 5. , 100. ], [ 5. , 100. ], [ 5. , 100. ], [ 5. , 100. ], [ 5. , 100. ], [ 5. , 66.66666667], [ 5. , 100. ], [ 5. , 100. ], [ 5. , 100. ], [ 5. , 100. ], [ 5. , 100. ], [ 5. , 100. ], [ 5. , 100. ], [ 5. , 100. ], [ 5. , 100. ], [ 5. , 100. ], [ 5. , 100. ], [ 5. , 100. ], [ 5. , 100. ], [ 5. , 83.33333333], [ 5. , 100. ], [ 5. , 85.71428571], [ 5. , 100. ], [ 5. , 100. ], [ 5. , 100. ], [ 5. , 100. ], [ 5. , 83.33333333], [ 5. , 100. ], [ 5. , 100. ], [ 5. , 100. ], [ 5. , 100. ], [ 5. , 33.33333333], [ 5. , 100. ], [ 5. , 83.33333333], [ 5. , 100. ], [ 5. , 100. ], [ 5. , 100. ], [ 5. , 100. ], [ 5. , 100. ], [ 5. , 100. ], [ 5. , 100. ], [ 5. , 100. ], [ 5. , 50. ], [ 5. , 97.05882353], [ 5. , 71.42857143], [ 5. , 92.85714286], [ 5. , 80. ], [ 5. , 88.23529412]])
# standardizing the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x = scaler.fit_transform(x)
# statistics of scaled data
pd.DataFrame(x).describe()
0 | 1 | |
---|---|---|
count | 2.160000e+02 | 2.160000e+02 |
mean | 7.237009e-16 | -2.055969e-17 |
std | 1.002323e+00 | 1.002323e+00 |
min | -9.725975e-01 | -5.220439e+00 |
25% | -9.725975e-01 | -2.050824e-01 |
50% | -9.725975e-01 | 5.038019e-01 |
75% | 1.028175e+00 | 5.038019e-01 |
max | 1.028175e+00 | 5.038019e-01 |
# Finding the optimal number of clusters using the elbow method
from sklearn.cluster import KMeans
wcss_list= [] #Initializing the list for the values of WCSS
#Using for loop for iterations from 1 to 10.
for i in range(1, 11):
kmeans = KMeans(n_clusters=i, init='k-means++', random_state= 42)
kmeans.fit(x)
wcss_list.append(kmeans.inertia_)
mtp.plot(range(1, 11), wcss_list)
mtp.title('The Elobw Method Graph')
mtp.xlabel('Number of clusters(k)')
mtp.ylabel('wcss_list')
mtp.show()
#training the K-means model on a dataset
kmeans = KMeans(n_clusters=3, init='k-means++', random_state= 42)
y_predict= kmeans.fit_predict(x)
print(y_predict)
#visulaizing the clusters
mtp.scatter(x[y_predict == 0, 0], x[y_predict == 0, 1], s = 100, c = 'blue', label = 'Cluster 1') #for first cluster
mtp.scatter(x[y_predict == 1, 0], x[y_predict == 1, 1], s = 100, c = 'green', label = 'Cluster 2') #for second cluster
mtp.scatter(x[y_predict== 2, 0], x[y_predict == 2, 1], s = 100, c = 'red', label = 'Cluster 3') #for third cluster
mtp.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 300, c = 'yellow', label = 'Centroid')
mtp.title('Clusters of children')
mtp.xlabel('Percentage of Correct Responses')
mtp.ylabel('Age')
mtp.legend()
mtp.show()
[0 0 0 0 0 0 0 0 0 0 0 2 2 0 2 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1]
new_df = dataset.iloc[:, [12, 16]].copy()
new_df['clusters'] = y_predict
new_df.head()
display(new_df)
child_age | percentage_no_of_correct_responses | clusters | |
---|---|---|---|
0 | 4 | 94.736842 | 0 |
1 | 4 | 100.000000 | 0 |
2 | 4 | 94.736842 | 0 |
3 | 4 | 100.000000 | 0 |
4 | 4 | 100.000000 | 0 |
... | ... | ... | ... |
211 | 5 | 97.058824 | 1 |
212 | 5 | 71.428571 | 1 |
213 | 5 | 92.857143 | 1 |
214 | 5 | 80.000000 | 1 |
215 | 5 | 88.235294 | 1 |
216 rows × 3 columns
len(new_df[new_df["clusters"] == 0])
104
cluster_1 = new_df[new_df["clusters"] == 0 ]
maxVal = cluster_1['percentage_no_of_correct_responses'].max()
minVal = cluster_1['percentage_no_of_correct_responses'].min()
print(minVal)
print(maxVal)
68.42105263 100.0
display(cluster_1)
cluster_1.boxplot(column =['percentage_no_of_correct_responses'], grid = False)
child_age | percentage_no_of_correct_responses | clusters | |
---|---|---|---|
0 | 4 | 94.736842 | 0 |
1 | 4 | 100.000000 | 0 |
2 | 4 | 94.736842 | 0 |
3 | 4 | 100.000000 | 0 |
4 | 4 | 100.000000 | 0 |
... | ... | ... | ... |
106 | 4 | 92.307692 | 0 |
107 | 4 | 100.000000 | 0 |
108 | 4 | 88.235294 | 0 |
109 | 4 | 86.666667 | 0 |
110 | 4 | 75.000000 | 0 |
104 rows × 3 columns
<AxesSubplot:>
len(new_df[new_df["clusters"] == 1])
101
cluster_2 = new_df[new_df["clusters"] == 1 ]
maxVal = cluster_2['percentage_no_of_correct_responses'].max()
minVal = cluster_2['percentage_no_of_correct_responses'].min()
print(minVal)
print(maxVal)
66.66666667 100.0
display(cluster_2)
cluster_2.sort_values('percentage_no_of_correct_responses')
cluster_2.boxplot(column =['percentage_no_of_correct_responses'], grid = False)
child_age | percentage_no_of_correct_responses | clusters | |
---|---|---|---|
111 | 5 | 100.000000 | 1 |
112 | 5 | 94.736842 | 1 |
113 | 5 | 89.473684 | 1 |
114 | 5 | 100.000000 | 1 |
115 | 5 | 100.000000 | 1 |
... | ... | ... | ... |
211 | 5 | 97.058824 | 1 |
212 | 5 | 71.428571 | 1 |
213 | 5 | 92.857143 | 1 |
214 | 5 | 80.000000 | 1 |
215 | 5 | 88.235294 | 1 |
101 rows × 3 columns
<AxesSubplot:>
len(new_df[new_df["clusters"] == 2])
11
cluster_3 = new_df[new_df["clusters"] == 2 ]
maxVal = cluster_3['percentage_no_of_correct_responses'].max()
minVal = cluster_3['percentage_no_of_correct_responses'].min()
print(minVal)
print(maxVal)
12.5 62.5
display(cluster_3)
cluster_3.sort_values('percentage_no_of_correct_responses')
cluster_3.boxplot(column =['percentage_no_of_correct_responses'], grid = False)
child_age | percentage_no_of_correct_responses | clusters | |
---|---|---|---|
11 | 4 | 62.500000 | 2 |
12 | 4 | 62.500000 | 2 |
14 | 4 | 50.000000 | 2 |
21 | 4 | 62.500000 | 2 |
65 | 4 | 12.500000 | 2 |
71 | 4 | 33.333333 | 2 |
82 | 4 | 16.666667 | 2 |
128 | 5 | 37.500000 | 2 |
130 | 5 | 12.500000 | 2 |
199 | 5 | 33.333333 | 2 |
210 | 5 | 50.000000 | 2 |
<AxesSubplot:>
from matplotlib import pyplot as plt
# Pandas dataframe
data = pd.DataFrame({"Cluster1": cluster_1['percentage_no_of_correct_responses'], "Cluster2": cluster_2['percentage_no_of_correct_responses'], "Cluster3": cluster_3['percentage_no_of_correct_responses']})
# Plot the dataframe
ax = data[['Cluster1', 'Cluster2', 'Cluster3']].plot(kind='box', title='boxplot')
# Display the plot
plt.show()