In [1]:
# importing libraries    
import numpy as nm    
import matplotlib.pyplot as mtp    
import pandas as pd   
from sklearn.cluster import DBSCAN
from numpy import unique
from numpy import where
from matplotlib import pyplot
In [2]:
# Importing the dataset  
dataset = pd.read_csv('A4.csv')   
dataset.drop(dataset.index[dataset['game'] == 'Focused'], inplace = True)
dataset.drop(dataset.index[dataset['game'] == 'Sustained'], inplace = True)
display(dataset)
# statistics of the data
dataset.describe()
id child_gender child_age sequence_of_responses sequence_of_stimuli colour order_of_selection sequence_of_sides no_of_clicks total_correct_responses correct_responses commission_errors omission_errors mean_reaction_time total_duration diagnosis percentage_no_of_correct_responses game CER OER
0 1 2 4 [M, C, C, C, C, C, C, C, C, C, C, C, C, C, C, ... red star, red crab, white bunny, pink pig, bro... NaN NaN right, right, left, left, left, left, right, l... NaN 19 18 0 1 1479 57000 Yes 94.736842 Alternating 0.00 1.00
1 2 1 4 [C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, ... red star, red crab, white bunny, red fish, blu... NaN NaN right, right, left, right, right, left, left, ... NaN 19 19 0 0 1605 57000 No 100.000000 Alternating 0.00 0.00
2 3 2 4 [M, C, C, C, W, C, C, C, C, W, C, W, C, C, C, ... red star, white bunny, pink pig, brown dog, re... NaN NaN right, left, left, left, right, left, left, le... NaN 19 18 3 1 1404 57000 No 94.736842 Alternating 0.75 0.25
3 4 2 4 [C, C, C, W, C, C, C, C, W, C, C, C, C, C, W, ... white bunny, pink pig, brown dog, red star, pa... NaN NaN left, left, left, right, left, left, right, le... NaN 19 19 4 0 1782 57000 No 100.000000 Alternating 1.00 0.00
4 5 2 4 [C, C, C, W, C, C, C, W, C, W, C, W, C, C, C, ... red star, red crab, white bunny, red fish, blu... NaN NaN right, right, left, right, right, right, left,... NaN 19 19 6 0 1258 57000 No 100.000000 Alternating 1.00 0.00
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
98 99 1 4 [W, W, C, C, C, C, W, W, C, C, C] ladybird red red_ladybird, red_ladybird, red_ladybird, red_... NaN 11.0 6 6 4 0 0 22375 No 100.000000 Selective 1.00 0.00
99 100 1 4 [C, C, C, C, C, C] ladybird red red_ladybird, red_ladybird, red_ladybird, red_... NaN 6.0 6 6 0 0 0 11325 No 100.000000 Selective 0.00 0.00
100 101 1 4 [C, C, C, C, C, C] butterfly blue red_ladybird, red_ladybird, red_ladybird, red_... NaN 6.0 6 6 0 0 0 14820 No 100.000000 Selective 0.00 0.00
101 102 1 4 [C, C, C, C, C, C, C] bird green red_ladybird, red_ladybird, red_ladybird, red_... NaN 7.0 8 7 0 1 0 16869 No 87.500000 Selective 0.00 1.00
102 103 1 4 [C, C, C, C, C, C] bird green red_ladybird, red_ladybird, red_ladybird, red_... NaN 6.0 6 6 0 0 0 14130 No 100.000000 Selective 0.00 0.00

77 rows × 20 columns

Out[2]:
id child_gender child_age no_of_clicks total_correct_responses correct_responses commission_errors omission_errors mean_reaction_time total_duration percentage_no_of_correct_responses CER OER
count 77.000000 77.00000 77.0 55.000000 77.000000 77.000000 77.000000 77.000000 77.000000 77.000000 77.000000 77.000000 77.000000
mean 57.571429 1.38961 4.0 6.781818 8.688312 7.909091 0.896104 0.779221 373.298701 27235.090909 91.246786 0.237734 0.242785
std 32.468782 0.49086 0.0 2.087803 4.347594 4.240102 1.602501 1.518512 605.455267 23804.601968 18.103695 0.391351 0.394733
min 1.000000 1.00000 4.0 1.000000 6.000000 1.000000 0.000000 0.000000 0.000000 3000.000000 12.500000 0.000000 0.000000
25% 20.000000 1.00000 4.0 6.000000 6.000000 6.000000 0.000000 0.000000 0.000000 10816.000000 87.500000 0.000000 0.000000
50% 65.000000 1.00000 4.0 6.000000 7.000000 6.000000 0.000000 0.000000 0.000000 14130.000000 100.000000 0.000000 0.000000
75% 84.000000 2.00000 4.0 7.000000 8.000000 8.000000 2.000000 1.000000 1043.000000 57000.000000 100.000000 0.500000 0.444444
max 103.000000 2.00000 4.0 13.000000 19.000000 19.000000 6.000000 7.000000 1782.000000 70000.000000 100.000000 1.000000 1.000000
In [3]:
# extracting only 11-comission & 12-omission
x = dataset.iloc[:, [18, 19]].values  
display(x)
array([[0.        , 1.        ],
       [0.        , 0.        ],
       [0.75      , 0.25      ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.55555556, 0.44444444],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [0.25      , 0.75      ],
       [0.        , 0.        ],
       [0.5       , 0.5       ],
       [0.4       , 0.6       ],
       [0.8       , 0.2       ],
       [0.33333333, 0.66666667],
       [0.5       , 0.5       ],
       [1.        , 0.        ],
       [0.8       , 0.2       ],
       [1.        , 0.        ],
       [0.        , 0.        ],
       [0.75      , 0.25      ],
       [0.66666667, 0.33333333],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 1.        ],
       [1.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 1.        ],
       [0.        , 0.        ],
       [0.        , 1.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.        , 0.        ],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [0.        , 0.        ],
       [0.        , 1.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [1.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 1.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [1.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [1.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 1.        ],
       [0.        , 0.        ]])
In [4]:
# standardizing the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x = scaler.fit_transform(x)

# statistics of scaled data
pd.DataFrame(x).describe()
Out[4]:
0 1
count 7.700000e+01 7.700000e+01
mean -6.920871e-17 6.776686e-17
std 1.006557e+00 1.006557e+00
min -6.114545e-01 -6.190950e-01
25% -6.114545e-01 -6.190950e-01
50% -6.114545e-01 -6.190950e-01
75% 6.745484e-01 5.142260e-01
max 1.960551e+00 1.930877e+00
In [5]:
# Finding the optimal number of clusters using the elbow method
from sklearn.cluster import KMeans  
wcss_list= []  #Initializing the list for the values of WCSS  
  
#Using for loop for iterations from 1 to 10.  
for i in range(1, 11):  
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state= 42)  
    kmeans.fit(x)  
    wcss_list.append(kmeans.inertia_)  
mtp.plot(range(1, 11), wcss_list)  
mtp.title('The Elobw Method Graph')  
mtp.xlabel('Number of clusters(k)')  
mtp.ylabel('wcss_list')  
mtp.show()
In [6]:
#training the K-means model on a dataset  
kmeans = KMeans(n_clusters=3, init='k-means++', random_state= 42)  
y_predict= kmeans.fit_predict(x)  
print(y_predict)

#visulaizing the clusters  
mtp.scatter(x[y_predict == 0, 0], x[y_predict == 0, 1], s = 100, c = 'blue', label = 'Cluster 1') #for first cluster  
mtp.scatter(x[y_predict == 1, 0], x[y_predict == 1, 1], s = 100, c = 'green', label = 'Cluster 2') #for second cluster  
mtp.scatter(x[y_predict== 2, 0], x[y_predict == 2, 1], s = 100, c = 'red', label = 'Cluster 3') #for third cluster    
mtp.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 300, c = 'yellow', label = 'Centroid')   
mtp.title('Clusters of children')  
mtp.xlabel('Commission Errors')  
mtp.ylabel('Omission Errors')  
mtp.legend()  
mtp.show()  
[1 0 2 2 2 2 2 1 1 1 0 2 1 2 1 2 2 2 2 0 2 2 0 0 0 0 1 2 0 0 0 0 0 0 0 0 0
 1 0 1 0 0 1 1 2 1 0 1 1 0 1 0 0 2 0 0 1 0 0 0 0 0 0 0 2 0 0 2 2 0 0 0 2 0
 0 1 0]
In [7]:
new_df = dataset.iloc[:, [18, 19]].copy()
new_df['clusters'] = y_predict
new_df.head()
display(new_df)
CER OER clusters
0 0.00 1.00 1
1 0.00 0.00 0
2 0.75 0.25 2
3 1.00 0.00 2
4 1.00 0.00 2
... ... ... ...
98 1.00 0.00 2
99 0.00 0.00 0
100 0.00 0.00 0
101 0.00 1.00 1
102 0.00 0.00 0

77 rows × 3 columns

Cluster Analysis¶

Cluster 1¶

In [8]:
len(new_df[new_df["clusters"] == 0])
Out[8]:
40
In [9]:
cluster_0 = new_df[new_df["clusters"] == 0 ]

maxVal = cluster_0['CER'].max()
minVal = cluster_0['CER'].min()

print("CER min - ", minVal)
print("CER max - ", maxVal)
print()

maxVal = cluster_0['OER'].max()
minVal = cluster_0['OER'].min()

print("OER min - ", minVal)
print("OER max - ", maxVal)
CER min -  0.0
CER max -  0.0

OER min -  0.0
OER max -  0.0
In [10]:
cluster_0 = new_df[new_df["clusters"] == 0 ]
display(cluster_0)
cluster_0.boxplot(column =['CER'], grid = False)
CER OER clusters
1 0.0 0.0 0
10 0.0 0.0 0
19 0.0 0.0 0
48 0.0 0.0 0
49 0.0 0.0 0
50 0.0 0.0 0
51 0.0 0.0 0
54 0.0 0.0 0
55 0.0 0.0 0
56 0.0 0.0 0
57 0.0 0.0 0
58 0.0 0.0 0
59 0.0 0.0 0
60 0.0 0.0 0
61 0.0 0.0 0
62 0.0 0.0 0
64 0.0 0.0 0
66 0.0 0.0 0
67 0.0 0.0 0
72 0.0 0.0 0
75 0.0 0.0 0
77 0.0 0.0 0
78 0.0 0.0 0
80 0.0 0.0 0
81 0.0 0.0 0
83 0.0 0.0 0
84 0.0 0.0 0
85 0.0 0.0 0
86 0.0 0.0 0
87 0.0 0.0 0
88 0.0 0.0 0
89 0.0 0.0 0
91 0.0 0.0 0
92 0.0 0.0 0
95 0.0 0.0 0
96 0.0 0.0 0
97 0.0 0.0 0
99 0.0 0.0 0
100 0.0 0.0 0
102 0.0 0.0 0
Out[10]:
<AxesSubplot:>
In [11]:
cluster_0.boxplot(column =['OER'], grid = False)
Out[11]:
<AxesSubplot:>

Cluster 2¶

In [12]:
len(new_df[new_df["clusters"] == 1])
Out[12]:
17
In [13]:
cluster_1 = new_df[new_df["clusters"] == 1 ]

maxVal = cluster_1['CER'].max()
minVal = cluster_1['CER'].min()

print("CER min - ", minVal)
print("CER max - ", maxVal)
print()

maxVal = cluster_1['OER'].max()
minVal = cluster_1['OER'].min()

print("OER min - ", minVal)
print("OER max - ", maxVal)
CER min -  0.0
CER max -  0.4

OER min -  0.6
OER max -  1.0
In [14]:
cluster_1 = new_df[new_df["clusters"] == 1 ]
display(cluster_1)
cluster_1.boxplot(column =['CER'], grid = False)
CER OER clusters
0 0.000000 1.000000 1
7 0.000000 1.000000 1
8 0.000000 1.000000 1
9 0.250000 0.750000 1
12 0.400000 0.600000 1
14 0.333333 0.666667 1
52 0.000000 1.000000 1
63 0.000000 1.000000 1
65 0.000000 1.000000 1
68 0.000000 1.000000 1
69 0.000000 1.000000 1
71 0.000000 1.000000 1
73 0.000000 1.000000 1
74 0.000000 1.000000 1
76 0.000000 1.000000 1
82 0.000000 1.000000 1
101 0.000000 1.000000 1
Out[14]:
<AxesSubplot:>
In [15]:
cluster_1.boxplot(column =['OER'], grid = False)
Out[15]:
<AxesSubplot:>

Cluster 3¶

In [16]:
len(new_df[new_df["clusters"] == 2])
Out[16]:
20
In [17]:
cluster_2 = new_df[new_df["clusters"] == 2 ]

maxVal = cluster_2['CER'].max()
minVal = cluster_2['CER'].min()

print("CER min - ", minVal)
print("CER max - ", maxVal)
print()

maxVal = cluster_2['OER'].max()
minVal = cluster_2['OER'].min()

print("OER min - ", minVal)
print("OER max - ", maxVal)
CER min -  0.5
CER max -  1.0

OER min -  0.0
OER max -  0.5
In [18]:
cluster_2 = new_df[new_df["clusters"] == 2 ]
display(cluster_2)
cluster_2.boxplot(column =['CER'], grid = False)
CER OER clusters
2 0.750000 0.250000 2
3 1.000000 0.000000 2
4 1.000000 0.000000 2
5 0.555556 0.444444 2
6 1.000000 0.000000 2
11 0.500000 0.500000 2
13 0.800000 0.200000 2
15 0.500000 0.500000 2
16 1.000000 0.000000 2
17 0.800000 0.200000 2
18 1.000000 0.000000 2
20 0.750000 0.250000 2
21 0.666667 0.333333 2
53 1.000000 0.000000 2
70 1.000000 0.000000 2
79 1.000000 0.000000 2
90 1.000000 0.000000 2
93 1.000000 0.000000 2
94 1.000000 0.000000 2
98 1.000000 0.000000 2
Out[18]:
<AxesSubplot:>
In [19]:
cluster_2.boxplot(column =['OER'], grid = False)
Out[19]:
<AxesSubplot:>
In [20]:
from matplotlib import pyplot as plt

# Pandas dataframe
data = pd.DataFrame({"Cluster1": cluster_0['CER'], "Cluster2": cluster_1['CER'], "Cluster3": cluster_2['CER']})

# Plot the dataframe
ax = data[['Cluster1', 'Cluster2', 'Cluster3']].plot(kind='box', title='boxplot')

# Display the plot
plt.show()
In [21]:
from matplotlib import pyplot as plt

# Pandas dataframe
data = pd.DataFrame({"Cluster1": cluster_0['OER'], "Cluster2": cluster_1['OER'], "Cluster3": cluster_2['OER']})

# Plot the dataframe
ax = data[['Cluster1', 'Cluster2', 'Cluster3']].plot(kind='box', title='boxplot')

# Display the plot
plt.show()