Add new file

parent 808f5f54
#Features values distribution exploration
df.hist(bins = 30, figsize = (20,20), color = 'b')
#Irrelevant Feature Removal
print("Value distribution of StandardHours column")
print(df["StandardHours"].value_counts().sort_values(ascending=False))
print("Value distribution of EmployeeCount column")
print(df["EmployeeCount"].value_counts().sort_values(ascending=False))
print("Value distribution of Over18 column")
print(df["Over18"].value_counts().sort_values(ascending=False))
#these columns ['Over18', 'StandardHours', 'EmployeeCount'] have only 1 value and its same for aal the entries.
#so we can drop above 3 cloumns from df
#moreover we also can drop 'EmployeeNumber' as it is just a ID number
#drop the columns and check the shape of df
df.drop( columns= ['Over18', 'StandardHours', 'EmployeeCount', 'EmployeeNumber' ] , axis = 1, inplace=True )
print("Dataset shape after irrelevant feature removal", df.shape)
df.head(10)
#EDA
#statistical info about the numeric columns
df.describe()
# Correlation matrix
corr_matrix = df.corr()
corr_matrix
### correlation matrix visually using seaborn heat map
plt.figure(figsize=(20, 20))
sns.heatmap(corr_matrix, annot=True)
plt.figure(figsize=[20,20])
plt.subplot(411)
sns.countplot(x = 'Age', hue = 'Attrition', data = df)
plt.subplot(412)
sns.countplot(x = 'MaritalStatus', hue = 'Attrition', data = df)
plt.figure(figsize=[15,5])
sns.countplot(y = 'JobRole', hue = 'Attrition', data = df)
plt.figure(figsize=[15,10])
plt.subplot(413)
sns.countplot(y = 'JobInvolvement', hue = 'Attrition', data = df)
plt.subplot(414)
sns.countplot(y = 'JobLevel', hue = 'Attrition', data = df)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment