Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
2
2020_21 J-25
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
3
Merge Requests
3
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
2020_21 J-25
2020_21 J-25
Commits
bfbfadc6
Commit
bfbfadc6
authored
Jul 09, 2021
by
Amuthini
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
source files updated
parent
9606323e
Changes
24
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
24 changed files
with
2131 additions
and
211 deletions
+2131
-211
it17173100/.idea/inspectionProfiles/profiles_settings.xml
it17173100/.idea/inspectionProfiles/profiles_settings.xml
+0
-6
it17173100/.idea/misc.xml
it17173100/.idea/misc.xml
+0
-4
it17173100/.idea/modules.xml
it17173100/.idea/modules.xml
+0
-8
it17173100/.idea/workspace.xml
it17173100/.idea/workspace.xml
+0
-193
it17173100/Personality_prediction/.gitignore
it17173100/Personality_prediction/.gitignore
+9
-0
it17173100/Personality_prediction/.pre-commit-config.yaml
it17173100/Personality_prediction/.pre-commit-config.yaml
+5
-0
it17173100/Personality_prediction/Adaboost parameter tunning.py
...3100/Personality_prediction/Adaboost parameter tunning.py
+96
-0
it17173100/Personality_prediction/Adaboost.py
it17173100/Personality_prediction/Adaboost.py
+76
-0
it17173100/Personality_prediction/AdaboostClassifier.txt.txt
it17173100/Personality_prediction/AdaboostClassifier.txt.txt
+28
-0
it17173100/Personality_prediction/AdaboostClassifier_tunning_result.txt
...sonality_prediction/AdaboostClassifier_tunning_result.txt
+28
-0
it17173100/Personality_prediction/SGDClassifier.txt.txt
it17173100/Personality_prediction/SGDClassifier.txt.txt
+29
-0
it17173100/Personality_prediction/SGDClassifier_tunning_result.txt
...0/Personality_prediction/SGDClassifier_tunning_result.txt
+33
-0
it17173100/Personality_prediction/XGBoost classifier.py
it17173100/Personality_prediction/XGBoost classifier.py
+74
-0
it17173100/Personality_prediction/XGBoostClassifier.txt.txt
it17173100/Personality_prediction/XGBoostClassifier.txt.txt
+29
-0
it17173100/Personality_prediction/XGBoostClassifier_tunning_results.txt
...sonality_prediction/XGBoostClassifier_tunning_results.txt
+29
-0
it17173100/Personality_prediction/get_tweets.py
it17173100/Personality_prediction/get_tweets.py
+66
-0
it17173100/Personality_prediction/personality_predictor.py
it17173100/Personality_prediction/personality_predictor.py
+49
-0
it17173100/Personality_prediction/pipe.pickle
it17173100/Personality_prediction/pipe.pickle
+0
-0
it17173100/Personality_prediction/poetry.lock
it17173100/Personality_prediction/poetry.lock
+1134
-0
it17173100/Personality_prediction/preprocessor.py
it17173100/Personality_prediction/preprocessor.py
+134
-0
it17173100/Personality_prediction/pyproject.toml
it17173100/Personality_prediction/pyproject.toml
+21
-0
it17173100/Personality_prediction/sgd parameter tuning.py
it17173100/Personality_prediction/sgd parameter tuning.py
+110
-0
it17173100/Personality_prediction/sgdClassifier.py
it17173100/Personality_prediction/sgdClassifier.py
+72
-0
it17173100/Personality_prediction/xgb parameter tunning.py
it17173100/Personality_prediction/xgb parameter tunning.py
+109
-0
No files found.
it17173100/.idea/inspectionProfiles/profiles_settings.xml
deleted
100644 → 0
View file @
9606323e
<component
name=
"InspectionProjectProfileManager"
>
<settings>
<option
name=
"USE_PROJECT_PROFILE"
value=
"false"
/>
<version
value=
"1.0"
/>
</settings>
</component>
\ No newline at end of file
it17173100/.idea/misc.xml
deleted
100644 → 0
View file @
9606323e
<?xml version="1.0" encoding="UTF-8"?>
<project
version=
"4"
>
<component
name=
"ProjectRootManager"
version=
"2"
project-jdk-name=
"Python 3.7 (Predicting-Myers-Briggs-Type-Indicator-with-Recurrent-Neural-Networks-master)"
project-jdk-type=
"Python SDK"
/>
</project>
\ No newline at end of file
it17173100/.idea/modules.xml
deleted
100644 → 0
View file @
9606323e
<?xml version="1.0" encoding="UTF-8"?>
<project
version=
"4"
>
<component
name=
"ProjectModuleManager"
>
<modules>
<module
fileurl=
"file://$PROJECT_DIR$/.idea/Predicting-Myers-Briggs-Type-Indicator-with-Recurrent-Neural-Networks-master.iml"
filepath=
"$PROJECT_DIR$/.idea/Predicting-Myers-Briggs-Type-Indicator-with-Recurrent-Neural-Networks-master.iml"
/>
</modules>
</component>
</project>
\ No newline at end of file
it17173100/.idea/workspace.xml
deleted
100644 → 0
View file @
9606323e
This diff is collapsed.
Click to expand it.
it17173100/Personality_prediction/.gitignore
0 → 100644
View file @
bfbfadc6
.DS_Store
!data/.gitkeep
!models/.gitkeep
/__pycache__
/data
/models
/venv
/pipe
/.idea
\ No newline at end of file
it17173100/Personality_prediction/.pre-commit-config.yaml
0 → 100644
View file @
bfbfadc6
-
repo
:
https://github.com/psf/black
rev
:
20.8b1
# Replace by any tag/version: https://github.com/psf/black/tags
hooks
:
-
id
:
black
language_version
:
python3
# Should be a command that runs python3.6+
\ No newline at end of file
it17173100/Personality_prediction/Adaboost parameter tunning.py
0 → 100644
View file @
bfbfadc6
# AdaBoost classifier hyper parameter tunning
import
pandas
as
pd
from
sklearn.ensemble
import
AdaBoostClassifier
from
sklearn.feature_extraction.text
import
TfidfVectorizer
from
sklearn.metrics
import
accuracy_score
from
sklearn.metrics
import
make_scorer
,
confusion_matrix
from
sklearn.model_selection
import
train_test_split
,
RandomizedSearchCV
,
KFold
from
sklearn.pipeline
import
Pipeline
from
preprocessor
import
pre_process_data
#from preprocessor import get_types
SAVE_MODEL
=
True
MODELS_DIR
=
"models"
DATA_DIR
=
"data"
DIMENSIONS
=
[
"IE"
,
"NS"
,
"FT"
,
"PJ"
]
data
=
pd
.
read_csv
(
'data/mbti_personality.csv'
);
#data = data.join(data.apply(lambda row: get_types(row), axis=1))
type_indicators
=
[
"IE: Introversion (I) - Extroversion (E)"
,
"NS: Intuition (N) – Sensing (S)"
,
"FT: Feeling (F) - Thinking (T)"
,
"JP: Judging (J) – Perceiving (P)"
]
list_posts
,
list_personality
=
pre_process_data
(
data
,
remove_stop_words
=
True
)
# Learn the vocabulary dictionary and return term-document matrix
Tfidf
=
TfidfVectorizer
(
analyzer
=
"word"
,
max_features
=
1500
,
tokenizer
=
None
,
preprocessor
=
None
,
stop_words
=
None
,
max_df
=
0.7
,
min_df
=
0.1
)
X
=
list_posts
# train type indicator individually
for
l
in
range
(
len
(
DIMENSIONS
)):
print
(
"
%
s ..."
%
(
type_indicators
[
l
]))
# Let's train type indicator individually
Y
=
list_personality
[:,
l
]
# split data into train and test sets
seed
=
7
test_size
=
0.33
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
X
,
Y
,
test_size
=
test_size
,
random_state
=
seed
)
# to improve the performance , use support vector classifier as base_estimator
# svc = SVC(probability=True, kernel='linear')
# Create adaboost classifer object with default parameters
# for Adaboost the base_estimator = decision tree id default
# n_estimators - number of models to iteratively train / num
# learning_rate - its the contribution of each model to the weights
abc
=
AdaBoostClassifier
()
pipe
=
Pipeline
([(
"Tfidf"
,
Tfidf
),
(
"abc"
,
abc
)])
# Define our search space for grid search
search_space
=
[
{
'abc__n_estimators'
:
[
50
,
100
,
300
,
500
],
'abc__learning_rate'
:
[
0.0001
,
0.001
,
0.01
,
0.1
,
1.0
]
}
]
# Define cross validation
kfold
=
KFold
(
n_splits
=
10
,
random_state
=
42
,
shuffle
=
True
)
# put n_splits = 10 , put shuffle = true,
# AUC and accuracy as score
scoring
=
{
'AUC'
:
'roc_auc'
,
'Accuracy'
:
make_scorer
(
accuracy_score
)}
# Define Randomized Search
grid
=
RandomizedSearchCV
(
pipe
,
param_distributions
=
search_space
,
cv
=
kfold
,
scoring
=
scoring
,
refit
=
'AUC'
,
verbose
=
1
,
n_jobs
=-
1
)
# for param in grid.get_params().keys():
# print(param)
# Fit grid search
model
=
grid
.
fit
(
X_train
,
y_train
)
# The model scores and confusion matrix can be obtained by
predict
=
model
.
predict
(
X_test
)
print
(
'Best AUC Score: {}'
.
format
(
model
.
best_score_
))
print
(
'Accuracy: {}'
.
format
(
accuracy_score
(
y_test
,
predict
)))
print
(
confusion_matrix
(
y_test
,
predict
))
# print the the best parameters
print
(
model
.
best_params_
)
it17173100/Personality_prediction/Adaboost.py
0 → 100644
View file @
bfbfadc6
# Train and test hte dataset on the AdaBoost classifier
import
os
from
sklearn.ensemble
import
AdaBoostClassifier
import
pandas
as
pd
from
future.moves
import
pickle
from
sklearn.feature_extraction.text
import
TfidfVectorizer
from
sklearn.model_selection
import
train_test_split
from
sklearn.pipeline
import
Pipeline
from
sklearn.metrics
import
accuracy_score
from
preprocessor
import
pre_process_data
from
preprocessor
import
translate_personality
from
preprocessor
import
translate_back
from
preprocessor
import
get_types
SAVE_MODEL
=
True
MODELS_DIR
=
"models"
DATA_DIR
=
"data"
DIMENSIONS
=
[
"IE"
,
"NS"
,
"FT"
,
"PJ"
]
data
=
pd
.
read_csv
(
'data/mbti_personality.csv'
);
data
=
data
.
join
(
data
.
apply
(
lambda
row
:
get_types
(
row
),
axis
=
1
))
list_posts
,
list_personality
=
pre_process_data
(
data
,
remove_stop_words
=
True
)
# Learn the vocabulary dictionary and return term-document matrix
Tfidf
=
TfidfVectorizer
(
analyzer
=
"word"
,
max_features
=
1500
,
tokenizer
=
None
,
preprocessor
=
None
,
stop_words
=
None
,
max_df
=
0.7
,
min_df
=
0.1
)
type_indicators
=
[
"IE: Introversion (I) - Extroversion (E)"
,
"NS: Intuition (N) – Sensing (S)"
,
"FT: Feeling (F) - Thinking (T)"
,
"JP: Judging (J) – Perceiving (P)"
]
X
=
list_posts
# Let's train type indicator individually
for
l
in
range
(
len
(
DIMENSIONS
)):
print
(
"
%
s ..."
%
(
type_indicators
[
l
]))
# Let's train type indicator individually
Y
=
list_personality
[:,
l
]
# split data into train and test sets
seed
=
7
test_size
=
0.33
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
X
,
Y
,
test_size
=
test_size
,
random_state
=
seed
)
# Create adaboost classifer object with default parameters
# for Adaboost the base_estimator = decision tree id default
# n_estimators - number of models to iteratively train
# learning_rate - its the contribution of each model to the weights
abc
=
AdaBoostClassifier
(
n_estimators
=
500
,
learning_rate
=
0.1
)
pipe
=
Pipeline
([(
"Tfidf"
,
Tfidf
),
(
"abc"
,
abc
)])
pipe
.
fit
(
X_train
,
y_train
)
with
open
(
os
.
path
.
join
(
MODELS_DIR
,
"Adaboost_pipeline_{}.pkl"
.
format
(
DIMENSIONS
[
l
])),
'wb'
)
as
picklefile
:
pickle
.
dump
(
pipe
,
picklefile
)
# make predictions for test data
y_pred
=
pipe
.
predict
(
X_test
)
predictions
=
[
round
(
value
)
for
value
in
y_pred
]
# evaluate predictions
accuracy
=
accuracy_score
(
y_test
,
predictions
)
print
(
"*
%
s Accuracy:
%.2
f
%%
"
%
(
type_indicators
[
l
],
accuracy
*
100.0
))
it17173100/Personality_prediction/AdaboostClassifier.txt.txt
0 → 100644
View file @
bfbfadc6
Intoversion - extroversion
Best AUC Score: 0.803667
Accuracy: 0.7285539643730353
[[2229 0]
[ 634 0]]
{'abc__learning_rate': 0.1, 'abc__n_estimators': 500}
NS: Intuition (N) – Sensing (S) ...
Best AUC Score: 0.6727666369367796
Accuracy: 0.8046946929265
[[2431 32]
[ 384 16]]
{'abc__learning_rate': 0.1, 'abc__n_estimators': 300}
FT: Feeling (F) - Thinking (T) ...
Best AUC Score: 0.75395340936081
Accuracy: 0.72895568376202319
[[1199 355]
[ 421 888]]
{'abc__learning_rate': 0.01, 'abc__n_estimators': 500}
JP: Judging (J) – Perceiving (P) ...
Best AUC Score: 0.6638994402640133
Accuracy: 0.6521131680055885
[[ 252 867]
[ 129 1615]]
{'abc__learning_rate': 0.1, 'abc__n_estimators': 500}
it17173100/Personality_prediction/AdaboostClassifier_tunning_result.txt
0 → 100644
View file @
bfbfadc6
Intoversion - extroversion
Best AUC Score: 0.803667
Accuracy: 0.7285539643730353
[[2229 0]
[ 634 0]]
{'abc__learning_rate': 0.1, 'abc__n_estimators': 500}
NS: Intuition (N) – Sensing (S) ...
Best AUC Score: 0.6727666369367796
Accuracy: 0.8046946929265
[[2431 32]
[ 384 16]]
{'abc__learning_rate': 0.1, 'abc__n_estimators': 300}
FT: Feeling (F) - Thinking (T) ...
Best AUC Score: 0.75395340936081
Accuracy: 0.72895568376202319
[[1199 355]
[ 421 888]]
{'abc__learning_rate': 0.01, 'abc__n_estimators': 500}
JP: Judging (J) – Perceiving (P) ...
Best AUC Score: 0.6638994402640133
Accuracy: 0.6521131680055885
[[ 252 867]
[ 129 1615]]
{'abc__learning_rate': 0.1, 'abc__n_estimators': 500}
it17173100/Personality_prediction/SGDClassifier.txt.txt
0 → 100644
View file @
bfbfadc6
IE: Introversion (I) - Extroversion (E) ...
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Best AUC Score: 0.5142753030133241
Accuracy: 0.762486901851205
[[2179 10]
[ 670 4]]
{'sgd__alpha': 0.0003238897879211981, 'sgd__loss': 'modified_huber', 'sgd__penalty': 'l1'}
NS: Intuition (N) – Sensing (S) ...
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Best AUC Score: 0.516264861572238
Accuracy: 0.8620328326929794
[[2468 0]
[ 395 0]]
{'sgd__alpha': 0.0006842577234824017, 'sgd__loss': 'modified_huber', 'sgd__penalty': 'l1'}
FT: Feeling (F) - Thinking (T) ...
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Best AUC Score: 0.5050005967000348
Accuracy: 0.5036674816625917
[[858 695]
[726 584]]
{'sgd__alpha': 0.0015350240019106492, 'sgd__loss': 'modified_huber', 'sgd__penalty': 'none'}
JP: Judging (J) – Perceiving (P) ...
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Best AUC Score: 0.5209168542372503
Accuracy: 0.5277680754453371
[[ 335 785]
[ 567 1176]]
{'sgd__alpha': 0.0008934896440956289, 'sgd__loss': 'modified_huber', 'sgd__penalty': 'none'}
it17173100/Personality_prediction/SGDClassifier_tunning_result.txt
0 → 100644
View file @
bfbfadc6
sgd
IE: Introversion (I) - Extroversion (E) ...
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Best AUC Score: 0.5292548647534668
Accuracy: 0.7740132727907789
[[2216 0]
[ 647 0]]
{'sgd__alpha': 0.0009265019438562898, 'sgd__loss': 'modified_huber', 'sgd__penalty': 'l1'}
NS: Intuition (N) – Sensing (S) ...
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Best AUC Score: 0.5426211797685075
Accuracy: 0.857492141110723
[[2455 1]
[ 407 0]]
{'sgd__alpha': 0.0011441798336083461, 'sgd__loss': 'modified_huber', 'sgd__penalty': 'l2'}
FT: Feeling (F) - Thinking (T) ...
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Best AUC Score: 0.5
Accuracy: 0.5312609151239958
[[1521 0]
[1342 0]]
{'sgd__alpha': 0.0019410296620838965, 'sgd__loss': 'hinge', 'sgd__penalty': 'l1'}
JP: Judging (J) – Perceiving (P) ...
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Best AUC Score: 0.4989554047081358
Accuracy: 0.5302130632203982
[[ 336 814]
[ 531 1182]]
{'sgd__alpha': 0.0004231316730058021, 'sgd__loss': 'modified_huber', 'sgd__penalty': 'none'}
it17173100/Personality_prediction/XGBoost classifier.py
0 → 100644
View file @
bfbfadc6
# Train and test hte dataset on XGBoost classifier
import
os
import
re
import
numpy
as
np
import
pandas
as
pd
from
future.moves
import
pickle
from
sklearn.feature_extraction.text
import
TfidfVectorizer
from
sklearn.metrics
import
accuracy_score
from
sklearn.model_selection
import
train_test_split
from
sklearn.pipeline
import
Pipeline
from
xgboost
import
XGBClassifier
from
preprocessor
import
pre_process_data
from
preprocessor
import
translate_personality
from
preprocessor
import
translate_back
#from preprocessor import get_types
SAVE_MODEL
=
True
MODELS_DIR
=
"models"
DATA_DIR
=
"data"
DIMENSIONS
=
[
"IE"
,
"NS"
,
"FT"
,
"PJ"
]
data
=
pd
.
read_csv
(
'data/mbti_personality.csv'
)
list_posts
,
list_personality
=
pre_process_data
(
data
,
remove_stop_words
=
True
)
# Learn the vocabulary dictionary and return term-document matrix
print
(
"CountVectorizer..."
)
Tfidf
=
TfidfVectorizer
(
analyzer
=
"word"
,
max_features
=
1500
,
tokenizer
=
None
,
preprocessor
=
None
,
stop_words
=
None
,
max_df
=
0.7
,
min_df
=
0.1
)
type_indicators
=
[
"IE: Introversion (I) - Extroversion (E)"
,
"NS: Intuition (N) – Sensing (S)"
,
"FT: Feeling (F) - Thinking (T)"
,
"JP: Judging (J) – Perceiving (P)"
]
X
=
list_posts
# setup parameters for xgboost
param
=
{}
param
[
'n_estimators'
]
=
150
# 200
param
[
'max_depth'
]
=
3
# 2
#param['nthread'] = 8
param
[
'learning_rate'
]
=
0.01
param
[
'gamma'
]
=
0.1
# param['xgb__colsample_bytree'] = 0.1
# Let's train type indicator individually
for
l
in
range
(
len
(
DIMENSIONS
)):
print
(
"
%
s ..."
%
(
type_indicators
[
l
]))
# Let's train type indicator individually
Y
=
list_personality
[:,
l
]
# split data into train and test sets
seed
=
7
test_size
=
0.33
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
X
,
Y
,
test_size
=
test_size
,
random_state
=
seed
)
xgb
=
XGBClassifier
(
**
param
)
pipe
=
Pipeline
([(
"Tfidf"
,
Tfidf
),
(
"xgb"
,
xgb
)])
pipe
.
fit
(
X_train
,
y_train
)
with
open
(
os
.
path
.
join
(
MODELS_DIR
,
"xgb_pipeline_{}.pkl"
.
format
(
DIMENSIONS
[
l
])),
'wb'
)
as
picklefile
:
pickle
.
dump
(
pipe
,
picklefile
)
# make predictions for test data
y_pred
=
pipe
.
predict
(
X_test
)
predictions
=
[
round
(
value
)
for
value
in
y_pred
]
# evaluate predictions
accuracy
=
accuracy_score
(
y_test
,
predictions
)
print
(
"*
%
s Accuracy:
%.2
f
%%
"
%
(
type_indicators
[
l
],
accuracy
*
100.0
))
it17173100/Personality_prediction/XGBoostClassifier.txt.txt
0 → 100644
View file @
bfbfadc6
Intoversion - extroversion
Best AUC Score: 0.677028682166271
Accuracy: 0.7785539643730353
[[2229 0]
[ 634 0]]
{'xgb__n_estimators': 200, 'xgb__max_depth': 6, 'xgb__learning_rate': 0.01, 'xgb__gamma': 0.2, 'xgb__colsample_bytree': 0.1}
NS: Intuition (N) – Sensing (S) ...
Best AUC Score: 0.6527666346929265
Accuracy: 0.854697869367796
[[2431 32]
[ 384 16]]
{'xgb__n_estimators': 150, 'xgb__max_depth': 3, 'xgb__learning_rate': 0.3, 'xgb__gamma': 0.2, 'xgb__colsample_bytree': 0.2}
FT: Feeling (F) - Thinking (T) ...
Best AUC Score: 0.8139538376202319
Accuracy: 0.728955640936081
[[1199 355]
[ 421 888]]
{'xgb__n_estimators': 150, 'xgb__max_depth': 4, 'xgb__learning_rate': 0.1, 'xgb__gamma': 0.1, 'xgb__colsample_bytree': 0.1}
JP: Judging (J) – Perceiving (P) ...
Best AUC Score: 0.6638994402640133
Accuracy: 0.6521131680055885
[[ 252 867]
[ 129 1615]]
{'xgb__n_estimators': 50, 'xgb__max_depth': 3, 'xgb__learning_rate': 0.1, 'xgb__gamma': 0.0, 'xgb__colsample_bytree': 0.2}
it17173100/Personality_prediction/XGBoostClassifier_tunning_results.txt
0 → 100644
View file @
bfbfadc6
Intoversion - extroversion
Best AUC Score: 0.677028682166271
Accuracy: 0.7785539643730353
[[2229 0]
[ 634 0]]
{'xgb__n_estimators': 200, 'xgb__max_depth': 6, 'xgb__learning_rate': 0.01, 'xgb__gamma': 0.2, 'xgb__colsample_bytree': 0.1}
NS: Intuition (N) – Sensing (S) ...
Best AUC Score: 0.6527666346929265
Accuracy: 0.854697869367796
[[2431 32]
[ 384 16]]
{'xgb__n_estimators': 150, 'xgb__max_depth': 3, 'xgb__learning_rate': 0.3, 'xgb__gamma': 0.2, 'xgb__colsample_bytree': 0.2}
FT: Feeling (F) - Thinking (T) ...
Best AUC Score: 0.8139538376202319
Accuracy: 0.728955640936081
[[1199 355]
[ 421 888]]
{'xgb__n_estimators': 150, 'xgb__max_depth': 4, 'xgb__learning_rate': 0.1, 'xgb__gamma': 0.1, 'xgb__colsample_bytree': 0.1}
JP: Judging (J) – Perceiving (P) ...
Best AUC Score: 0.6638994402640133
Accuracy: 0.6521131680055885
[[ 252 867]
[ 129 1615]]
{'xgb__n_estimators': 50, 'xgb__max_depth': 3, 'xgb__learning_rate': 0.1, 'xgb__gamma': 0.0, 'xgb__colsample_bytree': 0.2}
it17173100/Personality_prediction/get_tweets.py
0 → 100644
View file @
bfbfadc6
# code to extract tweets of a person , by using their twitter username
import
sys
import
csv
import
tweepy
# Get your Twitter API credentials and enter them here
consumer_key
=
"6c5V3cQ1pj0DOiMgZ5znPZHDR"
consumer_secret
=
"yVky0cSsTYrs0Lge39pLwFSggH7Fan8ibtMz4fu10mAoZk9AA8"
access_key
=
"867670407599579137-8Ap6KyTTvlTiOI2xlMSIp8uFVlDMxHG"
access_secret
=
"HWchPQN4C3xJ3U6eWzCDovfsRD1dBbPh4z6ir4AuIzxYU"
# method to get a user's last tweets
def
get_tweets
(
username
):
# http://tweepy.readthedocs.org/en/v3.1.0/getting_started.html#api
auth
=
tweepy
.
OAuthHandler
(
consumer_key
,
consumer_secret
)
auth
.
set_access_token
(
access_key
,
access_secret
)
api
=
tweepy
.
API
(
auth
)
# set count to however many tweets you want
number_of_tweets
=
3000
user
=
api
.
get_user
(
username
)
# get tweets
tweets_for_csv
=
[]
# tweets_for_csv.append([username,user.name,user.description,user.followers_count,user.listed_count,user.friends_count])
for
tweet
in
tweepy
.
Cursor
(
api
.
user_timeline
,
screen_name
=
username
)
.
items
(
number_of_tweets
):
# create array of tweet information: username, tweet id, date/time, text
tweets_for_csv
.
append
(
[
username
,
user
.
name
,
user
.
description
,
user
.
followers_count
,
user
.
listed_count
,
user
.
friends_count
,
tweet
.
id_str
,
tweet
.
created_at
,
(
tweet
.
text
)
.
encode
(
"utf-8"
)])
# write to a new csv file from the array of tweets
outfile
=
username
+
"_tweets.csv"
print
(
"writing to "
+
outfile
)
with
open
(
outfile
,
'w'
,
encoding
=
'utf-8'
)
as
file
:
# with file:
# identifying header
# header = ['user_name','user','description','no_list','no_followers','no_friends','tweet_id','tweet_date','tweet_text']
# writer = csv.writer(file,fieldnames = header, delimiter = ',')
writer
=
csv
.
writer
(
file
,
delimiter
=
','
)
# writer.writerow(['user_name','user','description','no_list','no_followers','no_friends','tweet_id','tweet_date','tweet_text'])
writer
.
writerow
([
'tweet_text'
])
writer
.
writerows
(
tweets_for_csv
)
# if we're running this as a script
if
__name__
==
'__main__'
:
# get tweets for username passed at command line
# if len(sys.argv) == 2:
# get_tweets(sys.argv[1])
# else:
# print "Error: enter one username"
# alternative method: loop through multiple users
users
=
[
'mayweather_gh'
,
'KhuthTradingWay'
,
'PistisMakasi'
,
'TUt3YwYDORjfgw2'
,
'koaung448'
,
'hikuto_e'
,
'Gmoncsc'
,
'KRISHAN17328009'
,
'mtvc36112'
,
'Charles46762537'
,
'kenanyildirimky'
,
'Santanu57201363'
,
'GiorgianStrejo1'
,
'UsharaniSharm11'
,
'SchlangerAndre1'
,
'RajKumarChadha8'
,
'ChikaMartinO1'
,
'Gargipal18'
]
for
user
in
users
:
get_tweets
(
user
)
it17173100/Personality_prediction/personality_predictor.py
0 → 100644
View file @
bfbfadc6
# Predict the personality type from a candidate tweets.
import
csv
import
os
import
pandas
as
pd
from
future.moves
import
pickle
from
preprocessor
import
pre_process_data
from
preprocessor
import
translate_back
SAVE_MODEL
=
True
MODELS_DIR
=
"models"
SAMPLE_DATA_DIRECTORY
=
"data/sample_data"
SAMPLE_TWEETS_PATH
=
os
.
path
.
join
(
SAMPLE_DATA_DIRECTORY
,
"apihandyman_tweets.csv"
)
x_test
=
""
DIMENSIONS
=
[
"IE"
,
"NS"
,
"FT"
,
"PJ"
]
with
open
(
SAMPLE_TWEETS_PATH
,
"r"
,
encoding
=
"ISO-8859-1"
)
as
f
:
reader
=
csv
.
reader
(
f
)
for
row
in
f
:
# x_test.append(row)
x_test
=
x_test
+
" "
+
row
type_indicators
=
[
"IE: Introversion (I) - Extroversion (E)"
,
"NS: Intuition (N) – Sensing (S)"
,
"FT: Feeling (F) - Thinking (T)"
,
"JP: Judging (J) – Perceiving (P)"
]
# The type is just a dummy so that the data preprocessing fucntion can be reused
mydata
=
pd
.
DataFrame
(
data
=
{
'type'
:
[
'ENFP'
],
'posts'
:
x_test
})
x_test
,
dummy
=
pre_process_data
(
mydata
,
remove_stop_words
=
True
)
result
=
[]
# train type indicator individually
# for l in range(len(type_indicators)):
for
k
in
range
(
len
(
DIMENSIONS
)):
print
(
"
%
s ..."
%
(
DIMENSIONS
[
k
]))
with
open
(
os
.
path
.
join
(
MODELS_DIR
,
"sgd_pipeline_{}.pkl"
.
format
(
DIMENSIONS
[
k
])),
'rb'
)
as
picklefile
:
saved_pipe
=
pickle
.
load
(
picklefile
)
predictions
=
saved_pipe
.
predict
(
x_test
)
# make predictions for my data
result
.
append
(
predictions
[
0
])
print
(
"The result is: "
,
translate_back
(
result
))
it17173100/Personality_prediction/pipe.pickle
0 → 100644
View file @
bfbfadc6
File added
it17173100/Personality_prediction/poetry.lock
0 → 100644
View file @
bfbfadc6
This diff is collapsed.
Click to expand it.
it17173100/Personality_prediction/preprocessor.py
0 → 100644
View file @
bfbfadc6
# Code to preprocess the user's text posts
import
re
import
numpy
as
np
import
pandas
as
pd
from
nltk.corpus
import
stopwords
from
nltk.stem
import
PorterStemmer
,
WordNetLemmatizer
SAVE_MODEL
=
True
MODELS_DIR
=
"models"
DATA_DIR
=
"data"
DIMENSIONS
=
[
"IE"
,
"NS"
,
"FT"
,
"PJ"
]
# data = pd.read_csv('data/mbti_personality.csv')
# add 4 columns for personality type indicators
# def get_types(row):
# t = row['type']
#
# I = 0
# N = 0
# T = 0
# J = 0
#
# if t[0] == 'I':
# I = 1
# elif t[0] == 'E':
# I = 0
# else:
# print('I-E incorrect')
#
# if t[1] == 'N':
# N = 1
# elif t[1] == 'S':
# N = 0
# else:
# print('N-S incorrect')
#
# if t[2] == 'T':
# T = 1
# elif t[2] == 'F':
# T = 0
# else:
# print('T-F incorrect')
#
# if t[3] == 'J':
# J = 1
# elif t[3] == 'P':
# J = 0
# else:
# print('J-P incorrect')
# return pd.Series({'IE': I, 'NS': N, 'TF': T, 'JP': J})
# data = data.join(data.apply(lambda row: get_types(row), axis=1))
# print("Introversion (I) / Extroversion (E):\t", data['IE'].value_counts()[0], " / ", data['IE'].value_counts()[1])
# print("Intuition (N) – Sensing (S):\t\t", data['NS'].value_counts()[0], " / ", data['NS'].value_counts()[1])
# print("Thinking (T) – Feeling (F):\t\t", data['TF'].value_counts()[0], " / ", data['TF'].value_counts()[1])
# print("Judging (J) – Perceiving (P):\t\t", data['JP'].value_counts()[0], " / ", data['JP'].value_counts()[1])
b_Pers
=
{
'I'
:
0
,
'E'
:
1
,
'N'
:
0
,
'S'
:
1
,
'F'
:
0
,
'T'
:
1
,
'J'
:
0
,
'P'
:
1
}
b_Pers_list
=
[{
0
:
'I'
,
1
:
'E'
},
{
0
:
'N'
,
1
:
'S'
},
{
0
:
'F'
,
1
:
'T'
},
{
0
:
'J'
,
1
:
'P'
}]
def
translate_personality
(
personality
):
# transform mbti to binary vector
return
[
b_Pers
[
l
]
for
l
in
personality
]
def
translate_back
(
personality
):
# transform binary vector to mbti personality
s
=
""
for
i
,
l
in
enumerate
(
personality
):
s
+=
b_Pers_list
[
i
][
l
]
return
s
# To remove the personality type from the psosts
unique_type_list
=
[
'INFJ'
,
'ENTP'
,
'INTP'
,
'INTJ'
,
'ENTJ'
,
'ENFJ'
,
'INFP'
,
'ENFP'
,
'ISFP'
,
'ISTP'
,
'ISFJ'
,
'ISTJ'
,
'ESTP'
,
'ESFP'
,
'ESTJ'
,
'ESFJ'
]
unique_type_list
=
[
x
.
lower
()
for
x
in
unique_type_list
]
# Lemmatize
stemmer
=
PorterStemmer
()
lemmatiser
=
WordNetLemmatizer
()
# Cache the stop words for speed
cachedStopWords
=
stopwords
.
words
(
"english"
)
def
pre_process_data
(
data
,
remove_stop_words
=
True
,
remove_mbti_profiles
=
True
):
list_personality
=
[]
list_posts
=
[]
len_data
=
len
(
data
)
i
=
0
for
row
in
data
.
iterrows
():
i
+=
1
if
i
%
500
==
0
or
i
==
1
or
i
==
len_data
:
print
(
"
%
s of
%
s rows"
%
(
i
,
len_data
))
# Remove and clean posts
posts
=
row
[
1
]
.
posts
# remove urls
temp
=
re
.
sub
(
'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|(?:
%
[0-9a-fA-F][0-9a-fA-F]))+'
,
' '
,
posts
)
# remove uncleared words
temp
=
re
.
sub
(
"[^a-zA-Z]"
,
" "
,
temp
)
# Remove spaces > 1
temp
=
re
.
sub
(
' +'
,
' '
,
temp
)
.
lower
()
# Remove stop words
if
remove_stop_words
:
temp
=
" "
.
join
([
lemmatiser
.
lemmatize
(
w
)
for
w
in
temp
.
split
(
' '
)
if
w
not
in
cachedStopWords
])
else
:
temp
=
" "
.
join
([
lemmatiser
.
lemmatize
(
w
)
for
w
in
temp
.
split
(
' '
)])
# Remove MBTI personality words from posts
if
remove_mbti_profiles
:
for
t
in
unique_type_list
:
temp
=
temp
.
replace
(
t
,
""
)
# transform mbti to binary vector
type_labelized
=
translate_personality
(
row
[
1
]
.
type
)
list_personality
.
append
(
type_labelized
)
list_posts
.
append
(
temp
)
list_posts
=
np
.
array
(
list_posts
)
list_personality
=
np
.
array
(
list_personality
)
return
list_posts
,
list_personality
it17173100/Personality_prediction/pyproject.toml
0 → 100644
View file @
bfbfadc6
[tool.poetry]
name
=
"mbti-rnn"
version
=
"0.1.0"
description
=
""
authors
=
[
"Ian Scott Knight <isk@alumni.stanford.edu>"
]
license
=
"MIT"
[tool.poetry.dependencies]
python
=
"^3.8"
scikit-learn
=
"^0.24.1"
nltk
=
"^3.5"
Keras
=
"^2.4.3"
pandas
=
"^1.2.1"
tensorflow
=
"^2.4.1"
[tool.poetry.dev-dependencies]
pre-commit
=
"^2.10.0"
[build-system]
requires
=
["poetry-core>=1.0.0"]
build-backend
=
"poetry.core.masonry.api"
it17173100/Personality_prediction/sgd parameter tuning.py
0 → 100644
View file @
bfbfadc6
# SGDClassifer parameter tunning
import
numpy
as
np
import
pandas
as
pd
import
scipy
from
sklearn.feature_extraction.text
import
TfidfVectorizer
from
sklearn.linear_model
import
SGDClassifier
from
sklearn.metrics
import
accuracy_score
,
make_scorer
,
confusion_matrix
from
sklearn.model_selection
import
train_test_split
,
RandomizedSearchCV
,
KFold
from
sklearn.pipeline
import
Pipeline
from
sklearn.metrics
import
accuracy_score
from
preprocessor
import
pre_process_data
from
preprocessor
import
translate_personality
from
preprocessor
import
translate_back
#from preprocessor import get_types
SAVE_MODEL
=
True
MODELS_DIR
=
"models"
DATA_DIR
=
"data"
DIMENSIONS
=
[
"IE"
,
"NS"
,
"FT"
,
"PJ"
]
data
=
pd
.
read_csv
(
'data/mbti_personality.csv'
);
#data = data.join(data.apply(lambda row: get_types(row), axis=1))
list_posts
,
list_personality
=
pre_process_data
(
data
,
remove_stop_words
=
True
)
# Learn the vocabulary dictionary and return term-document matrix
Tfidf
=
TfidfVectorizer
(
analyzer
=
"word"
,
max_features
=
1500
,
tokenizer
=
None
,
preprocessor
=
None
,
stop_words
=
None
,
max_df
=
0.7
,
min_df
=
0.1
)
type_indicators
=
[
"IE: Introversion (I) - Extroversion (E)"
,
"NS: Intuition (N) – Sensing (S)"
,
"FT: Feeling (F) - Thinking (T)"
,
"JP: Judging (J) – Perceiving (P)"
]
X
=
list_posts
# Let's train type indicator individually
for
l
in
range
(
len
(
DIMENSIONS
)):
print
(
"
%
s ..."
%
(
type_indicators
[
l
]))
# Let's train type indicator individually
Y
=
list_personality
[:,
l
]
np
.
random
.
shuffle
(
X
)
np
.
random
.
shuffle
(
Y
)
# split data into train and test sets
seed
=
7
test_size
=
0.33
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
X
,
Y
,
test_size
=
test_size
,
random_state
=
seed
)
# sgd = SGDClassifier(max_iter=10, tol=None)
sgd
=
SGDClassifier
()
pipe
=
Pipeline
([(
"Tfidf"
,
Tfidf
),
(
"sgd"
,
sgd
)])
# Define our search space for Randomized Search
search_space
=
[
{
'sgd__loss'
:
[
'hinge'
,
'modified_huber'
],
'sgd__penalty'
:
[
'none'
,
'l2'
,
'l1'
],
'sgd__alpha'
:
scipy
.
stats
.
uniform
(
0.00005
,
0.002
)
}
]
# Define cross validation
kfold
=
KFold
(
n_splits
=
10
,
random_state
=
42
,
shuffle
=
True
)
# put 10 try - yeah tried it
# AUC and accuracy as score
scoring
=
{
'AUC'
:
'roc_auc'
,
'Accuracy'
:
make_scorer
(
accuracy_score
)}
# Define grid search
grid
=
RandomizedSearchCV
(
pipe
,
param_distributions
=
search_space
,
cv
=
kfold
,
scoring
=
scoring
,
refit
=
'AUC'
,
verbose
=
1
,
n_jobs
=-
1
)
# for param in grid.get_params().keys():
# print(param)
# Fit grid search
model
=
grid
.
fit
(
X_train
,
y_train
)
# The model scores and confusion matrix can be obtained by
predict
=
model
.
predict
(
X_test
)
print
(
'Best AUC Score: {}'
.
format
(
model
.
best_score_
))
print
(
'Accuracy: {}'
.
format
(
accuracy_score
(
y_test
,
predict
)))
print
(
confusion_matrix
(
y_test
,
predict
))
# And the best parameters can be obtained by:
print
(
model
.
best_params_
)
it17173100/Personality_prediction/sgdClassifier.py
0 → 100644
View file @
bfbfadc6
# Train and test hte dataset on SGDClassifier
import
os
import
pandas
as
pd
from
future.moves
import
pickle
from
sklearn.feature_extraction.text
import
TfidfVectorizer
from
sklearn.linear_model
import
SGDClassifier
from
sklearn.model_selection
import
train_test_split
from
sklearn.pipeline
import
Pipeline
from
sklearn.metrics
import
accuracy_score
from
preprocessor
import
pre_process_data
from
preprocessor
import
translate_personality
from
preprocessor
import
translate_back
#from preprocessor import get_types
SAVE_MODEL
=
True
MODELS_DIR
=
"models"
DATA_DIR
=
"data"
DIMENSIONS
=
[
"IE"
,
"NS"
,
"FT"
,
"PJ"
]
data
=
pd
.
read_csv
(
'data/mbti_personality.csv'
);
#data = data.join(data.apply(lambda row: get_types(row), axis=1))
list_posts
,
list_personality
=
pre_process_data
(
data
,
remove_stop_words
=
True
)
# Learn the vocabulary dictionary and return term-document matrix
Tfidf
=
TfidfVectorizer
(
analyzer
=
"word"
,
max_features
=
1500
,
tokenizer
=
None
,
preprocessor
=
None
,
stop_words
=
None
,
max_df
=
0.7
,
min_df
=
0.1
)
type_indicators
=
[
"IE: Introversion (I) - Extroversion (E)"
,
"NS: Intuition (N) – Sensing (S)"
,
"FT: Feeling (F) - Thinking (T)"
,
"JP: Judging (J) – Perceiving (P)"
]
X
=
list_posts
# Let's train type indicator individually
for
l
in
range
(
len
(
DIMENSIONS
)):
print
(
"
%
s ..."
%
(
type_indicators
[
l
]))
# Let's train type indicator individually
Y
=
list_personality
[:,
l
]
# split data into train and test sets
seed
=
7
test_size
=
0.33
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
X
,
Y
,
test_size
=
test_size
,
random_state
=
seed
)
# sgd = SGDClassifier(max_iter=10, tol=None,)
sgd
=
SGDClassifier
(
loss
=
'modified_huber'
,
penalty
=
'l1'
,
alpha
=
0.00032
)
pipe
=
Pipeline
([(
"Tfidf"
,
Tfidf
),
(
"sgd"
,
sgd
)])
pipe
.
fit
(
X_train
,
y_train
)
with
open
(
os
.
path
.
join
(
MODELS_DIR
,
"sgd_pipeline_{}.pkl"
.
format
(
DIMENSIONS
[
l
])),
'wb'
)
as
picklefile
:
pickle
.
dump
(
pipe
,
picklefile
)
# make predictions for test data
y_pred
=
pipe
.
predict
(
X_test
)
predictions
=
[
round
(
value
)
for
value
in
y_pred
]
# evaluate predictions
accuracy
=
accuracy_score
(
y_test
,
predictions
)
print
(
"*
%
s Accuracy:
%.2
f
%%
"
%
(
type_indicators
[
l
],
accuracy
*
100.0
))
it17173100/Personality_prediction/xgb parameter tunning.py
0 → 100644
View file @
bfbfadc6
# XGBoost parameter training
import
pandas
as
pd
from
sklearn.feature_extraction.text
import
TfidfVectorizer
from
sklearn.metrics
import
accuracy_score
,
make_scorer
,
confusion_matrix
from
sklearn.model_selection
import
train_test_split
,
RepeatedStratifiedKFold
,
GridSearchCV
,
StratifiedKFold
,
RandomizedSearchCV
,
KFold
from
sklearn.pipeline
import
Pipeline
from
xgboost
import
XGBClassifier
from
preprocessor
import
pre_process_data
#from preprocessor import get_types
SAVE_MODEL
=
True
MODELS_DIR
=
"models"
DATA_DIR
=
"data"
DIMENSIONS
=
[
"IE"
,
"NS"
,
"FT"
,
"PJ"
]
data
=
pd
.
read_csv
(
'data/mbti_personality.csv'
);
#data = data.join(data.apply(lambda row: get_types(row), axis=1))
type_indicators
=
[
"IE: Introversion (I) - Extroversion (E)"
,
"NS: Intuition (N) – Sensing (S)"
,
"FT: Feeling (F) - Thinking (T)"
,
"JP: Judging (J) – Perceiving (P)"
]
list_posts
,
list_personality
=
pre_process_data
(
data
,
remove_stop_words
=
True
)
# Posts in tf-idf representation
Tfidf
=
TfidfVectorizer
(
analyzer
=
"word"
,
max_features
=
1500
,
tokenizer
=
None
,
preprocessor
=
None
,
stop_words
=
None
,
max_df
=
0.7
,
min_df
=
0.1
)
X
=
list_posts
# setup parameters for xgboost
param
=
{}
param
[
'n_estimators'
]
=
200
param
[
'max_depth'
]
=
2
param
[
'nthread'
]
=
8
param
[
'learning_rate'
]
=
0.2
# Let's train type indicator individually
for
l
in
range
(
len
(
DIMENSIONS
)):
print
(
"
%
s ..."
%
(
type_indicators
[
l
]))
# Let's train type indicator individually
Y
=
list_personality
[:,
l
]
#split data into train and test sets
seed
=
7
test_size
=
0.33
X_train
,
X_test
,
y_train
,
y_test
=
train_test_split
(
X
,
Y
,
test_size
=
test_size
,
random_state
=
seed
)
xgb
=
XGBClassifier
(
use_label_encoder
=
False
)
pipe
=
Pipeline
([(
"Tfidf"
,
Tfidf
),
(
"xgb"
,
xgb
)])
# Define our search space for randomized search
search_space
=
[
{
'xgb__n_estimators'
:
[
50
,
100
,
150
,
200
],
'xgb__learning_rate'
:
[
0.01
,
0.1
,
0.2
,
0.3
],
'xgb__max_depth'
:
range
(
3
,
10
),
'xgb__colsample_bytree'
:
[
i
/
10.0
for
i
in
range
(
1
,
3
)],
'xgb__gamma'
:
[
i
/
10.0
for
i
in
range
(
3
)]
}
]
# Define cross validation
kfold
=
KFold
(
n_splits
=
10
,
random_state
=
42
,
shuffle
=
True
)
# put 10 try and put shuffle = true,
# AUC and accuracy as score
scoring
=
{
'AUC'
:
'roc_auc'
,
'Accuracy'
:
make_scorer
(
accuracy_score
)}
# Define grid search
grid
=
RandomizedSearchCV
(
pipe
,
param_distributions
=
search_space
,
cv
=
kfold
,
scoring
=
scoring
,
refit
=
'AUC'
,
verbose
=
1
,
n_jobs
=-
1
)
# for param in grid.get_params().keys():
# print(param)
# Fit grid search
model
=
grid
.
fit
(
X_train
,
y_train
)
# The model scores and confusion matrix can be obtained by
predict
=
model
.
predict
(
X_test
)
print
(
'Best AUC Score: {}'
.
format
(
model
.
best_score_
))
print
(
'Accuracy: {}'
.
format
(
accuracy_score
(
y_test
,
predict
)))
print
(
confusion_matrix
(
y_test
,
predict
))
# And the best parameters can be obtained by:
print
(
model
.
best_params_
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment