Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
2
2020_21 J-25
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
3
Merge Requests
3
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
2020_21 J-25
2020_21 J-25
Commits
905d39d8
Commit
905d39d8
authored
Jun 28, 2021
by
Amuthini
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
source files updated
parent
2f33b0c5
Changes
21
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
21 changed files
with
2398 additions
and
88 deletions
+2398
-88
Personality_prediction/Personality_prediction/.gitignore
Personality_prediction/Personality_prediction/.gitignore
+5
-0
Personality_prediction/Personality_prediction/.idea/.gitignore
...nality_prediction/Personality_prediction/.idea/.gitignore
+8
-0
Personality_prediction/Personality_prediction/.idea/Personality_prediction.iml
...n/Personality_prediction/.idea/Personality_prediction.iml
+8
-0
Personality_prediction/Personality_prediction/.idea/inspectionProfiles/profiles_settings.xml
...prediction/.idea/inspectionProfiles/profiles_settings.xml
+6
-0
Personality_prediction/Personality_prediction/.idea/misc.xml
Personality_prediction/Personality_prediction/.idea/misc.xml
+4
-0
Personality_prediction/Personality_prediction/.idea/modules.xml
...ality_prediction/Personality_prediction/.idea/modules.xml
+8
-0
Personality_prediction/Personality_prediction/.pre-commit-config.yaml
...prediction/Personality_prediction/.pre-commit-config.yaml
+5
-0
Personality_prediction/Personality_prediction/bidirectional_lstm.py
...y_prediction/Personality_prediction/bidirectional_lstm.py
+288
-0
Personality_prediction/Personality_prediction/make_test_set.py
...nality_prediction/Personality_prediction/make_test_set.py
+36
-0
Personality_prediction/Personality_prediction/make_training_set.py
...ty_prediction/Personality_prediction/make_training_set.py
+64
-0
Personality_prediction/Personality_prediction/poetry.lock
Personality_prediction/Personality_prediction/poetry.lock
+1134
-0
Personality_prediction/Personality_prediction/pyproject.toml
Personality_prediction/Personality_prediction/pyproject.toml
+21
-0
Personality_prediction/Personality_prediction/sample_predictor.py
...ity_prediction/Personality_prediction/sample_predictor.py
+102
-0
Personality_prediction/Personality_prediction/separate_clean_and_unclean.py
...tion/Personality_prediction/separate_clean_and_unclean.py
+67
-0
Personality_prediction/Personality_prediction/simple_rnn.py
Personality_prediction/Personality_prediction/simple_rnn.py
+40
-62
Personality_prediction/bidirectional_lstm.py
Personality_prediction/bidirectional_lstm.py
+288
-0
Personality_prediction/make_test_set.py
Personality_prediction/make_test_set.py
+3
-3
Personality_prediction/make_training_set.py
Personality_prediction/make_training_set.py
+3
-3
Personality_prediction/sample_predictor.py
Personality_prediction/sample_predictor.py
+21
-16
Personality_prediction/separate_clean_and_unclean.py
Personality_prediction/separate_clean_and_unclean.py
+4
-4
Personality_prediction/simple_rnn.py
Personality_prediction/simple_rnn.py
+283
-0
No files found.
Personality_prediction/Personality_prediction/.gitignore
0 → 100644
View file @
905d39d8
.DS_Store
data
models
!data/.gitkeep
!models/.gitkeep
\ No newline at end of file
Personality_prediction/Personality_prediction/.idea/.gitignore
0 → 100644
View file @
905d39d8
# Default ignored files
/shelf/
/workspace.xml
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml
# Editor-based HTTP Client requests
/httpRequests/
Personality_prediction/Personality_prediction/.idea/Personality_prediction.iml
0 → 100644
View file @
905d39d8
<?xml version="1.0" encoding="UTF-8"?>
<module
type=
"PYTHON_MODULE"
version=
"4"
>
<component
name=
"NewModuleRootManager"
>
<content
url=
"file://$MODULE_DIR$"
/>
<orderEntry
type=
"jdk"
jdkName=
"Python 3.8 (2)"
jdkType=
"Python SDK"
/>
<orderEntry
type=
"sourceFolder"
forTests=
"false"
/>
</component>
</module>
\ No newline at end of file
Personality_prediction/Personality_prediction/.idea/inspectionProfiles/profiles_settings.xml
0 → 100644
View file @
905d39d8
<component
name=
"InspectionProjectProfileManager"
>
<settings>
<option
name=
"USE_PROJECT_PROFILE"
value=
"false"
/>
<version
value=
"1.0"
/>
</settings>
</component>
\ No newline at end of file
Personality_prediction/Personality_prediction/.idea/misc.xml
0 → 100644
View file @
905d39d8
<?xml version="1.0" encoding="UTF-8"?>
<project
version=
"4"
>
<component
name=
"ProjectRootManager"
version=
"2"
project-jdk-name=
"Python 3.8 (2)"
project-jdk-type=
"Python SDK"
/>
</project>
\ No newline at end of file
Personality_prediction/Personality_prediction/.idea/modules.xml
0 → 100644
View file @
905d39d8
<?xml version="1.0" encoding="UTF-8"?>
<project
version=
"4"
>
<component
name=
"ProjectModuleManager"
>
<modules>
<module
fileurl=
"file://$PROJECT_DIR$/.idea/Personality_prediction.iml"
filepath=
"$PROJECT_DIR$/.idea/Personality_prediction.iml"
/>
</modules>
</component>
</project>
\ No newline at end of file
Personality_prediction/Personality_prediction/.pre-commit-config.yaml
0 → 100644
View file @
905d39d8
-
repo
:
https://github.com/psf/black
rev
:
20.8b1
# Replace by any tag/version: https://github.com/psf/black/tags
hooks
:
-
id
:
black
language_version
:
python3
# Should be a command that runs python3.6+
\ No newline at end of file
Personality_prediction/Personality_prediction/bidirectional_lstm.py
0 → 100644
View file @
905d39d8
This diff is collapsed.
Click to expand it.
Personality_prediction/Personality_prediction/make_test_set.py
0 → 100644
View file @
905d39d8
import
os
import
collections
import
pandas
as
pd
import
csv
import
re
DATA_DIRECTORY
=
"data"
MBTI_CLEAN_CSV_PATH
=
os
.
path
.
join
(
DATA_DIRECTORY
,
"mbti_clean.csv"
)
DIMENSIONS
=
(
"IE"
,
"NS"
,
"TF"
,
"PJ"
)
df
=
pd
.
read_csv
(
MBTI_CLEAN_CSV_PATH
)
for
dimension
in
DIMENSIONS
:
letter_1
,
letter_2
=
dimension
for
letter
in
[
letter_1
,
letter_2
]:
posts
=
[]
for
index
,
row
in
df
.
iterrows
():
if
letter
in
row
[
"type"
]:
hundred_posts
=
row
[
"posts"
]
.
split
(
"|||"
)
for
post
in
hundred_posts
:
if
(
(
"http"
in
post
)
or
(
post
==
""
)
or
(
post
==
None
)
or
(
not
re
.
search
(
"[a-zA-Z]"
,
post
))
):
# ignore deformed posts
continue
posts
.
append
(
post
)
test_csv_path
=
os
.
path
.
join
(
DATA_DIRECTORY
,
f
"test_{letter}.csv"
)
with
open
(
test_csv_path
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
writer
=
csv
.
writer
(
f
)
for
post
in
posts
:
writer
.
writerow
([
post
])
Personality_prediction/Personality_prediction/make_training_set.py
0 → 100644
View file @
905d39d8
import
os
import
collections
import
pandas
as
pd
import
csv
import
re
DATA_DIRECTORY
=
"data"
MBTI_UNCLEAN_CSV_PATH
=
os
.
path
.
join
(
DATA_DIRECTORY
,
"mbti_unclean.csv"
)
DIMENSIONS
=
(
"IE"
,
"NS"
,
"TF"
,
"PJ"
)
df
=
pd
.
read_csv
(
MBTI_UNCLEAN_CSV_PATH
)
counts
=
collections
.
defaultdict
(
int
)
for
dimension
in
DIMENSIONS
:
letter_1
,
letter_2
=
dimension
for
index
,
row
in
df
.
iterrows
():
mbti
=
row
[
"type"
]
hundred_posts
=
row
[
"posts"
]
.
split
(
"|||"
)
for
post
in
hundred_posts
:
if
(
(
"http"
in
post
)
or
(
post
==
""
)
or
(
post
==
None
)
or
(
not
re
.
search
(
"[a-zA-Z]"
,
post
))
):
# ignore deformed posts
continue
if
letter_1
in
mbti
:
counts
[
letter_1
]
+=
1
if
letter_2
in
mbti
:
counts
[
letter_2
]
+=
1
for
dimension
in
DIMENSIONS
:
letter_1
,
letter_2
=
dimension
if
counts
[
letter_1
]
<
counts
[
letter_2
]:
limit
=
counts
[
letter_1
]
else
:
limit
=
counts
[
letter_2
]
for
letter
in
[
letter_1
,
letter_2
]:
posts
=
[]
i
=
0
for
index
,
row
in
df
.
iterrows
():
if
letter
in
row
[
"type"
]:
hundred_posts
=
row
[
"posts"
]
.
split
(
"|||"
)
for
post
in
hundred_posts
:
if
i
==
limit
:
break
if
(
(
"http"
in
post
)
or
(
post
==
""
)
or
(
post
==
None
)
or
(
not
re
.
search
(
"[a-zA-Z]"
,
post
))
):
# ignore deformed posts
continue
posts
.
append
(
post
)
i
+=
1
train_csv_path
=
os
.
path
.
join
(
DATA_DIRECTORY
,
f
"train_{letter}.csv"
)
with
open
(
train_csv_path
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
writer
=
csv
.
writer
(
f
)
for
post
in
posts
:
writer
.
writerow
([
post
])
Personality_prediction/Personality_prediction/poetry.lock
0 → 100644
View file @
905d39d8
This diff is collapsed.
Click to expand it.
Personality_prediction/Personality_prediction/pyproject.toml
0 → 100644
View file @
905d39d8
[tool.poetry]
name
=
"mbti-rnn"
version
=
"0.1.0"
description
=
""
authors
=
[
"Ian Scott Knight <isk@alumni.stanford.edu>"
]
license
=
"MIT"
[tool.poetry.dependencies]
python
=
"^3.8"
scikit-learn
=
"^0.24.1"
nltk
=
"^3.5"
Keras
=
"^2.4.3"
pandas
=
"^1.2.1"
tensorflow
=
"^2.4.1"
[tool.poetry.dev-dependencies]
pre-commit
=
"^2.10.0"
[build-system]
requires
=
["poetry-core>=1.0.0"]
build-backend
=
"poetry.core.masonry.api"
Personality_prediction/Personality_prediction/sample_predictor.py
0 → 100644
View file @
905d39d8
import
csv
import
os
import
pickle
import
numpy
as
np
from
keras.models
import
load_model
from
keras.preprocessing
import
sequence
from
nltk.corpus
import
stopwords
from
nltk.stem
import
WordNetLemmatizer
MODELS_DIRECTORY
=
"models"
DATA_DIRECTORY
=
"data/sample_data"
SAMPLE_TWEETS_PATH
=
os
.
path
.
join
(
DATA_DIRECTORY
,
"0xnickrodriguez_tweets.csv"
)
DIMENSIONS
=
[
"IE"
,
"NS"
,
"FT"
,
"PJ"
]
DIMENSIONS_with_strings
=
[
"Introversion Extroversion"
,
"Intuition Sensing"
,
"Feeling Thinking"
,
"Perceiving Judging"
]
MODEL_BATCH_SIZE
=
128
TOP_WORDS
=
2500
MAX_POST_LENGTH
=
40
EMBEDDING_VECTOR_LENGTH
=
20
final
=
""
x_test
=
[]
with
open
(
SAMPLE_TWEETS_PATH
,
"r"
,
encoding
=
"ISO-8859-1"
)
as
f
:
reader
=
csv
.
reader
(
f
)
for
row
in
f
:
x_test
.
append
(
row
)
types
=
[
"INFJ"
,
"ENTP"
,
"INTP"
,
"INTJ"
,
"ENTJ"
,
"ENFJ"
,
"INFP"
,
"ENFP"
,
"ISFP"
,
"ISTP"
,
"ISFJ"
,
"ISTJ"
,
"ESTP"
,
"ESFP"
,
"ESTJ"
,
"ESFJ"
,
]
types
=
[
x
.
lower
()
for
x
in
types
]
lemmatizer
=
WordNetLemmatizer
()
stop_words
=
stopwords
.
words
(
"english"
)
def
lemmatize
(
x
):
lemmatized
=
[]
for
post
in
x
:
temp
=
post
.
lower
()
for
type_
in
types
:
temp
=
temp
.
replace
(
" "
+
type_
,
""
)
temp
=
" "
.
join
(
[
lemmatizer
.
lemmatize
(
word
)
for
word
in
temp
.
split
(
" "
)
if
(
word
not
in
stop_words
)
]
)
lemmatized
.
append
(
temp
)
return
np
.
array
(
lemmatized
)
for
k
in
range
(
len
(
DIMENSIONS
)):
model
=
load_model
(
os
.
path
.
join
(
MODELS_DIRECTORY
,
"rnn_model_{}.h5"
.
format
(
DIMENSIONS
[
k
]))
)
tokenizer
=
None
with
open
(
os
.
path
.
join
(
MODELS_DIRECTORY
,
"rnn_tokenizer_{}.pkl"
.
format
(
DIMENSIONS
[
k
])),
"rb"
)
as
f
:
tokenizer
=
pickle
.
load
(
f
)
def
preprocess
(
x
):
lemmatized
=
lemmatize
(
x
)
tokenized
=
tokenizer
.
texts_to_sequences
(
lemmatized
)
return
sequence
.
pad_sequences
(
tokenized
,
maxlen
=
MAX_POST_LENGTH
)
predictions
=
model
.
predict
(
preprocess
(
x_test
))
prediction
=
float
(
sum
(
predictions
)
/
len
(
predictions
))
print
(
DIMENSIONS_with_strings
[
k
])
print
(
prediction
)
if
prediction
>=
0.5
:
final
+=
DIMENSIONS
[
k
][
1
]
print
(
"Personality type - "
,
DIMENSIONS
[
k
][
1
])
else
:
final
+=
DIMENSIONS
[
k
][
0
]
print
(
"Personality type - "
,
DIMENSIONS
[
k
][
0
])
print
(
""
)
print
(
""
)
print
(
"Personality Type of the Person : {} "
.
format
(
final
))
Personality_prediction/Personality_prediction/separate_clean_and_unclean.py
0 → 100644
View file @
905d39d8
import
os
import
collections
import
pandas
as
pd
import
csv
DATA_DIRECTORY
=
"data"
MBTI_RAW_CSV_PATH
=
os
.
path
.
join
(
DATA_DIRECTORY
,
"mbti_personality.csv"
)
MBTI_CLEAN_CSV_PATH
=
os
.
path
.
join
(
DATA_DIRECTORY
,
"mbti_personality_clean.csv"
)
MBTI_UNCLEAN_CSV_PATH
=
os
.
path
.
join
(
DATA_DIRECTORY
,
"mbti_personality_unclean.csv"
)
MBTI_TO_FREQUENCY_DICT
=
{
"ISTJ"
:
0.11
,
"ISFJ"
:
0.09
,
"INFJ"
:
0.04
,
"INTJ"
:
0.05
,
"ISTP"
:
0.05
,
"ISFP"
:
0.05
,
"INFP"
:
0.06
,
"INTP"
:
0.06
,
"ESTP"
:
0.04
,
"ESFP"
:
0.04
,
"ENFP"
:
0.08
,
"ENTP"
:
0.06
,
"ESTJ"
:
0.08
,
"ESFJ"
:
0.09
,
"ENFJ"
:
0.05
,
"ENTJ"
:
0.05
,
}
df
=
pd
.
read_csv
(
MBTI_RAW_CSV_PATH
)
counts
=
collections
.
defaultdict
(
int
)
for
mbti
in
df
[
"type"
]:
counts
[
mbti
]
+=
1
limiting_type
=
None
min_size
=
float
(
"infinity"
)
for
mbti
in
counts
.
keys
():
size
=
counts
[
mbti
]
/
MBTI_TO_FREQUENCY_DICT
[
mbti
]
if
size
<
min_size
:
min_size
=
size
limiting_type
=
mbti
dic
=
collections
.
defaultdict
(
list
)
for
index
,
row
in
df
.
iterrows
():
dic
[
row
[
"type"
]]
.
append
(
row
)
unclean_list
=
[]
with
open
(
MBTI_CLEAN_CSV_PATH
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
writer
=
csv
.
writer
(
f
)
writer
.
writerow
([
"type"
,
"posts"
])
for
mbti
in
MBTI_TO_FREQUENCY_DICT
.
keys
():
list1
=
dic
[
mbti
]
for
x
in
range
(
0
,
int
(
round
(
min_size
*
MBTI_TO_FREQUENCY_DICT
[
mbti
]))):
writer
.
writerow
(
list1
[
x
])
unclean_list
.
append
(
list1
[
int
(
round
(
min_size
*
MBTI_TO_FREQUENCY_DICT
[
mbti
]))
:
len
(
list1
)]
)
with
open
(
MBTI_UNCLEAN_CSV_PATH
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
writer
=
csv
.
writer
(
f
)
writer
.
writerow
([
"type"
,
"posts"
])
for
mbti
in
unclean_list
:
for
x
in
mbti
:
writer
.
writerow
(
x
)
Personality_prediction/rnn.py
→
Personality_prediction/
Personality_prediction/simple_
rnn.py
View file @
905d39d8
import
csv
import
os
import
pickle
import
warnings
import
numpy
as
np
import
pandas
as
pd
import
csv
import
random
import
pickle
import
collections
import
tensorflow
as
tf
from
tensorflow
import
keras
from
keras.preprocessing.text
import
Tokenizer
from
nltk
import
word_tokenize
from
nltk.stem
import
WordNetLemmatizer
from
nltk.corpus
import
stopwords
import
joblib
# scikit-learn - sklearn
from
sklearn.pipeline
import
Pipeline
from
sklearn.model_selection
import
KFold
from
sklearn.metrics
import
confusion_matrix
,
accuracy_score
from
keras.wrappers.scikit_learn
import
KerasClassifier
from
keras.models
import
Sequential
from
keras.models
import
load_model
from
keras.layers
import
Dense
from
keras.layers
import
LSTM
from
keras.layers
import
Bidirectional
from
keras.layers
import
GRU
from
keras.layers
import
SimpleRNN
from
keras.layers.embeddings
import
Embedding
from
keras.preprocessing
import
sequence
from
keras.preprocessing
import
text
from
keras.models
import
Sequential
from
keras.optimizers
import
adam_v2
#from keras.optimizers import Adam
from
keras.preprocessing
import
sequence
from
keras.preprocessing.text
import
Tokenizer
from
nltk.corpus
import
stopwords
from
nltk.stem
import
WordNetLemmatizer
from
sklearn.metrics
import
confusion_matrix
,
accuracy_score
# scikit-learn - sklearn
from
sklearn.model_selection
import
KFold
warnings
.
filterwarnings
(
"ignore"
)
MODELS_DIR
=
"models"
DATA_DIR
=
"data"
GLOVE_PATH
=
os
.
path
.
join
(
DATA_DIR
,
"glove.6B.50d.txt"
)
DIMENSIONS
=
[
"IE"
,
"NS"
,
"FT"
,
"PJ"
]
#
##
Preprocessing variables
# Preprocessing variables
MODEL_BATCH_SIZE
=
128
TOP_WORDS
=
2500
MAX_POST_LENGTH
=
40
EMBEDDING_VECTOR_LENGTH
=
50
#
##
Learning variables
# Learning variables
LEARNING_RATE
=
0.01
DROPOUT
=
0.1
NUM_EPOCHS
=
1
#
##
Control variables
# Control variables
CROSS_VALIDATION
=
False
SAMPLE
=
True
WORD_CLOUD
=
True
...
...
@@ -55,16 +44,14 @@ SAVE_MODEL = True
for
k
in
range
(
len
(
DIMENSIONS
)):
###########################
### POST CLASSIFICATION ###
###########################
x_train
=
[]
y_train
=
[]
x_test
=
[]
y_test
=
[]
#
##
Read in data
# Read in data
with
open
(
os
.
path
.
join
(
DATA_DIR
,
"train_{}.csv"
.
format
(
DIMENSIONS
[
k
][
0
])),
"r"
,
encoding
=
"utf8"
)
as
f
:
...
...
@@ -94,7 +81,7 @@ for k in range(len(DIMENSIONS)):
x_test
.
append
(
post
)
y_test
.
append
(
1
)
#
## Preprocessing (lemmatization, tokenization, and padding of inpu
t)
#
Preprocessing (lemmatization, tokenization, and padding of input tex
t)
MBTI_TYPES
=
[
"INFJ"
,
"ENTP"
,
...
...
@@ -146,13 +133,13 @@ for k in range(len(DIMENSIONS)):
x_train
=
lemmatize
(
x_train
)
x_test
=
lemmatize
(
x_test
)
#
##
Assign to dataframe and shuffle rows
# Assign to dataframe and shuffle rows
df
=
pd
.
DataFrame
(
data
=
{
"x"
:
x_train
,
"y"
:
y_train
})
df
=
df
.
sample
(
frac
=
1
)
.
reset_index
(
drop
=
True
)
#
##
Shuffle rows
df
=
df
.
sample
(
frac
=
1
)
.
reset_index
(
drop
=
True
)
# Shuffle rows
if
SAMPLE
:
df
=
df
.
head
(
10000
)
#
##
Small sample for quick runs
df
=
df
.
head
(
10000
)
# Small sample for quick runs
#
##
Load glove into memory for embedding
# Load glove into memory for embedding
embeddings_index
=
dict
()
with
open
(
GLOVE_PATH
,
encoding
=
"utf8"
)
as
f
:
for
line
in
f
:
...
...
@@ -161,7 +148,7 @@ for k in range(len(DIMENSIONS)):
embeddings_index
[
word
]
=
np
.
asarray
(
values
[
1
:],
dtype
=
"float32"
)
print
(
"Loaded {} word vectors."
.
format
(
len
(
embeddings_index
)))
#
##
Create a weight matrix for words
# Create a weight matrix for words
embedding_matrix
=
np
.
zeros
((
TOP_WORDS
,
EMBEDDING_VECTOR_LENGTH
))
for
word
,
i
in
tokenizer
.
word_index
.
items
():
if
i
<
TOP_WORDS
:
...
...
@@ -169,7 +156,7 @@ for k in range(len(DIMENSIONS)):
if
embedding_vector
is
not
None
:
embedding_matrix
[
i
]
=
embedding_vector
#
##
Construct model
# Construct model
with
tf
.
device
(
"/gpu:0"
):
model
=
Sequential
()
model
.
add
(
...
...
@@ -182,18 +169,9 @@ for k in range(len(DIMENSIONS)):
trainable
=
True
,
)
)
# model.add(SimpleRNN(EMBEDDING_VECTOR_LENGTH, dropout=DROPOUT, recurrent_dropout=DROPOUT, activation='sigmoid', kernel_initializer='zeros'))
# model.add(GRU(EMBEDDING_VECTOR_LENGTH, dropout=DROPOUT, recurrent_dropout=DROPOUT, activation='sigmoid', kernel_initializer='zeros'))
model
.
add
(
LSTM
(
EMBEDDING_VECTOR_LENGTH
,
dropout
=
DROPOUT
,
recurrent_dropout
=
DROPOUT
,
activation
=
"sigmoid"
,
kernel_initializer
=
"zeros"
,
)
)
# model.add(Bidirectional(LSTM(EMBEDDING_VECTOR_LENGTH, dropout=DROPOUT, recurrent_dropout=DROPOUT, activation='sigmoid', kernel_initializer='zeros')))
model
.
add
(
SimpleRNN
(
EMBEDDING_VECTOR_LENGTH
,
dropout
=
DROPOUT
,
recurrent_dropout
=
DROPOUT
,
activation
=
'sigmoid'
,
kernel_initializer
=
'zeros'
))
model
.
add
(
Dense
(
1
,
activation
=
"sigmoid"
))
optimizer
=
adam_v2
.
Adam
(
learning_rate
=
LEARNING_RATE
,
beta_1
=
0.9
,
beta_2
=
0.999
,
epsilon
=
1e-8
)
model
.
compile
(
...
...
@@ -201,7 +179,7 @@ for k in range(len(DIMENSIONS)):
)
print
(
model
.
summary
())
#
##
Cross-validation classification (individual posts)
# Cross-validation classification (individual posts)
if
CROSS_VALIDATION
:
k_fold
=
KFold
(
n_splits
=
6
)
scores_k
=
[]
...
...
@@ -223,11 +201,11 @@ for k in range(len(DIMENSIONS)):
scores_k
.
append
(
score_k
)
with
open
(
os
.
path
.
join
(
DATA_DIR
,
"
rnn
_cross_validation_{}.txt"
.
format
(
DIMENSIONS
[
k
])
DATA_DIR
,
"
SimpleRNN
_cross_validation_{}.txt"
.
format
(
DIMENSIONS
[
k
])
),
"w"
,
encoding
=
"utf8"
)
as
f
:
f
.
write
(
"
*** {}/{} TRAINING SET CROSS VALIDATION (POSTS) ***
\n
"
.
format
(
"
{}/{} TRAINING SET CROSS VALIDATION (POSTS)
\n
"
.
format
(
DIMENSIONS
[
k
][
0
],
DIMENSIONS
[
k
][
1
]
)
)
...
...
@@ -236,7 +214,7 @@ for k in range(len(DIMENSIONS)):
f
.
write
(
"Confusion matrix:
\n
"
)
f
.
write
(
np
.
array2string
(
confusion_k
,
separator
=
", "
))
#
##
Test set classification (individual posts)
# Test set classification (individual posts)
model
.
fit
(
preprocess
(
df
[
"x"
]
.
values
),
df
[
"y"
]
.
values
,
...
...
@@ -247,10 +225,10 @@ for k in range(len(DIMENSIONS)):
confusion
=
confusion_matrix
(
y_test
,
predictions
)
score
=
accuracy_score
(
y_test
,
predictions
)
with
open
(
os
.
path
.
join
(
MODELS_DIR
,
"
rnn
_accuracy_{}.txt"
.
format
(
DIMENSIONS
[
k
])),
"w"
os
.
path
.
join
(
MODELS_DIR
,
"
SimpleRNN
_accuracy_{}.txt"
.
format
(
DIMENSIONS
[
k
])),
"w"
,
encoding
=
"utf8"
)
as
f
:
f
.
write
(
"
*** {}/{} TEST SET CLASSIFICATION (POSTS) ***
\n
"
.
format
(
"
{}/{} TEST SET CLASSIFICATION (POSTS)
\n
"
.
format
(
DIMENSIONS
[
k
][
0
],
DIMENSIONS
[
k
][
1
]
)
)
...
...
@@ -259,10 +237,10 @@ for k in range(len(DIMENSIONS)):
f
.
write
(
"Confusion matrix:
\n
"
)
f
.
write
(
np
.
array2string
(
confusion
,
separator
=
", "
))
print
(
f
"
\n
Wrote training / test results for {DIMENSIONS[k]} here: {os.path.join(MODELS_DIR, '
rnn
_accuracy_{}.txt'.format(DIMENSIONS[k]))}
\n
"
f
"
\n
Wrote training / test results for {DIMENSIONS[k]} here: {os.path.join(MODELS_DIR, '
SimpleRNN
_accuracy_{}.txt'.format(DIMENSIONS[k]))}
\n
"
)
#
##
Get most a-like/b-like sentences
# Get most a-like/b-like sentences
if
WORD_CLOUD
:
NUM_EXTREME_EXAMPLES
=
500
probs
=
model
.
predict
(
preprocess
(
x_test
))
...
...
@@ -279,7 +257,7 @@ for k in range(len(DIMENSIONS)):
DATA_DIR
,
"extreme_examples_{}.txt"
.
format
(
DIMENSIONS
[
k
][
0
])
),
"w"
,
encoding
=
"utf8"
)
as
f
:
,
encoding
=
"utf8"
)
as
f
:
for
prob
,
i
in
min_prob_indices
:
# f.write(x_test[i]+'\n')
f
.
write
(
x_test
[
i
]
+
"
\n
"
)
...
...
@@ -290,16 +268,16 @@ for k in range(len(DIMENSIONS)):
DATA_DIR
,
"extreme_examples_{}.txt"
.
format
(
DIMENSIONS
[
k
][
1
])
),
"w"
,
encoding
=
"utf8"
)
as
f
:
,
encoding
=
"utf8"
)
as
f
:
for
prob
,
i
in
max_prob_indices
:
# f.write(x_test[i]+'\n')
f
.
write
(
x_test
[
i
]
+
"
\n
"
)
# f.write(str(prob)+'\n')
f
.
write
(
"
\n
"
)
#
## Save model and tokenizer for future use
model
.
save
(
os
.
path
.
join
(
MODELS_DIR
,
"
rnn
_model_{}.h5"
.
format
(
DIMENSIONS
[
k
])))
#
Save model and tokenizer for future personality predictions
model
.
save
(
os
.
path
.
join
(
MODELS_DIR
,
"
SimpleRNN
_model_{}.h5"
.
format
(
DIMENSIONS
[
k
])))
with
open
(
os
.
path
.
join
(
MODELS_DIR
,
"
rnn
_tokenizer_{}.pkl"
.
format
(
DIMENSIONS
[
k
])),
"wb"
)
as
f
:
os
.
path
.
join
(
MODELS_DIR
,
"
SimpleRNN
_tokenizer_{}.pkl"
.
format
(
DIMENSIONS
[
k
])),
"wb"
)
as
f
:
pickle
.
dump
(
tokenizer
,
f
,
protocol
=
pickle
.
HIGHEST_PROTOCOL
)
Personality_prediction/bidirectional_lstm.py
0 → 100644
View file @
905d39d8
This diff is collapsed.
Click to expand it.
Personality_prediction/make_test_set.py
View file @
905d39d8
...
...
@@ -5,8 +5,8 @@ import csv
import
re
DATA_DIR
=
"data"
MBTI_CLEAN_CSV_PATH
=
os
.
path
.
join
(
DATA_DIR
,
"mbti_clean.csv"
)
DATA_DIR
ECTORY
=
"data"
MBTI_CLEAN_CSV_PATH
=
os
.
path
.
join
(
DATA_DIR
ECTORY
,
"mbti_clean.csv"
)
DIMENSIONS
=
(
"IE"
,
"NS"
,
"TF"
,
"PJ"
)
...
...
@@ -29,7 +29,7 @@ for dimension in DIMENSIONS:
continue
posts
.
append
(
post
)
test_csv_path
=
os
.
path
.
join
(
DATA_DIR
,
f
"test_{letter}.csv"
)
test_csv_path
=
os
.
path
.
join
(
DATA_DIR
ECTORY
,
f
"test_{letter}.csv"
)
with
open
(
test_csv_path
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
writer
=
csv
.
writer
(
f
)
for
post
in
posts
:
...
...
Personality_prediction/make_training_set.py
View file @
905d39d8
...
...
@@ -5,8 +5,8 @@ import csv
import
re
DATA_DIR
=
"data"
MBTI_UNCLEAN_CSV_PATH
=
os
.
path
.
join
(
DATA_DIR
,
"mbti_unclean.csv"
)
DATA_DIR
ECTORY
=
"data"
MBTI_UNCLEAN_CSV_PATH
=
os
.
path
.
join
(
DATA_DIR
ECTORY
,
"mbti_unclean.csv"
)
DIMENSIONS
=
(
"IE"
,
"NS"
,
"TF"
,
"PJ"
)
...
...
@@ -57,7 +57,7 @@ for dimension in DIMENSIONS:
posts
.
append
(
post
)
i
+=
1
train_csv_path
=
os
.
path
.
join
(
DATA_DIR
,
f
"train_{letter}.csv"
)
train_csv_path
=
os
.
path
.
join
(
DATA_DIR
ECTORY
,
f
"train_{letter}.csv"
)
with
open
(
train_csv_path
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
writer
=
csv
.
writer
(
f
)
for
post
in
posts
:
...
...
Personality_prediction/sample_predictor.py
View file @
905d39d8
import
os
import
csv
import
os
import
pickle
import
collections
import
numpy
as
np
from
nltk
import
word_tokenize
from
nltk.stem
import
WordNetLemmatizer
from
nltk.corpus
import
stopwords
from
keras.preprocessing
import
sequence
from
keras.preprocessing
import
text
from
keras.models
import
load_model
from
keras.preprocessing
import
sequence
from
nltk.corpus
import
stopwords
from
nltk.stem
import
WordNetLemmatizer
MODELS_DIR
=
"models"
DATA_DIR
=
"data"
TRUMP_TWEETS_PATH
=
os
.
path
.
join
(
DATA_DIR
,
"trumptweets.csv"
)
MODELS_DIRECTORY
=
"models"
DATA_DIRECTORY
=
"data/sample_data"
SAMPLE_TWEETS_PATH
=
os
.
path
.
join
(
DATA_DIRECTORY
,
"0xnickrodriguez_tweets.csv"
)
DIMENSIONS
=
[
"IE"
,
"NS"
,
"FT"
,
"PJ"
]
DIMENSIONS_with_strings
=
[
"Introversion Extroversion"
,
"Intuition Sensing"
,
"Feeling Thinking"
,
"Perceiving Judging"
]
MODEL_BATCH_SIZE
=
128
TOP_WORDS
=
2500
MAX_POST_LENGTH
=
40
...
...
@@ -24,7 +22,7 @@ EMBEDDING_VECTOR_LENGTH = 20
final
=
""
x_test
=
[]
with
open
(
TRUMP
_TWEETS_PATH
,
"r"
,
encoding
=
"ISO-8859-1"
)
as
f
:
with
open
(
SAMPLE
_TWEETS_PATH
,
"r"
,
encoding
=
"ISO-8859-1"
)
as
f
:
reader
=
csv
.
reader
(
f
)
for
row
in
f
:
x_test
.
append
(
row
)
...
...
@@ -71,27 +69,34 @@ def lemmatize(x):
for
k
in
range
(
len
(
DIMENSIONS
)):
model
=
load_model
(
os
.
path
.
join
(
MODELS_DIR
,
"rnn_model_{}.h5"
.
format
(
DIMENSIONS
[
k
]))
os
.
path
.
join
(
MODELS_DIR
ECTORY
,
"rnn_model_{}.h5"
.
format
(
DIMENSIONS
[
k
]))
)
tokenizer
=
None
with
open
(
os
.
path
.
join
(
MODELS_DIR
,
"rnn_tokenizer_{}.pkl"
.
format
(
DIMENSIONS
[
k
])),
"rb"
os
.
path
.
join
(
MODELS_DIRECTORY
,
"rnn_tokenizer_{}.pkl"
.
format
(
DIMENSIONS
[
k
])),
"rb"
)
as
f
:
tokenizer
=
pickle
.
load
(
f
)
def
preprocess
(
x
):
lemmatized
=
lemmatize
(
x
)
tokenized
=
tokenizer
.
texts_to_sequences
(
lemmatized
)
return
sequence
.
pad_sequences
(
tokenized
,
maxlen
=
MAX_POST_LENGTH
)
predictions
=
model
.
predict
(
preprocess
(
x_test
))
prediction
=
float
(
sum
(
predictions
)
/
len
(
predictions
))
print
(
DIMENSIONS
[
k
])
print
(
DIMENSIONS
_with_strings
[
k
])
print
(
prediction
)
if
prediction
>=
0.5
:
final
+=
DIMENSIONS
[
k
][
1
]
print
(
"Personality type - "
,
DIMENSIONS
[
k
][
1
])
else
:
final
+=
DIMENSIONS
[
k
][
0
]
print
(
"Personality type - "
,
DIMENSIONS
[
k
][
0
])
print
(
""
)
print
(
""
)
print
(
"
Final prediction: {}
"
.
format
(
final
))
print
(
"
Personality Type of the Person : {}
"
.
format
(
final
))
Personality_prediction/separate_clean_and_unclean.py
View file @
905d39d8
...
...
@@ -4,10 +4,10 @@ import pandas as pd
import
csv
DATA_DIR
=
"data"
MBTI_RAW_CSV_PATH
=
os
.
path
.
join
(
DATA_DIR
,
"mbti_personality.csv"
)
MBTI_CLEAN_CSV_PATH
=
os
.
path
.
join
(
DATA_DIR
,
"mbti
_clean.csv"
)
MBTI_UNCLEAN_CSV_PATH
=
os
.
path
.
join
(
DATA_DIR
,
"mbti
_unclean.csv"
)
DATA_DIR
ECTORY
=
"data"
MBTI_RAW_CSV_PATH
=
os
.
path
.
join
(
DATA_DIR
ECTORY
,
"mbti_personality.csv"
)
MBTI_CLEAN_CSV_PATH
=
os
.
path
.
join
(
DATA_DIR
ECTORY
,
"mbti_personality
_clean.csv"
)
MBTI_UNCLEAN_CSV_PATH
=
os
.
path
.
join
(
DATA_DIR
ECTORY
,
"mbti_personality
_unclean.csv"
)
MBTI_TO_FREQUENCY_DICT
=
{
"ISTJ"
:
0.11
,
"ISFJ"
:
0.09
,
...
...
Personality_prediction/simple_rnn.py
0 → 100644
View file @
905d39d8
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment