Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
2
2020_21 J-25
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
3
Merge Requests
3
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
2020_21 J-25
2020_21 J-25
Commits
2f33b0c5
Commit
2f33b0c5
authored
Jun 28, 2021
by
Amuthini
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fixed errors
parent
3a2e6ebf
Changes
14
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
1768 additions
and
0 deletions
+1768
-0
Personality_prediction/.gitignore
Personality_prediction/.gitignore
+5
-0
Personality_prediction/.idea/.gitignore
Personality_prediction/.idea/.gitignore
+8
-0
Personality_prediction/.idea/Personality_prediction.iml
Personality_prediction/.idea/Personality_prediction.iml
+8
-0
Personality_prediction/.idea/inspectionProfiles/profiles_settings.xml
...prediction/.idea/inspectionProfiles/profiles_settings.xml
+6
-0
Personality_prediction/.idea/misc.xml
Personality_prediction/.idea/misc.xml
+4
-0
Personality_prediction/.idea/modules.xml
Personality_prediction/.idea/modules.xml
+8
-0
Personality_prediction/.pre-commit-config.yaml
Personality_prediction/.pre-commit-config.yaml
+5
-0
Personality_prediction/make_test_set.py
Personality_prediction/make_test_set.py
+36
-0
Personality_prediction/make_training_set.py
Personality_prediction/make_training_set.py
+64
-0
Personality_prediction/poetry.lock
Personality_prediction/poetry.lock
+1134
-0
Personality_prediction/pyproject.toml
Personality_prediction/pyproject.toml
+21
-0
Personality_prediction/rnn.py
Personality_prediction/rnn.py
+305
-0
Personality_prediction/sample_predictor.py
Personality_prediction/sample_predictor.py
+97
-0
Personality_prediction/separate_clean_and_unclean.py
Personality_prediction/separate_clean_and_unclean.py
+67
-0
No files found.
Personality_prediction/.gitignore
0 → 100644
View file @
2f33b0c5
.DS_Store
data
models
!data/.gitkeep
!models/.gitkeep
\ No newline at end of file
Personality_prediction/.idea/.gitignore
0 → 100644
View file @
2f33b0c5
# Default ignored files
/shelf/
/workspace.xml
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml
# Editor-based HTTP Client requests
/httpRequests/
Personality_prediction/.idea/Personality_prediction.iml
0 → 100644
View file @
2f33b0c5
<?xml version="1.0" encoding="UTF-8"?>
<module
type=
"PYTHON_MODULE"
version=
"4"
>
<component
name=
"NewModuleRootManager"
>
<content
url=
"file://$MODULE_DIR$"
/>
<orderEntry
type=
"jdk"
jdkName=
"Python 3.8 (2)"
jdkType=
"Python SDK"
/>
<orderEntry
type=
"sourceFolder"
forTests=
"false"
/>
</component>
</module>
\ No newline at end of file
Personality_prediction/.idea/inspectionProfiles/profiles_settings.xml
0 → 100644
View file @
2f33b0c5
<component
name=
"InspectionProjectProfileManager"
>
<settings>
<option
name=
"USE_PROJECT_PROFILE"
value=
"false"
/>
<version
value=
"1.0"
/>
</settings>
</component>
\ No newline at end of file
Personality_prediction/.idea/misc.xml
0 → 100644
View file @
2f33b0c5
<?xml version="1.0" encoding="UTF-8"?>
<project
version=
"4"
>
<component
name=
"ProjectRootManager"
version=
"2"
project-jdk-name=
"Python 3.8 (2)"
project-jdk-type=
"Python SDK"
/>
</project>
\ No newline at end of file
Personality_prediction/.idea/modules.xml
0 → 100644
View file @
2f33b0c5
<?xml version="1.0" encoding="UTF-8"?>
<project
version=
"4"
>
<component
name=
"ProjectModuleManager"
>
<modules>
<module
fileurl=
"file://$PROJECT_DIR$/.idea/Personality_prediction.iml"
filepath=
"$PROJECT_DIR$/.idea/Personality_prediction.iml"
/>
</modules>
</component>
</project>
\ No newline at end of file
Personality_prediction/.pre-commit-config.yaml
0 → 100644
View file @
2f33b0c5
-
repo
:
https://github.com/psf/black
rev
:
20.8b1
# Replace by any tag/version: https://github.com/psf/black/tags
hooks
:
-
id
:
black
language_version
:
python3
# Should be a command that runs python3.6+
\ No newline at end of file
Personality_prediction/make_test_set.py
0 → 100644
View file @
2f33b0c5
import
os
import
collections
import
pandas
as
pd
import
csv
import
re
DATA_DIR
=
"data"
MBTI_CLEAN_CSV_PATH
=
os
.
path
.
join
(
DATA_DIR
,
"mbti_clean.csv"
)
DIMENSIONS
=
(
"IE"
,
"NS"
,
"TF"
,
"PJ"
)
df
=
pd
.
read_csv
(
MBTI_CLEAN_CSV_PATH
)
for
dimension
in
DIMENSIONS
:
letter_1
,
letter_2
=
dimension
for
letter
in
[
letter_1
,
letter_2
]:
posts
=
[]
for
index
,
row
in
df
.
iterrows
():
if
letter
in
row
[
"type"
]:
hundred_posts
=
row
[
"posts"
]
.
split
(
"|||"
)
for
post
in
hundred_posts
:
if
(
(
"http"
in
post
)
or
(
post
==
""
)
or
(
post
==
None
)
or
(
not
re
.
search
(
"[a-zA-Z]"
,
post
))
):
# ignore deformed posts
continue
posts
.
append
(
post
)
test_csv_path
=
os
.
path
.
join
(
DATA_DIR
,
f
"test_{letter}.csv"
)
with
open
(
test_csv_path
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
writer
=
csv
.
writer
(
f
)
for
post
in
posts
:
writer
.
writerow
([
post
])
Personality_prediction/make_training_set.py
0 → 100644
View file @
2f33b0c5
import
os
import
collections
import
pandas
as
pd
import
csv
import
re
DATA_DIR
=
"data"
MBTI_UNCLEAN_CSV_PATH
=
os
.
path
.
join
(
DATA_DIR
,
"mbti_unclean.csv"
)
DIMENSIONS
=
(
"IE"
,
"NS"
,
"TF"
,
"PJ"
)
df
=
pd
.
read_csv
(
MBTI_UNCLEAN_CSV_PATH
)
counts
=
collections
.
defaultdict
(
int
)
for
dimension
in
DIMENSIONS
:
letter_1
,
letter_2
=
dimension
for
index
,
row
in
df
.
iterrows
():
mbti
=
row
[
"type"
]
hundred_posts
=
row
[
"posts"
]
.
split
(
"|||"
)
for
post
in
hundred_posts
:
if
(
(
"http"
in
post
)
or
(
post
==
""
)
or
(
post
==
None
)
or
(
not
re
.
search
(
"[a-zA-Z]"
,
post
))
):
# ignore deformed posts
continue
if
letter_1
in
mbti
:
counts
[
letter_1
]
+=
1
if
letter_2
in
mbti
:
counts
[
letter_2
]
+=
1
for
dimension
in
DIMENSIONS
:
letter_1
,
letter_2
=
dimension
if
counts
[
letter_1
]
<
counts
[
letter_2
]:
limit
=
counts
[
letter_1
]
else
:
limit
=
counts
[
letter_2
]
for
letter
in
[
letter_1
,
letter_2
]:
posts
=
[]
i
=
0
for
index
,
row
in
df
.
iterrows
():
if
letter
in
row
[
"type"
]:
hundred_posts
=
row
[
"posts"
]
.
split
(
"|||"
)
for
post
in
hundred_posts
:
if
i
==
limit
:
break
if
(
(
"http"
in
post
)
or
(
post
==
""
)
or
(
post
==
None
)
or
(
not
re
.
search
(
"[a-zA-Z]"
,
post
))
):
# ignore deformed posts
continue
posts
.
append
(
post
)
i
+=
1
train_csv_path
=
os
.
path
.
join
(
DATA_DIR
,
f
"train_{letter}.csv"
)
with
open
(
train_csv_path
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
writer
=
csv
.
writer
(
f
)
for
post
in
posts
:
writer
.
writerow
([
post
])
Personality_prediction/poetry.lock
0 → 100644
View file @
2f33b0c5
This diff is collapsed.
Click to expand it.
Personality_prediction/pyproject.toml
0 → 100644
View file @
2f33b0c5
[tool.poetry]
name
=
"mbti-rnn"
version
=
"0.1.0"
description
=
""
authors
=
[
"Ian Scott Knight <isk@alumni.stanford.edu>"
]
license
=
"MIT"
[tool.poetry.dependencies]
python
=
"^3.8"
scikit-learn
=
"^0.24.1"
nltk
=
"^3.5"
Keras
=
"^2.4.3"
pandas
=
"^1.2.1"
tensorflow
=
"^2.4.1"
[tool.poetry.dev-dependencies]
pre-commit
=
"^2.10.0"
[build-system]
requires
=
["poetry-core>=1.0.0"]
build-backend
=
"poetry.core.masonry.api"
Personality_prediction/rnn.py
0 → 100644
View file @
2f33b0c5
This diff is collapsed.
Click to expand it.
Personality_prediction/sample_predictor.py
0 → 100644
View file @
2f33b0c5
import
os
import
csv
import
pickle
import
collections
import
numpy
as
np
from
nltk
import
word_tokenize
from
nltk.stem
import
WordNetLemmatizer
from
nltk.corpus
import
stopwords
from
keras.preprocessing
import
sequence
from
keras.preprocessing
import
text
from
keras.models
import
load_model
MODELS_DIR
=
"models"
DATA_DIR
=
"data"
TRUMP_TWEETS_PATH
=
os
.
path
.
join
(
DATA_DIR
,
"trumptweets.csv"
)
DIMENSIONS
=
[
"IE"
,
"NS"
,
"FT"
,
"PJ"
]
MODEL_BATCH_SIZE
=
128
TOP_WORDS
=
2500
MAX_POST_LENGTH
=
40
EMBEDDING_VECTOR_LENGTH
=
20
final
=
""
x_test
=
[]
with
open
(
TRUMP_TWEETS_PATH
,
"r"
,
encoding
=
"ISO-8859-1"
)
as
f
:
reader
=
csv
.
reader
(
f
)
for
row
in
f
:
x_test
.
append
(
row
)
types
=
[
"INFJ"
,
"ENTP"
,
"INTP"
,
"INTJ"
,
"ENTJ"
,
"ENFJ"
,
"INFP"
,
"ENFP"
,
"ISFP"
,
"ISTP"
,
"ISFJ"
,
"ISTJ"
,
"ESTP"
,
"ESFP"
,
"ESTJ"
,
"ESFJ"
,
]
types
=
[
x
.
lower
()
for
x
in
types
]
lemmatizer
=
WordNetLemmatizer
()
stop_words
=
stopwords
.
words
(
"english"
)
def
lemmatize
(
x
):
lemmatized
=
[]
for
post
in
x
:
temp
=
post
.
lower
()
for
type_
in
types
:
temp
=
temp
.
replace
(
" "
+
type_
,
""
)
temp
=
" "
.
join
(
[
lemmatizer
.
lemmatize
(
word
)
for
word
in
temp
.
split
(
" "
)
if
(
word
not
in
stop_words
)
]
)
lemmatized
.
append
(
temp
)
return
np
.
array
(
lemmatized
)
for
k
in
range
(
len
(
DIMENSIONS
)):
model
=
load_model
(
os
.
path
.
join
(
MODELS_DIR
,
"rnn_model_{}.h5"
.
format
(
DIMENSIONS
[
k
]))
)
tokenizer
=
None
with
open
(
os
.
path
.
join
(
MODELS_DIR
,
"rnn_tokenizer_{}.pkl"
.
format
(
DIMENSIONS
[
k
])),
"rb"
)
as
f
:
tokenizer
=
pickle
.
load
(
f
)
def
preprocess
(
x
):
lemmatized
=
lemmatize
(
x
)
tokenized
=
tokenizer
.
texts_to_sequences
(
lemmatized
)
return
sequence
.
pad_sequences
(
tokenized
,
maxlen
=
MAX_POST_LENGTH
)
predictions
=
model
.
predict
(
preprocess
(
x_test
))
prediction
=
float
(
sum
(
predictions
)
/
len
(
predictions
))
print
(
DIMENSIONS
[
k
])
print
(
prediction
)
if
prediction
>=
0.5
:
final
+=
DIMENSIONS
[
k
][
1
]
else
:
final
+=
DIMENSIONS
[
k
][
0
]
print
(
""
)
print
(
"Final prediction: {}"
.
format
(
final
))
Personality_prediction/separate_clean_and_unclean.py
0 → 100644
View file @
2f33b0c5
import
os
import
collections
import
pandas
as
pd
import
csv
DATA_DIR
=
"data"
MBTI_RAW_CSV_PATH
=
os
.
path
.
join
(
DATA_DIR
,
"mbti_personality.csv"
)
MBTI_CLEAN_CSV_PATH
=
os
.
path
.
join
(
DATA_DIR
,
"mbti_clean.csv"
)
MBTI_UNCLEAN_CSV_PATH
=
os
.
path
.
join
(
DATA_DIR
,
"mbti_unclean.csv"
)
MBTI_TO_FREQUENCY_DICT
=
{
"ISTJ"
:
0.11
,
"ISFJ"
:
0.09
,
"INFJ"
:
0.04
,
"INTJ"
:
0.05
,
"ISTP"
:
0.05
,
"ISFP"
:
0.05
,
"INFP"
:
0.06
,
"INTP"
:
0.06
,
"ESTP"
:
0.04
,
"ESFP"
:
0.04
,
"ENFP"
:
0.08
,
"ENTP"
:
0.06
,
"ESTJ"
:
0.08
,
"ESFJ"
:
0.09
,
"ENFJ"
:
0.05
,
"ENTJ"
:
0.05
,
}
df
=
pd
.
read_csv
(
MBTI_RAW_CSV_PATH
)
counts
=
collections
.
defaultdict
(
int
)
for
mbti
in
df
[
"type"
]:
counts
[
mbti
]
+=
1
limiting_type
=
None
min_size
=
float
(
"infinity"
)
for
mbti
in
counts
.
keys
():
size
=
counts
[
mbti
]
/
MBTI_TO_FREQUENCY_DICT
[
mbti
]
if
size
<
min_size
:
min_size
=
size
limiting_type
=
mbti
dic
=
collections
.
defaultdict
(
list
)
for
index
,
row
in
df
.
iterrows
():
dic
[
row
[
"type"
]]
.
append
(
row
)
unclean_list
=
[]
with
open
(
MBTI_CLEAN_CSV_PATH
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
writer
=
csv
.
writer
(
f
)
writer
.
writerow
([
"type"
,
"posts"
])
for
mbti
in
MBTI_TO_FREQUENCY_DICT
.
keys
():
list1
=
dic
[
mbti
]
for
x
in
range
(
0
,
int
(
round
(
min_size
*
MBTI_TO_FREQUENCY_DICT
[
mbti
]))):
writer
.
writerow
(
list1
[
x
])
unclean_list
.
append
(
list1
[
int
(
round
(
min_size
*
MBTI_TO_FREQUENCY_DICT
[
mbti
]))
:
len
(
list1
)]
)
with
open
(
MBTI_UNCLEAN_CSV_PATH
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
writer
=
csv
.
writer
(
f
)
writer
.
writerow
([
"type"
,
"posts"
])
for
mbti
in
unclean_list
:
for
x
in
mbti
:
writer
.
writerow
(
x
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment