Commit 2f33b0c5 authored by Amuthini's avatar Amuthini

fixed errors

parent 3a2e6ebf
.DS_Store
data
models
!data/.gitkeep
!models/.gitkeep
\ No newline at end of file
# Default ignored files
/shelf/
/workspace.xml
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml
# Editor-based HTTP Client requests
/httpRequests/
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="Python 3.8 (2)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>
\ No newline at end of file
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8 (2)" project-jdk-type="Python SDK" />
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/Personality_prediction.iml" filepath="$PROJECT_DIR$/.idea/Personality_prediction.iml" />
</modules>
</component>
</project>
\ No newline at end of file
- repo: https://github.com/psf/black
rev: 20.8b1 # Replace by any tag/version: https://github.com/psf/black/tags
hooks:
- id: black
language_version: python3 # Should be a command that runs python3.6+
\ No newline at end of file
import os
import collections
import pandas as pd
import csv
import re
DATA_DIR = "data"
MBTI_CLEAN_CSV_PATH = os.path.join(DATA_DIR, "mbti_clean.csv")
DIMENSIONS = ("IE", "NS", "TF", "PJ")
df = pd.read_csv(MBTI_CLEAN_CSV_PATH)
for dimension in DIMENSIONS:
letter_1, letter_2 = dimension
for letter in [letter_1, letter_2]:
posts = []
for index, row in df.iterrows():
if letter in row["type"]:
hundred_posts = row["posts"].split("|||")
for post in hundred_posts:
if (
("http" in post)
or (post == "")
or (post == None)
or (not re.search("[a-zA-Z]", post))
): # ignore deformed posts
continue
posts.append(post)
test_csv_path = os.path.join(DATA_DIR, f"test_{letter}.csv")
with open(test_csv_path, "w", encoding="utf-8") as f:
writer = csv.writer(f)
for post in posts:
writer.writerow([post])
import os
import collections
import pandas as pd
import csv
import re
DATA_DIR = "data"
MBTI_UNCLEAN_CSV_PATH = os.path.join(DATA_DIR, "mbti_unclean.csv")
DIMENSIONS = ("IE", "NS", "TF", "PJ")
df = pd.read_csv(MBTI_UNCLEAN_CSV_PATH)
counts = collections.defaultdict(int)
for dimension in DIMENSIONS:
letter_1, letter_2 = dimension
for index, row in df.iterrows():
mbti = row["type"]
hundred_posts = row["posts"].split("|||")
for post in hundred_posts:
if (
("http" in post)
or (post == "")
or (post == None)
or (not re.search("[a-zA-Z]", post))
): # ignore deformed posts
continue
if letter_1 in mbti:
counts[letter_1] += 1
if letter_2 in mbti:
counts[letter_2] += 1
for dimension in DIMENSIONS:
letter_1, letter_2 = dimension
if counts[letter_1] < counts[letter_2]:
limit = counts[letter_1]
else:
limit = counts[letter_2]
for letter in [letter_1, letter_2]:
posts = []
i = 0
for index, row in df.iterrows():
if letter in row["type"]:
hundred_posts = row["posts"].split("|||")
for post in hundred_posts:
if i == limit:
break
if (
("http" in post)
or (post == "")
or (post == None)
or (not re.search("[a-zA-Z]", post))
): # ignore deformed posts
continue
posts.append(post)
i += 1
train_csv_path = os.path.join(DATA_DIR, f"train_{letter}.csv")
with open(train_csv_path, "w", encoding="utf-8") as f:
writer = csv.writer(f)
for post in posts:
writer.writerow([post])
This diff is collapsed.
[tool.poetry]
name = "mbti-rnn"
version = "0.1.0"
description = ""
authors = ["Ian Scott Knight <isk@alumni.stanford.edu>"]
license = "MIT"
[tool.poetry.dependencies]
python = "^3.8"
scikit-learn = "^0.24.1"
nltk = "^3.5"
Keras = "^2.4.3"
pandas = "^1.2.1"
tensorflow = "^2.4.1"
[tool.poetry.dev-dependencies]
pre-commit = "^2.10.0"
[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"
This diff is collapsed.
import os
import csv
import pickle
import collections
import numpy as np
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from keras.preprocessing import sequence
from keras.preprocessing import text
from keras.models import load_model
MODELS_DIR = "models"
DATA_DIR = "data"
TRUMP_TWEETS_PATH = os.path.join(DATA_DIR, "trumptweets.csv")
DIMENSIONS = ["IE", "NS", "FT", "PJ"]
MODEL_BATCH_SIZE = 128
TOP_WORDS = 2500
MAX_POST_LENGTH = 40
EMBEDDING_VECTOR_LENGTH = 20
final = ""
x_test = []
with open(TRUMP_TWEETS_PATH, "r", encoding="ISO-8859-1") as f:
reader = csv.reader(f)
for row in f:
x_test.append(row)
types = [
"INFJ",
"ENTP",
"INTP",
"INTJ",
"ENTJ",
"ENFJ",
"INFP",
"ENFP",
"ISFP",
"ISTP",
"ISFJ",
"ISTJ",
"ESTP",
"ESFP",
"ESTJ",
"ESFJ",
]
types = [x.lower() for x in types]
lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words("english")
def lemmatize(x):
lemmatized = []
for post in x:
temp = post.lower()
for type_ in types:
temp = temp.replace(" " + type_, "")
temp = " ".join(
[
lemmatizer.lemmatize(word)
for word in temp.split(" ")
if (word not in stop_words)
]
)
lemmatized.append(temp)
return np.array(lemmatized)
for k in range(len(DIMENSIONS)):
model = load_model(
os.path.join(MODELS_DIR, "rnn_model_{}.h5".format(DIMENSIONS[k]))
)
tokenizer = None
with open(
os.path.join(MODELS_DIR, "rnn_tokenizer_{}.pkl".format(DIMENSIONS[k])), "rb"
) as f:
tokenizer = pickle.load(f)
def preprocess(x):
lemmatized = lemmatize(x)
tokenized = tokenizer.texts_to_sequences(lemmatized)
return sequence.pad_sequences(tokenized, maxlen=MAX_POST_LENGTH)
predictions = model.predict(preprocess(x_test))
prediction = float(sum(predictions) / len(predictions))
print(DIMENSIONS[k])
print(prediction)
if prediction >= 0.5:
final += DIMENSIONS[k][1]
else:
final += DIMENSIONS[k][0]
print("")
print("Final prediction: {}".format(final))
import os
import collections
import pandas as pd
import csv
DATA_DIR = "data"
MBTI_RAW_CSV_PATH = os.path.join(DATA_DIR, "mbti_personality.csv")
MBTI_CLEAN_CSV_PATH = os.path.join(DATA_DIR, "mbti_clean.csv")
MBTI_UNCLEAN_CSV_PATH = os.path.join(DATA_DIR, "mbti_unclean.csv")
MBTI_TO_FREQUENCY_DICT = {
"ISTJ": 0.11,
"ISFJ": 0.09,
"INFJ": 0.04,
"INTJ": 0.05,
"ISTP": 0.05,
"ISFP": 0.05,
"INFP": 0.06,
"INTP": 0.06,
"ESTP": 0.04,
"ESFP": 0.04,
"ENFP": 0.08,
"ENTP": 0.06,
"ESTJ": 0.08,
"ESFJ": 0.09,
"ENFJ": 0.05,
"ENTJ": 0.05,
}
df = pd.read_csv(MBTI_RAW_CSV_PATH)
counts = collections.defaultdict(int)
for mbti in df["type"]:
counts[mbti] += 1
limiting_type = None
min_size = float("infinity")
for mbti in counts.keys():
size = counts[mbti] / MBTI_TO_FREQUENCY_DICT[mbti]
if size < min_size:
min_size = size
limiting_type = mbti
dic = collections.defaultdict(list)
for index, row in df.iterrows():
dic[row["type"]].append(row)
unclean_list = []
with open(MBTI_CLEAN_CSV_PATH, "w", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(["type", "posts"])
for mbti in MBTI_TO_FREQUENCY_DICT.keys():
list1 = dic[mbti]
for x in range(0, int(round(min_size * MBTI_TO_FREQUENCY_DICT[mbti]))):
writer.writerow(list1[x])
unclean_list.append(
list1[int(round(min_size * MBTI_TO_FREQUENCY_DICT[mbti])) : len(list1)]
)
with open(MBTI_UNCLEAN_CSV_PATH, "w", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(["type", "posts"])
for mbti in unclean_list:
for x in mbti:
writer.writerow(x)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment