Commit 905d39d8 authored by Amuthini's avatar Amuthini

source files updated

parent 2f33b0c5
.DS_Store
data
models
!data/.gitkeep
!models/.gitkeep
\ No newline at end of file
# Default ignored files
/shelf/
/workspace.xml
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml
# Editor-based HTTP Client requests
/httpRequests/
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$" />
<orderEntry type="jdk" jdkName="Python 3.8 (2)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>
\ No newline at end of file
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8 (2)" project-jdk-type="Python SDK" />
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/Personality_prediction.iml" filepath="$PROJECT_DIR$/.idea/Personality_prediction.iml" />
</modules>
</component>
</project>
\ No newline at end of file
- repo: https://github.com/psf/black
rev: 20.8b1 # Replace by any tag/version: https://github.com/psf/black/tags
hooks:
- id: black
language_version: python3 # Should be a command that runs python3.6+
\ No newline at end of file
import os
import collections
import pandas as pd
import csv
import re
DATA_DIRECTORY = "data"
MBTI_CLEAN_CSV_PATH = os.path.join(DATA_DIRECTORY, "mbti_clean.csv")
DIMENSIONS = ("IE", "NS", "TF", "PJ")
df = pd.read_csv(MBTI_CLEAN_CSV_PATH)
for dimension in DIMENSIONS:
letter_1, letter_2 = dimension
for letter in [letter_1, letter_2]:
posts = []
for index, row in df.iterrows():
if letter in row["type"]:
hundred_posts = row["posts"].split("|||")
for post in hundred_posts:
if (
("http" in post)
or (post == "")
or (post == None)
or (not re.search("[a-zA-Z]", post))
): # ignore deformed posts
continue
posts.append(post)
test_csv_path = os.path.join(DATA_DIRECTORY, f"test_{letter}.csv")
with open(test_csv_path, "w", encoding="utf-8") as f:
writer = csv.writer(f)
for post in posts:
writer.writerow([post])
import os
import collections
import pandas as pd
import csv
import re
DATA_DIRECTORY = "data"
MBTI_UNCLEAN_CSV_PATH = os.path.join(DATA_DIRECTORY, "mbti_unclean.csv")
DIMENSIONS = ("IE", "NS", "TF", "PJ")
df = pd.read_csv(MBTI_UNCLEAN_CSV_PATH)
counts = collections.defaultdict(int)
for dimension in DIMENSIONS:
letter_1, letter_2 = dimension
for index, row in df.iterrows():
mbti = row["type"]
hundred_posts = row["posts"].split("|||")
for post in hundred_posts:
if (
("http" in post)
or (post == "")
or (post == None)
or (not re.search("[a-zA-Z]", post))
): # ignore deformed posts
continue
if letter_1 in mbti:
counts[letter_1] += 1
if letter_2 in mbti:
counts[letter_2] += 1
for dimension in DIMENSIONS:
letter_1, letter_2 = dimension
if counts[letter_1] < counts[letter_2]:
limit = counts[letter_1]
else:
limit = counts[letter_2]
for letter in [letter_1, letter_2]:
posts = []
i = 0
for index, row in df.iterrows():
if letter in row["type"]:
hundred_posts = row["posts"].split("|||")
for post in hundred_posts:
if i == limit:
break
if (
("http" in post)
or (post == "")
or (post == None)
or (not re.search("[a-zA-Z]", post))
): # ignore deformed posts
continue
posts.append(post)
i += 1
train_csv_path = os.path.join(DATA_DIRECTORY, f"train_{letter}.csv")
with open(train_csv_path, "w", encoding="utf-8") as f:
writer = csv.writer(f)
for post in posts:
writer.writerow([post])
This diff is collapsed.
[tool.poetry]
name = "mbti-rnn"
version = "0.1.0"
description = ""
authors = ["Ian Scott Knight <isk@alumni.stanford.edu>"]
license = "MIT"
[tool.poetry.dependencies]
python = "^3.8"
scikit-learn = "^0.24.1"
nltk = "^3.5"
Keras = "^2.4.3"
pandas = "^1.2.1"
tensorflow = "^2.4.1"
[tool.poetry.dev-dependencies]
pre-commit = "^2.10.0"
[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"
import csv
import os
import pickle
import numpy as np
from keras.models import load_model
from keras.preprocessing import sequence
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
MODELS_DIRECTORY = "models"
DATA_DIRECTORY = "data/sample_data"
SAMPLE_TWEETS_PATH = os.path.join(DATA_DIRECTORY, "0xnickrodriguez_tweets.csv")
DIMENSIONS = ["IE", "NS", "FT", "PJ"]
DIMENSIONS_with_strings = ["Introversion Extroversion", "Intuition Sensing", "Feeling Thinking", "Perceiving Judging"]
MODEL_BATCH_SIZE = 128
TOP_WORDS = 2500
MAX_POST_LENGTH = 40
EMBEDDING_VECTOR_LENGTH = 20
final = ""
x_test = []
with open(SAMPLE_TWEETS_PATH, "r", encoding="ISO-8859-1") as f:
reader = csv.reader(f)
for row in f:
x_test.append(row)
types = [
"INFJ",
"ENTP",
"INTP",
"INTJ",
"ENTJ",
"ENFJ",
"INFP",
"ENFP",
"ISFP",
"ISTP",
"ISFJ",
"ISTJ",
"ESTP",
"ESFP",
"ESTJ",
"ESFJ",
]
types = [x.lower() for x in types]
lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words("english")
def lemmatize(x):
lemmatized = []
for post in x:
temp = post.lower()
for type_ in types:
temp = temp.replace(" " + type_, "")
temp = " ".join(
[
lemmatizer.lemmatize(word)
for word in temp.split(" ")
if (word not in stop_words)
]
)
lemmatized.append(temp)
return np.array(lemmatized)
for k in range(len(DIMENSIONS)):
model = load_model(
os.path.join(MODELS_DIRECTORY, "rnn_model_{}.h5".format(DIMENSIONS[k]))
)
tokenizer = None
with open(
os.path.join(MODELS_DIRECTORY, "rnn_tokenizer_{}.pkl".format(DIMENSIONS[k])), "rb"
) as f:
tokenizer = pickle.load(f)
def preprocess(x):
lemmatized = lemmatize(x)
tokenized = tokenizer.texts_to_sequences(lemmatized)
return sequence.pad_sequences(tokenized, maxlen=MAX_POST_LENGTH)
predictions = model.predict(preprocess(x_test))
prediction = float(sum(predictions) / len(predictions))
print(DIMENSIONS_with_strings[k])
print(prediction)
if prediction >= 0.5:
final += DIMENSIONS[k][1]
print("Personality type - ", DIMENSIONS[k][1])
else:
final += DIMENSIONS[k][0]
print("Personality type - ", DIMENSIONS[k][0])
print("")
print("")
print("Personality Type of the Person : {} ".format(final))
import os
import collections
import pandas as pd
import csv
DATA_DIRECTORY = "data"
MBTI_RAW_CSV_PATH = os.path.join(DATA_DIRECTORY, "mbti_personality.csv")
MBTI_CLEAN_CSV_PATH = os.path.join(DATA_DIRECTORY, "mbti_personality_clean.csv")
MBTI_UNCLEAN_CSV_PATH = os.path.join(DATA_DIRECTORY, "mbti_personality_unclean.csv")
MBTI_TO_FREQUENCY_DICT = {
"ISTJ": 0.11,
"ISFJ": 0.09,
"INFJ": 0.04,
"INTJ": 0.05,
"ISTP": 0.05,
"ISFP": 0.05,
"INFP": 0.06,
"INTP": 0.06,
"ESTP": 0.04,
"ESFP": 0.04,
"ENFP": 0.08,
"ENTP": 0.06,
"ESTJ": 0.08,
"ESFJ": 0.09,
"ENFJ": 0.05,
"ENTJ": 0.05,
}
df = pd.read_csv(MBTI_RAW_CSV_PATH)
counts = collections.defaultdict(int)
for mbti in df["type"]:
counts[mbti] += 1
limiting_type = None
min_size = float("infinity")
for mbti in counts.keys():
size = counts[mbti] / MBTI_TO_FREQUENCY_DICT[mbti]
if size < min_size:
min_size = size
limiting_type = mbti
dic = collections.defaultdict(list)
for index, row in df.iterrows():
dic[row["type"]].append(row)
unclean_list = []
with open(MBTI_CLEAN_CSV_PATH, "w", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(["type", "posts"])
for mbti in MBTI_TO_FREQUENCY_DICT.keys():
list1 = dic[mbti]
for x in range(0, int(round(min_size * MBTI_TO_FREQUENCY_DICT[mbti]))):
writer.writerow(list1[x])
unclean_list.append(
list1[int(round(min_size * MBTI_TO_FREQUENCY_DICT[mbti])) : len(list1)]
)
with open(MBTI_UNCLEAN_CSV_PATH, "w", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(["type", "posts"])
for mbti in unclean_list:
for x in mbti:
writer.writerow(x)
import csv
import os
import pickle
import warnings
import numpy as np
import pandas as pd
import csv
import random
import pickle
import collections
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import joblib
# scikit-learn - sklearn
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, accuracy_score
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Sequential
from keras.models import load_model
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Bidirectional
from keras.layers import GRU
from keras.layers import SimpleRNN
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing import text
from keras.models import Sequential
from keras.optimizers import adam_v2
#from keras.optimizers import Adam
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import confusion_matrix, accuracy_score
# scikit-learn - sklearn
from sklearn.model_selection import KFold
warnings.filterwarnings("ignore")
MODELS_DIR = "models"
DATA_DIR = "data"
GLOVE_PATH = os.path.join(DATA_DIR, "glove.6B.50d.txt")
DIMENSIONS = ["IE", "NS", "FT", "PJ"]
### Preprocessing variables
# Preprocessing variables
MODEL_BATCH_SIZE = 128
TOP_WORDS = 2500
MAX_POST_LENGTH = 40
EMBEDDING_VECTOR_LENGTH = 50
### Learning variables
# Learning variables
LEARNING_RATE = 0.01
DROPOUT = 0.1
NUM_EPOCHS = 1
### Control variables
# Control variables
CROSS_VALIDATION = False
SAMPLE = True
WORD_CLOUD = True
......@@ -55,16 +44,14 @@ SAVE_MODEL = True
for k in range(len(DIMENSIONS)):
###########################
### POST CLASSIFICATION ###
###########################
x_train = []
y_train = []
x_test = []
y_test = []
### Read in data
# Read in data
with open(
os.path.join(DATA_DIR, "train_{}.csv".format(DIMENSIONS[k][0])), "r"
, encoding="utf8") as f:
......@@ -94,7 +81,7 @@ for k in range(len(DIMENSIONS)):
x_test.append(post)
y_test.append(1)
### Preprocessing (lemmatization, tokenization, and padding of input)
# Preprocessing (lemmatization, tokenization, and padding of input text)
MBTI_TYPES = [
"INFJ",
"ENTP",
......@@ -146,13 +133,13 @@ for k in range(len(DIMENSIONS)):
x_train = lemmatize(x_train)
x_test = lemmatize(x_test)
### Assign to dataframe and shuffle rows
# Assign to dataframe and shuffle rows
df = pd.DataFrame(data={"x": x_train, "y": y_train})
df = df.sample(frac=1).reset_index(drop=True) ### Shuffle rows
df = df.sample(frac=1).reset_index(drop=True) # Shuffle rows
if SAMPLE:
df = df.head(10000) ### Small sample for quick runs
df = df.head(10000) # Small sample for quick runs
### Load glove into memory for embedding
# Load glove into memory for embedding
embeddings_index = dict()
with open(GLOVE_PATH, encoding="utf8") as f:
for line in f:
......@@ -161,7 +148,7 @@ for k in range(len(DIMENSIONS)):
embeddings_index[word] = np.asarray(values[1:], dtype="float32")
print("Loaded {} word vectors.".format(len(embeddings_index)))
### Create a weight matrix for words
# Create a weight matrix for words
embedding_matrix = np.zeros((TOP_WORDS, EMBEDDING_VECTOR_LENGTH))
for word, i in tokenizer.word_index.items():
if i < TOP_WORDS:
......@@ -169,7 +156,7 @@ for k in range(len(DIMENSIONS)):
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
### Construct model
# Construct model
with tf.device("/gpu:0"):
model = Sequential()
model.add(
......@@ -182,18 +169,9 @@ for k in range(len(DIMENSIONS)):
trainable=True,
)
)
# model.add(SimpleRNN(EMBEDDING_VECTOR_LENGTH, dropout=DROPOUT, recurrent_dropout=DROPOUT, activation='sigmoid', kernel_initializer='zeros'))
# model.add(GRU(EMBEDDING_VECTOR_LENGTH, dropout=DROPOUT, recurrent_dropout=DROPOUT, activation='sigmoid', kernel_initializer='zeros'))
model.add(
LSTM(
EMBEDDING_VECTOR_LENGTH,
dropout=DROPOUT,
recurrent_dropout=DROPOUT,
activation="sigmoid",
kernel_initializer="zeros",
)
)
# model.add(Bidirectional(LSTM(EMBEDDING_VECTOR_LENGTH, dropout=DROPOUT, recurrent_dropout=DROPOUT, activation='sigmoid', kernel_initializer='zeros')))
model.add(SimpleRNN(EMBEDDING_VECTOR_LENGTH, dropout=DROPOUT, recurrent_dropout=DROPOUT,
activation='sigmoid', kernel_initializer='zeros'))
model.add(Dense(1, activation="sigmoid"))
optimizer = adam_v2.Adam(learning_rate=LEARNING_RATE, beta_1=0.9, beta_2=0.999, epsilon=1e-8)
model.compile(
......@@ -201,7 +179,7 @@ for k in range(len(DIMENSIONS)):
)
print(model.summary())
### Cross-validation classification (individual posts)
# Cross-validation classification (individual posts)
if CROSS_VALIDATION:
k_fold = KFold(n_splits=6)
scores_k = []
......@@ -223,11 +201,11 @@ for k in range(len(DIMENSIONS)):
scores_k.append(score_k)
with open(
os.path.join(
DATA_DIR, "rnn_cross_validation_{}.txt".format(DIMENSIONS[k])
DATA_DIR, "SimpleRNN_cross_validation_{}.txt".format(DIMENSIONS[k])
),
"w", encoding="utf8") as f:
f.write(
"*** {}/{} TRAINING SET CROSS VALIDATION (POSTS) ***\n".format(
" {}/{} TRAINING SET CROSS VALIDATION (POSTS) \n".format(
DIMENSIONS[k][0], DIMENSIONS[k][1]
)
)
......@@ -236,7 +214,7 @@ for k in range(len(DIMENSIONS)):
f.write("Confusion matrix: \n")
f.write(np.array2string(confusion_k, separator=", "))
### Test set classification (individual posts)
# Test set classification (individual posts)
model.fit(
preprocess(df["x"].values),
df["y"].values,
......@@ -247,10 +225,10 @@ for k in range(len(DIMENSIONS)):
confusion = confusion_matrix(y_test, predictions)
score = accuracy_score(y_test, predictions)
with open(
os.path.join(MODELS_DIR, "rnn_accuracy_{}.txt".format(DIMENSIONS[k])), "w"
os.path.join(MODELS_DIR, "SimpleRNN_accuracy_{}.txt".format(DIMENSIONS[k])), "w"
, encoding="utf8") as f:
f.write(
"*** {}/{} TEST SET CLASSIFICATION (POSTS) ***\n".format(
" {}/{} TEST SET CLASSIFICATION (POSTS) \n".format(
DIMENSIONS[k][0], DIMENSIONS[k][1]
)
)
......@@ -259,10 +237,10 @@ for k in range(len(DIMENSIONS)):
f.write("Confusion matrix: \n")
f.write(np.array2string(confusion, separator=", "))
print(
f"\nWrote training / test results for {DIMENSIONS[k]} here: {os.path.join(MODELS_DIR, 'rnn_accuracy_{}.txt'.format(DIMENSIONS[k]))}\n"
f"\nWrote training / test results for {DIMENSIONS[k]} here: {os.path.join(MODELS_DIR, 'SimpleRNN_accuracy_{}.txt'.format(DIMENSIONS[k]))}\n"
)
### Get most a-like/b-like sentences
# Get most a-like/b-like sentences
if WORD_CLOUD:
NUM_EXTREME_EXAMPLES = 500
probs = model.predict(preprocess(x_test))
......@@ -279,7 +257,7 @@ for k in range(len(DIMENSIONS)):
DATA_DIR, "extreme_examples_{}.txt".format(DIMENSIONS[k][0])
),
"w"
, encoding="utf8")as f:
, encoding="utf8")as f:
for prob, i in min_prob_indices:
# f.write(x_test[i]+'\n')
f.write(x_test[i] + "\n")
......@@ -290,16 +268,16 @@ for k in range(len(DIMENSIONS)):
DATA_DIR, "extreme_examples_{}.txt".format(DIMENSIONS[k][1])
),
"w"
, encoding="utf8") as f:
, encoding="utf8") as f:
for prob, i in max_prob_indices:
# f.write(x_test[i]+'\n')
f.write(x_test[i] + "\n")
# f.write(str(prob)+'\n')
f.write("\n")
### Save model and tokenizer for future use
model.save(os.path.join(MODELS_DIR, "rnn_model_{}.h5".format(DIMENSIONS[k])))
# Save model and tokenizer for future personality predictions
model.save(os.path.join(MODELS_DIR, "SimpleRNN_model_{}.h5".format(DIMENSIONS[k])))
with open(
os.path.join(MODELS_DIR, "rnn_tokenizer_{}.pkl".format(DIMENSIONS[k])), "wb"
) as f:
os.path.join(MODELS_DIR, "SimpleRNN_tokenizer_{}.pkl".format(DIMENSIONS[k])), "wb"
) as f:
pickle.dump(tokenizer, f, protocol=pickle.HIGHEST_PROTOCOL)
This diff is collapsed.
......@@ -5,8 +5,8 @@ import csv
import re
DATA_DIR = "data"
MBTI_CLEAN_CSV_PATH = os.path.join(DATA_DIR, "mbti_clean.csv")
DATA_DIRECTORY = "data"
MBTI_CLEAN_CSV_PATH = os.path.join(DATA_DIRECTORY, "mbti_clean.csv")
DIMENSIONS = ("IE", "NS", "TF", "PJ")
......@@ -29,7 +29,7 @@ for dimension in DIMENSIONS:
continue
posts.append(post)
test_csv_path = os.path.join(DATA_DIR, f"test_{letter}.csv")
test_csv_path = os.path.join(DATA_DIRECTORY, f"test_{letter}.csv")
with open(test_csv_path, "w", encoding="utf-8") as f:
writer = csv.writer(f)
for post in posts:
......
......@@ -5,8 +5,8 @@ import csv
import re
DATA_DIR = "data"
MBTI_UNCLEAN_CSV_PATH = os.path.join(DATA_DIR, "mbti_unclean.csv")
DATA_DIRECTORY = "data"
MBTI_UNCLEAN_CSV_PATH = os.path.join(DATA_DIRECTORY, "mbti_unclean.csv")
DIMENSIONS = ("IE", "NS", "TF", "PJ")
......@@ -57,7 +57,7 @@ for dimension in DIMENSIONS:
posts.append(post)
i += 1
train_csv_path = os.path.join(DATA_DIR, f"train_{letter}.csv")
train_csv_path = os.path.join(DATA_DIRECTORY, f"train_{letter}.csv")
with open(train_csv_path, "w", encoding="utf-8") as f:
writer = csv.writer(f)
for post in posts:
......
import os
import csv
import os
import pickle
import collections
import numpy as np
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from keras.preprocessing import sequence
from keras.preprocessing import text
from keras.models import load_model
from keras.preprocessing import sequence
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
MODELS_DIR = "models"
DATA_DIR = "data"
TRUMP_TWEETS_PATH = os.path.join(DATA_DIR, "trumptweets.csv")
MODELS_DIRECTORY = "models"
DATA_DIRECTORY = "data/sample_data"
SAMPLE_TWEETS_PATH = os.path.join(DATA_DIRECTORY, "0xnickrodriguez_tweets.csv")
DIMENSIONS = ["IE", "NS", "FT", "PJ"]
DIMENSIONS_with_strings = ["Introversion Extroversion", "Intuition Sensing", "Feeling Thinking", "Perceiving Judging"]
MODEL_BATCH_SIZE = 128
TOP_WORDS = 2500
MAX_POST_LENGTH = 40
......@@ -24,7 +22,7 @@ EMBEDDING_VECTOR_LENGTH = 20
final = ""
x_test = []
with open(TRUMP_TWEETS_PATH, "r", encoding="ISO-8859-1") as f:
with open(SAMPLE_TWEETS_PATH, "r", encoding="ISO-8859-1") as f:
reader = csv.reader(f)
for row in f:
x_test.append(row)
......@@ -71,27 +69,34 @@ def lemmatize(x):
for k in range(len(DIMENSIONS)):
model = load_model(
os.path.join(MODELS_DIR, "rnn_model_{}.h5".format(DIMENSIONS[k]))
os.path.join(MODELS_DIRECTORY, "rnn_model_{}.h5".format(DIMENSIONS[k]))
)
tokenizer = None
with open(
os.path.join(MODELS_DIR, "rnn_tokenizer_{}.pkl".format(DIMENSIONS[k])), "rb"
os.path.join(MODELS_DIRECTORY, "rnn_tokenizer_{}.pkl".format(DIMENSIONS[k])), "rb"
) as f:
tokenizer = pickle.load(f)
def preprocess(x):
lemmatized = lemmatize(x)
tokenized = tokenizer.texts_to_sequences(lemmatized)
return sequence.pad_sequences(tokenized, maxlen=MAX_POST_LENGTH)
predictions = model.predict(preprocess(x_test))
prediction = float(sum(predictions) / len(predictions))
print(DIMENSIONS[k])
print(DIMENSIONS_with_strings[k])
print(prediction)
if prediction >= 0.5:
final += DIMENSIONS[k][1]
print("Personality type - ", DIMENSIONS[k][1])
else:
final += DIMENSIONS[k][0]
print("Personality type - ", DIMENSIONS[k][0])
print("")
print("")
print("Final prediction: {}".format(final))
print("Personality Type of the Person : {} ".format(final))
......@@ -4,10 +4,10 @@ import pandas as pd
import csv
DATA_DIR = "data"
MBTI_RAW_CSV_PATH = os.path.join(DATA_DIR, "mbti_personality.csv")
MBTI_CLEAN_CSV_PATH = os.path.join(DATA_DIR, "mbti_clean.csv")
MBTI_UNCLEAN_CSV_PATH = os.path.join(DATA_DIR, "mbti_unclean.csv")
DATA_DIRECTORY = "data"
MBTI_RAW_CSV_PATH = os.path.join(DATA_DIRECTORY, "mbti_personality.csv")
MBTI_CLEAN_CSV_PATH = os.path.join(DATA_DIRECTORY, "mbti_personality_clean.csv")
MBTI_UNCLEAN_CSV_PATH = os.path.join(DATA_DIRECTORY, "mbti_personality_unclean.csv")
MBTI_TO_FREQUENCY_DICT = {
"ISTJ": 0.11,
"ISFJ": 0.09,
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment