Commit bfbfadc6 authored by Amuthini's avatar Amuthini

source files updated

parent 9606323e
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (Predicting-Myers-Briggs-Type-Indicator-with-Recurrent-Neural-Networks-master)" project-jdk-type="Python SDK" />
</project>
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/Predicting-Myers-Briggs-Type-Indicator-with-Recurrent-Neural-Networks-master.iml" filepath="$PROJECT_DIR$/.idea/Predicting-Myers-Briggs-Type-Indicator-with-Recurrent-Neural-Networks-master.iml" />
</modules>
</component>
</project>
\ No newline at end of file
This diff is collapsed.
.DS_Store
!data/.gitkeep
!models/.gitkeep
/__pycache__
/data
/models
/venv
/pipe
/.idea
\ No newline at end of file
- repo: https://github.com/psf/black
rev: 20.8b1 # Replace by any tag/version: https://github.com/psf/black/tags
hooks:
- id: black
language_version: python3 # Should be a command that runs python3.6+
\ No newline at end of file
# AdaBoost classifier hyper parameter tunning
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import make_scorer, confusion_matrix
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.pipeline import Pipeline
from preprocessor import pre_process_data
#from preprocessor import get_types
SAVE_MODEL = True
MODELS_DIR = "models"
DATA_DIR = "data"
DIMENSIONS = ["IE", "NS", "FT", "PJ"]
data = pd.read_csv('data/mbti_personality.csv');
#data = data.join(data.apply(lambda row: get_types(row), axis=1))
type_indicators = ["IE: Introversion (I) - Extroversion (E)", "NS: Intuition (N) – Sensing (S)",
"FT: Feeling (F) - Thinking (T)", "JP: Judging (J) – Perceiving (P)"]
list_posts, list_personality = pre_process_data(data, remove_stop_words=True)
# Learn the vocabulary dictionary and return term-document matrix
Tfidf = TfidfVectorizer(analyzer="word",
max_features=1500,
tokenizer=None,
preprocessor=None,
stop_words=None,
max_df=0.7,
min_df=0.1)
X = list_posts
# train type indicator individually
for l in range(len(DIMENSIONS)):
print("%s ..." % (type_indicators[l]))
# Let's train type indicator individually
Y = list_personality[:, l]
# split data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
# to improve the performance , use support vector classifier as base_estimator
# svc = SVC(probability=True, kernel='linear')
# Create adaboost classifer object with default parameters
# for Adaboost the base_estimator = decision tree id default
# n_estimators - number of models to iteratively train / num
# learning_rate - its the contribution of each model to the weights
abc = AdaBoostClassifier()
pipe = Pipeline([("Tfidf", Tfidf), ("abc", abc)])
# Define our search space for grid search
search_space = [
{
'abc__n_estimators': [50, 100, 300, 500],
'abc__learning_rate': [0.0001, 0.001, 0.01, 0.1, 1.0]
}
]
# Define cross validation
kfold = KFold(n_splits=10, random_state=42, shuffle=True) # put n_splits = 10 , put shuffle = true,
# AUC and accuracy as score
scoring = {'AUC': 'roc_auc', 'Accuracy': make_scorer(accuracy_score)}
# Define Randomized Search
grid = RandomizedSearchCV(
pipe,
param_distributions=search_space,
cv=kfold,
scoring=scoring,
refit='AUC',
verbose=1,
n_jobs=-1
)
# for param in grid.get_params().keys():
# print(param)
# Fit grid search
model = grid.fit(X_train, y_train)
# The model scores and confusion matrix can be obtained by
predict = model.predict(X_test)
print('Best AUC Score: {}'.format(model.best_score_))
print('Accuracy: {}'.format(accuracy_score(y_test, predict)))
print(confusion_matrix(y_test, predict))
# print the the best parameters
print(model.best_params_)
# Train and test hte dataset on the AdaBoost classifier
import os
from sklearn.ensemble import AdaBoostClassifier
import pandas as pd
from future.moves import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from preprocessor import pre_process_data
from preprocessor import translate_personality
from preprocessor import translate_back
from preprocessor import get_types
SAVE_MODEL = True
MODELS_DIR = "models"
DATA_DIR = "data"
DIMENSIONS = ["IE", "NS", "FT", "PJ"]
data = pd.read_csv('data/mbti_personality.csv');
data = data.join(data.apply(lambda row: get_types(row), axis=1))
list_posts, list_personality = pre_process_data(data, remove_stop_words=True)
# Learn the vocabulary dictionary and return term-document matrix
Tfidf = TfidfVectorizer(analyzer="word",
max_features=1500,
tokenizer=None,
preprocessor=None,
stop_words=None,
max_df=0.7,
min_df=0.1)
type_indicators = ["IE: Introversion (I) - Extroversion (E)", "NS: Intuition (N) – Sensing (S)",
"FT: Feeling (F) - Thinking (T)", "JP: Judging (J) – Perceiving (P)"]
X = list_posts
# Let's train type indicator individually
for l in range(len(DIMENSIONS)):
print("%s ..." % (type_indicators[l]))
# Let's train type indicator individually
Y = list_personality[:, l]
# split data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
# Create adaboost classifer object with default parameters
# for Adaboost the base_estimator = decision tree id default
# n_estimators - number of models to iteratively train
# learning_rate - its the contribution of each model to the weights
abc = AdaBoostClassifier(n_estimators=500, learning_rate=0.1)
pipe = Pipeline([("Tfidf", Tfidf), ("abc", abc)])
pipe.fit(X_train, y_train)
with open (os.path.join(MODELS_DIR, "Adaboost_pipeline_{}.pkl".format(DIMENSIONS[l])), 'wb') as picklefile:
pickle.dump(pipe, picklefile)
# make predictions for test data
y_pred = pipe.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("* %s Accuracy: %.2f%%" % (type_indicators[l], accuracy * 100.0))
Intoversion - extroversion
Best AUC Score: 0.803667
Accuracy: 0.7285539643730353
[[2229 0]
[ 634 0]]
{'abc__learning_rate': 0.1, 'abc__n_estimators': 500}
NS: Intuition (N) – Sensing (S) ...
Best AUC Score: 0.6727666369367796
Accuracy: 0.8046946929265
[[2431 32]
[ 384 16]]
{'abc__learning_rate': 0.1, 'abc__n_estimators': 300}
FT: Feeling (F) - Thinking (T) ...
Best AUC Score: 0.75395340936081
Accuracy: 0.72895568376202319
[[1199 355]
[ 421 888]]
{'abc__learning_rate': 0.01, 'abc__n_estimators': 500}
JP: Judging (J) – Perceiving (P) ...
Best AUC Score: 0.6638994402640133
Accuracy: 0.6521131680055885
[[ 252 867]
[ 129 1615]]
{'abc__learning_rate': 0.1, 'abc__n_estimators': 500}
Intoversion - extroversion
Best AUC Score: 0.803667
Accuracy: 0.7285539643730353
[[2229 0]
[ 634 0]]
{'abc__learning_rate': 0.1, 'abc__n_estimators': 500}
NS: Intuition (N) – Sensing (S) ...
Best AUC Score: 0.6727666369367796
Accuracy: 0.8046946929265
[[2431 32]
[ 384 16]]
{'abc__learning_rate': 0.1, 'abc__n_estimators': 300}
FT: Feeling (F) - Thinking (T) ...
Best AUC Score: 0.75395340936081
Accuracy: 0.72895568376202319
[[1199 355]
[ 421 888]]
{'abc__learning_rate': 0.01, 'abc__n_estimators': 500}
JP: Judging (J) – Perceiving (P) ...
Best AUC Score: 0.6638994402640133
Accuracy: 0.6521131680055885
[[ 252 867]
[ 129 1615]]
{'abc__learning_rate': 0.1, 'abc__n_estimators': 500}
IE: Introversion (I) - Extroversion (E) ...
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Best AUC Score: 0.5142753030133241
Accuracy: 0.762486901851205
[[2179 10]
[ 670 4]]
{'sgd__alpha': 0.0003238897879211981, 'sgd__loss': 'modified_huber', 'sgd__penalty': 'l1'}
NS: Intuition (N) – Sensing (S) ...
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Best AUC Score: 0.516264861572238
Accuracy: 0.8620328326929794
[[2468 0]
[ 395 0]]
{'sgd__alpha': 0.0006842577234824017, 'sgd__loss': 'modified_huber', 'sgd__penalty': 'l1'}
FT: Feeling (F) - Thinking (T) ...
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Best AUC Score: 0.5050005967000348
Accuracy: 0.5036674816625917
[[858 695]
[726 584]]
{'sgd__alpha': 0.0015350240019106492, 'sgd__loss': 'modified_huber', 'sgd__penalty': 'none'}
JP: Judging (J) – Perceiving (P) ...
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Best AUC Score: 0.5209168542372503
Accuracy: 0.5277680754453371
[[ 335 785]
[ 567 1176]]
{'sgd__alpha': 0.0008934896440956289, 'sgd__loss': 'modified_huber', 'sgd__penalty': 'none'}
sgd
IE: Introversion (I) - Extroversion (E) ...
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Best AUC Score: 0.5292548647534668
Accuracy: 0.7740132727907789
[[2216 0]
[ 647 0]]
{'sgd__alpha': 0.0009265019438562898, 'sgd__loss': 'modified_huber', 'sgd__penalty': 'l1'}
NS: Intuition (N) – Sensing (S) ...
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Best AUC Score: 0.5426211797685075
Accuracy: 0.857492141110723
[[2455 1]
[ 407 0]]
{'sgd__alpha': 0.0011441798336083461, 'sgd__loss': 'modified_huber', 'sgd__penalty': 'l2'}
FT: Feeling (F) - Thinking (T) ...
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Best AUC Score: 0.5
Accuracy: 0.5312609151239958
[[1521 0]
[1342 0]]
{'sgd__alpha': 0.0019410296620838965, 'sgd__loss': 'hinge', 'sgd__penalty': 'l1'}
JP: Judging (J) – Perceiving (P) ...
Fitting 10 folds for each of 10 candidates, totalling 100 fits
Best AUC Score: 0.4989554047081358
Accuracy: 0.5302130632203982
[[ 336 814]
[ 531 1182]]
{'sgd__alpha': 0.0004231316730058021, 'sgd__loss': 'modified_huber', 'sgd__penalty': 'none'}
# Train and test hte dataset on XGBoost classifier
import os
import re
import numpy as np
import pandas as pd
from future.moves import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from preprocessor import pre_process_data
from preprocessor import translate_personality
from preprocessor import translate_back
#from preprocessor import get_types
SAVE_MODEL = True
MODELS_DIR = "models"
DATA_DIR = "data"
DIMENSIONS = ["IE", "NS", "FT", "PJ"]
data = pd.read_csv('data/mbti_personality.csv')
list_posts, list_personality = pre_process_data(data, remove_stop_words=True)
# Learn the vocabulary dictionary and return term-document matrix
print("CountVectorizer...")
Tfidf = TfidfVectorizer(analyzer="word",
max_features=1500,
tokenizer=None,
preprocessor=None,
stop_words=None,
max_df=0.7,
min_df=0.1)
type_indicators = ["IE: Introversion (I) - Extroversion (E)", "NS: Intuition (N) – Sensing (S)",
"FT: Feeling (F) - Thinking (T)", "JP: Judging (J) – Perceiving (P)"]
X = list_posts
# setup parameters for xgboost
param = {}
param['n_estimators'] = 150 # 200
param['max_depth'] = 3 # 2
#param['nthread'] = 8
param['learning_rate'] = 0.01
param['gamma'] = 0.1
# param['xgb__colsample_bytree'] = 0.1
# Let's train type indicator individually
for l in range(len(DIMENSIONS)):
print("%s ..." % (type_indicators[l]))
# Let's train type indicator individually
Y = list_personality[:, l]
# split data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
xgb = XGBClassifier(**param)
pipe = Pipeline([("Tfidf", Tfidf), ("xgb", xgb)])
pipe.fit(X_train, y_train)
with open (os.path.join(MODELS_DIR, "xgb_pipeline_{}.pkl".format(DIMENSIONS[l])), 'wb') as picklefile:
pickle.dump(pipe, picklefile)
# make predictions for test data
y_pred = pipe.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("* %s Accuracy: %.2f%%" % (type_indicators[l], accuracy * 100.0))
Intoversion - extroversion
Best AUC Score: 0.677028682166271
Accuracy: 0.7785539643730353
[[2229 0]
[ 634 0]]
{'xgb__n_estimators': 200, 'xgb__max_depth': 6, 'xgb__learning_rate': 0.01, 'xgb__gamma': 0.2, 'xgb__colsample_bytree': 0.1}
NS: Intuition (N) – Sensing (S) ...
Best AUC Score: 0.6527666346929265
Accuracy: 0.854697869367796
[[2431 32]
[ 384 16]]
{'xgb__n_estimators': 150, 'xgb__max_depth': 3, 'xgb__learning_rate': 0.3, 'xgb__gamma': 0.2, 'xgb__colsample_bytree': 0.2}
FT: Feeling (F) - Thinking (T) ...
Best AUC Score: 0.8139538376202319
Accuracy: 0.728955640936081
[[1199 355]
[ 421 888]]
{'xgb__n_estimators': 150, 'xgb__max_depth': 4, 'xgb__learning_rate': 0.1, 'xgb__gamma': 0.1, 'xgb__colsample_bytree': 0.1}
JP: Judging (J) – Perceiving (P) ...
Best AUC Score: 0.6638994402640133
Accuracy: 0.6521131680055885
[[ 252 867]
[ 129 1615]]
{'xgb__n_estimators': 50, 'xgb__max_depth': 3, 'xgb__learning_rate': 0.1, 'xgb__gamma': 0.0, 'xgb__colsample_bytree': 0.2}
Intoversion - extroversion
Best AUC Score: 0.677028682166271
Accuracy: 0.7785539643730353
[[2229 0]
[ 634 0]]
{'xgb__n_estimators': 200, 'xgb__max_depth': 6, 'xgb__learning_rate': 0.01, 'xgb__gamma': 0.2, 'xgb__colsample_bytree': 0.1}
NS: Intuition (N) – Sensing (S) ...
Best AUC Score: 0.6527666346929265
Accuracy: 0.854697869367796
[[2431 32]
[ 384 16]]
{'xgb__n_estimators': 150, 'xgb__max_depth': 3, 'xgb__learning_rate': 0.3, 'xgb__gamma': 0.2, 'xgb__colsample_bytree': 0.2}
FT: Feeling (F) - Thinking (T) ...
Best AUC Score: 0.8139538376202319
Accuracy: 0.728955640936081
[[1199 355]
[ 421 888]]
{'xgb__n_estimators': 150, 'xgb__max_depth': 4, 'xgb__learning_rate': 0.1, 'xgb__gamma': 0.1, 'xgb__colsample_bytree': 0.1}
JP: Judging (J) – Perceiving (P) ...
Best AUC Score: 0.6638994402640133
Accuracy: 0.6521131680055885
[[ 252 867]
[ 129 1615]]
{'xgb__n_estimators': 50, 'xgb__max_depth': 3, 'xgb__learning_rate': 0.1, 'xgb__gamma': 0.0, 'xgb__colsample_bytree': 0.2}
# code to extract tweets of a person , by using their twitter username
import sys
import csv
import tweepy
# Get your Twitter API credentials and enter them here
consumer_key = "6c5V3cQ1pj0DOiMgZ5znPZHDR"
consumer_secret = "yVky0cSsTYrs0Lge39pLwFSggH7Fan8ibtMz4fu10mAoZk9AA8"
access_key = "867670407599579137-8Ap6KyTTvlTiOI2xlMSIp8uFVlDMxHG"
access_secret = "HWchPQN4C3xJ3U6eWzCDovfsRD1dBbPh4z6ir4AuIzxYU"
# method to get a user's last tweets
def get_tweets(username):
# http://tweepy.readthedocs.org/en/v3.1.0/getting_started.html#api
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)
# set count to however many tweets you want
number_of_tweets = 3000
user = api.get_user(username)
# get tweets
tweets_for_csv = []
# tweets_for_csv.append([username,user.name,user.description,user.followers_count,user.listed_count,user.friends_count])
for tweet in tweepy.Cursor(api.user_timeline, screen_name=username).items(number_of_tweets):
# create array of tweet information: username, tweet id, date/time, text
tweets_for_csv.append(
[username, user.name, user.description, user.followers_count, user.listed_count, user.friends_count,
tweet.id_str, tweet.created_at, (tweet.text).encode("utf-8")])
# write to a new csv file from the array of tweets
outfile = username + "_tweets.csv"
print("writing to " + outfile)
with open(outfile, 'w', encoding='utf-8') as file:
# with file:
# identifying header
# header = ['user_name','user','description','no_list','no_followers','no_friends','tweet_id','tweet_date','tweet_text']
# writer = csv.writer(file,fieldnames = header, delimiter = ',')
writer = csv.writer(file, delimiter=',')
# writer.writerow(['user_name','user','description','no_list','no_followers','no_friends','tweet_id','tweet_date','tweet_text'])
writer.writerow(['tweet_text'])
writer.writerows(tweets_for_csv)
# if we're running this as a script
if __name__ == '__main__':
# get tweets for username passed at command line
# if len(sys.argv) == 2:
# get_tweets(sys.argv[1])
# else:
# print "Error: enter one username"
# alternative method: loop through multiple users
users = ['mayweather_gh', 'KhuthTradingWay', 'PistisMakasi', 'TUt3YwYDORjfgw2', 'koaung448', 'hikuto_e', 'Gmoncsc',
'KRISHAN17328009', 'mtvc36112', 'Charles46762537', 'kenanyildirimky', 'Santanu57201363', 'GiorgianStrejo1',
'UsharaniSharm11', 'SchlangerAndre1', 'RajKumarChadha8',
'ChikaMartinO1', 'Gargipal18']
for user in users:
get_tweets(user)
# Predict the personality type from a candidate tweets.
import csv
import os
import pandas as pd
from future.moves import pickle
from preprocessor import pre_process_data
from preprocessor import translate_back
SAVE_MODEL = True
MODELS_DIR = "models"
SAMPLE_DATA_DIRECTORY = "data/sample_data"
SAMPLE_TWEETS_PATH = os.path.join(SAMPLE_DATA_DIRECTORY, "apihandyman_tweets.csv")
x_test = ""
DIMENSIONS = ["IE", "NS", "FT", "PJ"]
with open(SAMPLE_TWEETS_PATH, "r", encoding="ISO-8859-1") as f:
reader = csv.reader(f)
for row in f:
# x_test.append(row)
x_test = x_test + " " + row
type_indicators = ["IE: Introversion (I) - Extroversion (E)", "NS: Intuition (N) – Sensing (S)",
"FT: Feeling (F) - Thinking (T)", "JP: Judging (J) – Perceiving (P)"]
# The type is just a dummy so that the data preprocessing fucntion can be reused
mydata = pd.DataFrame(data={'type': ['ENFP'], 'posts': x_test})
x_test, dummy = pre_process_data(mydata, remove_stop_words=True)
result = []
# train type indicator individually
# for l in range(len(type_indicators)):
for k in range(len(DIMENSIONS)):
print("%s ..." % (DIMENSIONS[k]))
with open(os.path.join(MODELS_DIR, "sgd_pipeline_{}.pkl".format(DIMENSIONS[k])), 'rb') as picklefile:
saved_pipe = pickle.load(picklefile)
predictions = saved_pipe.predict(x_test)
# make predictions for my data
result.append(predictions[0])
print("The result is: ", translate_back(result))
This diff is collapsed.
# Code to preprocess the user's text posts
import re
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
SAVE_MODEL = True
MODELS_DIR = "models"
DATA_DIR = "data"
DIMENSIONS = ["IE", "NS", "FT", "PJ"]
# data = pd.read_csv('data/mbti_personality.csv')
# add 4 columns for personality type indicators
# def get_types(row):
# t = row['type']
#
# I = 0
# N = 0
# T = 0
# J = 0
#
# if t[0] == 'I':
# I = 1
# elif t[0] == 'E':
# I = 0
# else:
# print('I-E incorrect')
#
# if t[1] == 'N':
# N = 1
# elif t[1] == 'S':
# N = 0
# else:
# print('N-S incorrect')
#
# if t[2] == 'T':
# T = 1
# elif t[2] == 'F':
# T = 0
# else:
# print('T-F incorrect')
#
# if t[3] == 'J':
# J = 1
# elif t[3] == 'P':
# J = 0
# else:
# print('J-P incorrect')
# return pd.Series({'IE': I, 'NS': N, 'TF': T, 'JP': J})
# data = data.join(data.apply(lambda row: get_types(row), axis=1))
# print("Introversion (I) / Extroversion (E):\t", data['IE'].value_counts()[0], " / ", data['IE'].value_counts()[1])
# print("Intuition (N) – Sensing (S):\t\t", data['NS'].value_counts()[0], " / ", data['NS'].value_counts()[1])
# print("Thinking (T) – Feeling (F):\t\t", data['TF'].value_counts()[0], " / ", data['TF'].value_counts()[1])
# print("Judging (J) – Perceiving (P):\t\t", data['JP'].value_counts()[0], " / ", data['JP'].value_counts()[1])
b_Pers = {'I': 0, 'E': 1, 'N': 0, 'S': 1, 'F': 0, 'T': 1, 'J': 0, 'P': 1}
b_Pers_list = [{0: 'I', 1: 'E'}, {0: 'N', 1: 'S'}, {0: 'F', 1: 'T'}, {0: 'J', 1: 'P'}]
def translate_personality(personality):
# transform mbti to binary vector
return [b_Pers[l] for l in personality]
def translate_back(personality):
# transform binary vector to mbti personality
s = ""
for i, l in enumerate(personality):
s += b_Pers_list[i][l]
return s
# To remove the personality type from the psosts
unique_type_list = ['INFJ', 'ENTP', 'INTP', 'INTJ', 'ENTJ', 'ENFJ', 'INFP', 'ENFP',
'ISFP', 'ISTP', 'ISFJ', 'ISTJ', 'ESTP', 'ESFP', 'ESTJ', 'ESFJ']
unique_type_list = [x.lower() for x in unique_type_list]
# Lemmatize
stemmer = PorterStemmer()
lemmatiser = WordNetLemmatizer()
# Cache the stop words for speed
cachedStopWords = stopwords.words("english")
def pre_process_data(data, remove_stop_words=True, remove_mbti_profiles=True):
list_personality = []
list_posts = []
len_data = len(data)
i = 0
for row in data.iterrows():
i += 1
if i % 500 == 0 or i == 1 or i == len_data:
print("%s of %s rows" % (i, len_data))
# Remove and clean posts
posts = row[1].posts
# remove urls
temp = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ', posts)
# remove uncleared words
temp = re.sub("[^a-zA-Z]", " ", temp)
# Remove spaces > 1
temp = re.sub(' +', ' ', temp).lower()
# Remove stop words
if remove_stop_words:
temp = " ".join([lemmatiser.lemmatize(w) for w in temp.split(' ') if w not in cachedStopWords])
else:
temp = " ".join([lemmatiser.lemmatize(w) for w in temp.split(' ')])
# Remove MBTI personality words from posts
if remove_mbti_profiles:
for t in unique_type_list:
temp = temp.replace(t, "")
# transform mbti to binary vector
type_labelized = translate_personality(row[1].type)
list_personality.append(type_labelized)
list_posts.append(temp)
list_posts = np.array(list_posts)
list_personality = np.array(list_personality)
return list_posts, list_personality
[tool.poetry]
name = "mbti-rnn"
version = "0.1.0"
description = ""
authors = ["Ian Scott Knight <isk@alumni.stanford.edu>"]
license = "MIT"
[tool.poetry.dependencies]
python = "^3.8"
scikit-learn = "^0.24.1"
nltk = "^3.5"
Keras = "^2.4.3"
pandas = "^1.2.1"
tensorflow = "^2.4.1"
[tool.poetry.dev-dependencies]
pre-commit = "^2.10.0"
[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"
# SGDClassifer parameter tunning
import numpy as np
import pandas as pd
import scipy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, make_scorer, confusion_matrix
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from preprocessor import pre_process_data
from preprocessor import translate_personality
from preprocessor import translate_back
#from preprocessor import get_types
SAVE_MODEL = True
MODELS_DIR = "models"
DATA_DIR = "data"
DIMENSIONS = ["IE", "NS", "FT", "PJ"]
data = pd.read_csv('data/mbti_personality.csv');
#data = data.join(data.apply(lambda row: get_types(row), axis=1))
list_posts, list_personality = pre_process_data(data, remove_stop_words=True)
# Learn the vocabulary dictionary and return term-document matrix
Tfidf = TfidfVectorizer(analyzer="word",
max_features=1500,
tokenizer=None,
preprocessor=None,
stop_words=None,
max_df=0.7,
min_df=0.1)
type_indicators = ["IE: Introversion (I) - Extroversion (E)", "NS: Intuition (N) – Sensing (S)",
"FT: Feeling (F) - Thinking (T)", "JP: Judging (J) – Perceiving (P)"]
X = list_posts
# Let's train type indicator individually
for l in range(len(DIMENSIONS)):
print("%s ..." % (type_indicators[l]))
# Let's train type indicator individually
Y = list_personality[:, l]
np.random.shuffle(X)
np.random.shuffle(Y)
# split data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
# sgd = SGDClassifier(max_iter=10, tol=None)
sgd = SGDClassifier()
pipe = Pipeline([("Tfidf", Tfidf), ("sgd", sgd)])
# Define our search space for Randomized Search
search_space = [
{
'sgd__loss': ['hinge', 'modified_huber'],
'sgd__penalty': ['none', 'l2', 'l1'],
'sgd__alpha': scipy.stats.uniform(0.00005, 0.002)
}
]
# Define cross validation
kfold = KFold(n_splits=10, random_state=42, shuffle=True) # put 10 try - yeah tried it
# AUC and accuracy as score
scoring = {'AUC': 'roc_auc', 'Accuracy': make_scorer(accuracy_score)}
# Define grid search
grid = RandomizedSearchCV(
pipe,
param_distributions=search_space,
cv=kfold,
scoring=scoring,
refit='AUC',
verbose=1,
n_jobs=-1
)
# for param in grid.get_params().keys():
# print(param)
# Fit grid search
model = grid.fit(X_train, y_train)
# The model scores and confusion matrix can be obtained by
predict = model.predict(X_test)
print('Best AUC Score: {}'.format(model.best_score_))
print('Accuracy: {}'.format(accuracy_score(y_test, predict)))
print(confusion_matrix(y_test, predict))
# And the best parameters can be obtained by:
print(model.best_params_)
# Train and test hte dataset on SGDClassifier
import os
import pandas as pd
from future.moves import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from preprocessor import pre_process_data
from preprocessor import translate_personality
from preprocessor import translate_back
#from preprocessor import get_types
SAVE_MODEL = True
MODELS_DIR = "models"
DATA_DIR = "data"
DIMENSIONS = ["IE", "NS", "FT", "PJ"]
data = pd.read_csv('data/mbti_personality.csv');
#data = data.join(data.apply(lambda row: get_types(row), axis=1))
list_posts, list_personality = pre_process_data(data, remove_stop_words=True)
# Learn the vocabulary dictionary and return term-document matrix
Tfidf = TfidfVectorizer(analyzer="word",
max_features=1500,
tokenizer=None,
preprocessor=None,
stop_words=None,
max_df=0.7,
min_df=0.1)
type_indicators = ["IE: Introversion (I) - Extroversion (E)", "NS: Intuition (N) – Sensing (S)",
"FT: Feeling (F) - Thinking (T)", "JP: Judging (J) – Perceiving (P)"]
X = list_posts
# Let's train type indicator individually
for l in range(len(DIMENSIONS)):
print("%s ..." % (type_indicators[l]))
# Let's train type indicator individually
Y = list_personality[:, l]
# split data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
# sgd = SGDClassifier(max_iter=10, tol=None,)
sgd = SGDClassifier(loss= 'modified_huber', penalty = 'l1', alpha = 0.00032 )
pipe = Pipeline([("Tfidf", Tfidf), ("sgd", sgd)])
pipe.fit(X_train, y_train)
with open (os.path.join(MODELS_DIR, "sgd_pipeline_{}.pkl".format(DIMENSIONS[l])), 'wb') as picklefile:
pickle.dump(pipe, picklefile)
# make predictions for test data
y_pred = pipe.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("* %s Accuracy: %.2f%%" % (type_indicators[l], accuracy * 100.0))
# XGBoost parameter training
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, make_scorer, confusion_matrix
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, GridSearchCV, StratifiedKFold, RandomizedSearchCV, KFold
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from preprocessor import pre_process_data
#from preprocessor import get_types
SAVE_MODEL = True
MODELS_DIR = "models"
DATA_DIR = "data"
DIMENSIONS = ["IE", "NS", "FT", "PJ"]
data = pd.read_csv('data/mbti_personality.csv');
#data = data.join(data.apply(lambda row: get_types(row), axis=1))
type_indicators = ["IE: Introversion (I) - Extroversion (E)", "NS: Intuition (N) – Sensing (S)",
"FT: Feeling (F) - Thinking (T)", "JP: Judging (J) – Perceiving (P)"]
list_posts, list_personality = pre_process_data(data, remove_stop_words=True)
# Posts in tf-idf representation
Tfidf = TfidfVectorizer(analyzer="word",
max_features=1500,
tokenizer=None,
preprocessor=None,
stop_words=None,
max_df=0.7,
min_df=0.1)
X = list_posts
# setup parameters for xgboost
param = {}
param['n_estimators'] = 200
param['max_depth'] = 2
param['nthread'] = 8
param['learning_rate'] = 0.2
# Let's train type indicator individually
for l in range(len(DIMENSIONS)):
print("%s ..." % (type_indicators[l]))
# Let's train type indicator individually
Y = list_personality[:, l]
#split data into train and test sets
seed = 7
test_size = 0.33
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
xgb = XGBClassifier(use_label_encoder=False)
pipe = Pipeline([("Tfidf", Tfidf), ("xgb", xgb)])
# Define our search space for randomized search
search_space = [
{
'xgb__n_estimators': [50, 100, 150, 200],
'xgb__learning_rate': [0.01, 0.1, 0.2, 0.3],
'xgb__max_depth': range(3, 10),
'xgb__colsample_bytree': [i / 10.0 for i in range(1, 3)],
'xgb__gamma': [i / 10.0 for i in range(3)]
}
]
# Define cross validation
kfold = KFold(n_splits=10, random_state=42, shuffle=True ) # put 10 try and put shuffle = true,
# AUC and accuracy as score
scoring = {'AUC': 'roc_auc', 'Accuracy': make_scorer(accuracy_score)}
# Define grid search
grid = RandomizedSearchCV(
pipe,
param_distributions=search_space,
cv=kfold,
scoring=scoring,
refit='AUC',
verbose=1,
n_jobs=-1
)
# for param in grid.get_params().keys():
# print(param)
# Fit grid search
model = grid.fit(X_train, y_train)
# The model scores and confusion matrix can be obtained by
predict = model.predict(X_test)
print('Best AUC Score: {}'.format(model.best_score_))
print('Accuracy: {}'.format(accuracy_score(y_test, predict)))
print(confusion_matrix(y_test, predict))
# And the best parameters can be obtained by:
print(model.best_params_)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment