source files updated

bfbfadc6 · Amuthini · 9606323e · 9606323e · 9606323e · 9606323e
Commit bfbfadc6 authored Jul 09, 2021 by Amuthini
24 changed files
--- a/it17173100/.idea/inspectionProfiles/profiles_settings.xml
+++ b/it17173100/.idea/inspectionProfiles/profiles_settings.xml
-<component name="InspectionProjectProfileManager">
-  <settings>
-    <option name="USE_PROJECT_PROFILE" value="false" />
-    <version value="1.0" />
-  </settings>
-</component>
\ No newline at end of file
--- a/it17173100/.idea/misc.xml
+++ b/it17173100/.idea/misc.xml
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (Predicting-Myers-Briggs-Type-Indicator-with-Recurrent-Neural-Networks-master)" project-jdk-type="Python SDK" />
-</project>
\ No newline at end of file
--- a/it17173100/.idea/modules.xml
+++ b/it17173100/.idea/modules.xml
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="ProjectModuleManager">
-    <modules>
-      <module fileurl="file://$PROJECT_DIR$/.idea/Predicting-Myers-Briggs-Type-Indicator-with-Recurrent-Neural-Networks-master.iml" filepath="$PROJECT_DIR$/.idea/Predicting-Myers-Briggs-Type-Indicator-with-Recurrent-Neural-Networks-master.iml" />
-    </modules>
-  </component>
-</project>
\ No newline at end of file
--- a/it17173100/.idea/workspace.xml
+++ b/it17173100/.idea/workspace.xml
--- a/it17173100/Personality_prediction/.gitignore
+++ b/it17173100/Personality_prediction/.gitignore
+.DS_Store
+!data/.gitkeep
+!models/.gitkeep
+/__pycache__
+/data
+/models
+/venv
+/pipe
+/.idea
\ No newline at end of file
--- a/it17173100/Personality_prediction/.pre-commit-config.yaml
+++ b/it17173100/Personality_prediction/.pre-commit-config.yaml
+- repo: https://github.com/psf/black
+  rev: 20.8b1 # Replace by any tag/version: https://github.com/psf/black/tags
+  hooks:
+    - id: black
+      language_version: python3 # Should be a command that runs python3.6+
\ No newline at end of file
--- a/it17173100/Personality_prediction/Adaboost parameter tunning.py
+++ b/it17173100/Personality_prediction/Adaboost parameter tunning.py
+#  AdaBoost classifier hyper parameter tunning
+import pandas as pd
+from sklearn.ensemble import AdaBoostClassifier
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics import accuracy_score
+from sklearn.metrics import make_scorer, confusion_matrix
+from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
+from sklearn.pipeline import Pipeline
+from preprocessor import pre_process_data
+#from preprocessor import get_types
+SAVE_MODEL = True
+MODELS_DIR = "models"
+DATA_DIR = "data"
+DIMENSIONS = ["IE", "NS", "FT", "PJ"]
+data = pd.read_csv('data/mbti_personality.csv');
+#data = data.join(data.apply(lambda row: get_types(row), axis=1))
+type_indicators = ["IE: Introversion (I) - Extroversion (E)", "NS: Intuition (N) – Sensing (S)",
+                   "FT: Feeling (F) - Thinking (T)", "JP: Judging (J) – Perceiving (P)"]
+list_posts, list_personality = pre_process_data(data, remove_stop_words=True)
+# Learn the vocabulary dictionary and return term-document matrix
+Tfidf = TfidfVectorizer(analyzer="word",
+                        max_features=1500,
+                        tokenizer=None,
+                        preprocessor=None,
+                        stop_words=None,
+                        max_df=0.7,
+                        min_df=0.1)
+X = list_posts
+# train type indicator individually
+for l in range(len(DIMENSIONS)):
+    print("%s ..." % (type_indicators[l]))
+    # Let's train type indicator individually
+    Y = list_personality[:, l]
+    # split data into train and test sets
+    seed = 7
+    test_size = 0.33
+    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
+    # to improve the performance , use support vector classifier as base_estimator
+    # svc = SVC(probability=True, kernel='linear')
+    # Create adaboost classifer object with default parameters
+    # for Adaboost the base_estimator = decision tree id default
+    # n_estimators -  number of models to iteratively train / num
+    # learning_rate - its the contribution of each model to the weights
+    abc = AdaBoostClassifier()
+    pipe = Pipeline([("Tfidf", Tfidf), ("abc", abc)])
+    # Define our search space for grid search
+    search_space = [
+        {
+            'abc__n_estimators': [50, 100, 300, 500],
+            'abc__learning_rate': [0.0001, 0.001, 0.01, 0.1, 1.0]
+        }
+    ]
+    # Define cross validation
+    kfold = KFold(n_splits=10, random_state=42, shuffle=True)  # put n_splits =  10 , put shuffle = true,
+    # AUC and accuracy as score
+    scoring = {'AUC': 'roc_auc', 'Accuracy': make_scorer(accuracy_score)}
+    # Define  Randomized Search
+    grid = RandomizedSearchCV(
+        pipe,
+        param_distributions=search_space,
+        cv=kfold,
+        scoring=scoring,
+        refit='AUC',
+        verbose=1,
+        n_jobs=-1
+    )
+    # for param in grid.get_params().keys():
+    #     print(param)
+    # Fit grid search
+    model = grid.fit(X_train, y_train)
+    # The model scores and confusion matrix can be obtained by
+    predict = model.predict(X_test)
+    print('Best AUC Score: {}'.format(model.best_score_))
+    print('Accuracy: {}'.format(accuracy_score(y_test, predict)))
+    print(confusion_matrix(y_test, predict))
+    # print the  the best parameters
+    print(model.best_params_)
--- a/it17173100/Personality_prediction/Adaboost.py
+++ b/it17173100/Personality_prediction/Adaboost.py
+# Train and test hte dataset on the AdaBoost classifier
+import os
+from sklearn.ensemble import AdaBoostClassifier
+import pandas as pd
+from future.moves import pickle
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.model_selection import train_test_split
+from sklearn.pipeline import Pipeline
+from sklearn.metrics import accuracy_score
+from preprocessor import pre_process_data
+from preprocessor import translate_personality
+from preprocessor import translate_back
+from preprocessor import get_types
+SAVE_MODEL = True
+MODELS_DIR = "models"
+DATA_DIR = "data"
+DIMENSIONS = ["IE", "NS", "FT", "PJ"]
+data = pd.read_csv('data/mbti_personality.csv');
+data = data.join(data.apply(lambda row: get_types(row), axis=1))
+list_posts, list_personality = pre_process_data(data, remove_stop_words=True)
+# Learn the vocabulary dictionary and return term-document matrix
+Tfidf = TfidfVectorizer(analyzer="word",
+                             max_features=1500,
+                             tokenizer=None,
+                             preprocessor=None,
+                             stop_words=None,
+                             max_df=0.7,
+                             min_df=0.1)
+type_indicators = ["IE: Introversion (I) - Extroversion (E)", "NS: Intuition (N) – Sensing (S)",
+                   "FT: Feeling (F) - Thinking (T)", "JP: Judging (J) – Perceiving (P)"]
+X = list_posts
+# Let's train type indicator individually
+for l in range(len(DIMENSIONS)):
+    print("%s ..." % (type_indicators[l]))
+    # Let's train type indicator individually
+    Y = list_personality[:, l]
+    # split data into train and test sets
+    seed = 7
+    test_size = 0.33
+    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
+    # Create adaboost classifer object with default parameters
+    # for Adaboost the base_estimator = decision tree id default
+    # n_estimators -  number of models to iteratively train
+    # learning_rate - its the contribution of each model to the weights
+    abc = AdaBoostClassifier(n_estimators=500, learning_rate=0.1)
+    pipe = Pipeline([("Tfidf", Tfidf), ("abc", abc)])
+    pipe.fit(X_train, y_train)
+    with open (os.path.join(MODELS_DIR, "Adaboost_pipeline_{}.pkl".format(DIMENSIONS[l])), 'wb') as picklefile:
+        pickle.dump(pipe, picklefile)
+    # make predictions for test data
+    y_pred = pipe.predict(X_test)
+    predictions = [round(value) for value in y_pred]
+    # evaluate predictions
+    accuracy = accuracy_score(y_test, predictions)
+    print("* %s Accuracy: %.2f%%" % (type_indicators[l], accuracy * 100.0))
--- a/it17173100/Personality_prediction/AdaboostClassifier.txt.txt
+++ b/it17173100/Personality_prediction/AdaboostClassifier.txt.txt
+Intoversion - extroversion
+Best AUC Score: 0.803667 
+Accuracy: 0.7285539643730353
+[[2229    0]
+ [ 634    0]]
+{'abc__learning_rate': 0.1, 'abc__n_estimators': 500}
+NS: Intuition (N) – Sensing (S) ...
+Best AUC Score: 0.6727666369367796
+Accuracy: 0.8046946929265
+[[2431   32]
+ [ 384   16]]
+{'abc__learning_rate': 0.1, 'abc__n_estimators': 300}
+FT: Feeling (F) - Thinking (T) ...
+Best AUC Score: 0.75395340936081
+Accuracy: 0.72895568376202319 
+[[1199  355]
+ [ 421  888]]
+{'abc__learning_rate': 0.01, 'abc__n_estimators': 500}
+JP: Judging (J) – Perceiving (P) ...
+Best AUC Score: 0.6638994402640133
+Accuracy: 0.6521131680055885
+[[ 252  867]
+ [ 129 1615]]
+{'abc__learning_rate': 0.1, 'abc__n_estimators': 500}
--- a/it17173100/Personality_prediction/AdaboostClassifier_tunning_result.txt
+++ b/it17173100/Personality_prediction/AdaboostClassifier_tunning_result.txt
+Intoversion - extroversion
+Best AUC Score: 0.803667 
+Accuracy: 0.7285539643730353
+[[2229    0]
+ [ 634    0]]
+{'abc__learning_rate': 0.1, 'abc__n_estimators': 500}
+NS: Intuition (N) – Sensing (S) ...
+Best AUC Score: 0.6727666369367796
+Accuracy: 0.8046946929265
+[[2431   32]
+ [ 384   16]]
+{'abc__learning_rate': 0.1, 'abc__n_estimators': 300}
+FT: Feeling (F) - Thinking (T) ...
+Best AUC Score: 0.75395340936081
+Accuracy: 0.72895568376202319 
+[[1199  355]
+ [ 421  888]]
+{'abc__learning_rate': 0.01, 'abc__n_estimators': 500}
+JP: Judging (J) – Perceiving (P) ...
+Best AUC Score: 0.6638994402640133
+Accuracy: 0.6521131680055885
+[[ 252  867]
+ [ 129 1615]]
+{'abc__learning_rate': 0.1, 'abc__n_estimators': 500}
--- a/it17173100/Personality_prediction/SGDClassifier.txt.txt
+++ b/it17173100/Personality_prediction/SGDClassifier.txt.txt
+IE: Introversion (I) - Extroversion (E) ...
+Fitting 10 folds for each of 10 candidates, totalling 100 fits
+Best AUC Score: 0.5142753030133241
+Accuracy: 0.762486901851205
+[[2179   10]
+ [ 670    4]]
+{'sgd__alpha': 0.0003238897879211981, 'sgd__loss': 'modified_huber', 'sgd__penalty': 'l1'}
+NS: Intuition (N) – Sensing (S) ...
+Fitting 10 folds for each of 10 candidates, totalling 100 fits
+Best AUC Score: 0.516264861572238
+Accuracy: 0.8620328326929794
+[[2468    0]
+ [ 395    0]]
+{'sgd__alpha': 0.0006842577234824017, 'sgd__loss': 'modified_huber', 'sgd__penalty': 'l1'}
+FT: Feeling (F) - Thinking (T) ...
+Fitting 10 folds for each of 10 candidates, totalling 100 fits
+Best AUC Score: 0.5050005967000348
+Accuracy: 0.5036674816625917
+[[858 695]
+ [726 584]]
+{'sgd__alpha': 0.0015350240019106492, 'sgd__loss': 'modified_huber', 'sgd__penalty': 'none'}
+JP: Judging (J) – Perceiving (P) ...
+Fitting 10 folds for each of 10 candidates, totalling 100 fits
+Best AUC Score: 0.5209168542372503
+Accuracy: 0.5277680754453371
+[[ 335  785]
+ [ 567 1176]]
+{'sgd__alpha': 0.0008934896440956289, 'sgd__loss': 'modified_huber', 'sgd__penalty': 'none'}
--- a/it17173100/Personality_prediction/SGDClassifier_tunning_result.txt
+++ b/it17173100/Personality_prediction/SGDClassifier_tunning_result.txt
+sgd
+IE: Introversion (I) - Extroversion (E) ...
+Fitting 10 folds for each of 10 candidates, totalling 100 fits
+Best AUC Score: 0.5292548647534668
+Accuracy: 0.7740132727907789
+[[2216    0]
+ [ 647    0]]
+{'sgd__alpha': 0.0009265019438562898, 'sgd__loss': 'modified_huber', 'sgd__penalty': 'l1'}
+NS: Intuition (N) – Sensing (S) ...
+Fitting 10 folds for each of 10 candidates, totalling 100 fits
+Best AUC Score: 0.5426211797685075
+Accuracy: 0.857492141110723
+[[2455    1]
+ [ 407    0]]
+{'sgd__alpha': 0.0011441798336083461, 'sgd__loss': 'modified_huber', 'sgd__penalty': 'l2'}
+FT: Feeling (F) - Thinking (T) ...
+Fitting 10 folds for each of 10 candidates, totalling 100 fits
+Best AUC Score: 0.5
+Accuracy: 0.5312609151239958
+[[1521    0]
+ [1342    0]]
+{'sgd__alpha': 0.0019410296620838965, 'sgd__loss': 'hinge', 'sgd__penalty': 'l1'}
+JP: Judging (J) – Perceiving (P) ...
+Fitting 10 folds for each of 10 candidates, totalling 100 fits
+Best AUC Score: 0.4989554047081358
+Accuracy: 0.5302130632203982
+[[ 336  814]
+ [ 531 1182]]
+{'sgd__alpha': 0.0004231316730058021, 'sgd__loss': 'modified_huber', 'sgd__penalty': 'none'}
--- a/it17173100/Personality_prediction/XGBoost classifier.py
+++ b/it17173100/Personality_prediction/XGBoost classifier.py
+# Train and test hte dataset on XGBoost classifier
+import os
+import re
+import numpy as np
+import pandas as pd
+from future.moves import pickle
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics import accuracy_score
+from sklearn.model_selection import train_test_split
+from sklearn.pipeline import Pipeline
+from xgboost import XGBClassifier
+from preprocessor import pre_process_data
+from preprocessor import translate_personality
+from preprocessor import translate_back
+#from preprocessor import get_types
+SAVE_MODEL = True
+MODELS_DIR = "models"
+DATA_DIR = "data"
+DIMENSIONS = ["IE", "NS", "FT", "PJ"]
+data = pd.read_csv('data/mbti_personality.csv')
+list_posts, list_personality = pre_process_data(data, remove_stop_words=True)
+# Learn the vocabulary dictionary and return term-document matrix
+print("CountVectorizer...")
+Tfidf = TfidfVectorizer(analyzer="word",
+                        max_features=1500,
+                        tokenizer=None,
+                        preprocessor=None,
+                        stop_words=None,
+                        max_df=0.7,
+                        min_df=0.1)
+type_indicators = ["IE: Introversion (I) - Extroversion (E)", "NS: Intuition (N) – Sensing (S)",
+                   "FT: Feeling (F) - Thinking (T)", "JP: Judging (J) – Perceiving (P)"]
+X = list_posts
+# setup parameters for xgboost
+param = {}
+param['n_estimators'] = 150  # 200
+param['max_depth'] = 3  # 2
+#param['nthread'] = 8
+param['learning_rate'] = 0.01
+param['gamma'] = 0.1
+# param['xgb__colsample_bytree'] = 0.1
+# Let's train type indicator individually
+for l in range(len(DIMENSIONS)):
+    print("%s ..." % (type_indicators[l]))
+    # Let's train type indicator individually
+    Y = list_personality[:, l]
+    # split data into train and test sets
+    seed = 7
+    test_size = 0.33
+    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
+    xgb = XGBClassifier(**param)
+    pipe = Pipeline([("Tfidf", Tfidf), ("xgb", xgb)])
+    pipe.fit(X_train, y_train)
+    with open  (os.path.join(MODELS_DIR, "xgb_pipeline_{}.pkl".format(DIMENSIONS[l])), 'wb') as picklefile:
+        pickle.dump(pipe, picklefile)
+    # make predictions for test data
+    y_pred = pipe.predict(X_test)
+    predictions = [round(value) for value in y_pred]
+    # evaluate predictions
+    accuracy = accuracy_score(y_test, predictions)
+    print("* %s Accuracy: %.2f%%" % (type_indicators[l], accuracy * 100.0))
--- a/it17173100/Personality_prediction/XGBoostClassifier.txt.txt
+++ b/it17173100/Personality_prediction/XGBoostClassifier.txt.txt
+Intoversion - extroversion
+Best AUC Score: 0.677028682166271
+Accuracy: 0.7785539643730353
+[[2229    0]
+ [ 634    0]]
+{'xgb__n_estimators': 200, 'xgb__max_depth': 6, 'xgb__learning_rate': 0.01, 'xgb__gamma': 0.2, 'xgb__colsample_bytree': 0.1}
+NS: Intuition (N) – Sensing (S) ...
+Best AUC Score: 0.6527666346929265
+Accuracy: 0.854697869367796
+[[2431   32]
+ [ 384   16]]
+{'xgb__n_estimators': 150, 'xgb__max_depth': 3, 'xgb__learning_rate': 0.3, 'xgb__gamma': 0.2, 'xgb__colsample_bytree': 0.2}
+FT: Feeling (F) - Thinking (T) ...
+Best AUC Score: 0.8139538376202319
+Accuracy: 0.728955640936081
+[[1199  355]
+ [ 421  888]]
+{'xgb__n_estimators': 150, 'xgb__max_depth': 4, 'xgb__learning_rate': 0.1, 'xgb__gamma': 0.1, 'xgb__colsample_bytree': 0.1}
+JP: Judging (J) – Perceiving (P) ...
+Best AUC Score: 0.6638994402640133
+Accuracy: 0.6521131680055885
+[[ 252  867]
+ [ 129 1615]]
+{'xgb__n_estimators': 50, 'xgb__max_depth': 3, 'xgb__learning_rate': 0.1, 'xgb__gamma': 0.0, 'xgb__colsample_bytree': 0.2}
--- a/it17173100/Personality_prediction/XGBoostClassifier_tunning_results.txt
+++ b/it17173100/Personality_prediction/XGBoostClassifier_tunning_results.txt
+Intoversion - extroversion
+Best AUC Score: 0.677028682166271
+Accuracy: 0.7785539643730353
+[[2229    0]
+ [ 634    0]]
+{'xgb__n_estimators': 200, 'xgb__max_depth': 6, 'xgb__learning_rate': 0.01, 'xgb__gamma': 0.2, 'xgb__colsample_bytree': 0.1}
+NS: Intuition (N) – Sensing (S) ...
+Best AUC Score: 0.6527666346929265
+Accuracy: 0.854697869367796
+[[2431   32]
+ [ 384   16]]
+{'xgb__n_estimators': 150, 'xgb__max_depth': 3, 'xgb__learning_rate': 0.3, 'xgb__gamma': 0.2, 'xgb__colsample_bytree': 0.2}
+FT: Feeling (F) - Thinking (T) ...
+Best AUC Score: 0.8139538376202319
+Accuracy: 0.728955640936081
+[[1199  355]
+ [ 421  888]]
+{'xgb__n_estimators': 150, 'xgb__max_depth': 4, 'xgb__learning_rate': 0.1, 'xgb__gamma': 0.1, 'xgb__colsample_bytree': 0.1}
+JP: Judging (J) – Perceiving (P) ...
+Best AUC Score: 0.6638994402640133
+Accuracy: 0.6521131680055885
+[[ 252  867]
+ [ 129 1615]]
+{'xgb__n_estimators': 50, 'xgb__max_depth': 3, 'xgb__learning_rate': 0.1, 'xgb__gamma': 0.0, 'xgb__colsample_bytree': 0.2}
--- a/it17173100/Personality_prediction/get_tweets.py
+++ b/it17173100/Personality_prediction/get_tweets.py
+# code to extract tweets of a person , by using their twitter username
+import sys
+import csv
+import tweepy
+# Get your Twitter API credentials and enter them here
+consumer_key = "6c5V3cQ1pj0DOiMgZ5znPZHDR"
+consumer_secret = "yVky0cSsTYrs0Lge39pLwFSggH7Fan8ibtMz4fu10mAoZk9AA8"
+access_key = "867670407599579137-8Ap6KyTTvlTiOI2xlMSIp8uFVlDMxHG"
+access_secret = "HWchPQN4C3xJ3U6eWzCDovfsRD1dBbPh4z6ir4AuIzxYU"
+# method to get a user's last tweets
+def get_tweets(username):
+    # http://tweepy.readthedocs.org/en/v3.1.0/getting_started.html#api
+    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
+    auth.set_access_token(access_key, access_secret)
+    api = tweepy.API(auth)
+    # set count to however many tweets you want
+    number_of_tweets = 3000
+    user = api.get_user(username)
+    # get tweets
+    tweets_for_csv = []
+    # tweets_for_csv.append([username,user.name,user.description,user.followers_count,user.listed_count,user.friends_count])
+    for tweet in tweepy.Cursor(api.user_timeline, screen_name=username).items(number_of_tweets):
+        # create array of tweet information: username, tweet id, date/time, text
+        tweets_for_csv.append(
+            [username, user.name, user.description, user.followers_count, user.listed_count, user.friends_count,
+             tweet.id_str, tweet.created_at, (tweet.text).encode("utf-8")])
+    # write to a new csv file from the array of tweets
+    outfile = username + "_tweets.csv"
+    print("writing to " + outfile)
+    with open(outfile, 'w', encoding='utf-8') as file:
+        # with file:
+        # identifying header
+        # header = ['user_name','user','description','no_list','no_followers','no_friends','tweet_id','tweet_date','tweet_text']
+        # writer = csv.writer(file,fieldnames = header, delimiter = ',')
+        writer = csv.writer(file, delimiter=',')
+        # writer.writerow(['user_name','user','description','no_list','no_followers','no_friends','tweet_id','tweet_date','tweet_text'])
+        writer.writerow(['tweet_text'])
+        writer.writerows(tweets_for_csv)
+# if we're running this as a script
+if __name__ == '__main__':
+    # get tweets for username passed at command line
+    # if len(sys.argv) == 2:
+    #   get_tweets(sys.argv[1])
+    # else:
+    #   print "Error: enter one username"
+    # alternative method: loop through multiple users
+    users = ['mayweather_gh', 'KhuthTradingWay', 'PistisMakasi', 'TUt3YwYDORjfgw2', 'koaung448', 'hikuto_e', 'Gmoncsc',
+             'KRISHAN17328009', 'mtvc36112', 'Charles46762537', 'kenanyildirimky', 'Santanu57201363', 'GiorgianStrejo1',
+             'UsharaniSharm11', 'SchlangerAndre1', 'RajKumarChadha8',
+             'ChikaMartinO1', 'Gargipal18']
+    for user in users:
+        get_tweets(user)
--- a/it17173100/Personality_prediction/personality_predictor.py
+++ b/it17173100/Personality_prediction/personality_predictor.py
+#  Predict the personality type from a candidate tweets.
+import csv
+import os
+import pandas as pd
+from future.moves import pickle
+from preprocessor import pre_process_data
+from preprocessor import translate_back
+SAVE_MODEL = True
+MODELS_DIR = "models"
+SAMPLE_DATA_DIRECTORY = "data/sample_data"
+SAMPLE_TWEETS_PATH = os.path.join(SAMPLE_DATA_DIRECTORY, "apihandyman_tweets.csv")
+x_test = ""
+DIMENSIONS = ["IE", "NS", "FT", "PJ"]
+with open(SAMPLE_TWEETS_PATH, "r", encoding="ISO-8859-1") as f:
+    reader = csv.reader(f)
+    for row in f:
+        # x_test.append(row)
+        x_test = x_test + " " + row
+type_indicators = ["IE: Introversion (I) - Extroversion (E)", "NS: Intuition (N) – Sensing (S)",
+                   "FT: Feeling (F) - Thinking (T)", "JP: Judging (J) – Perceiving (P)"]
+# The type is just a dummy so that the data preprocessing fucntion can be reused
+mydata = pd.DataFrame(data={'type': ['ENFP'], 'posts': x_test})
+x_test, dummy = pre_process_data(mydata, remove_stop_words=True)
+result = []
+# train type indicator individually
+# for l in range(len(type_indicators)):
+for k in range(len(DIMENSIONS)):
+    print("%s ..." % (DIMENSIONS[k]))
+    with open(os.path.join(MODELS_DIR, "sgd_pipeline_{}.pkl".format(DIMENSIONS[k])), 'rb') as picklefile:
+        saved_pipe = pickle.load(picklefile)
+    predictions = saved_pipe.predict(x_test)
+    # make predictions for my  data
+    result.append(predictions[0])
+print("The result is: ", translate_back(result))
--- a/it17173100/Personality_prediction/pipe.pickle
+++ b/it17173100/Personality_prediction/pipe.pickle
--- a/it17173100/Personality_prediction/poetry.lock
+++ b/it17173100/Personality_prediction/poetry.lock
--- a/it17173100/Personality_prediction/preprocessor.py
+++ b/it17173100/Personality_prediction/preprocessor.py
+# Code to preprocess the user's text posts
+import re
+import numpy as np
+import pandas as pd
+from nltk.corpus import stopwords
+from nltk.stem import PorterStemmer, WordNetLemmatizer
+SAVE_MODEL = True
+MODELS_DIR = "models"
+DATA_DIR = "data"
+DIMENSIONS = ["IE", "NS", "FT", "PJ"]
+# data = pd.read_csv('data/mbti_personality.csv')
+# add 4  columns for personality type indicators
+# def get_types(row):
+#     t = row['type']
+#
+#     I = 0
+#     N = 0
+#     T = 0
+#     J = 0
+#
+#     if t[0] == 'I':
+#         I = 1
+#     elif t[0] == 'E':
+#         I = 0
+#     else:
+#         print('I-E incorrect')
+#
+#     if t[1] == 'N':
+#         N = 1
+#     elif t[1] == 'S':
+#         N = 0
+#     else:
+#         print('N-S incorrect')
+#
+#     if t[2] == 'T':
+#         T = 1
+#     elif t[2] == 'F':
+#         T = 0
+#     else:
+#         print('T-F incorrect')
+#
+#     if t[3] == 'J':
+#         J = 1
+#     elif t[3] == 'P':
+#         J = 0
+#     else:
+#         print('J-P incorrect')
+#     return pd.Series({'IE': I, 'NS': N, 'TF': T, 'JP': J})
+# data = data.join(data.apply(lambda row: get_types(row), axis=1))
+# print("Introversion (I) /  Extroversion (E):\t", data['IE'].value_counts()[0], " / ", data['IE'].value_counts()[1])
+# print("Intuition (N) – Sensing (S):\t\t", data['NS'].value_counts()[0], " / ", data['NS'].value_counts()[1])
+# print("Thinking (T) – Feeling (F):\t\t", data['TF'].value_counts()[0], " / ", data['TF'].value_counts()[1])
+# print("Judging (J) – Perceiving (P):\t\t", data['JP'].value_counts()[0], " / ", data['JP'].value_counts()[1])
+b_Pers = {'I': 0, 'E': 1, 'N': 0, 'S': 1, 'F': 0, 'T': 1, 'J': 0, 'P': 1}
+b_Pers_list = [{0: 'I', 1: 'E'}, {0: 'N', 1: 'S'}, {0: 'F', 1: 'T'}, {0: 'J', 1: 'P'}]
+def translate_personality(personality):
+    # transform mbti to binary vector
+    return [b_Pers[l] for l in personality]
+def translate_back(personality):
+    # transform binary vector to mbti personality
+    s = ""
+    for i, l in enumerate(personality):
+        s += b_Pers_list[i][l]
+    return s
+# To remove the personality type from the psosts
+unique_type_list = ['INFJ', 'ENTP', 'INTP', 'INTJ', 'ENTJ', 'ENFJ', 'INFP', 'ENFP',
+                    'ISFP', 'ISTP', 'ISFJ', 'ISTJ', 'ESTP', 'ESFP', 'ESTJ', 'ESFJ']
+unique_type_list = [x.lower() for x in unique_type_list]
+# Lemmatize
+stemmer = PorterStemmer()
+lemmatiser = WordNetLemmatizer()
+# Cache the stop words for speed
+cachedStopWords = stopwords.words("english")
+def pre_process_data(data, remove_stop_words=True, remove_mbti_profiles=True):
+    list_personality = []
+    list_posts = []
+    len_data = len(data)
+    i = 0
+    for row in data.iterrows():
+        i += 1
+        if i % 500 == 0 or i == 1 or i == len_data:
+            print("%s of %s rows" % (i, len_data))
+        # Remove and clean posts
+        posts = row[1].posts
+        # remove urls
+        temp = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ', posts)
+        # remove uncleared words
+        temp = re.sub("[^a-zA-Z]", " ", temp)
+        # Remove spaces > 1
+        temp = re.sub(' +', ' ', temp).lower()
+        # Remove stop words
+        if remove_stop_words:
+            temp = " ".join([lemmatiser.lemmatize(w) for w in temp.split(' ') if w not in cachedStopWords])
+        else:
+            temp = " ".join([lemmatiser.lemmatize(w) for w in temp.split(' ')])
+        # Remove MBTI personality words from posts
+        if remove_mbti_profiles:
+            for t in unique_type_list:
+                temp = temp.replace(t, "")
+        # transform mbti to binary vector
+        type_labelized = translate_personality(row[1].type)
+        list_personality.append(type_labelized)
+        list_posts.append(temp)
+    list_posts = np.array(list_posts)
+    list_personality = np.array(list_personality)
+    return list_posts, list_personality
--- a/it17173100/Personality_prediction/pyproject.toml
+++ b/it17173100/Personality_prediction/pyproject.toml
+[tool.poetry]
+name = "mbti-rnn"
+version = "0.1.0"
+description = ""
+authors = ["Ian Scott Knight <isk@alumni.stanford.edu>"]
+license = "MIT"
+[tool.poetry.dependencies]
+python = "^3.8"
+scikit-learn = "^0.24.1"
+nltk = "^3.5"
+Keras = "^2.4.3"
+pandas = "^1.2.1"
+tensorflow = "^2.4.1"
+[tool.poetry.dev-dependencies]
+pre-commit = "^2.10.0"
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
--- a/it17173100/Personality_prediction/sgd parameter tuning.py
+++ b/it17173100/Personality_prediction/sgd parameter tuning.py
+# SGDClassifer parameter tunning
+import numpy as np
+import pandas as pd
+import scipy
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.linear_model import SGDClassifier
+from sklearn.metrics import accuracy_score, make_scorer, confusion_matrix
+from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
+from sklearn.pipeline import Pipeline
+from sklearn.metrics import accuracy_score
+from preprocessor import pre_process_data
+from preprocessor import translate_personality
+from preprocessor import translate_back
+#from preprocessor import get_types
+SAVE_MODEL = True
+MODELS_DIR = "models"
+DATA_DIR = "data"
+DIMENSIONS = ["IE", "NS", "FT", "PJ"]
+data = pd.read_csv('data/mbti_personality.csv');
+#data = data.join(data.apply(lambda row: get_types(row), axis=1))
+list_posts, list_personality = pre_process_data(data, remove_stop_words=True)
+# Learn the vocabulary dictionary and return term-document matrix
+Tfidf = TfidfVectorizer(analyzer="word",
+                             max_features=1500,
+                             tokenizer=None,
+                             preprocessor=None,
+                             stop_words=None,
+                             max_df=0.7,
+                             min_df=0.1)
+type_indicators = ["IE: Introversion (I) - Extroversion (E)", "NS: Intuition (N) – Sensing (S)",
+                   "FT: Feeling (F) - Thinking (T)", "JP: Judging (J) – Perceiving (P)"]
+X = list_posts
+# Let's train type indicator individually
+for l in range(len(DIMENSIONS)):
+    print("%s ..." % (type_indicators[l]))
+    # Let's train type indicator individually
+    Y = list_personality[:, l]
+    np.random.shuffle(X)
+    np.random.shuffle(Y)
+    # split data into train and test sets
+    seed = 7
+    test_size = 0.33
+    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
+   # sgd = SGDClassifier(max_iter=10, tol=None)
+    sgd = SGDClassifier()
+    pipe = Pipeline([("Tfidf", Tfidf), ("sgd", sgd)])
+    # Define our search space for Randomized Search
+    search_space = [
+        {
+            'sgd__loss': ['hinge', 'modified_huber'],
+            'sgd__penalty': ['none', 'l2', 'l1'],
+            'sgd__alpha': scipy.stats.uniform(0.00005, 0.002)
+        }
+    ]
+    # Define cross validation
+    kfold = KFold(n_splits=10, random_state=42, shuffle=True)  # put 10 try - yeah tried it
+    # AUC and accuracy as score
+    scoring = {'AUC': 'roc_auc', 'Accuracy': make_scorer(accuracy_score)}
+    # Define grid search
+    grid = RandomizedSearchCV(
+        pipe,
+        param_distributions=search_space,
+        cv=kfold,
+        scoring=scoring,
+        refit='AUC',
+        verbose=1,
+        n_jobs=-1
+    )
+    # for param in grid.get_params().keys():
+    #     print(param)
+    # Fit grid search
+    model = grid.fit(X_train, y_train)
+    # The model scores and confusion matrix can be obtained by
+    predict = model.predict(X_test)
+    print('Best AUC Score: {}'.format(model.best_score_))
+    print('Accuracy: {}'.format(accuracy_score(y_test, predict)))
+    print(confusion_matrix(y_test, predict))
+    # And the best parameters can be obtained by:
+    print(model.best_params_)
--- a/it17173100/Personality_prediction/sgdClassifier.py
+++ b/it17173100/Personality_prediction/sgdClassifier.py
+# Train and test hte dataset on SGDClassifier
+import os
+import pandas as pd
+from future.moves import pickle
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.linear_model import SGDClassifier
+from sklearn.model_selection import train_test_split
+from sklearn.pipeline import Pipeline
+from sklearn.metrics import accuracy_score
+from preprocessor import pre_process_data
+from preprocessor import translate_personality
+from preprocessor import translate_back
+#from preprocessor import get_types
+SAVE_MODEL = True
+MODELS_DIR = "models"
+DATA_DIR = "data"
+DIMENSIONS = ["IE", "NS", "FT", "PJ"]
+data = pd.read_csv('data/mbti_personality.csv');
+#data = data.join(data.apply(lambda row: get_types(row), axis=1))
+list_posts, list_personality = pre_process_data(data, remove_stop_words=True)
+# Learn the vocabulary dictionary and return term-document matrix
+Tfidf = TfidfVectorizer(analyzer="word",
+                             max_features=1500,
+                             tokenizer=None,
+                             preprocessor=None,
+                             stop_words=None,
+                             max_df=0.7,
+                             min_df=0.1)
+type_indicators = ["IE: Introversion (I) - Extroversion (E)", "NS: Intuition (N) – Sensing (S)",
+                   "FT: Feeling (F) - Thinking (T)", "JP: Judging (J) – Perceiving (P)"]
+X = list_posts
+# Let's train type indicator individually
+for l in range(len(DIMENSIONS)):
+    print("%s ..." % (type_indicators[l]))
+    # Let's train type indicator individually
+    Y = list_personality[:, l]
+    # split data into train and test sets
+    seed = 7
+    test_size = 0.33
+    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
+ #   sgd = SGDClassifier(max_iter=10, tol=None,)
+    sgd = SGDClassifier(loss= 'modified_huber', penalty = 'l1', alpha =  0.00032 )
+    pipe = Pipeline([("Tfidf", Tfidf), ("sgd", sgd)])
+    pipe.fit(X_train, y_train)
+    with open (os.path.join(MODELS_DIR, "sgd_pipeline_{}.pkl".format(DIMENSIONS[l])), 'wb') as picklefile:
+        pickle.dump(pipe, picklefile)
+    # make predictions for test data
+    y_pred = pipe.predict(X_test)
+    predictions = [round(value) for value in y_pred]
+    # evaluate predictions
+    accuracy = accuracy_score(y_test, predictions)
+    print("* %s Accuracy: %.2f%%" % (type_indicators[l], accuracy * 100.0))
--- a/it17173100/Personality_prediction/xgb parameter tunning.py
+++ b/it17173100/Personality_prediction/xgb parameter tunning.py
+# XGBoost parameter training
+import pandas as pd
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics import accuracy_score, make_scorer, confusion_matrix
+from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, GridSearchCV, StratifiedKFold,  RandomizedSearchCV, KFold
+from sklearn.pipeline import Pipeline
+from xgboost import XGBClassifier
+from preprocessor import pre_process_data
+#from preprocessor import get_types
+SAVE_MODEL = True
+MODELS_DIR = "models"
+DATA_DIR = "data"
+DIMENSIONS = ["IE", "NS", "FT", "PJ"]
+data = pd.read_csv('data/mbti_personality.csv');
+#data = data.join(data.apply(lambda row: get_types(row), axis=1))
+type_indicators = ["IE: Introversion (I) - Extroversion (E)", "NS: Intuition (N) – Sensing (S)",
+                   "FT: Feeling (F) - Thinking (T)", "JP: Judging (J) – Perceiving (P)"]
+list_posts, list_personality = pre_process_data(data, remove_stop_words=True)
+# Posts in tf-idf representation
+Tfidf = TfidfVectorizer(analyzer="word",
+                             max_features=1500,
+                             tokenizer=None,
+                             preprocessor=None,
+                             stop_words=None,
+                             max_df=0.7,
+                             min_df=0.1)
+X = list_posts
+# setup parameters for xgboost
+param = {}
+param['n_estimators'] = 200
+param['max_depth'] = 2
+param['nthread'] = 8
+param['learning_rate'] = 0.2
+# Let's train type indicator individually
+for l in range(len(DIMENSIONS)):
+    print("%s ..." % (type_indicators[l]))
+    # Let's train type indicator individually
+    Y = list_personality[:, l]
+    #split data into train and test sets
+    seed = 7
+    test_size = 0.33
+    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
+    xgb = XGBClassifier(use_label_encoder=False)
+    pipe = Pipeline([("Tfidf", Tfidf), ("xgb", xgb)])
+    # Define our search space for randomized search
+    search_space = [
+        {
+            'xgb__n_estimators': [50, 100, 150, 200],
+            'xgb__learning_rate': [0.01, 0.1, 0.2, 0.3],
+            'xgb__max_depth': range(3, 10),
+            'xgb__colsample_bytree': [i / 10.0 for i in range(1, 3)],
+            'xgb__gamma': [i / 10.0 for i in range(3)]
+        }
+    ]
+    # Define cross validation
+    kfold = KFold(n_splits=10, random_state=42, shuffle=True ) # put 10 try and  put shuffle = true,
+    # AUC and accuracy as score
+    scoring = {'AUC': 'roc_auc', 'Accuracy': make_scorer(accuracy_score)}
+    # Define grid search
+    grid = RandomizedSearchCV(
+        pipe,
+        param_distributions=search_space,
+        cv=kfold,
+        scoring=scoring,
+        refit='AUC',
+        verbose=1,
+        n_jobs=-1
+    )
+    # for param in grid.get_params().keys():
+    #     print(param)
+    # Fit grid search
+    model = grid.fit(X_train, y_train)
+    # The model scores and confusion matrix can be obtained by
+    predict = model.predict(X_test)
+    print('Best AUC Score: {}'.format(model.best_score_))
+    print('Accuracy: {}'.format(accuracy_score(y_test, predict)))
+    print(confusion_matrix(y_test, predict))
+    # And the best parameters can be obtained by:
+    print(model.best_params_)