Merge branch 'IT18150926_jayasekaraA.P.S.D' into 'master'

Add completed model See merge request !7

Merge branch 'IT18150926_jayasekaraA.P.S.D' into 'master'
Add completed model See merge request !7
5ed12c09 · Shalitha Deshan Jayasekara · a081caf8 · e0c8090b · 5ed12c09 · 5ed12c09
Commit 5ed12c09 authored Jul 04, 2021 by Shalitha Deshan Jayasekara 🏘
Showing with 428 additions and 0 deletions

NLTK_model/.gitignore NLTK_model/.gitignore +132 -0

NLTK_model/Ontology_scripts.py NLTK_model/Ontology_scripts.py +137 -0

NLTK_model/nltk_nlp.py NLTK_model/nltk_nlp.py +159 -0

No files found.
--- a/NLTK_model/.gitignore
+++ b/NLTK_model/.gitignore
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# code editor file
+.idea
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
--- a/NLTK_model/Ontology_scripts.py
+++ b/NLTK_model/Ontology_scripts.py
+import nltk, re, string, random
+from nltk.corpus import twitter_samples
+from nltk.tag import pos_tag
+from nltk.stem.wordnet import WordNetLemmatizer
+from nltk.corpus import stopwords
+from nltk import FreqDist
+from nltk import classify
+from nltk import NaiveBayesClassifier
+from nltk.tokenize import word_tokenize
+from flask import Flask
+positive_tweets = twitter_samples.strings('positive_tweets.json')
+negative_tweets = twitter_samples.strings('negative_tweets.json')
+text = twitter_samples.strings('tweets.20150430-223406.json')
+tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
+def lemmatize_sentence(tokens):
+    lemmatizer = WordNetLemmatizer()
+    lemmatized_sentence = []
+    for word, tag in pos_tag(tokens):
+        if tag.startswith('NN'):
+            pos = 'n'
+        elif tag.startswith('VB'):
+            pos = 'v'
+        else:
+            pos = 'a'
+        lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
+    return lemmatized_sentence
+def remove_noise(tweet_tokens, stop_words=()):
+    cleaned_tokens = []
+    for token, tag in pos_tag(tweet_tokens):
+        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|' \
+                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', token)
+        token = re.sub("(@[A-Za-z0-9_]+)", "", token)
+        if tag.startswith("NN"):
+            pos = 'n'
+        elif tag.startswith('VB'):
+            pos = 'v'
+        else:
+            pos = 'a'
+        lemmatizer = WordNetLemmatizer()
+        token = lemmatizer.lemmatize(token, pos)
+        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
+            cleaned_tokens.append(token.lower())
+    return cleaned_tokens
+stop_words = stopwords.words('english')
+positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
+negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')
+positive_cleaned_tokens_list = []
+negative_cleaned_tokens_list = []
+for tokens in positive_tweet_tokens:
+    positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))
+for tokens in negative_tweet_tokens:
+    negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))
+def get_all_words(cleaned_tokens_list):
+    for tokens in cleaned_tokens_list:
+        for token in tokens:
+            yield token
+all_pos_words = get_all_words(positive_cleaned_tokens_list)
+freq_dist_pos = FreqDist(all_pos_words)
+def get_words_for_model(cleaned_tokens_list):
+    for tweet_tokens in cleaned_tokens_list:
+        yield dict([token, True] for token in tweet_tokens)
+positive_tokens_for_model = get_words_for_model(positive_cleaned_tokens_list)
+negative_tokens_for_model = get_words_for_model(negative_cleaned_tokens_list)
+positive_dataset = [(tweet_dict, "Positive")
+                    for tweet_dict in positive_tokens_for_model]
+negative_dataset = [(tweet_dict, "Negative")
+                    for tweet_dict in negative_tokens_for_model]
+dataset = positive_dataset + negative_dataset
+random.shuffle(dataset)
+train_data = dataset[:7000]
+test_data = dataset[7000:]
+classifier = NaiveBayesClassifier.train(train_data)
+def analyzePost(title, content, reviews):
+    points = 0
+    title_tokens = remove_noise(word_tokenize(title))
+    titleSentiment = classifier.classify(dict([token, True] for token in title_tokens))
+    if titleSentiment == "Positive":
+        points += 2
+    else:
+        points -= 2
+    content_tokens = remove_noise(word_tokenize(content))
+    contentSentiment = classifier.classify(dict([token, True] for token in content_tokens))
+    if contentSentiment == "Positive":
+        points += 5
+    else:
+        points -= 5
+    points += len(reviews)
+    for review in reviews:
+        review_tokens = remove_noise(word_tokenize(review))
+        reviewSentiment = classifier.classify(dict([token, True] for token in review_tokens))
+        if reviewSentiment == "Positive":
+            points += 1
+        else:
+            points -= 1
+    return points
+# print(analyzePost(
+#     "Health",
+#     "Why Do Dogs Eat Grass?", [
+#         "Some vets believe dogs eat grass because they’re bored, stressed, anxious, or upset about something. Some dogs are more likely to eat grass when they believe they’re alone in the backyard, which contributes to the idea that they are unhappy when they do so.",
+#         "Some vets also believe dogs eat grass because it gets their owners’ attention, which is something they want. Even if they’re being told to stop doing something, dogs perceive this as attention, and it’s good enough for many of them."
+#     ]))
+print(analyzePost(
+    "good",
+    "good", [
+        "very",
+    ]))
--- a/NLTK_model/nltk_nlp.py
+++ b/NLTK_model/nltk_nlp.py
+import nltk, re, string, random
+# Downloading words data
+nltk.download('twitter_samples')
+nltk.download('punkt')
+nltk.download('wordnet')
+nltk.download('averaged_perceptron_tagger')
+nltk.download('stopwords')
+# Importing required libraries and dependencies
+from nltk.corpus import twitter_samples
+from nltk.tag import pos_tag
+from nltk.stem.wordnet import WordNetLemmatizer
+from nltk.corpus import stopwords
+from nltk import FreqDist
+from nltk import classify
+from nltk import NaiveBayesClassifier
+from nltk.tokenize import word_tokenize
+from flask import Flask
+from flask import request
+app = Flask(__name__)
+positive_tweets = twitter_samples.strings('positive_tweets.json')
+negative_tweets = twitter_samples.strings('negative_tweets.json')
+text = twitter_samples.strings('tweets.20150430-223406.json')
+tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
+# print(tweet_tokens[0])
+# print(pos_tag(tweet_tokens[0]))
+def lemmatize_sentence(tokens):
+    lemmatizer = WordNetLemmatizer()
+    lemmatized_sentence = []
+    # TAG words with NLTK POS tagger : https://www.nltk.org/book/ch05.html
+    for word, tag in pos_tag(tokens):
+        if tag.startswith('NN'):
+            pos = 'n'
+        elif tag.startswith('VB'):
+            pos = 'v'
+        else:
+            pos = 'a'
+        lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
+    return lemmatized_sentence
+# print(tweet_tokens[0])
+# print(lemmatize_sentence(tweet_tokens[0]))
+def remove_noise(tweet_tokens, stop_words = ()):
+    cleaned_tokens = []
+    # Removing urls and other unnecessary words (noise)
+    for token, tag in pos_tag(tweet_tokens):
+        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
+                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
+        token = re.sub("(@[A-Za-z0-9_]+)","", token)
+        if tag.startswith("NN"):
+            pos = 'n'
+        elif tag.startswith('VB'):
+            pos = 'v'
+        else:
+            pos = 'a'
+        # Word net lemmatizer : https://www.programcreek.com/python/example/81649/nltk.WordNetLemmatizer
+        lemmatizer = WordNetLemmatizer()
+        token = lemmatizer.lemmatize(token, pos)
+        # If the lemmatized tokens are not punctuation and they are not stop words -> add those tokens to the end of cleaned tokens
+        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
+            cleaned_tokens.append(token.lower())
+    return cleaned_tokens
+# Starting NLP
+stop_words = stopwords.words('english')
+# print(remove_noise(tweet_tokens[0], stop_words))
+positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
+negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')
+positive_cleaned_tokens_list = []
+negative_cleaned_tokens_list = []
+# Remove noise with above function remove_noise()
+for tokens in positive_tweet_tokens:
+    positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))
+for tokens in negative_tweet_tokens:
+    negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))
+# print(positive_tweet_tokens[500])
+# print(positive_cleaned_tokens_list[500])
+# Turn 2D array of tokens in to single 1D array
+def get_all_words(cleaned_tokens_list):
+    for tokens in cleaned_tokens_list:
+        for token in tokens:
+            yield token
+all_pos_words = get_all_words(positive_cleaned_tokens_list)
+freq_dist_pos = FreqDist(all_pos_words)
+# print(freq_dist_pos.most_common(10))
+def get_words_for_model(cleaned_tokens_list):
+    for tweet_tokens in cleaned_tokens_list:
+        yield dict([token, True] for token in tweet_tokens)
+positive_tokens_for_model = get_words_for_model(positive_cleaned_tokens_list)
+negative_tokens_for_model = get_words_for_model(negative_cleaned_tokens_list)
+positive_dataset = [(tweet_dict, "Positive")
+                     for tweet_dict in positive_tokens_for_model]
+negative_dataset = [(tweet_dict, "Negative")
+                     for tweet_dict in negative_tokens_for_model]
+# Create a single data set with both negative and positive data sets prepared
+dataset = positive_dataset + negative_dataset
+# Shuffle the data set to mix positive and negative data to be split to train data and test data
+random.shuffle(dataset)
+# Split whole data set into train and test data
+train_data = dataset[:7000]
+test_data = dataset[7000:]
+# train the NaiveBayesClassifier model with train_data
+classifier = NaiveBayesClassifier.train(train_data)
+print("Accuracy is:", classify.accuracy(classifier, test_data))
+# print(classifier.show_most_informative_features(10))
+# custom_text = "This is a bad supplier."
+# custom_text = "This is a very good supplier."
+# custom_text = "This is a very good supplier. but there was some delay in shipping."
+custom_text = "This is a very good supplier. but there was some delay in shipping. but it is okay."
+custom_tokens = remove_noise(word_tokenize(custom_text))
+# Test print
+print(classifier.classify(dict([token, True] for token in custom_tokens)))
+# Flask API to be used in backend
+@app.route("/NLP")
+def hello():
+    # text as a http get request parameter
+    custom_tweet = request.args.get('text')
+    custom_tokens = remove_noise(word_tokenize(custom_tweet))
+    return classifier.classify(dict([token, True] for token in custom_tokens))
+if __name__ == '__main__':
+    # Set the API port to 8083 and enable debug to view clear errors
+    app.run(debug=True, port=8083)