Commit 5ed12c09 authored by Shalitha Deshan Jayasekara's avatar Shalitha Deshan Jayasekara 🏘

Merge branch 'IT18150926_jayasekaraA.P.S.D' into 'master'

Add completed model

See merge request !7
parents a081caf8 e0c8090b
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# code editor file
.idea
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
import nltk, re, string, random
from nltk.corpus import twitter_samples
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import FreqDist
from nltk import classify
from nltk import NaiveBayesClassifier
from nltk.tokenize import word_tokenize
from flask import Flask
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
text = twitter_samples.strings('tweets.20150430-223406.json')
tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
def lemmatize_sentence(tokens):
lemmatizer = WordNetLemmatizer()
lemmatized_sentence = []
for word, tag in pos_tag(tokens):
if tag.startswith('NN'):
pos = 'n'
elif tag.startswith('VB'):
pos = 'v'
else:
pos = 'a'
lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
return lemmatized_sentence
def remove_noise(tweet_tokens, stop_words=()):
cleaned_tokens = []
for token, tag in pos_tag(tweet_tokens):
token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|' \
'(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', token)
token = re.sub("(@[A-Za-z0-9_]+)", "", token)
if tag.startswith("NN"):
pos = 'n'
elif tag.startswith('VB'):
pos = 'v'
else:
pos = 'a'
lemmatizer = WordNetLemmatizer()
token = lemmatizer.lemmatize(token, pos)
if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
cleaned_tokens.append(token.lower())
return cleaned_tokens
stop_words = stopwords.words('english')
positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')
positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []
for tokens in positive_tweet_tokens:
positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))
for tokens in negative_tweet_tokens:
negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))
def get_all_words(cleaned_tokens_list):
for tokens in cleaned_tokens_list:
for token in tokens:
yield token
all_pos_words = get_all_words(positive_cleaned_tokens_list)
freq_dist_pos = FreqDist(all_pos_words)
def get_words_for_model(cleaned_tokens_list):
for tweet_tokens in cleaned_tokens_list:
yield dict([token, True] for token in tweet_tokens)
positive_tokens_for_model = get_words_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_words_for_model(negative_cleaned_tokens_list)
positive_dataset = [(tweet_dict, "Positive")
for tweet_dict in positive_tokens_for_model]
negative_dataset = [(tweet_dict, "Negative")
for tweet_dict in negative_tokens_for_model]
dataset = positive_dataset + negative_dataset
random.shuffle(dataset)
train_data = dataset[:7000]
test_data = dataset[7000:]
classifier = NaiveBayesClassifier.train(train_data)
def analyzePost(title, content, reviews):
points = 0
title_tokens = remove_noise(word_tokenize(title))
titleSentiment = classifier.classify(dict([token, True] for token in title_tokens))
if titleSentiment == "Positive":
points += 2
else:
points -= 2
content_tokens = remove_noise(word_tokenize(content))
contentSentiment = classifier.classify(dict([token, True] for token in content_tokens))
if contentSentiment == "Positive":
points += 5
else:
points -= 5
points += len(reviews)
for review in reviews:
review_tokens = remove_noise(word_tokenize(review))
reviewSentiment = classifier.classify(dict([token, True] for token in review_tokens))
if reviewSentiment == "Positive":
points += 1
else:
points -= 1
return points
# print(analyzePost(
# "Health",
# "Why Do Dogs Eat Grass?", [
# "Some vets believe dogs eat grass because they’re bored, stressed, anxious, or upset about something. Some dogs are more likely to eat grass when they believe they’re alone in the backyard, which contributes to the idea that they are unhappy when they do so.",
# "Some vets also believe dogs eat grass because it gets their owners’ attention, which is something they want. Even if they’re being told to stop doing something, dogs perceive this as attention, and it’s good enough for many of them."
# ]))
print(analyzePost(
"good",
"good", [
"very",
]))
import nltk, re, string, random
# Downloading words data
nltk.download('twitter_samples')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
# Importing required libraries and dependencies
from nltk.corpus import twitter_samples
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import FreqDist
from nltk import classify
from nltk import NaiveBayesClassifier
from nltk.tokenize import word_tokenize
from flask import Flask
from flask import request
app = Flask(__name__)
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
text = twitter_samples.strings('tweets.20150430-223406.json')
tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
# print(tweet_tokens[0])
# print(pos_tag(tweet_tokens[0]))
def lemmatize_sentence(tokens):
lemmatizer = WordNetLemmatizer()
lemmatized_sentence = []
# TAG words with NLTK POS tagger : https://www.nltk.org/book/ch05.html
for word, tag in pos_tag(tokens):
if tag.startswith('NN'):
pos = 'n'
elif tag.startswith('VB'):
pos = 'v'
else:
pos = 'a'
lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
return lemmatized_sentence
# print(tweet_tokens[0])
# print(lemmatize_sentence(tweet_tokens[0]))
def remove_noise(tweet_tokens, stop_words = ()):
cleaned_tokens = []
# Removing urls and other unnecessary words (noise)
for token, tag in pos_tag(tweet_tokens):
token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
'(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
token = re.sub("(@[A-Za-z0-9_]+)","", token)
if tag.startswith("NN"):
pos = 'n'
elif tag.startswith('VB'):
pos = 'v'
else:
pos = 'a'
# Word net lemmatizer : https://www.programcreek.com/python/example/81649/nltk.WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
token = lemmatizer.lemmatize(token, pos)
# If the lemmatized tokens are not punctuation and they are not stop words -> add those tokens to the end of cleaned tokens
if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
cleaned_tokens.append(token.lower())
return cleaned_tokens
# Starting NLP
stop_words = stopwords.words('english')
# print(remove_noise(tweet_tokens[0], stop_words))
positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')
positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []
# Remove noise with above function remove_noise()
for tokens in positive_tweet_tokens:
positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))
for tokens in negative_tweet_tokens:
negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))
# print(positive_tweet_tokens[500])
# print(positive_cleaned_tokens_list[500])
# Turn 2D array of tokens in to single 1D array
def get_all_words(cleaned_tokens_list):
for tokens in cleaned_tokens_list:
for token in tokens:
yield token
all_pos_words = get_all_words(positive_cleaned_tokens_list)
freq_dist_pos = FreqDist(all_pos_words)
# print(freq_dist_pos.most_common(10))
def get_words_for_model(cleaned_tokens_list):
for tweet_tokens in cleaned_tokens_list:
yield dict([token, True] for token in tweet_tokens)
positive_tokens_for_model = get_words_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_words_for_model(negative_cleaned_tokens_list)
positive_dataset = [(tweet_dict, "Positive")
for tweet_dict in positive_tokens_for_model]
negative_dataset = [(tweet_dict, "Negative")
for tweet_dict in negative_tokens_for_model]
# Create a single data set with both negative and positive data sets prepared
dataset = positive_dataset + negative_dataset
# Shuffle the data set to mix positive and negative data to be split to train data and test data
random.shuffle(dataset)
# Split whole data set into train and test data
train_data = dataset[:7000]
test_data = dataset[7000:]
# train the NaiveBayesClassifier model with train_data
classifier = NaiveBayesClassifier.train(train_data)
print("Accuracy is:", classify.accuracy(classifier, test_data))
# print(classifier.show_most_informative_features(10))
# custom_text = "This is a bad supplier."
# custom_text = "This is a very good supplier."
# custom_text = "This is a very good supplier. but there was some delay in shipping."
custom_text = "This is a very good supplier. but there was some delay in shipping. but it is okay."
custom_tokens = remove_noise(word_tokenize(custom_text))
# Test print
print(classifier.classify(dict([token, True] for token in custom_tokens)))
# Flask API to be used in backend
@app.route("/NLP")
def hello():
# text as a http get request parameter
custom_tweet = request.args.get('text')
custom_tokens = remove_noise(word_tokenize(custom_tweet))
return classifier.classify(dict([token, True] for token in custom_tokens))
if __name__ == '__main__':
# Set the API port to 8083 and enable debug to view clear errors
app.run(debug=True, port=8083)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment