Commit 4443e165 authored by Shalitha Deshan Jayasekara's avatar Shalitha Deshan Jayasekara 🏘

Merge branch 'IT18150926_jayasekaraA.P.S.D' into 'master'

add necessary comments to some part

See merge request !9
parents 0ced088e 42bb130b
......@@ -134,4 +134,7 @@ print(analyzePost(
"good",
"good", [
"very",
"good",
"very good",
"brilliant"
]))
......@@ -21,18 +21,21 @@ from flask import request
app = Flask(__name__)
# import training data set
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
text = twitter_samples.strings('tweets.20150430-223406.json')
tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
# print(tweet_tokens[0])
# print(negative_tweets)
# print(positive_tweets)
# print(pos_tag(tweet_tokens[0]))
# minimize the words that contain in the sentence
def lemmatize_sentence(tokens):
lemmatizer = WordNetLemmatizer()
lemmatizer = WordNetLemmatizer() # https://wordnet.princeton.edu/
lemmatized_sentence = []
# TAG words with NLTK POS tagger : https://www.nltk.org/book/ch05.html
for word, tag in pos_tag(tokens):
......@@ -42,7 +45,7 @@ def lemmatize_sentence(tokens):
pos = 'v'
else:
pos = 'a'
lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
lemmatized_sentence.append(lemmatizer.lemmatize(word, pos)) # Append object to the end of the list.
return lemmatized_sentence
# print(tweet_tokens[0])
......@@ -78,6 +81,8 @@ stop_words = stopwords.words('english')
# print(remove_noise(tweet_tokens[0], stop_words))
# the given file as a list of the text content of Tweets as a list of words, screenanames, hashtags, URLs and punctuation symbols
positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')
......@@ -102,16 +107,21 @@ def get_all_words(cleaned_tokens_list):
all_pos_words = get_all_words(positive_cleaned_tokens_list)
# get frequency distribution post word
freq_dist_pos = FreqDist(all_pos_words)
# print(freq_dist_pos.most_common(10))
# check if empty values get those also
def get_words_for_model(cleaned_tokens_list):
for tweet_tokens in cleaned_tokens_list:
yield dict([token, True] for token in tweet_tokens)
# link data set again
positive_tokens_for_model = get_words_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_words_for_model(negative_cleaned_tokens_list)
# label data set
positive_dataset = [(tweet_dict, "Positive")
for tweet_dict in positive_tokens_for_model]
......@@ -120,6 +130,7 @@ negative_dataset = [(tweet_dict, "Negative")
# Create a single data set with both negative and positive data sets prepared
dataset = positive_dataset + negative_dataset
# print(dataset)
# Shuffle the data set to mix positive and negative data to be split to train data and test data
random.shuffle(dataset)
......@@ -138,15 +149,21 @@ print("Accuracy is:", classify.accuracy(classifier, test_data))
# custom_text = "This is a bad supplier."
# custom_text = "This is a very good supplier."
# custom_text = "This is a very good supplier. but there was some delay in shipping."
custom_text = "This is a very good supplier. but there was some delay in shipping. but it is okay."
custom_text = "This is a bad post. it gives out wrong idea to the people."
custom_tokens = remove_noise(word_tokenize(custom_text))
# print(custom_tokens)
# Test print
print(classifier.classify(dict([token, True] for token in custom_tokens)))
# API endpoint
# Flask API to be used in backend
@app.route("/NLP")
def hello():
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment