Commit 3905567d authored by Shalitha Deshan Jayasekara's avatar Shalitha Deshan Jayasekara 🏘

Merge branch 'IT18150926_jayasekaraA.P.S.D' into 'master'

Add graph that need to research paper

See merge request !15
parents 392626c9 0873ac50
......@@ -14,7 +14,6 @@ negative_tweets = twitter_samples.strings('negative_tweets.json')
text = twitter_samples.strings('tweets.20150430-223406.json')
tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
def lemmatize_sentence(tokens):
lemmatizer = WordNetLemmatizer()
lemmatized_sentence = []
......@@ -28,13 +27,12 @@ def lemmatize_sentence(tokens):
lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
return lemmatized_sentence
def remove_noise(tweet_tokens, stop_words=()):
def remove_noise(tweet_tokens, stop_words = ()):
cleaned_tokens = []
for token, tag in pos_tag(tweet_tokens):
token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|' \
'(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', token)
token = re.sub("(@[A-Za-z0-9_]+)", "", token)
'(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
token = re.sub("(@[A-Za-z0-9_]+)","", token)
if tag.startswith("NN"):
pos = 'n'
......@@ -49,7 +47,6 @@ def remove_noise(tweet_tokens, stop_words=()):
cleaned_tokens.append(token.lower())
return cleaned_tokens
stop_words = stopwords.words('english')
positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')
......@@ -63,23 +60,19 @@ for tokens in positive_tweet_tokens:
for tokens in negative_tweet_tokens:
negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))
def get_all_words(cleaned_tokens_list):
for tokens in cleaned_tokens_list:
for token in tokens:
yield token
all_pos_words = get_all_words(positive_cleaned_tokens_list)
freq_dist_pos = FreqDist(all_pos_words)
def get_words_for_model(cleaned_tokens_list):
for tweet_tokens in cleaned_tokens_list:
yield dict([token, True] for token in tweet_tokens)
positive_tokens_for_model = get_words_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_words_for_model(negative_cleaned_tokens_list)
......@@ -97,7 +90,6 @@ test_data = dataset[7000:]
classifier = NaiveBayesClassifier.train(train_data)
def analyzePost(title, content, reviews):
points = 0
title_tokens = remove_noise(word_tokenize(title))
......@@ -122,19 +114,4 @@ def analyzePost(title, content, reviews):
points -= 1
return points
# print(analyzePost(
# "Health",
# "Why Do Dogs Eat Grass?", [
# "Some vets believe dogs eat grass because they’re bored, stressed, anxious, or upset about something. Some dogs are more likely to eat grass when they believe they’re alone in the backyard, which contributes to the idea that they are unhappy when they do so.",
# "Some vets also believe dogs eat grass because it gets their owners’ attention, which is something they want. Even if they’re being told to stop doing something, dogs perceive this as attention, and it’s good enough for many of them."
# ]))
print(analyzePost(
"good",
"good", [
"very",
"good",
"very good",
"brilliant"
]))
print(analyzePost("watch your dog's health", "bvjkdn dbfjksdn", ["gdfsdff"]))
......@@ -19,6 +19,9 @@ create phython workplace
### 2021-06-11
Update workspace
### 2021-08-11
Add graph that need to research paper
<!-- Write ontology scripts to module -->
......@@ -21,21 +21,18 @@ from flask import request
app = Flask(__name__)
# import training data set
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
text = twitter_samples.strings('tweets.20150430-223406.json')
tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
# print(tweet_tokens[0])
# print(negative_tweets)
# print(positive_tweets)
# print(pos_tag(tweet_tokens[0]))
# minimize the words that contain in the sentence
def lemmatize_sentence(tokens):
lemmatizer = WordNetLemmatizer() # https://wordnet.princeton.edu/
lemmatizer = WordNetLemmatizer()
lemmatized_sentence = []
# TAG words with NLTK POS tagger : https://www.nltk.org/book/ch05.html
for word, tag in pos_tag(tokens):
......@@ -45,7 +42,7 @@ def lemmatize_sentence(tokens):
pos = 'v'
else:
pos = 'a'
lemmatized_sentence.append(lemmatizer.lemmatize(word, pos)) # Append object to the end of the list.
lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
return lemmatized_sentence
# print(tweet_tokens[0])
......@@ -81,8 +78,6 @@ stop_words = stopwords.words('english')
# print(remove_noise(tweet_tokens[0], stop_words))
# the given file as a list of the text content of Tweets as a list of words, screenanames, hashtags, URLs and punctuation symbols
positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')
......@@ -107,21 +102,16 @@ def get_all_words(cleaned_tokens_list):
all_pos_words = get_all_words(positive_cleaned_tokens_list)
# get frequency distribution post word
freq_dist_pos = FreqDist(all_pos_words)
# print(freq_dist_pos.most_common(10))
# check if empty values get those also
def get_words_for_model(cleaned_tokens_list):
for tweet_tokens in cleaned_tokens_list:
yield dict([token, True] for token in tweet_tokens)
# link data set again
positive_tokens_for_model = get_words_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_words_for_model(negative_cleaned_tokens_list)
# label data set
positive_dataset = [(tweet_dict, "Positive")
for tweet_dict in positive_tokens_for_model]
......@@ -130,40 +120,29 @@ negative_dataset = [(tweet_dict, "Negative")
# Create a single data set with both negative and positive data sets prepared
dataset = positive_dataset + negative_dataset
# print(dataset)
# Shuffle the data set to mix positive and negative data to be split to train data and test data
random.shuffle(dataset)
# Split whole data set into train and test data
train_data = dataset[:7000]
test_data = dataset[7000:]
train_data = dataset[:9000]
test_data = dataset[9000:]
# train the NaiveBayesClassifier model with train_data
classifier = NaiveBayesClassifier.train(train_data)
print("Accuracy is:", classify.accuracy(classifier, test_data))
classifier.show_most_informative_features(15)
# print(classifier.show_most_informative_features(10))
# custom_text = "This is a bad supplier."
# custom_text = "This is a very good supplier."
# custom_text = "This is a very good supplier. but there was some delay in shipping."
custom_text = "This is a bad post. it gives out wrong idea to the people."
custom_tokens = remove_noise(word_tokenize(custom_text))
# print(custom_tokens)
# Test print
print(classifier.classify(dict([token, True] for token in custom_tokens)))
# API endpoint
# Flask API to be used in backend
@app.route("/NLP")
def hello():
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment