Merge branch 'IT18150926_jayasekaraA.P.S.D' into 'master'

Add graph that need to research paper See merge request !15

Merge branch 'IT18150926_jayasekaraA.P.S.D' into 'master'
Add graph that need to research paper See merge request !15
3905567d · Shalitha Deshan Jayasekara · 392626c9 · 0873ac50 · 3905567d · 3905567d
Commit 3905567d authored Sep 15, 2021 by Shalitha Deshan Jayasekara 🏘
Hide whitespace changes
Inline Side-by-side

Showing with 14 additions and 55 deletions

NLTK_model/Ontology_scripts.py NLTK_model/Ontology_scripts.py +4 -27

NLTK_model/README.md NLTK_model/README.md +3 -0

NLTK_model/nltk_nlp.py NLTK_model/nltk_nlp.py +7 -28

No files found.
--- a/NLTK_model/Ontology_scripts.py
+++ b/NLTK_model/Ontology_scripts.py
@@ -14,7 +14,6 @@ negative_tweets = twitter_samples.strings('negative_tweets.json')
 text = twitter_samples.strings('tweets.20150430-223406.json')
 tweet_tokens = twitter_samples.tokenized('positive_tweets.json')

-
 def lemmatize_sentence(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = []
@@ -28,13 +27,12 @@ def lemmatize_sentence(tokens):
        lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
    return lemmatized_sentence

-
-def remove_noise(tweet_tokens, stop_words=()):
+def remove_noise(tweet_tokens, stop_words = ()):
    cleaned_tokens = []
    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|' \
-                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', token)
-        token = re.sub("(@[A-Za-z0-9_]+)", "", token)
+                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
+        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
@@ -49,7 +47,6 @@ def remove_noise(tweet_tokens, stop_words=()):
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

-
 stop_words = stopwords.words('english')
 positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
 negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')
@@ -63,23 +60,19 @@ for tokens in positive_tweet_tokens:
 for tokens in negative_tweet_tokens:
    negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

-
 def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token

-
 all_pos_words = get_all_words(positive_cleaned_tokens_list)

 freq_dist_pos = FreqDist(all_pos_words)

-
 def get_words_for_model(cleaned_tokens_list):
    for tweet_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tweet_tokens)

-
 positive_tokens_for_model = get_words_for_model(positive_cleaned_tokens_list)
 negative_tokens_for_model = get_words_for_model(negative_cleaned_tokens_list)

@@ -97,7 +90,6 @@ test_data = dataset[7000:]

 classifier = NaiveBayesClassifier.train(train_data)

-
 def analyzePost(title, content, reviews):
    points = 0
    title_tokens = remove_noise(word_tokenize(title))
@@ -122,19 +114,4 @@ def analyzePost(title, content, reviews):
            points -= 1
    return points

-
-# print(analyzePost(
-#     "Health",
-#     "Why Do Dogs Eat Grass?", [
-#         "Some vets believe dogs eat grass because they’re bored, stressed, anxious, or upset about something. Some dogs are more likely to eat grass when they believe they’re alone in the backyard, which contributes to the idea that they are unhappy when they do so.",
-#         "Some vets also believe dogs eat grass because it gets their owners’ attention, which is something they want. Even if they’re being told to stop doing something, dogs perceive this as attention, and it’s good enough for many of them."
-#     ]))
-
-print(analyzePost(
-    "good",
-    "good", [
-        "very",
-        "good",
-        "very good",
-        "brilliant"
-    ]))
+print(analyzePost("watch your dog's health", "bvjkdn dbfjksdn", ["gdfsdff"]))
--- a/NLTK_model/README.md
+++ b/NLTK_model/README.md
@@ -19,6 +19,9 @@ create phython workplace

 ### 2021-06-11
 Update workspace
+
+### 2021-08-11
+Add graph that need to research paper
 <!-- Write ontology scripts to module -->


--- a/NLTK_model/nltk_nlp.py
+++ b/NLTK_model/nltk_nlp.py
@@ -21,21 +21,18 @@ from flask import request

 app = Flask(__name__)

-
-# import training data set
 positive_tweets = twitter_samples.strings('positive_tweets.json')
 negative_tweets = twitter_samples.strings('negative_tweets.json')
 text = twitter_samples.strings('tweets.20150430-223406.json')
 tweet_tokens = twitter_samples.tokenized('positive_tweets.json')

 # print(tweet_tokens[0])
-# print(negative_tweets)
-# print(positive_tweets)
+
 # print(pos_tag(tweet_tokens[0]))

-# minimize the words that contain in the sentence
+
 def lemmatize_sentence(tokens):
-    lemmatizer = WordNetLemmatizer()  # https://wordnet.princeton.edu/
+    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = []
    # TAG words with NLTK POS tagger : https://www.nltk.org/book/ch05.html
    for word, tag in pos_tag(tokens):
@@ -45,7 +42,7 @@ def lemmatize_sentence(tokens):
            pos = 'v'
        else:
            pos = 'a'
-        lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))  # Append object to the end of the list.
+        lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
    return lemmatized_sentence

 # print(tweet_tokens[0])
@@ -81,8 +78,6 @@ stop_words = stopwords.words('english')

 # print(remove_noise(tweet_tokens[0], stop_words))

-# the given file as a list of the text content of Tweets as a list of words, screenanames, hashtags, URLs and punctuation symbols
-
 positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
 negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

@@ -107,21 +102,16 @@ def get_all_words(cleaned_tokens_list):

 all_pos_words = get_all_words(positive_cleaned_tokens_list)

-# get frequency distribution post word
 freq_dist_pos = FreqDist(all_pos_words)
 # print(freq_dist_pos.most_common(10))

-
-# check if empty values get those also
 def get_words_for_model(cleaned_tokens_list):
    for tweet_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tweet_tokens)

-# link data set again
 positive_tokens_for_model = get_words_for_model(positive_cleaned_tokens_list)
 negative_tokens_for_model = get_words_for_model(negative_cleaned_tokens_list)

-# label data set
 positive_dataset = [(tweet_dict, "Positive")
                     for tweet_dict in positive_tokens_for_model]

@@ -130,40 +120,29 @@ negative_dataset = [(tweet_dict, "Negative")

 # Create a single data set with both negative and positive data sets prepared
 dataset = positive_dataset + negative_dataset
-# print(dataset)

 # Shuffle the data set to mix positive and negative data to be split to train data and test data
 random.shuffle(dataset)

 # Split whole data set into train and test data
-train_data = dataset[:7000]
-test_data = dataset[7000:]
+train_data = dataset[:9000]
+test_data = dataset[9000:]

 # train the NaiveBayesClassifier model with train_data
 classifier = NaiveBayesClassifier.train(train_data)

 print("Accuracy is:", classify.accuracy(classifier, test_data))
+classifier.show_most_informative_features(15)

 # print(classifier.show_most_informative_features(10))

-# custom_text = "This is a bad supplier."
-# custom_text = "This is a very good supplier."
-# custom_text = "This is a very good supplier. but there was some delay in shipping."
 custom_text = "This is a bad post. it gives out wrong idea to the people."

-
 custom_tokens = remove_noise(word_tokenize(custom_text))
-# print(custom_tokens)

 # Test print
 print(classifier.classify(dict([token, True] for token in custom_tokens)))

-
-
-
-
-
-# API endpoint
 # Flask API to be used in backend
 @app.route("/NLP")
 def hello():