initial question generator py commit

6bc2d94c · keshara · 6bc2d94c
Commit 6bc2d94c authored Oct 26, 2023 by keshara
Hide whitespace changes
Inline Side-by-side

Showing with 97 additions and 0 deletions

Objective_4/question_generator.py Objective_4/question_generator.py +97 -0

No files found.
--- a/Objective_4/question_generator.py
+++ b/Objective_4/question_generator.py
+from textwrap3 import wrap
+import torch
+from transformers import T5ForConditionalGeneration, T5Tokenizer
+import random
+import numpy as np
+import nltk
+nltk.download('punkt')
+nltk.download('brown')
+nltk.download('wordnet')
+nltk.download('stopwords')
+from nltk.corpus import wordnet as wn
+from nltk.tokenize import sent_tokenize
+from nltk.corpus import stopwords
+import string
+import pke
+import traceback
+from flashtext import KeywordProcessor
+
+summary_model = T5ForConditionalGeneration.from_pretrained('t5-base')
+summary_tokenizer = T5Tokenizer.from_pretrained('t5-base')
+
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+summary_model = summary_model.to(device)
+
+
+def set_seed(seed: int):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+
+
+def postprocesstext(content):
+    final = ""
+    for sent in sent_tokenize(content):
+        sent = sent.capitalize()
+        final = final + " " + sent
+    return final
+
+
+def summarizer(text, model=summary_model, tokenizer=summary_tokenizer):
+    text = text.strip().replace("\n", " ")
+    text = "summarize: " + text
+    # print (text)
+    max_len = 512
+    encoding = tokenizer.encode_plus(text, max_length=max_len, pad_to_max_length=False, truncation=True,
+                                     return_tensors="pt").to(device)
+
+    input_ids, attention_mask = encoding["input_ids"], encoding["attention_mask"]
+
+    outs = model.generate(input_ids=input_ids,
+                          attention_mask=attention_mask,
+                          early_stopping=True,
+                          num_beams=3,
+                          num_return_sequences=1,
+                          no_repeat_ngram_size=2,
+                          min_length=75,
+                          max_length=300)
+
+    dec = [tokenizer.decode(ids, skip_special_tokens=True) for ids in outs]
+    summary = dec[0]
+    summary = postprocesstext(summary)
+    summary = summary.strip()
+
+    return summary
+
+
+def get_nouns_multipartite(content):
+    out = []
+    try:
+        extractor = pke.unsupervised.MultipartiteRank()
+        extractor.load_document(input=content)
+        #    not contain punctuation marks or stopwords as candidates.
+        pos = {'PROPN', 'NOUN'}
+        # pos = {'PROPN','NOUN'}
+        stoplist = list(string.punctuation)
+        stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
+        stoplist += stopwords.words('english')
+        extractor.candidate_selection(pos=pos, stoplist=stoplist)
+        # 4. build the Multipartite graph and rank candidates using random walk,
+        #    alpha controls the weight adjustment mechanism, see TopicRank for
+        #    threshold/method parameters.
+        extractor.candidate_weighting(alpha=1.1,
+                                      threshold=0.75,
+                                      method='average')
+        keyphrases = extractor.get_n_best(n=15)
+
+        for val in keyphrases:
+            out.append(val[0])
+    except:
+        out = []
+        traceback.print_exc()
+
+    return out
+
+
+