Upload New File

ff49d9cd · Niyas Inshaf · b9939e66 · ff49d9cd
Commit ff49d9cd authored Jan 31, 2023 by Niyas Inshaf
Show whitespace changes
Inline Side-by-side

Showing with 91 additions and 0 deletions

Automation Question Genaration/q_a_gen_ml.py Automation Question Genaration/q_a_gen_ml.py +91 -0

No files found.
--- a/Automation Question Genaration/q_a_gen_ml.py
+++ b/Automation Question Genaration/q_a_gen_ml.py
+# -*- coding: utf-8 -*-
+"""Q&A gen ML.ipynb
+
+Automatically generated by Colaboratory.
+
+Original file is located at
+    https://colab.research.google.com/drive/1S1Bj59Ziyq1YmJYOeUNRRfpR7wZJyrFT
+"""
+
+import io
+import nltk
+from nltk.tokenize import sent_tokenize, word_tokenize
+from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
+
+"""# Use the function to extract text from the PDF"""
+
+pdf_path = 'research 1.pdf'
+
+"""# Pre-processing step
+# Tokenize the text into sentences
+sentences = sent_tokenize(text)
+
+# Tokenize the sentences into words
+words = [word_tokenize(sentence) for sentence in sentences]
+
+# Remove stop words and punctuations
+stop_words = set(stopwords.words('english'))
+filtered_words = [[word.lower() for word in sentence if word.isalnum() and word.lower() not in stop_words] for sentence in words]
+
+# Lemmatize the words
+lemmatizer = WordNetLemmatizer()
+lemmatized_words = [[lemmatizer.lemmatize(word) for word in sentence] for sentence in filtered_words]
+
+# Generate questions
+questions = []
+for sentence in lemmatized_words:
+    if 'who' in sentence or 'what' in sentence or 'when' in sentence or 'where' in sentence or 'why' in sentence or 'how' in sentence:
+        questions.append(' '.join(sentence))
+
+print( lemmatized_words)
+
+# Download necessary NLTK data
+"""
+
+nltk.download('punkt')
+nltk.download('stopwords')
+nltk.download('wordnet')
+
+def extract_text_from_pdf(pdf_path):
+    with open(pdf_path, 'rb') as fh:
+        # Create a PDF resource manager object that stores shared resources
+        rsrcmgr = PDFResourceManager()
+        # Create a string buffer for the extracted text
+        sio = io.StringIO()
+        # Create a PDF device object
+        device = TextConverter(rsrcmgr, sio, codec='utf-8', laparams=LAParams())
+        # Create a PDF interpreter object
+        interpreter = PDFPageInterpreter(rsrcmgr, device)
+        # Process each page contained in the PDF document
+        for page in PDFPage.get_pages(fh, set(), maxpages=0, password="", caching=True, check_extractable=True):
+            interpreter.process_page(page)
+        # Get the extracted text
+        text = sio.getvalue()
+    # Cleanup
+    device.close()
+    sio.close()
+    return text
+
+!pip install transformers
+
+!pip install sentencepiece
+
+from transformers import AutoTokenizer, T5ForConditionalGeneration
+
+model_name = "allenai/unifiedqa-t5-small" # you can specify the model size here
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = T5ForConditionalGeneration.from_pretrained(model_name)
+
+# Commented out IPython magic to ensure Python compatibility.
+
+
+!git clone https://github.com/amontgomerie/question_generator
+# %cd question_generator/
+
+import torch
+
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+assert device == torch.device('cuda'), "Not using CUDA. Set: Runtime > Change runtime type > Hardware Accelerator: GPU"
+
+!python 'run_qg.py' --text_file '/content/question_generator/articles/twitter_hack.txt'
\ No newline at end of file