Commit ff49d9cd authored by Niyas Inshaf's avatar Niyas Inshaf

Upload New File

parent b9939e66
# -*- coding: utf-8 -*-
"""Q&A gen ML.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1S1Bj59Ziyq1YmJYOeUNRRfpR7wZJyrFT
"""
import io
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
"""# Use the function to extract text from the PDF"""
pdf_path = 'research 1.pdf'
"""# Pre-processing step
# Tokenize the text into sentences
sentences = sent_tokenize(text)
# Tokenize the sentences into words
words = [word_tokenize(sentence) for sentence in sentences]
# Remove stop words and punctuations
stop_words = set(stopwords.words('english'))
filtered_words = [[word.lower() for word in sentence if word.isalnum() and word.lower() not in stop_words] for sentence in words]
# Lemmatize the words
lemmatizer = WordNetLemmatizer()
lemmatized_words = [[lemmatizer.lemmatize(word) for word in sentence] for sentence in filtered_words]
# Generate questions
questions = []
for sentence in lemmatized_words:
if 'who' in sentence or 'what' in sentence or 'when' in sentence or 'where' in sentence or 'why' in sentence or 'how' in sentence:
questions.append(' '.join(sentence))
print( lemmatized_words)
# Download necessary NLTK data
"""
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
def extract_text_from_pdf(pdf_path):
with open(pdf_path, 'rb') as fh:
# Create a PDF resource manager object that stores shared resources
rsrcmgr = PDFResourceManager()
# Create a string buffer for the extracted text
sio = io.StringIO()
# Create a PDF device object
device = TextConverter(rsrcmgr, sio, codec='utf-8', laparams=LAParams())
# Create a PDF interpreter object
interpreter = PDFPageInterpreter(rsrcmgr, device)
# Process each page contained in the PDF document
for page in PDFPage.get_pages(fh, set(), maxpages=0, password="", caching=True, check_extractable=True):
interpreter.process_page(page)
# Get the extracted text
text = sio.getvalue()
# Cleanup
device.close()
sio.close()
return text
!pip install transformers
!pip install sentencepiece
from transformers import AutoTokenizer, T5ForConditionalGeneration
model_name = "allenai/unifiedqa-t5-small" # you can specify the model size here
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
# Commented out IPython magic to ensure Python compatibility.
!git clone https://github.com/amontgomerie/question_generator
# %cd question_generator/
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
assert device == torch.device('cuda'), "Not using CUDA. Set: Runtime > Change runtime type > Hardware Accelerator: GPU"
!python 'run_qg.py' --text_file '/content/question_generator/articles/twitter_hack.txt'
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment