Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
S
Smart E- Learn Tracer
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
1
Merge Requests
1
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
23_22 - J 01
Smart E- Learn Tracer
Commits
ff49d9cd
Commit
ff49d9cd
authored
Jan 31, 2023
by
Niyas Inshaf
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Upload New File
parent
b9939e66
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
91 additions
and
0 deletions
+91
-0
Automation Question Genaration/q_a_gen_ml.py
Automation Question Genaration/q_a_gen_ml.py
+91
-0
No files found.
Automation Question Genaration/q_a_gen_ml.py
0 → 100644
View file @
ff49d9cd
# -*- coding: utf-8 -*-
"""Q&A gen ML.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1S1Bj59Ziyq1YmJYOeUNRRfpR7wZJyrFT
"""
import
io
import
nltk
from
nltk.tokenize
import
sent_tokenize
,
word_tokenize
from
nltk.corpus
import
stopwords
from
nltk.stem
import
WordNetLemmatizer
"""# Use the function to extract text from the PDF"""
pdf_path
=
'research 1.pdf'
"""# Pre-processing step
# Tokenize the text into sentences
sentences = sent_tokenize(text)
# Tokenize the sentences into words
words = [word_tokenize(sentence) for sentence in sentences]
# Remove stop words and punctuations
stop_words = set(stopwords.words('english'))
filtered_words = [[word.lower() for word in sentence if word.isalnum() and word.lower() not in stop_words] for sentence in words]
# Lemmatize the words
lemmatizer = WordNetLemmatizer()
lemmatized_words = [[lemmatizer.lemmatize(word) for word in sentence] for sentence in filtered_words]
# Generate questions
questions = []
for sentence in lemmatized_words:
if 'who' in sentence or 'what' in sentence or 'when' in sentence or 'where' in sentence or 'why' in sentence or 'how' in sentence:
questions.append(' '.join(sentence))
print( lemmatized_words)
# Download necessary NLTK data
"""
nltk
.
download
(
'punkt'
)
nltk
.
download
(
'stopwords'
)
nltk
.
download
(
'wordnet'
)
def
extract_text_from_pdf
(
pdf_path
):
with
open
(
pdf_path
,
'rb'
)
as
fh
:
# Create a PDF resource manager object that stores shared resources
rsrcmgr
=
PDFResourceManager
()
# Create a string buffer for the extracted text
sio
=
io
.
StringIO
()
# Create a PDF device object
device
=
TextConverter
(
rsrcmgr
,
sio
,
codec
=
'utf-8'
,
laparams
=
LAParams
())
# Create a PDF interpreter object
interpreter
=
PDFPageInterpreter
(
rsrcmgr
,
device
)
# Process each page contained in the PDF document
for
page
in
PDFPage
.
get_pages
(
fh
,
set
(),
maxpages
=
0
,
password
=
""
,
caching
=
True
,
check_extractable
=
True
):
interpreter
.
process_page
(
page
)
# Get the extracted text
text
=
sio
.
getvalue
()
# Cleanup
device
.
close
()
sio
.
close
()
return
text
!
pip
install
transformers
!
pip
install
sentencepiece
from
transformers
import
AutoTokenizer
,
T5ForConditionalGeneration
model_name
=
"allenai/unifiedqa-t5-small"
# you can specify the model size here
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
)
model
=
T5ForConditionalGeneration
.
from_pretrained
(
model_name
)
# Commented out IPython magic to ensure Python compatibility.
!
git
clone
https
:
//
github
.
com
/
amontgomerie
/
question_generator
# %cd question_generator/
import
torch
device
=
torch
.
device
(
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
)
assert
device
==
torch
.
device
(
'cuda'
),
"Not using CUDA. Set: Runtime > Change runtime type > Hardware Accelerator: GPU"
!
python
'run_qg.py'
--
text_file
'/content/question_generator/articles/twitter_hack.txt'
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment