Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
2
2022-066
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Commits
Open sidebar
2022-066
2022-066
Commits
b23bb206
Commit
b23bb206
authored
Nov 14, 2022
by
De Silva K.C.C.C
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
text pre processing
parent
532415b9
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
160 additions
and
0 deletions
+160
-0
Indexing/question_generator.py
Indexing/question_generator.py
+160
-0
No files found.
Indexing/question_generator.py
0 → 100644
View file @
b23bb206
from
textwrap3
import
wrap
import
torch
from
transformers
import
T5ForConditionalGeneration
,
T5Tokenizer
import
random
import
numpy
as
np
import
nltk
# nltk.download('punkt')
# nltk.download('brown')
# nltk.download('wordnet')
# nltk.download('stopwords')
from
nltk.corpus
import
wordnet
as
wn
from
nltk.tokenize
import
sent_tokenize
from
nltk.corpus
import
stopwords
import
string
import
pke
import
traceback
from
flashtext
import
KeywordProcessor
summary_model
=
T5ForConditionalGeneration
.
from_pretrained
(
't5-base'
)
summary_tokenizer
=
T5Tokenizer
.
from_pretrained
(
't5-base'
)
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
)
summary_model
=
summary_model
.
to
(
device
)
def
set_seed
(
seed
:
int
):
random
.
seed
(
seed
)
np
.
random
.
seed
(
seed
)
torch
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed_all
(
seed
)
def
postprocesstext
(
content
):
final
=
""
for
sent
in
sent_tokenize
(
content
):
sent
=
sent
.
capitalize
()
final
=
final
+
" "
+
sent
return
final
def
summarizer
(
text
,
model
=
summary_model
,
tokenizer
=
summary_tokenizer
):
text
=
text
.
strip
()
.
replace
(
"
\n
"
,
" "
)
text
=
"summarize: "
+
text
# print (text)
max_len
=
512
encoding
=
tokenizer
.
encode_plus
(
text
,
max_length
=
max_len
,
pad_to_max_length
=
False
,
truncation
=
True
,
return_tensors
=
"pt"
)
.
to
(
device
)
input_ids
,
attention_mask
=
encoding
[
"input_ids"
],
encoding
[
"attention_mask"
]
outs
=
model
.
generate
(
input_ids
=
input_ids
,
attention_mask
=
attention_mask
,
early_stopping
=
True
,
num_beams
=
3
,
num_return_sequences
=
1
,
no_repeat_ngram_size
=
2
,
min_length
=
75
,
max_length
=
300
)
dec
=
[
tokenizer
.
decode
(
ids
,
skip_special_tokens
=
True
)
for
ids
in
outs
]
summary
=
dec
[
0
]
summary
=
postprocesstext
(
summary
)
summary
=
summary
.
strip
()
return
summary
def
get_nouns_multipartite
(
content
):
out
=
[]
try
:
extractor
=
pke
.
unsupervised
.
MultipartiteRank
()
extractor
.
load_document
(
input
=
content
)
# not contain punctuation marks or stopwords as candidates.
pos
=
{
'PROPN'
,
'NOUN'
}
# pos = {'PROPN','NOUN'}
stoplist
=
list
(
string
.
punctuation
)
stoplist
+=
[
'-lrb-'
,
'-rrb-'
,
'-lcb-'
,
'-rcb-'
,
'-lsb-'
,
'-rsb-'
]
stoplist
+=
stopwords
.
words
(
'english'
)
extractor
.
candidate_selection
(
pos
=
pos
,
stoplist
=
stoplist
)
# 4. build the Multipartite graph and rank candidates using random walk,
# alpha controls the weight adjustment mechanism, see TopicRank for
# threshold/method parameters.
extractor
.
candidate_weighting
(
alpha
=
1.1
,
threshold
=
0.75
,
method
=
'average'
)
keyphrases
=
extractor
.
get_n_best
(
n
=
15
)
for
val
in
keyphrases
:
out
.
append
(
val
[
0
])
except
:
out
=
[]
traceback
.
print_exc
()
return
out
def
get_keywords
(
originaltext
,
summarytext
):
keywords
=
get_nouns_multipartite
(
originaltext
)
# print("keywords unsummarized: ", keywords)
keyword_processor
=
KeywordProcessor
()
for
keyword
in
keywords
:
keyword_processor
.
add_keyword
(
keyword
)
keywords_found
=
keyword_processor
.
extract_keywords
(
summarytext
)
keywords_found
=
list
(
set
(
keywords_found
))
# print("keywords_found in summarized: ", keywords_found)
important_keywords
=
[]
for
keyword
in
keywords
:
if
keyword
in
keywords_found
:
important_keywords
.
append
(
keyword
)
return
important_keywords
[:
4
]
question_model
=
T5ForConditionalGeneration
.
from_pretrained
(
'ramsrigouthamg/t5_squad_v1'
)
question_tokenizer
=
T5Tokenizer
.
from_pretrained
(
'ramsrigouthamg/t5_squad_v1'
)
question_model
=
question_model
.
to
(
device
)
def
get_question
(
context
,
answer
,
model
,
tokenizer
):
text
=
"context: {} answer: {}"
.
format
(
context
,
answer
)
encoding
=
tokenizer
.
encode_plus
(
text
,
max_length
=
384
,
pad_to_max_length
=
False
,
truncation
=
True
,
return_tensors
=
"pt"
)
.
to
(
device
)
input_ids
,
attention_mask
=
encoding
[
"input_ids"
],
encoding
[
"attention_mask"
]
outs
=
model
.
generate
(
input_ids
=
input_ids
,
attention_mask
=
attention_mask
,
early_stopping
=
True
,
num_beams
=
5
,
num_return_sequences
=
1
,
no_repeat_ngram_size
=
2
,
max_length
=
72
)
dec
=
[
tokenizer
.
decode
(
ids
,
skip_special_tokens
=
True
)
for
ids
in
outs
]
Question
=
dec
[
0
]
.
replace
(
"question:"
,
""
)
Question
=
Question
.
strip
()
return
Question
def
generate_questions_and_answers
(
text
):
set_seed
(
42
)
summarized_text
=
summarizer
(
text
,
summary_model
,
summary_tokenizer
)
imp_keywords
=
get_keywords
(
text
,
summarized_text
)
question_and_answer_list
=
[]
for
answer
in
imp_keywords
:
ques
=
get_question
(
summarized_text
,
answer
,
question_model
,
question_tokenizer
)
question_and_answer_list
.
append
([
ques
,
answer
.
capitalize
()])
return
question_and_answer_list
# print(generate_questions_and_answers(xxx))
#
# x = generate_questions_and_answers(xxx)
#
# for i in x:
# print(i[0])
# print(i[1])
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment