Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
G
Gather only the necessary facts for documentaries by e-filtering videos
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
1
Merge Requests
1
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
TMP-23-384
Gather only the necessary facts for documentaries by e-filtering videos
Commits
6bc2d94c
Commit
6bc2d94c
authored
Oct 26, 2023
by
keshara
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
initial question generator py commit
parents
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
97 additions
and
0 deletions
+97
-0
Objective_4/question_generator.py
Objective_4/question_generator.py
+97
-0
No files found.
Objective_4/question_generator.py
0 → 100644
View file @
6bc2d94c
from
textwrap3
import
wrap
import
torch
from
transformers
import
T5ForConditionalGeneration
,
T5Tokenizer
import
random
import
numpy
as
np
import
nltk
nltk
.
download
(
'punkt'
)
nltk
.
download
(
'brown'
)
nltk
.
download
(
'wordnet'
)
nltk
.
download
(
'stopwords'
)
from
nltk.corpus
import
wordnet
as
wn
from
nltk.tokenize
import
sent_tokenize
from
nltk.corpus
import
stopwords
import
string
import
pke
import
traceback
from
flashtext
import
KeywordProcessor
summary_model
=
T5ForConditionalGeneration
.
from_pretrained
(
't5-base'
)
summary_tokenizer
=
T5Tokenizer
.
from_pretrained
(
't5-base'
)
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
)
summary_model
=
summary_model
.
to
(
device
)
def
set_seed
(
seed
:
int
):
random
.
seed
(
seed
)
np
.
random
.
seed
(
seed
)
torch
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed_all
(
seed
)
def
postprocesstext
(
content
):
final
=
""
for
sent
in
sent_tokenize
(
content
):
sent
=
sent
.
capitalize
()
final
=
final
+
" "
+
sent
return
final
def
summarizer
(
text
,
model
=
summary_model
,
tokenizer
=
summary_tokenizer
):
text
=
text
.
strip
()
.
replace
(
"
\n
"
,
" "
)
text
=
"summarize: "
+
text
# print (text)
max_len
=
512
encoding
=
tokenizer
.
encode_plus
(
text
,
max_length
=
max_len
,
pad_to_max_length
=
False
,
truncation
=
True
,
return_tensors
=
"pt"
)
.
to
(
device
)
input_ids
,
attention_mask
=
encoding
[
"input_ids"
],
encoding
[
"attention_mask"
]
outs
=
model
.
generate
(
input_ids
=
input_ids
,
attention_mask
=
attention_mask
,
early_stopping
=
True
,
num_beams
=
3
,
num_return_sequences
=
1
,
no_repeat_ngram_size
=
2
,
min_length
=
75
,
max_length
=
300
)
dec
=
[
tokenizer
.
decode
(
ids
,
skip_special_tokens
=
True
)
for
ids
in
outs
]
summary
=
dec
[
0
]
summary
=
postprocesstext
(
summary
)
summary
=
summary
.
strip
()
return
summary
def
get_nouns_multipartite
(
content
):
out
=
[]
try
:
extractor
=
pke
.
unsupervised
.
MultipartiteRank
()
extractor
.
load_document
(
input
=
content
)
# not contain punctuation marks or stopwords as candidates.
pos
=
{
'PROPN'
,
'NOUN'
}
# pos = {'PROPN','NOUN'}
stoplist
=
list
(
string
.
punctuation
)
stoplist
+=
[
'-lrb-'
,
'-rrb-'
,
'-lcb-'
,
'-rcb-'
,
'-lsb-'
,
'-rsb-'
]
stoplist
+=
stopwords
.
words
(
'english'
)
extractor
.
candidate_selection
(
pos
=
pos
,
stoplist
=
stoplist
)
# 4. build the Multipartite graph and rank candidates using random walk,
# alpha controls the weight adjustment mechanism, see TopicRank for
# threshold/method parameters.
extractor
.
candidate_weighting
(
alpha
=
1.1
,
threshold
=
0.75
,
method
=
'average'
)
keyphrases
=
extractor
.
get_n_best
(
n
=
15
)
for
val
in
keyphrases
:
out
.
append
(
val
[
0
])
except
:
out
=
[]
traceback
.
print_exc
()
return
out
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment