Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
2
22_23-J 25
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
1
Merge Requests
1
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
22_23-J 25
22_23-J 25
Commits
7968ab2b
Commit
7968ab2b
authored
Feb 03, 2023
by
De Silva E.A.A it19991054
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add answer_summarization.py file
parent
1b729c11
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
64 additions
and
0 deletions
+64
-0
backend/answer_summarization.py
backend/answer_summarization.py
+64
-0
No files found.
backend/answer_summarization.py
0 → 100644
View file @
7968ab2b
#Importing Libraries
import
torch
import
numpy
as
np
import
pandas
as
pd
from
torch.utils.data
import
Dataset
from
transformers
import
TrainingArguments
,
Trainer
from
sklearn.model_selection
import
train_test_split
from
transformers
import
AutoTokenizer
,
PegasusForConditionalGeneration
data_path
=
'data/qna-summarization.xlsx'
df
=
pd
.
read_excel
(
data_path
)
Answers
=
df
[
'Answer'
]
.
tolist
()
Summarized
=
df
[
'Summarized'
]
.
tolist
()
tokenizer
=
AutoTokenizer
.
from_pretrained
(
"google/pegasus-xsum"
)
tokenized_answers
=
tokenizer
(
Answers
,
truncation
=
True
,
padding
=
True
)
tokenized_summarized
=
tokenizer
(
Summarized
,
truncation
=
True
,
padding
=
True
)
class
SummarizationDataset
(
Dataset
):
def
__init__
(
self
,
encodings
,
labels
):
self
.
encodings
=
encodings
self
.
labels
=
labels
def
__getitem__
(
self
,
idx
):
item
=
{
key
:
torch
.
tensor
(
val
[
idx
])
for
key
,
val
in
self
.
encodings
.
items
()}
item
[
'labels'
]
=
torch
.
tensor
(
self
.
labels
[
idx
])
return
item
def
__len__
(
self
):
return
len
(
self
.
labels
)
dataset
=
SummarizationDataset
(
tokenized_answers
,
tokenized_summarized
.
input_ids
)
train_dataset
,
val_dataset
=
train_test_split
(
dataset
,
test_size
=
0.15
)
model
=
PegasusForConditionalGeneration
.
from_pretrained
(
'google/pegasus-xsum'
)
# define the training arguments
training_args
=
TrainingArguments
(
output_dir
=
'Answer Summarization'
,
# output directory
num_train_epochs
=
1
,
# total # of training epochs
per_device_train_batch_size
=
1
,
# batch size per device during training
per_device_eval_batch_size
=
1
,
# batch size for evaluation
warmup_steps
=
500
,
# number of warmup steps for learning rate scheduler
weight_decay
=
0.01
,
# strength of weight decay
logging_dir
=
'Answer Summarization/logs'
,
# directory for storing logs
logging_steps
=
10
)
trainer
=
Trainer
(
model
=
model
,
# the instantiated 🤗 Transformers model to be trained
args
=
training_args
,
# training arguments, defined above
train_dataset
=
train_dataset
,
# training dataset
eval_dataset
=
val_dataset
# evaluation dataset
)
# train the model
trainer
.
train
()
answer
=
"No, a person cannot be punished with a penalty more severe than the penalty in force at the time the offence was committed."
inputs
=
tokenizer
(
answer
,
return_tensors
=
"pt"
,
truncation
=
True
,
padding
=
True
)
summary_ids
=
model
.
generate
(
inputs
[
'input_ids'
],
num_beams
=
4
,
max_length
=
100
,
early_stopping
=
True
)
tokenizer
.
decode
(
summary_ids
[
0
],
skip_special_tokens
=
True
)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment