add answer_summarization.py file

parent 1b729c11
#Importing Libraries
import torch
import numpy as np
import pandas as pd
from torch.utils.data import Dataset
from transformers import TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, PegasusForConditionalGeneration
data_path = 'data/qna-summarization.xlsx'
df = pd.read_excel(data_path)
Answers = df['Answer'].tolist()
Summarized = df['Summarized'].tolist()
tokenizer = AutoTokenizer.from_pretrained("google/pegasus-xsum")
tokenized_answers = tokenizer(Answers, truncation=True, padding=True)
tokenized_summarized = tokenizer(Summarized, truncation=True, padding=True)
class SummarizationDataset(Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
item['labels'] = torch.tensor(self.labels[idx])
return item
def __len__(self):
return len(self.labels)
dataset = SummarizationDataset(tokenized_answers, tokenized_summarized.input_ids)
train_dataset, val_dataset = train_test_split(dataset, test_size=0.15)
model = PegasusForConditionalGeneration.from_pretrained('google/pegasus-xsum')
# define the training arguments
training_args = TrainingArguments(
output_dir='Answer Summarization', # output directory
num_train_epochs=1, # total # of training epochs
per_device_train_batch_size=1, # batch size per device during training
per_device_eval_batch_size=1, # batch size for evaluation
warmup_steps=500, # number of warmup steps for learning rate scheduler
weight_decay=0.01, # strength of weight decay
logging_dir='Answer Summarization/logs', # directory for storing logs
logging_steps=10
)
trainer = Trainer(
model=model, # the instantiated 🤗 Transformers model to be trained
args=training_args, # training arguments, defined above
train_dataset=train_dataset, # training dataset
eval_dataset=val_dataset # evaluation dataset
)
# train the model
trainer.train()
answer = "No, a person cannot be punished with a penalty more severe than the penalty in force at the time the offence was committed."
inputs = tokenizer(answer, return_tensors="pt", truncation=True, padding=True)
summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=100, early_stopping=True)
tokenizer.decode(summary_ids[0], skip_special_tokens=True)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment