add answer_summarization.py file

7968ab2b · De Silva E.A.A it19991054 · 1b729c11 · 7968ab2b
Commit 7968ab2b authored Feb 03, 2023 by De Silva E.A.A it19991054
Show whitespace changes
Inline Side-by-side

Showing with 64 additions and 0 deletions

backend/answer_summarization.py backend/answer_summarization.py +64 -0

No files found.
--- a/backend/answer_summarization.py
+++ b/backend/answer_summarization.py
+#Importing Libraries
+import torch
+import numpy as np
+import pandas as pd
+from torch.utils.data import Dataset
+from transformers import TrainingArguments, Trainer
+from sklearn.model_selection import train_test_split
+from transformers import AutoTokenizer, PegasusForConditionalGeneration
+
+data_path = 'data/qna-summarization.xlsx'
+df = pd.read_excel(data_path)
+Answers = df['Answer'].tolist()
+Summarized = df['Summarized'].tolist()
+
+tokenizer = AutoTokenizer.from_pretrained("google/pegasus-xsum")
+tokenized_answers = tokenizer(Answers, truncation=True, padding=True)
+tokenized_summarized = tokenizer(Summarized, truncation=True, padding=True)
+
+class SummarizationDataset(Dataset):
+    def __init__(self, encodings, labels):
+        self.encodings = encodings
+        self.labels = labels
+
+    def __getitem__(self, idx):
+        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
+        item['labels'] = torch.tensor(self.labels[idx])
+        return item
+
+    def __len__(self):
+        return len(self.labels)
+
+dataset = SummarizationDataset(tokenized_answers, tokenized_summarized.input_ids)
+train_dataset, val_dataset = train_test_split(dataset, test_size=0.15)
+
+model = PegasusForConditionalGeneration.from_pretrained('google/pegasus-xsum')
+
+# define the training arguments
+
+training_args = TrainingArguments(
+                                output_dir='Answer Summarization',          # output directory
+                                num_train_epochs=1,                                 # total # of training epochs
+                                per_device_train_batch_size=1,                      # batch size per device during training
+                                per_device_eval_batch_size=1,                       # batch size for evaluation
+                                warmup_steps=500,                                   # number of warmup steps for learning rate scheduler
+                                weight_decay=0.01,                                  # strength of weight decay
+                                logging_dir='Answer Summarization/logs',            # directory for storing logs
+                                logging_steps=10
+                            )
+
+trainer = Trainer(
+                model=model,                         # the instantiated 🤗 Transformers model to be trained
+                args=training_args,                  # training arguments, defined above
+                train_dataset=train_dataset,         # training dataset
+                eval_dataset=val_dataset             # evaluation dataset
+            )
+
+# train the model
+trainer.train()
+
+answer = "No, a person cannot be punished with a penalty more severe than the penalty in force at the time the offence was committed."
+
+inputs = tokenizer(answer, return_tensors="pt", truncation=True, padding=True)
+summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=100, early_stopping=True)
+tokenizer.decode(summary_ids[0], skip_special_tokens=True)
\ No newline at end of file