Commit 6ae2b121 authored by Bandara I.M.R.H IT18013474's avatar Bandara I.M.R.H IT18013474 💬

Public Violation Detection model

parent d1d3ed4c
{
"cells": [
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"import numpy as np\n",
"import pandas as pd\n",
"from torch.utils.data import Dataset\n",
"from transformers import TrainingArguments, Trainer\n",
"from sklearn.model_selection import train_test_split\n",
"from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Finetuning"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
"data_path = 'data/public-stories.xlsx'\n",
"df = pd.read_excel(data_path)\n",
"\n",
"cases_simple = df['case in simple word'].tolist()\n",
"cases_simple = [i.replace('\\n', '').strip() for i in cases_simple]\n",
"\n",
"ViolateFlag = df['ViolateFlag'].tolist()\n",
"class_dict = {'Yes': 1, 'No': 0}\n",
"ViolateFlag = [i.replace('\\n', '').strip() for i in ViolateFlag]\n",
"ViolateFlags = [class_dict[i] for i in ViolateFlag]"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"loading file vocab.txt from cache at C:\\Users\\Legion/.cache\\huggingface\\hub\\models--distilbert-base-uncased\\snapshots\\1c4513b2eedbda136f57676a34eea67aba266e5c\\vocab.txt\n",
"loading file tokenizer.json from cache at C:\\Users\\Legion/.cache\\huggingface\\hub\\models--distilbert-base-uncased\\snapshots\\1c4513b2eedbda136f57676a34eea67aba266e5c\\tokenizer.json\n",
"loading file added_tokens.json from cache at None\n",
"loading file special_tokens_map.json from cache at None\n",
"loading file tokenizer_config.json from cache at C:\\Users\\Legion/.cache\\huggingface\\hub\\models--distilbert-base-uncased\\snapshots\\1c4513b2eedbda136f57676a34eea67aba266e5c\\tokenizer_config.json\n",
"loading configuration file config.json from cache at C:\\Users\\Legion/.cache\\huggingface\\hub\\models--distilbert-base-uncased\\snapshots\\1c4513b2eedbda136f57676a34eea67aba266e5c\\config.json\n",
"Model config DistilBertConfig {\n",
" \"_name_or_path\": \"distilbert-base-uncased\",\n",
" \"activation\": \"gelu\",\n",
" \"architectures\": [\n",
" \"DistilBertForMaskedLM\"\n",
" ],\n",
" \"attention_dropout\": 0.1,\n",
" \"dim\": 768,\n",
" \"dropout\": 0.1,\n",
" \"hidden_dim\": 3072,\n",
" \"initializer_range\": 0.02,\n",
" \"max_position_embeddings\": 512,\n",
" \"model_type\": \"distilbert\",\n",
" \"n_heads\": 12,\n",
" \"n_layers\": 6,\n",
" \"pad_token_id\": 0,\n",
" \"qa_dropout\": 0.1,\n",
" \"seq_classif_dropout\": 0.2,\n",
" \"sinusoidal_pos_embds\": false,\n",
" \"tie_weights_\": true,\n",
" \"transformers_version\": \"4.24.0\",\n",
" \"vocab_size\": 30522\n",
"}\n",
"\n",
"loading configuration file config.json from cache at C:\\Users\\Legion/.cache\\huggingface\\hub\\models--distilbert-base-uncased\\snapshots\\1c4513b2eedbda136f57676a34eea67aba266e5c\\config.json\n",
"Model config DistilBertConfig {\n",
" \"activation\": \"gelu\",\n",
" \"architectures\": [\n",
" \"DistilBertForMaskedLM\"\n",
" ],\n",
" \"attention_dropout\": 0.1,\n",
" \"dim\": 768,\n",
" \"dropout\": 0.1,\n",
" \"hidden_dim\": 3072,\n",
" \"initializer_range\": 0.02,\n",
" \"max_position_embeddings\": 512,\n",
" \"model_type\": \"distilbert\",\n",
" \"n_heads\": 12,\n",
" \"n_layers\": 6,\n",
" \"pad_token_id\": 0,\n",
" \"qa_dropout\": 0.1,\n",
" \"seq_classif_dropout\": 0.2,\n",
" \"sinusoidal_pos_embds\": false,\n",
" \"tie_weights_\": true,\n",
" \"transformers_version\": \"4.24.0\",\n",
" \"vocab_size\": 30522\n",
"}\n",
"\n",
"loading weights file pytorch_model.bin from cache at C:\\Users\\Legion/.cache\\huggingface\\hub\\models--distilbert-base-uncased\\snapshots\\1c4513b2eedbda136f57676a34eea67aba266e5c\\pytorch_model.bin\n",
"Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_transform.weight']\n",
"- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
"Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight']\n",
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
]
}
],
"source": [
"tokenizer = DistilBertTokenizerFast.from_pretrained(\"distilbert-base-uncased\")\n",
"model = DistilBertForSequenceClassification.from_pretrained(\"distilbert-base-uncased\")"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [],
"source": [
"class PublicViolationDataset(Dataset):\n",
" def __init__(self, cases_simple, ViolateFlags):\n",
" self.cases_simple = cases_simple\n",
" self.ViolateFlags = ViolateFlags\n",
" self.tokenizer = tokenizer\n",
" self.encodings = tokenizer(cases_simple, truncation=True, padding=True)\n",
"\n",
" def __len__(self):\n",
" return len(self.cases_simple)\n",
"\n",
" def __getitem__(self, idx):\n",
" item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}\n",
" item['labels'] = torch.tensor(self.ViolateFlags[idx])\n",
" return item"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [],
"source": [
"dataset = PublicViolationDataset(\n",
" cases_simple,\n",
" ViolateFlags\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"PyTorch: setting up devices\n",
"The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).\n",
"c:\\Users\\Legion\\.conda\\envs\\torch111\\lib\\site-packages\\transformers\\optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
" warnings.warn(\n",
"***** Running training *****\n",
" Num examples = 11\n",
" Num Epochs = 30\n",
" Instantaneous batch size per device = 1\n",
" Total train batch size (w. parallel, distributed & accumulation) = 1\n",
" Gradient Accumulation steps = 1\n",
" Total optimization steps = 330\n",
" Number of trainable parameters = 66955010\n",
"Automatic Weights & Biases logging enabled, to disable set os.environ[\"WANDB_DISABLED\"] = \"true\"\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "be948d8c6455448da350172b0f663c0c",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
" 0%| | 0/330 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'loss': 0.7163, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.91}\n",
"{'loss': 0.7165, 'learning_rate': 2.0000000000000003e-06, 'epoch': 1.82}\n",
"{'loss': 0.723, 'learning_rate': 3e-06, 'epoch': 2.73}\n",
"{'loss': 0.6927, 'learning_rate': 4.000000000000001e-06, 'epoch': 3.64}\n",
"{'loss': 0.6761, 'learning_rate': 5e-06, 'epoch': 4.55}\n",
"{'loss': 0.6679, 'learning_rate': 6e-06, 'epoch': 5.45}\n",
"{'loss': 0.6409, 'learning_rate': 7.000000000000001e-06, 'epoch': 6.36}\n",
"{'loss': 0.5796, 'learning_rate': 8.000000000000001e-06, 'epoch': 7.27}\n",
"{'loss': 0.6684, 'learning_rate': 9e-06, 'epoch': 8.18}\n",
"{'loss': 0.3824, 'learning_rate': 1e-05, 'epoch': 9.09}\n",
"{'loss': 0.4787, 'learning_rate': 1.1000000000000001e-05, 'epoch': 10.0}\n",
"{'loss': 0.4034, 'learning_rate': 1.2e-05, 'epoch': 10.91}\n",
"{'loss': 0.2614, 'learning_rate': 1.3000000000000001e-05, 'epoch': 11.82}\n",
"{'loss': 0.2659, 'learning_rate': 1.4000000000000001e-05, 'epoch': 12.73}\n",
"{'loss': 0.1433, 'learning_rate': 1.5e-05, 'epoch': 13.64}\n",
"{'loss': 0.1084, 'learning_rate': 1.6000000000000003e-05, 'epoch': 14.55}\n",
"{'loss': 0.1265, 'learning_rate': 1.7000000000000003e-05, 'epoch': 15.45}\n",
"{'loss': 0.0297, 'learning_rate': 1.8e-05, 'epoch': 16.36}\n",
"{'loss': 0.0261, 'learning_rate': 1.9e-05, 'epoch': 17.27}\n",
"{'loss': 0.0113, 'learning_rate': 2e-05, 'epoch': 18.18}\n",
"{'loss': 0.0081, 'learning_rate': 2.1e-05, 'epoch': 19.09}\n",
"{'loss': 0.0062, 'learning_rate': 2.2000000000000003e-05, 'epoch': 20.0}\n",
"{'loss': 0.0042, 'learning_rate': 2.3000000000000003e-05, 'epoch': 20.91}\n",
"{'loss': 0.0032, 'learning_rate': 2.4e-05, 'epoch': 21.82}\n",
"{'loss': 0.0035, 'learning_rate': 2.5e-05, 'epoch': 22.73}\n",
"{'loss': 0.0027, 'learning_rate': 2.6000000000000002e-05, 'epoch': 23.64}\n",
"{'loss': 0.002, 'learning_rate': 2.7000000000000002e-05, 'epoch': 24.55}\n",
"{'loss': 0.0019, 'learning_rate': 2.8000000000000003e-05, 'epoch': 25.45}\n",
"{'loss': 0.0017, 'learning_rate': 2.9e-05, 'epoch': 26.36}\n",
"{'loss': 0.0019, 'learning_rate': 3e-05, 'epoch': 27.27}\n",
"{'loss': 0.0013, 'learning_rate': 3.1e-05, 'epoch': 28.18}\n",
"{'loss': 0.0017, 'learning_rate': 3.2000000000000005e-05, 'epoch': 29.09}\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"\n",
"\n",
"Training completed. Do not forget to share your model on huggingface.co/models =)\n",
"\n",
"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'loss': 0.0011, 'learning_rate': 3.3e-05, 'epoch': 30.0}\n",
"{'train_runtime': 647.3268, 'train_samples_per_second': 0.51, 'train_steps_per_second': 0.51, 'train_loss': 0.25327446054328573, 'epoch': 30.0}\n"
]
},
{
"data": {
"text/plain": [
"TrainOutput(global_step=330, training_loss=0.25327446054328573, metrics={'train_runtime': 647.3268, 'train_samples_per_second': 0.51, 'train_steps_per_second': 0.51, 'train_loss': 0.25327446054328573, 'epoch': 30.0})"
]
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"training_args = TrainingArguments(\n",
" output_dir='Public Violation Detection', # output directory\n",
" num_train_epochs=30, # total # of training epochs\n",
" per_device_train_batch_size=1, # batch size per device during training\n",
" per_device_eval_batch_size=1, # batch size for evaluation\n",
" warmup_steps=500, # number of warmup steps for learning rate scheduler\n",
" weight_decay=0.01, # strength of weight decay\n",
" logging_dir='Public Violation Detection/logs', # directory for storing logs\n",
" logging_steps=10\n",
" )\n",
"\n",
"trainer = Trainer(\n",
" model=model, # the instantiated 🤗 Transformers model to be trained\n",
" args=training_args, # training arguments, defined above\n",
" train_dataset=dataset # evaluation dataset\n",
" )\n",
"\n",
"# train the model\n",
"trainer.train()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Inference"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1"
]
},
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def predict(text):\n",
" input_ids = tokenizer(text, return_tensors='pt').input_ids\n",
" logits = model(input_ids)[0]\n",
" return logits.argmax().item()\n",
"\n",
"predict(\"I am a journalist and union official at 'Associated Newspapers of Ceylon Ltd.' My transfer, ordered by Rohana Ariyarathna, is illegal, arbitrary, and a violation of my constitutional rights. The Rohana Ariyarathna does not have the authority to transfer union officials, and I am seeking to set aside the transfer letter.I request you to perform the ceremony for me.\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "torch111",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.13"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "38d61779fb8a2d479ca2bc1a752fe475f56efe678dc670cf5ac86029018bbcc6"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment