Public Violation Detection model

6ae2b121 · Bandara I.M.R.H IT18013474 · d1d3ed4c · 6ae2b121
Commit 6ae2b121 authored Feb 04, 2023 by Bandara I.M.R.H IT18013474 💬
Hide whitespace changes
Inline Side-by-side

Showing with 351 additions and 0 deletions

Public_Violation_Detection.ipynb Public_Violation_Detection.ipynb +351 -0

No files found.
--- a/Public_Violation_Detection.ipynb
+++ b/Public_Violation_Detection.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from torch.utils.data import Dataset\n",
+    "from transformers import TrainingArguments, Trainer\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Finetuning"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_path = 'data/public-stories.xlsx'\n",
+    "df = pd.read_excel(data_path)\n",
+    "\n",
+    "cases_simple = df['case in simple word'].tolist()\n",
+    "cases_simple = [i.replace('\\n', '').strip() for i in cases_simple]\n",
+    "\n",
+    "ViolateFlag = df['ViolateFlag'].tolist()\n",
+    "class_dict = {'Yes': 1, 'No': 0}\n",
+    "ViolateFlag = [i.replace('\\n', '').strip() for i in ViolateFlag]\n",
+    "ViolateFlags = [class_dict[i] for i in ViolateFlag]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "loading file vocab.txt from cache at C:\\Users\\Legion/.cache\\huggingface\\hub\\models--distilbert-base-uncased\\snapshots\\1c4513b2eedbda136f57676a34eea67aba266e5c\\vocab.txt\n",
+      "loading file tokenizer.json from cache at C:\\Users\\Legion/.cache\\huggingface\\hub\\models--distilbert-base-uncased\\snapshots\\1c4513b2eedbda136f57676a34eea67aba266e5c\\tokenizer.json\n",
+      "loading file added_tokens.json from cache at None\n",
+      "loading file special_tokens_map.json from cache at None\n",
+      "loading file tokenizer_config.json from cache at C:\\Users\\Legion/.cache\\huggingface\\hub\\models--distilbert-base-uncased\\snapshots\\1c4513b2eedbda136f57676a34eea67aba266e5c\\tokenizer_config.json\n",
+      "loading configuration file config.json from cache at C:\\Users\\Legion/.cache\\huggingface\\hub\\models--distilbert-base-uncased\\snapshots\\1c4513b2eedbda136f57676a34eea67aba266e5c\\config.json\n",
+      "Model config DistilBertConfig {\n",
+      "  \"_name_or_path\": \"distilbert-base-uncased\",\n",
+      "  \"activation\": \"gelu\",\n",
+      "  \"architectures\": [\n",
+      "    \"DistilBertForMaskedLM\"\n",
+      "  ],\n",
+      "  \"attention_dropout\": 0.1,\n",
+      "  \"dim\": 768,\n",
+      "  \"dropout\": 0.1,\n",
+      "  \"hidden_dim\": 3072,\n",
+      "  \"initializer_range\": 0.02,\n",
+      "  \"max_position_embeddings\": 512,\n",
+      "  \"model_type\": \"distilbert\",\n",
+      "  \"n_heads\": 12,\n",
+      "  \"n_layers\": 6,\n",
+      "  \"pad_token_id\": 0,\n",
+      "  \"qa_dropout\": 0.1,\n",
+      "  \"seq_classif_dropout\": 0.2,\n",
+      "  \"sinusoidal_pos_embds\": false,\n",
+      "  \"tie_weights_\": true,\n",
+      "  \"transformers_version\": \"4.24.0\",\n",
+      "  \"vocab_size\": 30522\n",
+      "}\n",
+      "\n",
+      "loading configuration file config.json from cache at C:\\Users\\Legion/.cache\\huggingface\\hub\\models--distilbert-base-uncased\\snapshots\\1c4513b2eedbda136f57676a34eea67aba266e5c\\config.json\n",
+      "Model config DistilBertConfig {\n",
+      "  \"activation\": \"gelu\",\n",
+      "  \"architectures\": [\n",
+      "    \"DistilBertForMaskedLM\"\n",
+      "  ],\n",
+      "  \"attention_dropout\": 0.1,\n",
+      "  \"dim\": 768,\n",
+      "  \"dropout\": 0.1,\n",
+      "  \"hidden_dim\": 3072,\n",
+      "  \"initializer_range\": 0.02,\n",
+      "  \"max_position_embeddings\": 512,\n",
+      "  \"model_type\": \"distilbert\",\n",
+      "  \"n_heads\": 12,\n",
+      "  \"n_layers\": 6,\n",
+      "  \"pad_token_id\": 0,\n",
+      "  \"qa_dropout\": 0.1,\n",
+      "  \"seq_classif_dropout\": 0.2,\n",
+      "  \"sinusoidal_pos_embds\": false,\n",
+      "  \"tie_weights_\": true,\n",
+      "  \"transformers_version\": \"4.24.0\",\n",
+      "  \"vocab_size\": 30522\n",
+      "}\n",
+      "\n",
+      "loading weights file pytorch_model.bin from cache at C:\\Users\\Legion/.cache\\huggingface\\hub\\models--distilbert-base-uncased\\snapshots\\1c4513b2eedbda136f57676a34eea67aba266e5c\\pytorch_model.bin\n",
+      "Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_transform.weight']\n",
+      "- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+      "- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+      "Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+     ]
+    }
+   ],
+   "source": [
+    "tokenizer = DistilBertTokenizerFast.from_pretrained(\"distilbert-base-uncased\")\n",
+    "model = DistilBertForSequenceClassification.from_pretrained(\"distilbert-base-uncased\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class PublicViolationDataset(Dataset):\n",
+    "    def __init__(self, cases_simple, ViolateFlags):\n",
+    "        self.cases_simple = cases_simple\n",
+    "        self.ViolateFlags = ViolateFlags\n",
+    "        self.tokenizer = tokenizer\n",
+    "        self.encodings = tokenizer(cases_simple, truncation=True, padding=True)\n",
+    "\n",
+    "    def __len__(self):\n",
+    "        return len(self.cases_simple)\n",
+    "\n",
+    "    def __getitem__(self, idx):\n",
+    "        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}\n",
+    "        item['labels'] = torch.tensor(self.ViolateFlags[idx])\n",
+    "        return item"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 53,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset = PublicViolationDataset(\n",
+    "                                cases_simple,\n",
+    "                                ViolateFlags\n",
+    "                            )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "PyTorch: setting up devices\n",
+      "The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).\n",
+      "c:\\Users\\Legion\\.conda\\envs\\torch111\\lib\\site-packages\\transformers\\optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
+      "  warnings.warn(\n",
+      "***** Running training *****\n",
+      "  Num examples = 11\n",
+      "  Num Epochs = 30\n",
+      "  Instantaneous batch size per device = 1\n",
+      "  Total train batch size (w. parallel, distributed & accumulation) = 1\n",
+      "  Gradient Accumulation steps = 1\n",
+      "  Total optimization steps = 330\n",
+      "  Number of trainable parameters = 66955010\n",
+      "Automatic Weights & Biases logging enabled, to disable set os.environ[\"WANDB_DISABLED\"] = \"true\"\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "be948d8c6455448da350172b0f663c0c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/330 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'loss': 0.7163, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.91}\n",
+      "{'loss': 0.7165, 'learning_rate': 2.0000000000000003e-06, 'epoch': 1.82}\n",
+      "{'loss': 0.723, 'learning_rate': 3e-06, 'epoch': 2.73}\n",
+      "{'loss': 0.6927, 'learning_rate': 4.000000000000001e-06, 'epoch': 3.64}\n",
+      "{'loss': 0.6761, 'learning_rate': 5e-06, 'epoch': 4.55}\n",
+      "{'loss': 0.6679, 'learning_rate': 6e-06, 'epoch': 5.45}\n",
+      "{'loss': 0.6409, 'learning_rate': 7.000000000000001e-06, 'epoch': 6.36}\n",
+      "{'loss': 0.5796, 'learning_rate': 8.000000000000001e-06, 'epoch': 7.27}\n",
+      "{'loss': 0.6684, 'learning_rate': 9e-06, 'epoch': 8.18}\n",
+      "{'loss': 0.3824, 'learning_rate': 1e-05, 'epoch': 9.09}\n",
+      "{'loss': 0.4787, 'learning_rate': 1.1000000000000001e-05, 'epoch': 10.0}\n",
+      "{'loss': 0.4034, 'learning_rate': 1.2e-05, 'epoch': 10.91}\n",
+      "{'loss': 0.2614, 'learning_rate': 1.3000000000000001e-05, 'epoch': 11.82}\n",
+      "{'loss': 0.2659, 'learning_rate': 1.4000000000000001e-05, 'epoch': 12.73}\n",
+      "{'loss': 0.1433, 'learning_rate': 1.5e-05, 'epoch': 13.64}\n",
+      "{'loss': 0.1084, 'learning_rate': 1.6000000000000003e-05, 'epoch': 14.55}\n",
+      "{'loss': 0.1265, 'learning_rate': 1.7000000000000003e-05, 'epoch': 15.45}\n",
+      "{'loss': 0.0297, 'learning_rate': 1.8e-05, 'epoch': 16.36}\n",
+      "{'loss': 0.0261, 'learning_rate': 1.9e-05, 'epoch': 17.27}\n",
+      "{'loss': 0.0113, 'learning_rate': 2e-05, 'epoch': 18.18}\n",
+      "{'loss': 0.0081, 'learning_rate': 2.1e-05, 'epoch': 19.09}\n",
+      "{'loss': 0.0062, 'learning_rate': 2.2000000000000003e-05, 'epoch': 20.0}\n",
+      "{'loss': 0.0042, 'learning_rate': 2.3000000000000003e-05, 'epoch': 20.91}\n",
+      "{'loss': 0.0032, 'learning_rate': 2.4e-05, 'epoch': 21.82}\n",
+      "{'loss': 0.0035, 'learning_rate': 2.5e-05, 'epoch': 22.73}\n",
+      "{'loss': 0.0027, 'learning_rate': 2.6000000000000002e-05, 'epoch': 23.64}\n",
+      "{'loss': 0.002, 'learning_rate': 2.7000000000000002e-05, 'epoch': 24.55}\n",
+      "{'loss': 0.0019, 'learning_rate': 2.8000000000000003e-05, 'epoch': 25.45}\n",
+      "{'loss': 0.0017, 'learning_rate': 2.9e-05, 'epoch': 26.36}\n",
+      "{'loss': 0.0019, 'learning_rate': 3e-05, 'epoch': 27.27}\n",
+      "{'loss': 0.0013, 'learning_rate': 3.1e-05, 'epoch': 28.18}\n",
+      "{'loss': 0.0017, 'learning_rate': 3.2000000000000005e-05, 'epoch': 29.09}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Training completed. Do not forget to share your model on huggingface.co/models =)\n",
+      "\n",
+      "\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'loss': 0.0011, 'learning_rate': 3.3e-05, 'epoch': 30.0}\n",
+      "{'train_runtime': 647.3268, 'train_samples_per_second': 0.51, 'train_steps_per_second': 0.51, 'train_loss': 0.25327446054328573, 'epoch': 30.0}\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "TrainOutput(global_step=330, training_loss=0.25327446054328573, metrics={'train_runtime': 647.3268, 'train_samples_per_second': 0.51, 'train_steps_per_second': 0.51, 'train_loss': 0.25327446054328573, 'epoch': 30.0})"
+      ]
+     },
+     "execution_count": 54,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "training_args = TrainingArguments(\n",
+    "                                output_dir='Public Violation Detection',          # output directory\n",
+    "                                num_train_epochs=30,                                 # total # of training epochs\n",
+    "                                per_device_train_batch_size=1,                      # batch size per device during training\n",
+    "                                per_device_eval_batch_size=1,                       # batch size for evaluation\n",
+    "                                warmup_steps=500,                                   # number of warmup steps for learning rate scheduler\n",
+    "                                weight_decay=0.01,                                  # strength of weight decay\n",
+    "                                logging_dir='Public Violation Detection/logs',            # directory for storing logs\n",
+    "                                logging_steps=10\n",
+    "                            )\n",
+    "\n",
+    "trainer = Trainer(\n",
+    "                model=model,                         # the instantiated 🤗 Transformers model to be trained\n",
+    "                args=training_args,                  # training arguments, defined above\n",
+    "                train_dataset=dataset                # evaluation dataset\n",
+    "            )\n",
+    "\n",
+    "# train the model\n",
+    "trainer.train()"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Inference"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "1"
+      ]
+     },
+     "execution_count": 55,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "def predict(text):\n",
+    "    input_ids = tokenizer(text, return_tensors='pt').input_ids\n",
+    "    logits = model(input_ids)[0]\n",
+    "    return logits.argmax().item()\n",
+    "\n",
+    "predict(\"I am a journalist and union official at 'Associated Newspapers of Ceylon Ltd.' My transfer, ordered by  Rohana Ariyarathna, is illegal, arbitrary, and a violation of my constitutional rights. The Rohana Ariyarathna does not have the authority to transfer union officials, and I am seeking to set aside the transfer letter.I request you to perform the ceremony for me.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "torch111",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.13"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "38d61779fb8a2d479ca2bc1a752fe475f56efe678dc670cf5ac86029018bbcc6"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}