Commit e1a3366f authored by Lelkada L L P S M's avatar Lelkada L L P S M

filtration method 1 - improvements

parent 75b5b6cb
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "d6b46218-294a-4e5d-b8b3-f5c191fdb042",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting nltk\n",
" Using cached nltk-3.8.1-py3-none-any.whl (1.5 MB)\n",
"Requirement already satisfied: tqdm in /opt/conda/lib/python3.7/site-packages (from nltk) (4.64.1)\n",
"Requirement already satisfied: joblib in /opt/conda/lib/python3.7/site-packages (from nltk) (1.2.0)\n",
"Collecting regex>=2021.8.3\n",
" Using cached regex-2022.10.31-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (757 kB)\n",
"Requirement already satisfied: click in /opt/conda/lib/python3.7/site-packages (from nltk) (8.1.3)\n",
"Requirement already satisfied: importlib-metadata in /opt/conda/lib/python3.7/site-packages (from click->nltk) (6.0.1)\n",
"Requirement already satisfied: typing-extensions>=3.6.4 in /opt/conda/lib/python3.7/site-packages (from importlib-metadata->click->nltk) (4.5.0)\n",
"Requirement already satisfied: zipp>=0.5 in /opt/conda/lib/python3.7/site-packages (from importlib-metadata->click->nltk) (3.15.0)\n",
"Installing collected packages: regex, nltk\n",
"Successfully installed nltk-3.8.1 regex-2022.10.31\n"
]
}
],
"source": [
"!pip install nltk"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "5dc527d7-d2ba-4fe6-9659-790fd5be1558",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n",
"[nltk_data] Downloading package averaged_perceptron_tagger to\n",
"[nltk_data] /home/jupyter/nltk_data...\n",
"[nltk_data] Package averaged_perceptron_tagger is already up-to-\n",
"[nltk_data] date!\n",
"[nltk_data] Downloading package wordnet to /home/jupyter/nltk_data...\n"
]
}
],
"source": [
"import pandas as pd\n",
"import string\n",
"from collections import defaultdict\n",
"import nltk\n",
"nltk.download('punkt')\n",
"nltk.download('averaged_perceptron_tagger')\n",
"nltk.download('wordnet')\n",
"from nltk.tokenize import word_tokenize\n",
"from nltk.corpus import wordnet\n",
"import pymongo"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "b71a9ac0-2766-4b51-9262-f027124f124d",
"metadata": {},
"outputs": [],
"source": [
"# Read the CSV file\n",
"data = pd.read_csv('../dataset/offensive_text_train_data.csv')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "56f535ce-df4e-424e-8329-39665930dc3d",
"metadata": {},
"outputs": [],
"source": [
"# 1. Merge text columns\n",
"data['text'] = data['prev_agent'].fillna('') + ' ' + data['prev_user'].fillna('') + ' ' + data['agent'].fillna('') + ' ' + data['user'].fillna('')\n",
"data['text'] = data['text'].replace(r'^\\s*$', '', regex=True).replace(r'^_$', '', regex=True).replace(r'^-$', '', regex=True).replace(r'^:\\)$', '', regex=True)\n",
"data['text'] = data['text'].str.lower()\n",
"\n",
"# Remove punctuations\n",
"translator = str.maketrans('', '', string.punctuation)\n",
"data['text'] = data['text'].apply(lambda x: x.translate(translator))\n",
"\n",
"# 2. Merge is_abuse columns into score\n",
"data['score'] = data.apply(lambda row: 1 if row['is_abuse.1'] == 1 else 0 if row['is_abuse.0'] == 1 else -1 if row['is_abuse.-1'] == 1 else -2 if row['is_abuse.-2'] == 1 else -3, axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "8745f580-cd96-4e65-81cd-ef8a8d758a77",
"metadata": {},
"outputs": [],
"source": [
"# 3. Merge type columns into one column called 'type'\n",
"type_columns = ['type.ableism', 'type.homophobic', 'type.intellectual', 'type.racist', 'type.sexist', 'type.sex_harassment', 'type.transphobic', 'target.generalised', 'target.individual', 'target.system', 'direction.explicit', 'direction.implicit']"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "e29e1b27-6739-4562-aac2-ae9ccbdcef31",
"metadata": {},
"outputs": [],
"source": [
"def get_type(row):\n",
" for col in type_columns:\n",
" if row[col] == 1:\n",
" return col.split('.')[-1]\n",
" return None\n",
"\n",
"data['type'] = data.apply(get_type, axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "8fe9b2ee-f186-42ea-adf3-0a35eb20c29f",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" <th>score</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>does that question interest you because my fin...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>does it please you to believe i am a nobe jock...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>have you tried dick please go on oh what a pic...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>will you be travelling in economy class yes wh...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>please go on hello how do you do please state ...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19060</th>\n",
" <td>please go on who are you would you prefer if i...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19061</th>\n",
" <td>and what city will you be flying to to bangalo...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19062</th>\n",
" <td>will you be travelling in economy class yes wh...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19063</th>\n",
" <td>what city will you be flying from from berlin ...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19064</th>\n",
" <td>hello</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>19065 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" text score\n",
"0 does that question interest you because my fin... 1\n",
"1 does it please you to believe i am a nobe jock... 0\n",
"2 have you tried dick please go on oh what a pic... 1\n",
"3 will you be travelling in economy class yes wh... 1\n",
"4 please go on hello how do you do please state ... 1\n",
"... ... ...\n",
"19060 please go on who are you would you prefer if i... 0\n",
"19061 and what city will you be flying to to bangalo... 1\n",
"19062 will you be travelling in economy class yes wh... 1\n",
"19063 what city will you be flying from from berlin ... 1\n",
"19064 hello 1\n",
"\n",
"[19065 rows x 2 columns]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = data[['text', 'score']]\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "d84a2580-1ac1-4128-b917-778636df262b",
"metadata": {},
"outputs": [],
"source": [
"# read in the text file and split the lines into a list\n",
"with open('../dataset/storybook_text_cleaned0.txt') as f:\n",
" lines = f.readlines()\n",
" \n",
"# create a pandas DataFrame with one column named \"text\" and populate it with the lines from the text file\n",
"data0 = pd.DataFrame(lines, columns=['text'])\n",
"\n",
"# add a new column named \"score\" with a score of 1 for all rows\n",
"data0 ['score'] = 1"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "376ec7ec-4e6c-4dbe-b848-224dcb35f0c2",
"metadata": {},
"outputs": [],
"source": [
"df = pd.concat([df, data0])"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "823a4233-3133-4e27-884c-7157096fd6a4",
"metadata": {},
"outputs": [],
"source": [
"stop_words = ['oh','im','your', 'bye', 'if', 'because', 'how', 'no', 'not', 'yes','okay', 'ok', 'is', 'a', 'an', 'are', 'the', 'to', 'is', 'so', 'they', 'this', 'in', 'on', 'me', 'my', 'who', 'where', 'here', 'there', 'i', 'we', 'you', 'them', 'will', 'should', 'been', 'he', 'she','be','do', 'does', 'go','it', 'that', '?', ',', '!', '.', \"'\", '\"', ':']\n",
"df.loc[:, 'text'] = data['text'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "636445cc-48c8-4b01-a3a7-920b4bec76b5",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python (Local)",
"language": "python",
"name": "local-base"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment