Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
2
22_23-J 18
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
1
Merge Requests
1
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
22_23-J 18
22_23-J 18
Commits
e1a3366f
Commit
e1a3366f
authored
Apr 10, 2023
by
Lelkada L L P S M
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
filtration method 1 - improvements
parent
75b5b6cb
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
0 additions
and
311 deletions
+0
-311
IT19001708/DEV/scripts/word_filtration - method 1.ipynb
IT19001708/DEV/scripts/word_filtration - method 1.ipynb
+0
-311
No files found.
IT19001708/DEV/scripts/word_filtration - method 1.ipynb
deleted
100644 → 0
View file @
75b5b6cb
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "d6b46218-294a-4e5d-b8b3-f5c191fdb042",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting nltk\n",
" Using cached nltk-3.8.1-py3-none-any.whl (1.5 MB)\n",
"Requirement already satisfied: tqdm in /opt/conda/lib/python3.7/site-packages (from nltk) (4.64.1)\n",
"Requirement already satisfied: joblib in /opt/conda/lib/python3.7/site-packages (from nltk) (1.2.0)\n",
"Collecting regex>=2021.8.3\n",
" Using cached regex-2022.10.31-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (757 kB)\n",
"Requirement already satisfied: click in /opt/conda/lib/python3.7/site-packages (from nltk) (8.1.3)\n",
"Requirement already satisfied: importlib-metadata in /opt/conda/lib/python3.7/site-packages (from click->nltk) (6.0.1)\n",
"Requirement already satisfied: typing-extensions>=3.6.4 in /opt/conda/lib/python3.7/site-packages (from importlib-metadata->click->nltk) (4.5.0)\n",
"Requirement already satisfied: zipp>=0.5 in /opt/conda/lib/python3.7/site-packages (from importlib-metadata->click->nltk) (3.15.0)\n",
"Installing collected packages: regex, nltk\n",
"Successfully installed nltk-3.8.1 regex-2022.10.31\n"
]
}
],
"source": [
"!pip install nltk"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "5dc527d7-d2ba-4fe6-9659-790fd5be1558",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n",
"[nltk_data] Downloading package averaged_perceptron_tagger to\n",
"[nltk_data] /home/jupyter/nltk_data...\n",
"[nltk_data] Package averaged_perceptron_tagger is already up-to-\n",
"[nltk_data] date!\n",
"[nltk_data] Downloading package wordnet to /home/jupyter/nltk_data...\n"
]
}
],
"source": [
"import pandas as pd\n",
"import string\n",
"from collections import defaultdict\n",
"import nltk\n",
"nltk.download('punkt')\n",
"nltk.download('averaged_perceptron_tagger')\n",
"nltk.download('wordnet')\n",
"from nltk.tokenize import word_tokenize\n",
"from nltk.corpus import wordnet\n",
"import pymongo"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "b71a9ac0-2766-4b51-9262-f027124f124d",
"metadata": {},
"outputs": [],
"source": [
"# Read the CSV file\n",
"data = pd.read_csv('../dataset/offensive_text_train_data.csv')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "56f535ce-df4e-424e-8329-39665930dc3d",
"metadata": {},
"outputs": [],
"source": [
"# 1. Merge text columns\n",
"data['text'] = data['prev_agent'].fillna('') + ' ' + data['prev_user'].fillna('') + ' ' + data['agent'].fillna('') + ' ' + data['user'].fillna('')\n",
"data['text'] = data['text'].replace(r'^\\s*$', '', regex=True).replace(r'^_$', '', regex=True).replace(r'^-$', '', regex=True).replace(r'^:\\)$', '', regex=True)\n",
"data['text'] = data['text'].str.lower()\n",
"\n",
"# Remove punctuations\n",
"translator = str.maketrans('', '', string.punctuation)\n",
"data['text'] = data['text'].apply(lambda x: x.translate(translator))\n",
"\n",
"# 2. Merge is_abuse columns into score\n",
"data['score'] = data.apply(lambda row: 1 if row['is_abuse.1'] == 1 else 0 if row['is_abuse.0'] == 1 else -1 if row['is_abuse.-1'] == 1 else -2 if row['is_abuse.-2'] == 1 else -3, axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "8745f580-cd96-4e65-81cd-ef8a8d758a77",
"metadata": {},
"outputs": [],
"source": [
"# 3. Merge type columns into one column called 'type'\n",
"type_columns = ['type.ableism', 'type.homophobic', 'type.intellectual', 'type.racist', 'type.sexist', 'type.sex_harassment', 'type.transphobic', 'target.generalised', 'target.individual', 'target.system', 'direction.explicit', 'direction.implicit']"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "e29e1b27-6739-4562-aac2-ae9ccbdcef31",
"metadata": {},
"outputs": [],
"source": [
"def get_type(row):\n",
" for col in type_columns:\n",
" if row[col] == 1:\n",
" return col.split('.')[-1]\n",
" return None\n",
"\n",
"data['type'] = data.apply(get_type, axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "8fe9b2ee-f186-42ea-adf3-0a35eb20c29f",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" <th>score</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>does that question interest you because my fin...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>does it please you to believe i am a nobe jock...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>have you tried dick please go on oh what a pic...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>will you be travelling in economy class yes wh...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>please go on hello how do you do please state ...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19060</th>\n",
" <td>please go on who are you would you prefer if i...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19061</th>\n",
" <td>and what city will you be flying to to bangalo...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19062</th>\n",
" <td>will you be travelling in economy class yes wh...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19063</th>\n",
" <td>what city will you be flying from from berlin ...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19064</th>\n",
" <td>hello</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>19065 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" text score\n",
"0 does that question interest you because my fin... 1\n",
"1 does it please you to believe i am a nobe jock... 0\n",
"2 have you tried dick please go on oh what a pic... 1\n",
"3 will you be travelling in economy class yes wh... 1\n",
"4 please go on hello how do you do please state ... 1\n",
"... ... ...\n",
"19060 please go on who are you would you prefer if i... 0\n",
"19061 and what city will you be flying to to bangalo... 1\n",
"19062 will you be travelling in economy class yes wh... 1\n",
"19063 what city will you be flying from from berlin ... 1\n",
"19064 hello 1\n",
"\n",
"[19065 rows x 2 columns]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = data[['text', 'score']]\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "d84a2580-1ac1-4128-b917-778636df262b",
"metadata": {},
"outputs": [],
"source": [
"# read in the text file and split the lines into a list\n",
"with open('../dataset/storybook_text_cleaned0.txt') as f:\n",
" lines = f.readlines()\n",
" \n",
"# create a pandas DataFrame with one column named \"text\" and populate it with the lines from the text file\n",
"data0 = pd.DataFrame(lines, columns=['text'])\n",
"\n",
"# add a new column named \"score\" with a score of 1 for all rows\n",
"data0 ['score'] = 1"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "376ec7ec-4e6c-4dbe-b848-224dcb35f0c2",
"metadata": {},
"outputs": [],
"source": [
"df = pd.concat([df, data0])"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "823a4233-3133-4e27-884c-7157096fd6a4",
"metadata": {},
"outputs": [],
"source": [
"stop_words = ['oh','im','your', 'bye', 'if', 'because', 'how', 'no', 'not', 'yes','okay', 'ok', 'is', 'a', 'an', 'are', 'the', 'to', 'is', 'so', 'they', 'this', 'in', 'on', 'me', 'my', 'who', 'where', 'here', 'there', 'i', 'we', 'you', 'them', 'will', 'should', 'been', 'he', 'she','be','do', 'does', 'go','it', 'that', '?', ',', '!', '.', \"'\", '\"', ':']\n",
"df.loc[:, 'text'] = data['text'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "636445cc-48c8-4b01-a3a7-920b4bec76b5",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python (Local)",
"language": "python",
"name": "local-base"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment