Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
2
22_23-J 18
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
1
Merge Requests
1
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
22_23-J 18
22_23-J 18
Commits
1bb8987a
Commit
1bb8987a
authored
Apr 05, 2023
by
Lelkada L L P S M
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
filtration method 1 - data clean
parent
c9e0a138
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
0 additions
and
954 deletions
+0
-954
IT19001708/DEV/scripts/word_filtration.ipynb
IT19001708/DEV/scripts/word_filtration.ipynb
+0
-954
No files found.
IT19001708/DEV/scripts/word_filtration.ipynb
deleted
100644 → 0
View file @
c9e0a138
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "d6b46218-294a-4e5d-b8b3-f5c191fdb042",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting nltk\n",
" Using cached nltk-3.8.1-py3-none-any.whl (1.5 MB)\n",
"Requirement already satisfied: tqdm in /opt/conda/lib/python3.7/site-packages (from nltk) (4.64.1)\n",
"Requirement already satisfied: joblib in /opt/conda/lib/python3.7/site-packages (from nltk) (1.2.0)\n",
"Collecting regex>=2021.8.3\n",
" Using cached regex-2022.10.31-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (757 kB)\n",
"Requirement already satisfied: click in /opt/conda/lib/python3.7/site-packages (from nltk) (8.1.3)\n",
"Requirement already satisfied: importlib-metadata in /opt/conda/lib/python3.7/site-packages (from click->nltk) (6.0.1)\n",
"Requirement already satisfied: typing-extensions>=3.6.4 in /opt/conda/lib/python3.7/site-packages (from importlib-metadata->click->nltk) (4.5.0)\n",
"Requirement already satisfied: zipp>=0.5 in /opt/conda/lib/python3.7/site-packages (from importlib-metadata->click->nltk) (3.15.0)\n",
"Installing collected packages: regex, nltk\n",
"Successfully installed nltk-3.8.1 regex-2022.10.31\n"
]
}
],
"source": [
"!pip install nltk"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "5dc527d7-d2ba-4fe6-9659-790fd5be1558",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n",
"[nltk_data] Downloading package averaged_perceptron_tagger to\n",
"[nltk_data] /home/jupyter/nltk_data...\n",
"[nltk_data] Package averaged_perceptron_tagger is already up-to-\n",
"[nltk_data] date!\n",
"[nltk_data] Downloading package wordnet to /home/jupyter/nltk_data...\n"
]
}
],
"source": [
"import pandas as pd\n",
"import string\n",
"from collections import defaultdict\n",
"import nltk\n",
"nltk.download('punkt')\n",
"nltk.download('averaged_perceptron_tagger')\n",
"nltk.download('wordnet')\n",
"from nltk.tokenize import word_tokenize\n",
"from nltk.corpus import wordnet"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "b71a9ac0-2766-4b51-9262-f027124f124d",
"metadata": {},
"outputs": [],
"source": [
"# Read the CSV file\n",
"data = pd.read_csv('../dataset/offensive_text_train_data.csv')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "56f535ce-df4e-424e-8329-39665930dc3d",
"metadata": {},
"outputs": [],
"source": [
"# 1. Merge text columns\n",
"data['text'] = data['prev_agent'].fillna('') + ' ' + data['prev_user'].fillna('') + ' ' + data['agent'].fillna('') + ' ' + data['user'].fillna('')\n",
"data['text'] = data['text'].replace(r'^\\s*$', '', regex=True).replace(r'^_$', '', regex=True).replace(r'^-$', '', regex=True).replace(r'^:\\)$', '', regex=True)\n",
"data['text'] = data['text'].str.lower()\n",
"\n",
"# Remove punctuations\n",
"translator = str.maketrans('', '', string.punctuation)\n",
"data['text'] = data['text'].apply(lambda x: x.translate(translator))\n",
"\n",
"# 2. Merge is_abuse columns into score\n",
"data['score'] = data.apply(lambda row: 1 if row['is_abuse.1'] == 1 else 0 if row['is_abuse.0'] == 1 else -1 if row['is_abuse.-1'] == 1 else -2 if row['is_abuse.-2'] == 1 else -3, axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "8745f580-cd96-4e65-81cd-ef8a8d758a77",
"metadata": {},
"outputs": [],
"source": [
"# 3. Merge type columns into one column called 'type'\n",
"type_columns = ['type.ableism', 'type.homophobic', 'type.intellectual', 'type.racist', 'type.sexist', 'type.sex_harassment', 'type.transphobic', 'target.generalised', 'target.individual', 'target.system', 'direction.explicit', 'direction.implicit']"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "e29e1b27-6739-4562-aac2-ae9ccbdcef31",
"metadata": {},
"outputs": [],
"source": [
"def get_type(row):\n",
" for col in type_columns:\n",
" if row[col] == 1:\n",
" return col.split('.')[-1]\n",
" return None\n",
"\n",
"data['type'] = data.apply(get_type, axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "8fe9b2ee-f186-42ea-adf3-0a35eb20c29f",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" <th>score</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>does that question interest you because my fin...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>does it please you to believe i am a nobe jock...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>have you tried dick please go on oh what a pic...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>will you be travelling in economy class yes wh...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>please go on hello how do you do please state ...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19060</th>\n",
" <td>please go on who are you would you prefer if i...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19061</th>\n",
" <td>and what city will you be flying to to bangalo...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19062</th>\n",
" <td>will you be travelling in economy class yes wh...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19063</th>\n",
" <td>what city will you be flying from from berlin ...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19064</th>\n",
" <td>hello</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>19065 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" text score\n",
"0 does that question interest you because my fin... 1\n",
"1 does it please you to believe i am a nobe jock... 0\n",
"2 have you tried dick please go on oh what a pic... 1\n",
"3 will you be travelling in economy class yes wh... 1\n",
"4 please go on hello how do you do please state ... 1\n",
"... ... ...\n",
"19060 please go on who are you would you prefer if i... 0\n",
"19061 and what city will you be flying to to bangalo... 1\n",
"19062 will you be travelling in economy class yes wh... 1\n",
"19063 what city will you be flying from from berlin ... 1\n",
"19064 hello 1\n",
"\n",
"[19065 rows x 2 columns]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = data[['text', 'score']]\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "d84a2580-1ac1-4128-b917-778636df262b",
"metadata": {},
"outputs": [],
"source": [
"# read in the text file and split the lines into a list\n",
"with open('../dataset/storybook_text_cleaned0.txt') as f:\n",
" lines = f.readlines()\n",
" \n",
"# create a pandas DataFrame with one column named \"text\" and populate it with the lines from the text file\n",
"data0 = pd.DataFrame(lines, columns=['text'])\n",
"\n",
"# add a new column named \"score\" with a score of 1 for all rows\n",
"data0 ['score'] = 1"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "376ec7ec-4e6c-4dbe-b848-224dcb35f0c2",
"metadata": {},
"outputs": [],
"source": [
"df = pd.concat([df, data0])"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "823a4233-3133-4e27-884c-7157096fd6a4",
"metadata": {},
"outputs": [],
"source": [
"stop_words = ['oh','im','your', 'bye', 'if', 'because', 'how', 'no', 'not', 'yes','okay', 'ok', 'is', 'a', 'an', 'are', 'the', 'to', 'is', 'so', 'they', 'this', 'in', 'on', 'me', 'my', 'who', 'where', 'here', 'there', 'i', 'we', 'you', 'them', 'will', 'should', 'been', 'he', 'she','be','do', 'does', 'go','it', 'that', '?', ',', '!', '.', \"'\", '\"', ':']\n",
"df.loc[:, 'text'] = data['text'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))\n"
]
},
{
"cell_type": "markdown",
"id": "da4e56f5-1e99-4a71-9457-801f316cd3b8",
"metadata": {},
"source": [
"df"
]
},
{
"cell_type": "markdown",
"id": "6b539ed0-bee0-4b94-8841-163936ce9613",
"metadata": {},
"source": [
"#### offensive scale - method 1"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "688e5d15-0be4-42a6-8ec5-bd3564fce076",
"metadata": {},
"outputs": [],
"source": [
"word_scores = defaultdict(int)\n",
"word_counts = defaultdict(int)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "7da18128-a410-458f-869d-1f1f5a839cf7",
"metadata": {},
"outputs": [],
"source": [
"# Loop through the dataset\n",
"for index, row in df.iterrows():\n",
" text, score = row['text'], row['score']\n",
" \n",
" # Tokenize the sentence (assuming preprocessed text is space-separated)\n",
" words = text.split()\n",
" \n",
" # Update word scores and counts\n",
" for word in words:\n",
" word_scores[word] += score\n",
" word_counts[word] += 1"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "b18a31b7-a0ca-414c-a4c9-114386875543",
"metadata": {},
"outputs": [],
"source": [
"# Calculate average word scores\n",
"average_word_scores = {word: score / word_counts[word] for word, score in word_scores.items()}"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "c087fe01-8184-48f3-8404-f09f3355566b",
"metadata": {},
"outputs": [],
"source": [
"# Create a new dataset with word scores\n",
"word_scores_df = pd.DataFrame(list(average_word_scores.items()), columns=['word', 'offensive_scale'])"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "4492e07d-41ae-4b83-9298-eb9ef8839d25",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>word</th>\n",
" <th>offensive_scale</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>3141</th>\n",
" <td>entered</td>\n",
" <td>-3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3047</th>\n",
" <td>motherfucking</td>\n",
" <td>-3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3448</th>\n",
" <td>nnklkm</td>\n",
" <td>-3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3568</th>\n",
" <td>heard</td>\n",
" <td>-3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3569</th>\n",
" <td>conciousness</td>\n",
" <td>-3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1331</th>\n",
" <td>athens</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2494</th>\n",
" <td>inzicchillo</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2493</th>\n",
" <td>pisticchio</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1496</th>\n",
" <td>visit</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2099</th>\n",
" <td>sphincter</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3647 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" word offensive_scale\n",
"3141 entered -3.0\n",
"3047 motherfucking -3.0\n",
"3448 nnklkm -3.0\n",
"3568 heard -3.0\n",
"3569 conciousness -3.0\n",
"... ... ...\n",
"1331 athens 1.0\n",
"2494 inzicchillo 1.0\n",
"2493 pisticchio 1.0\n",
"1496 visit 1.0\n",
"2099 sphincter 1.0\n",
"\n",
"[3647 rows x 2 columns]"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"word_scores_df = word_scores_df.sort_values('offensive_scale', ascending=True)\n",
"word_scores_df"
]
},
{
"cell_type": "markdown",
"id": "6b172121-5cd7-4408-8591-6b20ed6dec33",
"metadata": {},
"source": [
"#### offensive scale - method 2"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "3aa9017f-da3b-4074-b61e-4cd1391a84d0",
"metadata": {},
"outputs": [],
"source": [
"word_scores = defaultdict(lambda: {'sum': 0, 'count': 0})\n",
"\n",
"for index, row in df.iterrows():\n",
" tokens = word_tokenize(row['text'])\n",
" score = row['score']\n",
"\n",
" for token in tokens:\n",
" word_scores[token.lower()]['sum'] += score\n",
" word_scores[token.lower()]['count'] += 1\n"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "b5db525e-0571-4eb0-8ad0-e3eb03add816",
"metadata": {},
"outputs": [],
"source": [
"for word, values in word_scores.items():\n",
" word_scores[word] = values['sum'] / values['count']\n"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "1165fd1b-8624-4118-b096-71d1ea9993cf",
"metadata": {},
"outputs": [],
"source": [
"word_offensive_df = pd.DataFrame(list(word_scores.items()), columns=['word', 'offensive_scale'])\n",
"word_offensive_df['offensive_scale'] = word_offensive_df['offensive_scale'].apply(lambda x: round(x, 2))\n"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "9d5d67b7-d493-4784-994b-3b7da03f07f0",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>word</th>\n",
" <th>offensive_scale</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>question</td>\n",
" <td>0.70</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>interest</td>\n",
" <td>0.70</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>finger</td>\n",
" <td>1.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>happy</td>\n",
" <td>0.75</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>why</td>\n",
" <td>0.61</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3640</th>\n",
" <td>non</td>\n",
" <td>1.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3641</th>\n",
" <td>twenty</td>\n",
" <td>1.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3642</th>\n",
" <td>bananas</td>\n",
" <td>0.50</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3643</th>\n",
" <td>jimmy</td>\n",
" <td>0.50</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3644</th>\n",
" <td>tease</td>\n",
" <td>-2.00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3645 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" word offensive_scale\n",
"0 question 0.70\n",
"1 interest 0.70\n",
"2 finger 1.00\n",
"3 happy 0.75\n",
"4 why 0.61\n",
"... ... ...\n",
"3640 non 1.00\n",
"3641 twenty 1.00\n",
"3642 bananas 0.50\n",
"3643 jimmy 0.50\n",
"3644 tease -2.00\n",
"\n",
"[3645 rows x 2 columns]"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"word_offensive_df"
]
},
{
"cell_type": "markdown",
"id": "64b9b371-41b6-4222-a653-be60d2538111",
"metadata": {},
"source": [
"### offensive scale - method 3"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "33c6c135-a4d9-4fa3-b8e4-fffad5d2443f",
"metadata": {},
"outputs": [],
"source": [
"def get_wordnet_pos(treebank_tag):\n",
" if treebank_tag.startswith('N'):\n",
" return wordnet.NOUN\n",
" elif treebank_tag.startswith('V'):\n",
" return wordnet.VERB\n",
" else:\n",
" return ''\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 47,
"id": "3ebed656-8bf0-4e43-91df-b57e78c44870",
"metadata": {},
"outputs": [],
"source": [
"word_scores = defaultdict(lambda: {'sum': 0, 'count': 0})"
]
},
{
"cell_type": "code",
"execution_count": 48,
"id": "a35f0427-6d89-4bfc-b619-a11e07f6812f",
"metadata": {},
"outputs": [],
"source": [
"for index, row in df.iterrows():\n",
" tokens = word_tokenize(row['text'])\n",
" score = row['score']\n",
"\n",
" tagged_tokens = nltk.pos_tag(tokens)\n",
" for token, pos in tagged_tokens:\n",
" token = token.lower()\n",
" wordnet_pos = get_wordnet_pos(pos)\n",
"\n",
" if wordnet_pos == wordnet.NOUN or wordnet_pos == wordnet.VERB:\n",
" weighted_score = score * 0.65\n",
" else:\n",
" weighted_score = score\n",
"\n",
" word_scores[token]['sum'] += weighted_score\n",
" word_scores[token]['count'] += 1\n",
"\n",
"for word, values in word_scores.items():\n",
" word_scores[word] = values['sum'] / values['count']\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "e28e58bd-5da8-42d6-8baf-f6125a5ba74d",
"metadata": {},
"outputs": [],
"source": [
"word_offensive_df = pd.DataFrame(list(word_scores.items()), columns=['word', 'offensive_scale'])\n",
"word_offensive_df['offensive_scale'] = word_offensive_df['offensive_scale'].apply(lambda x: round(x, 2))"
]
},
{
"cell_type": "code",
"execution_count": 50,
"id": "010250bc-284a-4f9b-8c2f-ae94d4a84609",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>word</th>\n",
" <th>offensive_scale</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>question</td>\n",
" <td>0.46</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>interest</td>\n",
" <td>0.46</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>finger</td>\n",
" <td>0.65</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>happy</td>\n",
" <td>0.75</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>why</td>\n",
" <td>0.61</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3640</th>\n",
" <td>non</td>\n",
" <td>0.65</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3641</th>\n",
" <td>twenty</td>\n",
" <td>1.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3642</th>\n",
" <td>bananas</td>\n",
" <td>0.33</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3643</th>\n",
" <td>jimmy</td>\n",
" <td>0.33</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3644</th>\n",
" <td>tease</td>\n",
" <td>-1.30</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3645 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" word offensive_scale\n",
"0 question 0.46\n",
"1 interest 0.46\n",
"2 finger 0.65\n",
"3 happy 0.75\n",
"4 why 0.61\n",
"... ... ...\n",
"3640 non 0.65\n",
"3641 twenty 1.00\n",
"3642 bananas 0.33\n",
"3643 jimmy 0.33\n",
"3644 tease -1.30\n",
"\n",
"[3645 rows x 2 columns]"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"word_offensive_df"
]
},
{
"cell_type": "markdown",
"id": "2f630f03-7792-41cd-a075-d0dc2a0098cd",
"metadata": {},
"source": [
"### Save in MongoDB"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "ca805d34-be3e-49ec-9ec5-700a837d1d68",
"metadata": {},
"outputs": [],
"source": [
"import pymongo"
]
},
{
"cell_type": "code",
"execution_count": 51,
"id": "9de9f8e6-9c8c-4313-823e-49339b7d3a28",
"metadata": {},
"outputs": [],
"source": [
"# connect to the MongoDB instance\n",
"client = pymongo.MongoClient(\"mongodb+srv://hearme:hearme678@cluster0.kz66vdr.mongodb.net/\")\n",
"\n",
"# select the database and collection\n",
"db = client[\"word_filtration\"]\n",
"collection = db[\"sensitivity_score\"]"
]
},
{
"cell_type": "code",
"execution_count": 52,
"id": "71256a3f-044e-4bc1-98bc-ac4d2e128fc6",
"metadata": {},
"outputs": [],
"source": [
"# insert each row of the DataFrame as a document into the collection\n",
"for index, row in word_offensive_df.iterrows():\n",
" document = {\"word\": row[\"word\"], \"score\": row[\"offensive_scale\"]}\n",
" result = collection.insert_one(document)"
]
},
{
"cell_type": "markdown",
"id": "aafbf51b-ff3b-49c7-952f-902ee628861c",
"metadata": {},
"source": [
"#### Retrieve score"
]
},
{
"cell_type": "code",
"execution_count": 54,
"id": "3cd54b06-c132-45ba-9602-1a5ceee220fa",
"metadata": {},
"outputs": [],
"source": [
"def get_sensitivity_score(word):\n",
" client = pymongo.MongoClient(\"mongodb+srv://hearme:hearme678@cluster0.kz66vdr.mongodb.net/\")\n",
"\n",
" # select the database and collection\n",
" db = client[\"word_filtration\"]\n",
" collection = db[\"sensitivity_score\"]\n",
" \n",
" # find the word in the collection and return its sensitivity score\n",
" result = collection.find_one({\"word\": word})\n",
" if result:\n",
" return result[\"score\"]\n",
" else:\n",
" return -99"
]
},
{
"cell_type": "code",
"execution_count": 58,
"id": "6f05439f-5fc6-4de0-9420-7117a9608cde",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.65"
]
},
"execution_count": 58,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"get_sensitivity_score('play')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "636445cc-48c8-4b01-a3a7-920b4bec76b5",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python (Local)",
"language": "python",
"name": "local-base"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment