Commit 1bb8987a authored by Lelkada L L P S M's avatar Lelkada L L P S M

filtration method 1 - data clean

parent c9e0a138
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "d6b46218-294a-4e5d-b8b3-f5c191fdb042",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting nltk\n",
" Using cached nltk-3.8.1-py3-none-any.whl (1.5 MB)\n",
"Requirement already satisfied: tqdm in /opt/conda/lib/python3.7/site-packages (from nltk) (4.64.1)\n",
"Requirement already satisfied: joblib in /opt/conda/lib/python3.7/site-packages (from nltk) (1.2.0)\n",
"Collecting regex>=2021.8.3\n",
" Using cached regex-2022.10.31-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (757 kB)\n",
"Requirement already satisfied: click in /opt/conda/lib/python3.7/site-packages (from nltk) (8.1.3)\n",
"Requirement already satisfied: importlib-metadata in /opt/conda/lib/python3.7/site-packages (from click->nltk) (6.0.1)\n",
"Requirement already satisfied: typing-extensions>=3.6.4 in /opt/conda/lib/python3.7/site-packages (from importlib-metadata->click->nltk) (4.5.0)\n",
"Requirement already satisfied: zipp>=0.5 in /opt/conda/lib/python3.7/site-packages (from importlib-metadata->click->nltk) (3.15.0)\n",
"Installing collected packages: regex, nltk\n",
"Successfully installed nltk-3.8.1 regex-2022.10.31\n"
]
}
],
"source": [
"!pip install nltk"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "5dc527d7-d2ba-4fe6-9659-790fd5be1558",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n",
"[nltk_data] Downloading package averaged_perceptron_tagger to\n",
"[nltk_data] /home/jupyter/nltk_data...\n",
"[nltk_data] Package averaged_perceptron_tagger is already up-to-\n",
"[nltk_data] date!\n",
"[nltk_data] Downloading package wordnet to /home/jupyter/nltk_data...\n"
]
}
],
"source": [
"import pandas as pd\n",
"import string\n",
"from collections import defaultdict\n",
"import nltk\n",
"nltk.download('punkt')\n",
"nltk.download('averaged_perceptron_tagger')\n",
"nltk.download('wordnet')\n",
"from nltk.tokenize import word_tokenize\n",
"from nltk.corpus import wordnet"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "b71a9ac0-2766-4b51-9262-f027124f124d",
"metadata": {},
"outputs": [],
"source": [
"# Read the CSV file\n",
"data = pd.read_csv('../dataset/offensive_text_train_data.csv')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "56f535ce-df4e-424e-8329-39665930dc3d",
"metadata": {},
"outputs": [],
"source": [
"# 1. Merge text columns\n",
"data['text'] = data['prev_agent'].fillna('') + ' ' + data['prev_user'].fillna('') + ' ' + data['agent'].fillna('') + ' ' + data['user'].fillna('')\n",
"data['text'] = data['text'].replace(r'^\\s*$', '', regex=True).replace(r'^_$', '', regex=True).replace(r'^-$', '', regex=True).replace(r'^:\\)$', '', regex=True)\n",
"data['text'] = data['text'].str.lower()\n",
"\n",
"# Remove punctuations\n",
"translator = str.maketrans('', '', string.punctuation)\n",
"data['text'] = data['text'].apply(lambda x: x.translate(translator))\n",
"\n",
"# 2. Merge is_abuse columns into score\n",
"data['score'] = data.apply(lambda row: 1 if row['is_abuse.1'] == 1 else 0 if row['is_abuse.0'] == 1 else -1 if row['is_abuse.-1'] == 1 else -2 if row['is_abuse.-2'] == 1 else -3, axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "8745f580-cd96-4e65-81cd-ef8a8d758a77",
"metadata": {},
"outputs": [],
"source": [
"# 3. Merge type columns into one column called 'type'\n",
"type_columns = ['type.ableism', 'type.homophobic', 'type.intellectual', 'type.racist', 'type.sexist', 'type.sex_harassment', 'type.transphobic', 'target.generalised', 'target.individual', 'target.system', 'direction.explicit', 'direction.implicit']"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "e29e1b27-6739-4562-aac2-ae9ccbdcef31",
"metadata": {},
"outputs": [],
"source": [
"def get_type(row):\n",
" for col in type_columns:\n",
" if row[col] == 1:\n",
" return col.split('.')[-1]\n",
" return None\n",
"\n",
"data['type'] = data.apply(get_type, axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "8fe9b2ee-f186-42ea-adf3-0a35eb20c29f",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>text</th>\n",
" <th>score</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>does that question interest you because my fin...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>does it please you to believe i am a nobe jock...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>have you tried dick please go on oh what a pic...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>will you be travelling in economy class yes wh...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>please go on hello how do you do please state ...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19060</th>\n",
" <td>please go on who are you would you prefer if i...</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19061</th>\n",
" <td>and what city will you be flying to to bangalo...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19062</th>\n",
" <td>will you be travelling in economy class yes wh...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19063</th>\n",
" <td>what city will you be flying from from berlin ...</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19064</th>\n",
" <td>hello</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>19065 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" text score\n",
"0 does that question interest you because my fin... 1\n",
"1 does it please you to believe i am a nobe jock... 0\n",
"2 have you tried dick please go on oh what a pic... 1\n",
"3 will you be travelling in economy class yes wh... 1\n",
"4 please go on hello how do you do please state ... 1\n",
"... ... ...\n",
"19060 please go on who are you would you prefer if i... 0\n",
"19061 and what city will you be flying to to bangalo... 1\n",
"19062 will you be travelling in economy class yes wh... 1\n",
"19063 what city will you be flying from from berlin ... 1\n",
"19064 hello 1\n",
"\n",
"[19065 rows x 2 columns]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = data[['text', 'score']]\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "d84a2580-1ac1-4128-b917-778636df262b",
"metadata": {},
"outputs": [],
"source": [
"# read in the text file and split the lines into a list\n",
"with open('../dataset/storybook_text_cleaned0.txt') as f:\n",
" lines = f.readlines()\n",
" \n",
"# create a pandas DataFrame with one column named \"text\" and populate it with the lines from the text file\n",
"data0 = pd.DataFrame(lines, columns=['text'])\n",
"\n",
"# add a new column named \"score\" with a score of 1 for all rows\n",
"data0 ['score'] = 1"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "376ec7ec-4e6c-4dbe-b848-224dcb35f0c2",
"metadata": {},
"outputs": [],
"source": [
"df = pd.concat([df, data0])"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "823a4233-3133-4e27-884c-7157096fd6a4",
"metadata": {},
"outputs": [],
"source": [
"stop_words = ['oh','im','your', 'bye', 'if', 'because', 'how', 'no', 'not', 'yes','okay', 'ok', 'is', 'a', 'an', 'are', 'the', 'to', 'is', 'so', 'they', 'this', 'in', 'on', 'me', 'my', 'who', 'where', 'here', 'there', 'i', 'we', 'you', 'them', 'will', 'should', 'been', 'he', 'she','be','do', 'does', 'go','it', 'that', '?', ',', '!', '.', \"'\", '\"', ':']\n",
"df.loc[:, 'text'] = data['text'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))\n"
]
},
{
"cell_type": "markdown",
"id": "da4e56f5-1e99-4a71-9457-801f316cd3b8",
"metadata": {},
"source": [
"df"
]
},
{
"cell_type": "markdown",
"id": "6b539ed0-bee0-4b94-8841-163936ce9613",
"metadata": {},
"source": [
"#### offensive scale - method 1"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "688e5d15-0be4-42a6-8ec5-bd3564fce076",
"metadata": {},
"outputs": [],
"source": [
"word_scores = defaultdict(int)\n",
"word_counts = defaultdict(int)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "7da18128-a410-458f-869d-1f1f5a839cf7",
"metadata": {},
"outputs": [],
"source": [
"# Loop through the dataset\n",
"for index, row in df.iterrows():\n",
" text, score = row['text'], row['score']\n",
" \n",
" # Tokenize the sentence (assuming preprocessed text is space-separated)\n",
" words = text.split()\n",
" \n",
" # Update word scores and counts\n",
" for word in words:\n",
" word_scores[word] += score\n",
" word_counts[word] += 1"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "b18a31b7-a0ca-414c-a4c9-114386875543",
"metadata": {},
"outputs": [],
"source": [
"# Calculate average word scores\n",
"average_word_scores = {word: score / word_counts[word] for word, score in word_scores.items()}"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "c087fe01-8184-48f3-8404-f09f3355566b",
"metadata": {},
"outputs": [],
"source": [
"# Create a new dataset with word scores\n",
"word_scores_df = pd.DataFrame(list(average_word_scores.items()), columns=['word', 'offensive_scale'])"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "4492e07d-41ae-4b83-9298-eb9ef8839d25",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>word</th>\n",
" <th>offensive_scale</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>3141</th>\n",
" <td>entered</td>\n",
" <td>-3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3047</th>\n",
" <td>motherfucking</td>\n",
" <td>-3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3448</th>\n",
" <td>nnklkm</td>\n",
" <td>-3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3568</th>\n",
" <td>heard</td>\n",
" <td>-3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3569</th>\n",
" <td>conciousness</td>\n",
" <td>-3.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1331</th>\n",
" <td>athens</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2494</th>\n",
" <td>inzicchillo</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2493</th>\n",
" <td>pisticchio</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1496</th>\n",
" <td>visit</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2099</th>\n",
" <td>sphincter</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3647 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" word offensive_scale\n",
"3141 entered -3.0\n",
"3047 motherfucking -3.0\n",
"3448 nnklkm -3.0\n",
"3568 heard -3.0\n",
"3569 conciousness -3.0\n",
"... ... ...\n",
"1331 athens 1.0\n",
"2494 inzicchillo 1.0\n",
"2493 pisticchio 1.0\n",
"1496 visit 1.0\n",
"2099 sphincter 1.0\n",
"\n",
"[3647 rows x 2 columns]"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"word_scores_df = word_scores_df.sort_values('offensive_scale', ascending=True)\n",
"word_scores_df"
]
},
{
"cell_type": "markdown",
"id": "6b172121-5cd7-4408-8591-6b20ed6dec33",
"metadata": {},
"source": [
"#### offensive scale - method 2"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "3aa9017f-da3b-4074-b61e-4cd1391a84d0",
"metadata": {},
"outputs": [],
"source": [
"word_scores = defaultdict(lambda: {'sum': 0, 'count': 0})\n",
"\n",
"for index, row in df.iterrows():\n",
" tokens = word_tokenize(row['text'])\n",
" score = row['score']\n",
"\n",
" for token in tokens:\n",
" word_scores[token.lower()]['sum'] += score\n",
" word_scores[token.lower()]['count'] += 1\n"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "b5db525e-0571-4eb0-8ad0-e3eb03add816",
"metadata": {},
"outputs": [],
"source": [
"for word, values in word_scores.items():\n",
" word_scores[word] = values['sum'] / values['count']\n"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "1165fd1b-8624-4118-b096-71d1ea9993cf",
"metadata": {},
"outputs": [],
"source": [
"word_offensive_df = pd.DataFrame(list(word_scores.items()), columns=['word', 'offensive_scale'])\n",
"word_offensive_df['offensive_scale'] = word_offensive_df['offensive_scale'].apply(lambda x: round(x, 2))\n"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "9d5d67b7-d493-4784-994b-3b7da03f07f0",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>word</th>\n",
" <th>offensive_scale</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>question</td>\n",
" <td>0.70</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>interest</td>\n",
" <td>0.70</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>finger</td>\n",
" <td>1.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>happy</td>\n",
" <td>0.75</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>why</td>\n",
" <td>0.61</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3640</th>\n",
" <td>non</td>\n",
" <td>1.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3641</th>\n",
" <td>twenty</td>\n",
" <td>1.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3642</th>\n",
" <td>bananas</td>\n",
" <td>0.50</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3643</th>\n",
" <td>jimmy</td>\n",
" <td>0.50</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3644</th>\n",
" <td>tease</td>\n",
" <td>-2.00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3645 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" word offensive_scale\n",
"0 question 0.70\n",
"1 interest 0.70\n",
"2 finger 1.00\n",
"3 happy 0.75\n",
"4 why 0.61\n",
"... ... ...\n",
"3640 non 1.00\n",
"3641 twenty 1.00\n",
"3642 bananas 0.50\n",
"3643 jimmy 0.50\n",
"3644 tease -2.00\n",
"\n",
"[3645 rows x 2 columns]"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"word_offensive_df"
]
},
{
"cell_type": "markdown",
"id": "64b9b371-41b6-4222-a653-be60d2538111",
"metadata": {},
"source": [
"### offensive scale - method 3"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "33c6c135-a4d9-4fa3-b8e4-fffad5d2443f",
"metadata": {},
"outputs": [],
"source": [
"def get_wordnet_pos(treebank_tag):\n",
" if treebank_tag.startswith('N'):\n",
" return wordnet.NOUN\n",
" elif treebank_tag.startswith('V'):\n",
" return wordnet.VERB\n",
" else:\n",
" return ''\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 47,
"id": "3ebed656-8bf0-4e43-91df-b57e78c44870",
"metadata": {},
"outputs": [],
"source": [
"word_scores = defaultdict(lambda: {'sum': 0, 'count': 0})"
]
},
{
"cell_type": "code",
"execution_count": 48,
"id": "a35f0427-6d89-4bfc-b619-a11e07f6812f",
"metadata": {},
"outputs": [],
"source": [
"for index, row in df.iterrows():\n",
" tokens = word_tokenize(row['text'])\n",
" score = row['score']\n",
"\n",
" tagged_tokens = nltk.pos_tag(tokens)\n",
" for token, pos in tagged_tokens:\n",
" token = token.lower()\n",
" wordnet_pos = get_wordnet_pos(pos)\n",
"\n",
" if wordnet_pos == wordnet.NOUN or wordnet_pos == wordnet.VERB:\n",
" weighted_score = score * 0.65\n",
" else:\n",
" weighted_score = score\n",
"\n",
" word_scores[token]['sum'] += weighted_score\n",
" word_scores[token]['count'] += 1\n",
"\n",
"for word, values in word_scores.items():\n",
" word_scores[word] = values['sum'] / values['count']\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "e28e58bd-5da8-42d6-8baf-f6125a5ba74d",
"metadata": {},
"outputs": [],
"source": [
"word_offensive_df = pd.DataFrame(list(word_scores.items()), columns=['word', 'offensive_scale'])\n",
"word_offensive_df['offensive_scale'] = word_offensive_df['offensive_scale'].apply(lambda x: round(x, 2))"
]
},
{
"cell_type": "code",
"execution_count": 50,
"id": "010250bc-284a-4f9b-8c2f-ae94d4a84609",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>word</th>\n",
" <th>offensive_scale</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>question</td>\n",
" <td>0.46</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>interest</td>\n",
" <td>0.46</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>finger</td>\n",
" <td>0.65</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>happy</td>\n",
" <td>0.75</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>why</td>\n",
" <td>0.61</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3640</th>\n",
" <td>non</td>\n",
" <td>0.65</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3641</th>\n",
" <td>twenty</td>\n",
" <td>1.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3642</th>\n",
" <td>bananas</td>\n",
" <td>0.33</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3643</th>\n",
" <td>jimmy</td>\n",
" <td>0.33</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3644</th>\n",
" <td>tease</td>\n",
" <td>-1.30</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3645 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" word offensive_scale\n",
"0 question 0.46\n",
"1 interest 0.46\n",
"2 finger 0.65\n",
"3 happy 0.75\n",
"4 why 0.61\n",
"... ... ...\n",
"3640 non 0.65\n",
"3641 twenty 1.00\n",
"3642 bananas 0.33\n",
"3643 jimmy 0.33\n",
"3644 tease -1.30\n",
"\n",
"[3645 rows x 2 columns]"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"word_offensive_df"
]
},
{
"cell_type": "markdown",
"id": "2f630f03-7792-41cd-a075-d0dc2a0098cd",
"metadata": {},
"source": [
"### Save in MongoDB"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "ca805d34-be3e-49ec-9ec5-700a837d1d68",
"metadata": {},
"outputs": [],
"source": [
"import pymongo"
]
},
{
"cell_type": "code",
"execution_count": 51,
"id": "9de9f8e6-9c8c-4313-823e-49339b7d3a28",
"metadata": {},
"outputs": [],
"source": [
"# connect to the MongoDB instance\n",
"client = pymongo.MongoClient(\"mongodb+srv://hearme:hearme678@cluster0.kz66vdr.mongodb.net/\")\n",
"\n",
"# select the database and collection\n",
"db = client[\"word_filtration\"]\n",
"collection = db[\"sensitivity_score\"]"
]
},
{
"cell_type": "code",
"execution_count": 52,
"id": "71256a3f-044e-4bc1-98bc-ac4d2e128fc6",
"metadata": {},
"outputs": [],
"source": [
"# insert each row of the DataFrame as a document into the collection\n",
"for index, row in word_offensive_df.iterrows():\n",
" document = {\"word\": row[\"word\"], \"score\": row[\"offensive_scale\"]}\n",
" result = collection.insert_one(document)"
]
},
{
"cell_type": "markdown",
"id": "aafbf51b-ff3b-49c7-952f-902ee628861c",
"metadata": {},
"source": [
"#### Retrieve score"
]
},
{
"cell_type": "code",
"execution_count": 54,
"id": "3cd54b06-c132-45ba-9602-1a5ceee220fa",
"metadata": {},
"outputs": [],
"source": [
"def get_sensitivity_score(word):\n",
" client = pymongo.MongoClient(\"mongodb+srv://hearme:hearme678@cluster0.kz66vdr.mongodb.net/\")\n",
"\n",
" # select the database and collection\n",
" db = client[\"word_filtration\"]\n",
" collection = db[\"sensitivity_score\"]\n",
" \n",
" # find the word in the collection and return its sensitivity score\n",
" result = collection.find_one({\"word\": word})\n",
" if result:\n",
" return result[\"score\"]\n",
" else:\n",
" return -99"
]
},
{
"cell_type": "code",
"execution_count": 58,
"id": "6f05439f-5fc6-4de0-9420-7117a9608cde",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.65"
]
},
"execution_count": 58,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"get_sensitivity_score('play')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "636445cc-48c8-4b01-a3a7-920b4bec76b5",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python (Local)",
"language": "python",
"name": "local-base"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment