Commit 61fad8c5 authored by Lelkada L L P S M's avatar Lelkada L L P S M

word filtration script - improvements

parent e140ae77
......@@ -466,13 +466,363 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "6b172121-5cd7-4408-8591-6b20ed6dec33",
"metadata": {},
"source": [
"#### offensive scale - method 2"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "3aa9017f-da3b-4074-b61e-4cd1391a84d0",
"metadata": {},
"outputs": [],
"source": [
"word_scores = defaultdict(lambda: {'sum': 0, 'count': 0})\n",
"\n",
"for index, row in df.iterrows():\n",
" tokens = word_tokenize(row['text'])\n",
" score = row['score']\n",
"\n",
" for token in tokens:\n",
" word_scores[token.lower()]['sum'] += score\n",
" word_scores[token.lower()]['count'] += 1\n"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "b5db525e-0571-4eb0-8ad0-e3eb03add816",
"metadata": {},
"outputs": [],
"source": [
"for word, values in word_scores.items():\n",
" word_scores[word] = values['sum'] / values['count']\n"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "1165fd1b-8624-4118-b096-71d1ea9993cf",
"metadata": {},
"outputs": [],
"source": [
"word_offensive_df = pd.DataFrame(list(word_scores.items()), columns=['word', 'offensive_scale'])\n",
"word_offensive_df['offensive_scale'] = word_offensive_df['offensive_scale'].apply(lambda x: round(x, 2))\n"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "9d5d67b7-d493-4784-994b-3b7da03f07f0",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>word</th>\n",
" <th>offensive_scale</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>question</td>\n",
" <td>0.70</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>interest</td>\n",
" <td>0.70</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>finger</td>\n",
" <td>1.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>happy</td>\n",
" <td>0.75</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>why</td>\n",
" <td>0.61</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3640</th>\n",
" <td>non</td>\n",
" <td>1.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3641</th>\n",
" <td>twenty</td>\n",
" <td>1.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3642</th>\n",
" <td>bananas</td>\n",
" <td>0.50</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3643</th>\n",
" <td>jimmy</td>\n",
" <td>0.50</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3644</th>\n",
" <td>tease</td>\n",
" <td>-2.00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3645 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" word offensive_scale\n",
"0 question 0.70\n",
"1 interest 0.70\n",
"2 finger 1.00\n",
"3 happy 0.75\n",
"4 why 0.61\n",
"... ... ...\n",
"3640 non 1.00\n",
"3641 twenty 1.00\n",
"3642 bananas 0.50\n",
"3643 jimmy 0.50\n",
"3644 tease -2.00\n",
"\n",
"[3645 rows x 2 columns]"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"word_offensive_df"
]
},
{
"cell_type": "markdown",
"id": "64b9b371-41b6-4222-a653-be60d2538111",
"metadata": {},
"source": [
"### offensive scale - method 3"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "33c6c135-a4d9-4fa3-b8e4-fffad5d2443f",
"metadata": {},
"outputs": [],
"source": [
"def get_wordnet_pos(treebank_tag):\n",
" if treebank_tag.startswith('N'):\n",
" return wordnet.NOUN\n",
" elif treebank_tag.startswith('V'):\n",
" return wordnet.VERB\n",
" else:\n",
" return ''\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 47,
"id": "3ebed656-8bf0-4e43-91df-b57e78c44870",
"metadata": {},
"outputs": [],
"source": [
"word_scores = defaultdict(lambda: {'sum': 0, 'count': 0})"
]
},
{
"cell_type": "code",
"execution_count": 48,
"id": "a35f0427-6d89-4bfc-b619-a11e07f6812f",
"metadata": {},
"outputs": [],
"source": [
"for index, row in df.iterrows():\n",
" tokens = word_tokenize(row['text'])\n",
" score = row['score']\n",
"\n",
" tagged_tokens = nltk.pos_tag(tokens)\n",
" for token, pos in tagged_tokens:\n",
" token = token.lower()\n",
" wordnet_pos = get_wordnet_pos(pos)\n",
"\n",
" if wordnet_pos == wordnet.NOUN or wordnet_pos == wordnet.VERB:\n",
" weighted_score = score * 0.65\n",
" else:\n",
" weighted_score = score\n",
"\n",
" word_scores[token]['sum'] += weighted_score\n",
" word_scores[token]['count'] += 1\n",
"\n",
"for word, values in word_scores.items():\n",
" word_scores[word] = values['sum'] / values['count']\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "e28e58bd-5da8-42d6-8baf-f6125a5ba74d",
"metadata": {},
"outputs": [],
"source": [
"word_offensive_df = pd.DataFrame(list(word_scores.items()), columns=['word', 'offensive_scale'])\n",
"word_offensive_df['offensive_scale'] = word_offensive_df['offensive_scale'].apply(lambda x: round(x, 2))"
]
},
{
"cell_type": "code",
"execution_count": 50,
"id": "010250bc-284a-4f9b-8c2f-ae94d4a84609",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>word</th>\n",
" <th>offensive_scale</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>question</td>\n",
" <td>0.46</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>interest</td>\n",
" <td>0.46</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>finger</td>\n",
" <td>0.65</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>happy</td>\n",
" <td>0.75</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>why</td>\n",
" <td>0.61</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3640</th>\n",
" <td>non</td>\n",
" <td>0.65</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3641</th>\n",
" <td>twenty</td>\n",
" <td>1.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3642</th>\n",
" <td>bananas</td>\n",
" <td>0.33</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3643</th>\n",
" <td>jimmy</td>\n",
" <td>0.33</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3644</th>\n",
" <td>tease</td>\n",
" <td>-1.30</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3645 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" word offensive_scale\n",
"0 question 0.46\n",
"1 interest 0.46\n",
"2 finger 0.65\n",
"3 happy 0.75\n",
"4 why 0.61\n",
"... ... ...\n",
"3640 non 0.65\n",
"3641 twenty 1.00\n",
"3642 bananas 0.33\n",
"3643 jimmy 0.33\n",
"3644 tease -1.30\n",
"\n",
"[3645 rows x 2 columns]"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"word_offensive_df"
]
}
],
"metadata": {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment