word filtration script - improvements

61fad8c5 · Lelkada L L P S M · e140ae77 · 61fad8c5
Commit 61fad8c5 authored Apr 12, 2023 by Lelkada L L P S M
Hide whitespace changes
Inline Side-by-side

Showing with 351 additions and 1 deletion

IT19001708/STG/script/word_filtration.ipynb IT19001708/STG/script/word_filtration.ipynb +351 -1

No files found.
--- a/IT19001708/STG/script/word_filtration.ipynb
+++ b/IT19001708/STG/script/word_filtration.ipynb
@@ -466,13 +466,363 @@
   ]
  },
  {
-   "attachments": {},
   "cell_type": "markdown",
   "id": "6b172121-5cd7-4408-8591-6b20ed6dec33",
   "metadata": {},
   "source": [
    "#### offensive scale - method 2"
   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "3aa9017f-da3b-4074-b61e-4cd1391a84d0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "word_scores = defaultdict(lambda: {'sum': 0, 'count': 0})\n",
+    "\n",
+    "for index, row in df.iterrows():\n",
+    "    tokens = word_tokenize(row['text'])\n",
+    "    score = row['score']\n",
+    "\n",
+    "    for token in tokens:\n",
+    "        word_scores[token.lower()]['sum'] += score\n",
+    "        word_scores[token.lower()]['count'] += 1\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "b5db525e-0571-4eb0-8ad0-e3eb03add816",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for word, values in word_scores.items():\n",
+    "    word_scores[word] = values['sum'] / values['count']\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "1165fd1b-8624-4118-b096-71d1ea9993cf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "word_offensive_df = pd.DataFrame(list(word_scores.items()), columns=['word', 'offensive_scale'])\n",
+    "word_offensive_df['offensive_scale'] = word_offensive_df['offensive_scale'].apply(lambda x: round(x, 2))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "9d5d67b7-d493-4784-994b-3b7da03f07f0",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>word</th>\n",
+       "      <th>offensive_scale</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>question</td>\n",
+       "      <td>0.70</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>interest</td>\n",
+       "      <td>0.70</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>finger</td>\n",
+       "      <td>1.00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>happy</td>\n",
+       "      <td>0.75</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>why</td>\n",
+       "      <td>0.61</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3640</th>\n",
+       "      <td>non</td>\n",
+       "      <td>1.00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3641</th>\n",
+       "      <td>twenty</td>\n",
+       "      <td>1.00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3642</th>\n",
+       "      <td>bananas</td>\n",
+       "      <td>0.50</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3643</th>\n",
+       "      <td>jimmy</td>\n",
+       "      <td>0.50</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3644</th>\n",
+       "      <td>tease</td>\n",
+       "      <td>-2.00</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>3645 rows × 2 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "          word  offensive_scale\n",
+       "0     question             0.70\n",
+       "1     interest             0.70\n",
+       "2       finger             1.00\n",
+       "3        happy             0.75\n",
+       "4          why             0.61\n",
+       "...        ...              ...\n",
+       "3640       non             1.00\n",
+       "3641    twenty             1.00\n",
+       "3642   bananas             0.50\n",
+       "3643     jimmy             0.50\n",
+       "3644     tease            -2.00\n",
+       "\n",
+       "[3645 rows x 2 columns]"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "word_offensive_df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "64b9b371-41b6-4222-a653-be60d2538111",
+   "metadata": {},
+   "source": [
+    "### offensive scale -  method 3"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "id": "33c6c135-a4d9-4fa3-b8e4-fffad5d2443f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_wordnet_pos(treebank_tag):\n",
+    "    if treebank_tag.startswith('N'):\n",
+    "        return wordnet.NOUN\n",
+    "    elif treebank_tag.startswith('V'):\n",
+    "        return wordnet.VERB\n",
+    "    else:\n",
+    "        return ''\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "id": "3ebed656-8bf0-4e43-91df-b57e78c44870",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "word_scores = defaultdict(lambda: {'sum': 0, 'count': 0})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "id": "a35f0427-6d89-4bfc-b619-a11e07f6812f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for index, row in df.iterrows():\n",
+    "    tokens = word_tokenize(row['text'])\n",
+    "    score = row['score']\n",
+    "\n",
+    "    tagged_tokens = nltk.pos_tag(tokens)\n",
+    "    for token, pos in tagged_tokens:\n",
+    "        token = token.lower()\n",
+    "        wordnet_pos = get_wordnet_pos(pos)\n",
+    "\n",
+    "        if wordnet_pos == wordnet.NOUN or wordnet_pos == wordnet.VERB:\n",
+    "            weighted_score = score * 0.65\n",
+    "        else:\n",
+    "            weighted_score = score\n",
+    "\n",
+    "        word_scores[token]['sum'] += weighted_score\n",
+    "        word_scores[token]['count'] += 1\n",
+    "\n",
+    "for word, values in word_scores.items():\n",
+    "    word_scores[word] = values['sum'] / values['count']\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "id": "e28e58bd-5da8-42d6-8baf-f6125a5ba74d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "word_offensive_df = pd.DataFrame(list(word_scores.items()), columns=['word', 'offensive_scale'])\n",
+    "word_offensive_df['offensive_scale'] = word_offensive_df['offensive_scale'].apply(lambda x: round(x, 2))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "id": "010250bc-284a-4f9b-8c2f-ae94d4a84609",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>word</th>\n",
+       "      <th>offensive_scale</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>question</td>\n",
+       "      <td>0.46</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>interest</td>\n",
+       "      <td>0.46</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>finger</td>\n",
+       "      <td>0.65</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>happy</td>\n",
+       "      <td>0.75</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>why</td>\n",
+       "      <td>0.61</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3640</th>\n",
+       "      <td>non</td>\n",
+       "      <td>0.65</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3641</th>\n",
+       "      <td>twenty</td>\n",
+       "      <td>1.00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3642</th>\n",
+       "      <td>bananas</td>\n",
+       "      <td>0.33</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3643</th>\n",
+       "      <td>jimmy</td>\n",
+       "      <td>0.33</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3644</th>\n",
+       "      <td>tease</td>\n",
+       "      <td>-1.30</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>3645 rows × 2 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "          word  offensive_scale\n",
+       "0     question             0.46\n",
+       "1     interest             0.46\n",
+       "2       finger             0.65\n",
+       "3        happy             0.75\n",
+       "4          why             0.61\n",
+       "...        ...              ...\n",
+       "3640       non             0.65\n",
+       "3641    twenty             1.00\n",
+       "3642   bananas             0.33\n",
+       "3643     jimmy             0.33\n",
+       "3644     tease            -1.30\n",
+       "\n",
+       "[3645 rows x 2 columns]"
+      ]
+     },
+     "execution_count": 50,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "word_offensive_df"
+   ]
  }
 ],
 "metadata": {