clustering script

353f995f · Lelkada L L P S M · 0933145e · 353f995f · 353f995f · 0933145e
Commit 353f995f authored Mar 09, 2023 by Lelkada L L P S M
4 changed files
--- a/IT19001708/STG/dataset/kid_story_dataset.txt
+++ b/IT19001708/STG/dataset/kid_story_dataset.txt
--- a/IT19001708/STG/script/.ipynb_checkpoints/Untitled-checkpoint.ipynb
+++ b/IT19001708/STG/script/.ipynb_checkpoints/Untitled-checkpoint.ipynb
--- a/IT19001708/STG/script/Untitled.ipynb
+++ b/IT19001708/STG/script/Untitled.ipynb
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "51481245-33cd-4f7c-ae5e-2376e8006545",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "!pip install transformers\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "aeea3911-8296-40a7-9770-9834031fb3d5",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "from transformers import AutoTokenizer, TFAutoModel\n",
-    "import tensorflow as tf"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "70bad982-8669-4559-a579-e8b7ba905517",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')\n",
-    "model = TFAutoModel.from_pretrained('bert-base-uncased')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "a3ad97ea-6074-4a09-b66e-ae8b79d816ef",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df = pd.read_csv('../dataset/df_filtered.csv')\n",
-    "words = df['word'].tolist()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "ab859a35-451e-4168-8277-e4d4d6e30e78",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "encoded_inputs = [tokenizer(word, return_tensors='tf') for word in words]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "de2ce194-c5b6-48c6-bab9-6687abd51077",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "encoded_inputs[0]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "d2d4363e-eec8-4559-8ea1-850ce2319b2c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "embeddings = []\n",
-    "with tf.device('/GPU:0'):\n",
-    "    for encoded_input in encoded_inputs:\n",
-    "        outputs = model(encoded_input)\n",
-    "        embedding = outputs[0][0].numpy()\n",
-    "        embeddings.append(embedding)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "dff2f4e7-a31b-439c-93b4-a9b75819d56a",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/opt/conda/lib/python3.7/site-packages/pandas/core/internals/construction.py:540: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
-      "  values = np.array([convert(v) for v in values])\n"
-     ]
-    },
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>0</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>[[-0.2112137, -0.057769418, -0.01924374, -0.00...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>[[-0.32676977, 0.032084987, -0.52493036, -0.44...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>[[-0.19533601, 0.24660303, -0.002797313, -0.06...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>[[-0.4519474, 0.18390995, -0.53327066, -0.2371...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>[[-0.052097924, 0.30908817, -0.039699733, 0.01...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3305</th>\n",
-       "      <td>[[-0.1235517, 0.43329853, -0.13414639, -0.1531...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3306</th>\n",
-       "      <td>[[-0.5598528, -0.07930106, -0.13959903, -0.069...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3307</th>\n",
-       "      <td>[[-0.39649808, 0.12856255, 0.21035239, -0.1410...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3308</th>\n",
-       "      <td>[[-0.5090915, 0.18303792, -0.29818898, 0.09537...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3309</th>\n",
-       "      <td>[[-0.25211066, 0.5173553, -0.0053291023, -0.33...</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>3310 rows × 1 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                                      0\n",
-       "0     [[-0.2112137, -0.057769418, -0.01924374, -0.00...\n",
-       "1     [[-0.32676977, 0.032084987, -0.52493036, -0.44...\n",
-       "2     [[-0.19533601, 0.24660303, -0.002797313, -0.06...\n",
-       "3     [[-0.4519474, 0.18390995, -0.53327066, -0.2371...\n",
-       "4     [[-0.052097924, 0.30908817, -0.039699733, 0.01...\n",
-       "...                                                 ...\n",
-       "3305  [[-0.1235517, 0.43329853, -0.13414639, -0.1531...\n",
-       "3306  [[-0.5598528, -0.07930106, -0.13959903, -0.069...\n",
-       "3307  [[-0.39649808, 0.12856255, 0.21035239, -0.1410...\n",
-       "3308  [[-0.5090915, 0.18303792, -0.29818898, 0.09537...\n",
-       "3309  [[-0.25211066, 0.5173553, -0.0053291023, -0.33...\n",
-       "\n",
-       "[3310 rows x 1 columns]"
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df_emb = pd.DataFrame(embeddings)\n",
-    "df_emb"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "id": "55cb94bc-ce3c-4b4e-a58a-eb7ab728b34e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df_emb.to_csv('../dataset/word_embeddings.csv')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "bacca570-1e23-4ca7-b6ce-d6e29be434bd",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "TensorFlow 2 (Local)",
-   "language": "python",
-   "name": "local-tf2"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.7.12"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
--- a/IT19001708/STG/script/embeddigns_to_clustering.ipynb
+++ b/IT19001708/STG/script/embeddigns_to_clustering.ipynb