embeddings added

e07e034a · Lelkada L L P S M · b8a2a524 · e07e034a · e07e034a · e07e034a
Commit e07e034a authored Feb 26, 2023 by Lelkada L L P S M
7 changed files
--- a/IT19001708/STG/dataset/.ipynb_checkpoints/word_embeddings-checkpoint.csv
+++ b/IT19001708/STG/dataset/.ipynb_checkpoints/word_embeddings-checkpoint.csv
--- a/IT19001708/STG/dataset/df_filtered.csv
+++ b/IT19001708/STG/dataset/df_filtered.csv
--- a/IT19001708/STG/dataset/word_embeddings.csv
+++ b/IT19001708/STG/dataset/word_embeddings.csv
--- a/IT19001708/STG/script/.ipynb_checkpoints/Untitled-checkpoint.ipynb
+++ b/IT19001708/STG/script/.ipynb_checkpoints/Untitled-checkpoint.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "51481245-33cd-4f7c-ae5e-2376e8006545",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Collecting transformers\n",
+      "  Using cached transformers-4.26.1-py3-none-any.whl (6.3 MB)\n",
+      "Collecting tokenizers!=0.11.3,<0.14,>=0.11.1\n",
+      "  Using cached tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)\n",
+      "Requirement already satisfied: tqdm>=4.27 in /opt/conda/lib/python3.7/site-packages (from transformers) (4.64.1)\n",
+      "Requirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.7/site-packages (from transformers) (6.0)\n",
+      "Requirement already satisfied: requests in /opt/conda/lib/python3.7/site-packages (from transformers) (2.28.2)\n",
+      "Requirement already satisfied: packaging>=20.0 in /opt/conda/lib/python3.7/site-packages (from transformers) (23.0)\n",
+      "Requirement already satisfied: regex!=2019.12.17 in /opt/conda/lib/python3.7/site-packages (from transformers) (2022.10.31)\n",
+      "Requirement already satisfied: numpy>=1.17 in /opt/conda/lib/python3.7/site-packages (from transformers) (1.21.6)\n",
+      "Requirement already satisfied: filelock in /opt/conda/lib/python3.7/site-packages (from transformers) (3.9.0)\n",
+      "Collecting huggingface-hub<1.0,>=0.11.0\n",
+      "  Using cached huggingface_hub-0.12.1-py3-none-any.whl (190 kB)\n",
+      "Requirement already satisfied: importlib-metadata in /opt/conda/lib/python3.7/site-packages (from transformers) (6.0.0)\n",
+      "Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.7/site-packages (from huggingface-hub<1.0,>=0.11.0->transformers) (4.5.0)\n",
+      "Requirement already satisfied: zipp>=0.5 in /opt/conda/lib/python3.7/site-packages (from importlib-metadata->transformers) (3.14.0)\n",
+      "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/conda/lib/python3.7/site-packages (from requests->transformers) (1.26.14)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.7/site-packages (from requests->transformers) (2022.12.7)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.7/site-packages (from requests->transformers) (2.1.1)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.7/site-packages (from requests->transformers) (3.4)\n",
+      "Installing collected packages: tokenizers, huggingface-hub, transformers\n",
+      "Successfully installed huggingface-hub-0.12.1 tokenizers-0.13.2 transformers-4.26.1\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install transformers\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aeea3911-8296-40a7-9770-9834031fb3d5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from transformers import AutoTokenizer, TFAutoModel\n",
+    "import tensorflow as tf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "70bad982-8669-4559-a579-e8b7ba905517",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')\n",
+    "model = TFAutoModel.from_pretrained('bert-base-uncased')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "ab859a35-451e-4168-8277-e4d4d6e30e78",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "encoded_inputs = [tokenizer(word, return_tensors='tf') for word in words]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "a3ad97ea-6074-4a09-b66e-ae8b79d816ef",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv('../dataset/df_filtered.csv')\n",
+    "words = df['word'].tolist()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "de2ce194-c5b6-48c6-bab9-6687abd51077",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'input_ids': <tf.Tensor: shape=(1, 3), dtype=int32, numpy=array([[ 101, 2612,  102]], dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(1, 3), dtype=int32, numpy=array([[0, 0, 0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(1, 3), dtype=int32, numpy=array([[1, 1, 1]], dtype=int32)>}"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "encoded_inputs[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d2d4363e-eec8-4559-8ea1-850ce2319b2c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "embeddings = []\n",
+    "with tf.device('/GPU:0'):\n",
+    "    for encoded_input in encoded_inputs:\n",
+    "        outputs = model(encoded_input)\n",
+    "        embedding = outputs[0][0].numpy()\n",
+    "        embeddings.append(embedding)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dff2f4e7-a31b-439c-93b4-a9b75819d56a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_emb = pd.DataFrame(embeddings)\n",
+    "df_emb"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "55cb94bc-ce3c-4b4e-a58a-eb7ab728b34e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_emb.to_csv('../dataset/word_embeddings.csv')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "TensorFlow 2 (Local)",
+   "language": "python",
+   "name": "local-tf2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/IT19001708/STG/script/.ipynb_checkpoints/word_embeddings_for_train_dataset-checkpoint.ipynb
+++ b/IT19001708/STG/script/.ipynb_checkpoints/word_embeddings_for_train_dataset-checkpoint.ipynb
@@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
@@ -10,15 +10,29 @@
    "id": "wrohN8tt5Ty8",
    "outputId": "a7b1b675-0f13-4e6d-c554-a4e6fb6f23e1"
   },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Installing collected packages: tokenizers, sentencepiece, regex, huggingface-hub, transformers, nltk, sentence-transformers\n",
+      "Successfully installed huggingface-hub-0.12.1 nltk-3.8.1 regex-2022.10.31 sentence-transformers-2.2.2 sentencepiece-0.1.97 tokenizers-0.13.2 transformers-4.26.1\n",
+      "Collecting langdetect\n",
+      "  Using cached langdetect-1.0.9-py3-none-any.whl\n",
+      "Requirement already satisfied: six in /opt/conda/lib/python3.7/site-packages (from langdetect) (1.16.0)\n",
+      "Installing collected packages: langdetect\n",
+      "Successfully installed langdetect-1.0.9\n"
+     ]
+    }
+   ],
   "source": [
    "!pip install sentence-transformers \n",
-    "!pip install langdetect"
+    "#!pip install langdetect"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
   "metadata": {
    "id": "XK-Fhom65eWi"
   },
@@ -28,12 +42,12 @@
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "import numpy as np\n",
    "import pandas as pd\n",
-    "from langdetect import detect"
+    "#from langdetect import detect"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": null,
   "metadata": {
    "id": "hqMzBp6m5iV6"
   },
@@ -44,7 +58,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": null,
   "metadata": {
    "id": "P4Prq9HG7d4j"
   },
@@ -56,7 +70,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": null,
   "metadata": {
    "id": "lxn2CudUEAGz"
   },
@@ -72,7 +86,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": null,
   "metadata": {
    "id": "PH45i7Ym5wCF"
   },
@@ -83,6 +97,7 @@
    "\n",
    "  for index, value in df['word'].iteritems():\n",
    "    embedding_list.append(model.encode(value))\n",
+    "    print(value)\n",
    "  \n",
    "  print('Done')\n",
    "  \n",
@@ -98,7 +113,7 @@
   },
   "outputs": [],
   "source": [
-    "df_filtered = df_word[df_word['word'].apply(detect_language)]"
+    "#df_filtered = df_word[df_word['word'].apply(detect_language)]"
   ]
  },
  {
@@ -109,7 +124,7 @@
   },
   "outputs": [],
   "source": [
-    "li = generate_embeddings(df_filtered)"
+    "li = generate_embeddings(df_word)"
   ]
  },
  {
@@ -120,6 +135,22 @@
   "source": [
    "df_embbeding = pd.dataframe(li, columns=['embedding'])"
   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_embbeding.to_csv('../dataset/word_embeddings.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
  }
 ],
 "metadata": {
@@ -127,9 +158,9 @@
   "provenance": []
  },
  "kernelspec": {
-   "display_name": "default:Python",
+   "display_name": "Pytorch (Local)",
   "language": "python",
-   "name": "conda-env-default-py"
+   "name": "local-pytorch"
  },
  "language_info": {
   "codemirror_mode": {
@@ -141,7 +172,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.9.13"
+   "version": "3.7.12"
  }
 },
 "nbformat": 4,

--- a/IT19001708/STG/script/Untitled.ipynb
+++ b/IT19001708/STG/script/Untitled.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "51481245-33cd-4f7c-ae5e-2376e8006545",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install transformers\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "aeea3911-8296-40a7-9770-9834031fb3d5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from transformers import AutoTokenizer, TFAutoModel\n",
+    "import tensorflow as tf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "70bad982-8669-4559-a579-e8b7ba905517",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')\n",
+    "model = TFAutoModel.from_pretrained('bert-base-uncased')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "a3ad97ea-6074-4a09-b66e-ae8b79d816ef",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv('../dataset/df_filtered.csv')\n",
+    "words = df['word'].tolist()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "ab859a35-451e-4168-8277-e4d4d6e30e78",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "encoded_inputs = [tokenizer(word, return_tensors='tf') for word in words]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "de2ce194-c5b6-48c6-bab9-6687abd51077",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "encoded_inputs[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "d2d4363e-eec8-4559-8ea1-850ce2319b2c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "embeddings = []\n",
+    "with tf.device('/GPU:0'):\n",
+    "    for encoded_input in encoded_inputs:\n",
+    "        outputs = model(encoded_input)\n",
+    "        embedding = outputs[0][0].numpy()\n",
+    "        embeddings.append(embedding)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "dff2f4e7-a31b-439c-93b4-a9b75819d56a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/conda/lib/python3.7/site-packages/pandas/core/internals/construction.py:540: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
+      "  values = np.array([convert(v) for v in values])\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>0</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>[[-0.2112137, -0.057769418, -0.01924374, -0.00...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>[[-0.32676977, 0.032084987, -0.52493036, -0.44...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>[[-0.19533601, 0.24660303, -0.002797313, -0.06...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>[[-0.4519474, 0.18390995, -0.53327066, -0.2371...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>[[-0.052097924, 0.30908817, -0.039699733, 0.01...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3305</th>\n",
+       "      <td>[[-0.1235517, 0.43329853, -0.13414639, -0.1531...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3306</th>\n",
+       "      <td>[[-0.5598528, -0.07930106, -0.13959903, -0.069...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3307</th>\n",
+       "      <td>[[-0.39649808, 0.12856255, 0.21035239, -0.1410...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3308</th>\n",
+       "      <td>[[-0.5090915, 0.18303792, -0.29818898, 0.09537...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3309</th>\n",
+       "      <td>[[-0.25211066, 0.5173553, -0.0053291023, -0.33...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>3310 rows × 1 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                      0\n",
+       "0     [[-0.2112137, -0.057769418, -0.01924374, -0.00...\n",
+       "1     [[-0.32676977, 0.032084987, -0.52493036, -0.44...\n",
+       "2     [[-0.19533601, 0.24660303, -0.002797313, -0.06...\n",
+       "3     [[-0.4519474, 0.18390995, -0.53327066, -0.2371...\n",
+       "4     [[-0.052097924, 0.30908817, -0.039699733, 0.01...\n",
+       "...                                                 ...\n",
+       "3305  [[-0.1235517, 0.43329853, -0.13414639, -0.1531...\n",
+       "3306  [[-0.5598528, -0.07930106, -0.13959903, -0.069...\n",
+       "3307  [[-0.39649808, 0.12856255, 0.21035239, -0.1410...\n",
+       "3308  [[-0.5090915, 0.18303792, -0.29818898, 0.09537...\n",
+       "3309  [[-0.25211066, 0.5173553, -0.0053291023, -0.33...\n",
+       "\n",
+       "[3310 rows x 1 columns]"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_emb = pd.DataFrame(embeddings)\n",
+    "df_emb"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "55cb94bc-ce3c-4b4e-a58a-eb7ab728b34e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_emb.to_csv('../dataset/word_embeddings.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bacca570-1e23-4ca7-b6ce-d6e29be434bd",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "TensorFlow 2 (Local)",
+   "language": "python",
+   "name": "local-tf2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/IT19001708/STG/script/word_embeddings_for_train_dataset.ipynb
+++ b/IT19001708/STG/script/word_embeddings_for_train_dataset.ipynb
@@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 16,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
@@ -10,7 +10,19 @@
    "id": "wrohN8tt5Ty8",
    "outputId": "a7b1b675-0f13-4e6d-c554-a4e6fb6f23e1"
   },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Collecting langdetect\n",
+      "  Using cached langdetect-1.0.9-py3-none-any.whl\n",
+      "Requirement already satisfied: six in /opt/conda/lib/python3.7/site-packages (from langdetect) (1.16.0)\n",
+      "Installing collected packages: langdetect\n",
+      "Successfully installed langdetect-1.0.9\n"
+     ]
+    }
+   ],
   "source": [
    "!pip install sentence-transformers \n",
    "!pip install langdetect"
@@ -18,7 +30,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 17,
   "metadata": {
    "id": "XK-Fhom65eWi"
   },
@@ -33,7 +45,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 3,
   "metadata": {
    "id": "hqMzBp6m5iV6"
   },
@@ -44,11 +56,107 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 4,
   "metadata": {
    "id": "P4Prq9HG7d4j"
   },
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>word</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>peacock</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>kind</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>gentle</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>bird</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>lived</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15423</th>\n",
+       "      <td>downhearted</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15424</th>\n",
+       "      <td>slits</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15425</th>\n",
+       "      <td>whisky</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15426</th>\n",
+       "      <td>pocketful</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15427</th>\n",
+       "      <td>sackfuls</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>15428 rows × 1 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "              word\n",
+       "0          peacock\n",
+       "1             kind\n",
+       "2           gentle\n",
+       "3             bird\n",
+       "4            lived\n",
+       "...            ...\n",
+       "15423  downhearted\n",
+       "15424        slits\n",
+       "15425       whisky\n",
+       "15426    pocketful\n",
+       "15427     sackfuls\n",
+       "\n",
+       "[15428 rows x 1 columns]"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
   "source": [
    "df_word = pd.read_csv('../dataset/word_no_label.csv')\n",
    "df_word"
@@ -56,7 +164,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 18,
   "metadata": {
    "id": "lxn2CudUEAGz"
   },
@@ -72,7 +180,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 6,
   "metadata": {
    "id": "PH45i7Ym5wCF"
   },
@@ -80,9 +188,13 @@
   "source": [
    "def generate_embeddings(df):\n",
    "  embedding_list = []\n",
+    "  i = 0\n",
+    "\n",
    "\n",
    "  for index, value in df['word'].iteritems():\n",
+    "    i = i+1\n",
    "    embedding_list.append(model.encode(value))\n",
+    "    print(i)\n",
    "  \n",
    "  print('Done')\n",
    "  \n",
@@ -92,7 +204,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 19,
   "metadata": {
    "id": "CSvPFot643YO"
   },
@@ -101,6 +213,15 @@
    "df_filtered = df_word[df_word['word'].apply(detect_language)]"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_filtered.to_csv('../dataset/df_filtered.csv')"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
@@ -118,7 +239,25 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "df_embbeding = pd.dataframe(li, columns=['embedding'])"
+    "df_embbeding = pd.DataFrame(li, columns=['embedding'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_embbeding.to_csv('../dataset/word_embeddings.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "l"
   ]
  }
 ],
@@ -127,9 +266,9 @@
   "provenance": []
  },
  "kernelspec": {
-   "display_name": "default:Python",
+   "display_name": "Pytorch (Local)",
   "language": "python",
-   "name": "conda-env-default-py"
+   "name": "local-pytorch"
  },
  "language_info": {
   "codemirror_mode": {
@@ -141,7 +280,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.9.13"
+   "version": "3.7.12"
  }
 },
 "nbformat": 4,