Commit 0933145e authored by Lelkada L L P S M's avatar Lelkada L L P S M

rebase

parents 1aa40656 e5255808
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "51481245-33cd-4f7c-ae5e-2376e8006545",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting transformers\n",
" Using cached transformers-4.26.1-py3-none-any.whl (6.3 MB)\n",
"Collecting tokenizers!=0.11.3,<0.14,>=0.11.1\n",
" Using cached tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)\n",
"Requirement already satisfied: tqdm>=4.27 in /opt/conda/lib/python3.7/site-packages (from transformers) (4.64.1)\n",
"Requirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.7/site-packages (from transformers) (6.0)\n",
"Requirement already satisfied: requests in /opt/conda/lib/python3.7/site-packages (from transformers) (2.28.2)\n",
"Requirement already satisfied: packaging>=20.0 in /opt/conda/lib/python3.7/site-packages (from transformers) (23.0)\n",
"Requirement already satisfied: regex!=2019.12.17 in /opt/conda/lib/python3.7/site-packages (from transformers) (2022.10.31)\n",
"Requirement already satisfied: numpy>=1.17 in /opt/conda/lib/python3.7/site-packages (from transformers) (1.21.6)\n",
"Requirement already satisfied: filelock in /opt/conda/lib/python3.7/site-packages (from transformers) (3.9.0)\n",
"Collecting huggingface-hub<1.0,>=0.11.0\n",
" Using cached huggingface_hub-0.12.1-py3-none-any.whl (190 kB)\n",
"Requirement already satisfied: importlib-metadata in /opt/conda/lib/python3.7/site-packages (from transformers) (6.0.0)\n",
"Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.7/site-packages (from huggingface-hub<1.0,>=0.11.0->transformers) (4.5.0)\n",
"Requirement already satisfied: zipp>=0.5 in /opt/conda/lib/python3.7/site-packages (from importlib-metadata->transformers) (3.14.0)\n",
"Requirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/conda/lib/python3.7/site-packages (from requests->transformers) (1.26.14)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.7/site-packages (from requests->transformers) (2022.12.7)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.7/site-packages (from requests->transformers) (2.1.1)\n",
"Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.7/site-packages (from requests->transformers) (3.4)\n",
"Installing collected packages: tokenizers, huggingface-hub, transformers\n",
"Successfully installed huggingface-hub-0.12.1 tokenizers-0.13.2 transformers-4.26.1\n"
]
}
],
"source": [
"!pip install transformers\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "aeea3911-8296-40a7-9770-9834031fb3d5",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from transformers import AutoTokenizer, TFAutoModel\n",
"import tensorflow as tf"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "70bad982-8669-4559-a579-e8b7ba905517",
"metadata": {},
"outputs": [],
"source": [
"tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')\n",
"model = TFAutoModel.from_pretrained('bert-base-uncased')"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "ab859a35-451e-4168-8277-e4d4d6e30e78",
"metadata": {},
"outputs": [],
"source": [
"encoded_inputs = [tokenizer(word, return_tensors='tf') for word in words]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "a3ad97ea-6074-4a09-b66e-ae8b79d816ef",
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv('../dataset/df_filtered.csv')\n",
"words = df['word'].tolist()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "de2ce194-c5b6-48c6-bab9-6687abd51077",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'input_ids': <tf.Tensor: shape=(1, 3), dtype=int32, numpy=array([[ 101, 2612, 102]], dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(1, 3), dtype=int32, numpy=array([[0, 0, 0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(1, 3), dtype=int32, numpy=array([[1, 1, 1]], dtype=int32)>}"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"encoded_inputs[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d2d4363e-eec8-4559-8ea1-850ce2319b2c",
"metadata": {},
"outputs": [],
"source": [
"embeddings = []\n",
"with tf.device('/GPU:0'):\n",
" for encoded_input in encoded_inputs:\n",
" outputs = model(encoded_input)\n",
" embedding = outputs[0][0].numpy()\n",
" embeddings.append(embedding)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dff2f4e7-a31b-439c-93b4-a9b75819d56a",
"metadata": {},
"outputs": [],
"source": [
"df_emb = pd.DataFrame(embeddings)\n",
"df_emb"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "55cb94bc-ce3c-4b4e-a58a-eb7ab728b34e",
"metadata": {},
"outputs": [],
"source": [
"df_emb.to_csv('../dataset/word_embeddings.csv')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "TensorFlow 2 (Local)",
"language": "python",
"name": "local-tf2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
"cells": [ "cells": [
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 1,
"metadata": { "metadata": {
"colab": { "colab": {
"base_uri": "https://localhost:8080/" "base_uri": "https://localhost:8080/"
...@@ -10,15 +10,29 @@ ...@@ -10,15 +10,29 @@
"id": "wrohN8tt5Ty8", "id": "wrohN8tt5Ty8",
"outputId": "a7b1b675-0f13-4e6d-c554-a4e6fb6f23e1" "outputId": "a7b1b675-0f13-4e6d-c554-a4e6fb6f23e1"
}, },
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Installing collected packages: tokenizers, sentencepiece, regex, huggingface-hub, transformers, nltk, sentence-transformers\n",
"Successfully installed huggingface-hub-0.12.1 nltk-3.8.1 regex-2022.10.31 sentence-transformers-2.2.2 sentencepiece-0.1.97 tokenizers-0.13.2 transformers-4.26.1\n",
"Collecting langdetect\n",
" Using cached langdetect-1.0.9-py3-none-any.whl\n",
"Requirement already satisfied: six in /opt/conda/lib/python3.7/site-packages (from langdetect) (1.16.0)\n",
"Installing collected packages: langdetect\n",
"Successfully installed langdetect-1.0.9\n"
]
}
],
"source": [ "source": [
"!pip install sentence-transformers \n", "!pip install sentence-transformers \n",
"!pip install langdetect" "#!pip install langdetect"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 9, "execution_count": null,
"metadata": { "metadata": {
"id": "XK-Fhom65eWi" "id": "XK-Fhom65eWi"
}, },
...@@ -28,12 +42,12 @@ ...@@ -28,12 +42,12 @@
"from sklearn.metrics.pairwise import cosine_similarity\n", "from sklearn.metrics.pairwise import cosine_similarity\n",
"import numpy as np\n", "import numpy as np\n",
"import pandas as pd\n", "import pandas as pd\n",
"from langdetect import detect" "#from langdetect import detect"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 10, "execution_count": null,
"metadata": { "metadata": {
"id": "hqMzBp6m5iV6" "id": "hqMzBp6m5iV6"
}, },
...@@ -44,7 +58,7 @@ ...@@ -44,7 +58,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 12, "execution_count": null,
"metadata": { "metadata": {
"id": "P4Prq9HG7d4j" "id": "P4Prq9HG7d4j"
}, },
...@@ -56,7 +70,7 @@ ...@@ -56,7 +70,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 14, "execution_count": null,
"metadata": { "metadata": {
"id": "lxn2CudUEAGz" "id": "lxn2CudUEAGz"
}, },
...@@ -72,7 +86,7 @@ ...@@ -72,7 +86,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 15, "execution_count": null,
"metadata": { "metadata": {
"id": "PH45i7Ym5wCF" "id": "PH45i7Ym5wCF"
}, },
...@@ -83,6 +97,7 @@ ...@@ -83,6 +97,7 @@
"\n", "\n",
" for index, value in df['word'].iteritems():\n", " for index, value in df['word'].iteritems():\n",
" embedding_list.append(model.encode(value))\n", " embedding_list.append(model.encode(value))\n",
" print(value)\n",
" \n", " \n",
" print('Done')\n", " print('Done')\n",
" \n", " \n",
...@@ -98,7 +113,7 @@ ...@@ -98,7 +113,7 @@
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"df_filtered = df_word[df_word['word'].apply(detect_language)]" "#df_filtered = df_word[df_word['word'].apply(detect_language)]"
] ]
}, },
{ {
...@@ -136,7 +151,11 @@ ...@@ -136,7 +151,11 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
<<<<<<< HEAD
"df_embedding.to_csv(\"../dataset/word_embeddings.csv\")" "df_embedding.to_csv(\"../dataset/word_embeddings.csv\")"
=======
"df_embbeding.to_csv('../dataset/word_embeddings.csv')"
>>>>>>> e5255808ccac5081b6253049b2c08282f39bc16c
] ]
}, },
{ {
...@@ -152,9 +171,9 @@ ...@@ -152,9 +171,9 @@
"provenance": [] "provenance": []
}, },
"kernelspec": { "kernelspec": {
"display_name": "default:Python", "display_name": "Pytorch (Local)",
"language": "python", "language": "python",
"name": "conda-env-default-py" "name": "local-pytorch"
}, },
"language_info": { "language_info": {
"codemirror_mode": { "codemirror_mode": {
...@@ -166,7 +185,7 @@ ...@@ -166,7 +185,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.9.13" "version": "3.7.12"
} }
}, },
"nbformat": 4, "nbformat": 4,
......
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "51481245-33cd-4f7c-ae5e-2376e8006545",
"metadata": {},
"outputs": [],
"source": [
"!pip install transformers\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "aeea3911-8296-40a7-9770-9834031fb3d5",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from transformers import AutoTokenizer, TFAutoModel\n",
"import tensorflow as tf"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "70bad982-8669-4559-a579-e8b7ba905517",
"metadata": {},
"outputs": [],
"source": [
"tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')\n",
"model = TFAutoModel.from_pretrained('bert-base-uncased')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "a3ad97ea-6074-4a09-b66e-ae8b79d816ef",
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv('../dataset/df_filtered.csv')\n",
"words = df['word'].tolist()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "ab859a35-451e-4168-8277-e4d4d6e30e78",
"metadata": {},
"outputs": [],
"source": [
"encoded_inputs = [tokenizer(word, return_tensors='tf') for word in words]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "de2ce194-c5b6-48c6-bab9-6687abd51077",
"metadata": {},
"outputs": [],
"source": [
"encoded_inputs[0]"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "d2d4363e-eec8-4559-8ea1-850ce2319b2c",
"metadata": {},
"outputs": [],
"source": [
"embeddings = []\n",
"with tf.device('/GPU:0'):\n",
" for encoded_input in encoded_inputs:\n",
" outputs = model(encoded_input)\n",
" embedding = outputs[0][0].numpy()\n",
" embeddings.append(embedding)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "dff2f4e7-a31b-439c-93b4-a9b75819d56a",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/conda/lib/python3.7/site-packages/pandas/core/internals/construction.py:540: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
" values = np.array([convert(v) for v in values])\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>[[-0.2112137, -0.057769418, -0.01924374, -0.00...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>[[-0.32676977, 0.032084987, -0.52493036, -0.44...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>[[-0.19533601, 0.24660303, -0.002797313, -0.06...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>[[-0.4519474, 0.18390995, -0.53327066, -0.2371...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>[[-0.052097924, 0.30908817, -0.039699733, 0.01...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3305</th>\n",
" <td>[[-0.1235517, 0.43329853, -0.13414639, -0.1531...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3306</th>\n",
" <td>[[-0.5598528, -0.07930106, -0.13959903, -0.069...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3307</th>\n",
" <td>[[-0.39649808, 0.12856255, 0.21035239, -0.1410...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3308</th>\n",
" <td>[[-0.5090915, 0.18303792, -0.29818898, 0.09537...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3309</th>\n",
" <td>[[-0.25211066, 0.5173553, -0.0053291023, -0.33...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3310 rows × 1 columns</p>\n",
"</div>"
],
"text/plain": [
" 0\n",
"0 [[-0.2112137, -0.057769418, -0.01924374, -0.00...\n",
"1 [[-0.32676977, 0.032084987, -0.52493036, -0.44...\n",
"2 [[-0.19533601, 0.24660303, -0.002797313, -0.06...\n",
"3 [[-0.4519474, 0.18390995, -0.53327066, -0.2371...\n",
"4 [[-0.052097924, 0.30908817, -0.039699733, 0.01...\n",
"... ...\n",
"3305 [[-0.1235517, 0.43329853, -0.13414639, -0.1531...\n",
"3306 [[-0.5598528, -0.07930106, -0.13959903, -0.069...\n",
"3307 [[-0.39649808, 0.12856255, 0.21035239, -0.1410...\n",
"3308 [[-0.5090915, 0.18303792, -0.29818898, 0.09537...\n",
"3309 [[-0.25211066, 0.5173553, -0.0053291023, -0.33...\n",
"\n",
"[3310 rows x 1 columns]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_emb = pd.DataFrame(embeddings)\n",
"df_emb"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "55cb94bc-ce3c-4b4e-a58a-eb7ab728b34e",
"metadata": {},
"outputs": [],
"source": [
"df_emb.to_csv('../dataset/word_embeddings.csv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bacca570-1e23-4ca7-b6ce-d6e29be434bd",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "TensorFlow 2 (Local)",
"language": "python",
"name": "local-tf2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
"cells": [ "cells": [
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 16,
"metadata": { "metadata": {
"colab": { "colab": {
"base_uri": "https://localhost:8080/" "base_uri": "https://localhost:8080/"
...@@ -10,7 +10,19 @@ ...@@ -10,7 +10,19 @@
"id": "wrohN8tt5Ty8", "id": "wrohN8tt5Ty8",
"outputId": "a7b1b675-0f13-4e6d-c554-a4e6fb6f23e1" "outputId": "a7b1b675-0f13-4e6d-c554-a4e6fb6f23e1"
}, },
"outputs": [], "outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting langdetect\n",
" Using cached langdetect-1.0.9-py3-none-any.whl\n",
"Requirement already satisfied: six in /opt/conda/lib/python3.7/site-packages (from langdetect) (1.16.0)\n",
"Installing collected packages: langdetect\n",
"Successfully installed langdetect-1.0.9\n"
]
}
],
"source": [ "source": [
"!pip install sentence-transformers \n", "!pip install sentence-transformers \n",
"!pip install langdetect" "!pip install langdetect"
...@@ -18,7 +30,7 @@ ...@@ -18,7 +30,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 9, "execution_count": 17,
"metadata": { "metadata": {
"id": "XK-Fhom65eWi" "id": "XK-Fhom65eWi"
}, },
...@@ -33,7 +45,7 @@ ...@@ -33,7 +45,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 10, "execution_count": 3,
"metadata": { "metadata": {
"id": "hqMzBp6m5iV6" "id": "hqMzBp6m5iV6"
}, },
...@@ -44,11 +56,107 @@ ...@@ -44,11 +56,107 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 12, "execution_count": 4,
"metadata": { "metadata": {
"id": "P4Prq9HG7d4j" "id": "P4Prq9HG7d4j"
}, },
"outputs": [], "outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>word</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>peacock</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>kind</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>gentle</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>bird</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>lived</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15423</th>\n",
" <td>downhearted</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15424</th>\n",
" <td>slits</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15425</th>\n",
" <td>whisky</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15426</th>\n",
" <td>pocketful</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15427</th>\n",
" <td>sackfuls</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>15428 rows × 1 columns</p>\n",
"</div>"
],
"text/plain": [
" word\n",
"0 peacock\n",
"1 kind\n",
"2 gentle\n",
"3 bird\n",
"4 lived\n",
"... ...\n",
"15423 downhearted\n",
"15424 slits\n",
"15425 whisky\n",
"15426 pocketful\n",
"15427 sackfuls\n",
"\n",
"[15428 rows x 1 columns]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [ "source": [
"df_word = pd.read_csv('../dataset/word_no_label.csv')\n", "df_word = pd.read_csv('../dataset/word_no_label.csv')\n",
"df_word" "df_word"
...@@ -56,7 +164,7 @@ ...@@ -56,7 +164,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 14, "execution_count": 18,
"metadata": { "metadata": {
"id": "lxn2CudUEAGz" "id": "lxn2CudUEAGz"
}, },
...@@ -72,7 +180,7 @@ ...@@ -72,7 +180,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 15, "execution_count": 6,
"metadata": { "metadata": {
"id": "PH45i7Ym5wCF" "id": "PH45i7Ym5wCF"
}, },
...@@ -80,9 +188,13 @@ ...@@ -80,9 +188,13 @@
"source": [ "source": [
"def generate_embeddings(df):\n", "def generate_embeddings(df):\n",
" embedding_list = []\n", " embedding_list = []\n",
" i = 0\n",
"\n",
"\n", "\n",
" for index, value in df['word'].iteritems():\n", " for index, value in df['word'].iteritems():\n",
" i = i+1\n",
" embedding_list.append(model.encode(value))\n", " embedding_list.append(model.encode(value))\n",
" print(i)\n",
" \n", " \n",
" print('Done')\n", " print('Done')\n",
" \n", " \n",
...@@ -92,7 +204,11 @@ ...@@ -92,7 +204,11 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
<<<<<<< HEAD
"execution_count": 17, "execution_count": 17,
=======
"execution_count": 19,
>>>>>>> e5255808ccac5081b6253049b2c08282f39bc16c
"metadata": { "metadata": {
"id": "CSvPFot643YO" "id": "CSvPFot643YO"
}, },
...@@ -101,6 +217,15 @@ ...@@ -101,6 +217,15 @@
"df_filtered = df_word[df_word['word'].apply(detect_language)]" "df_filtered = df_word[df_word['word'].apply(detect_language)]"
] ]
}, },
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"df_filtered.to_csv('../dataset/df_filtered.csv')"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
...@@ -127,7 +252,25 @@ ...@@ -127,7 +252,25 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"df_embbeding = pd.dataframe(li, columns=['embedding'])" "df_embbeding = pd.DataFrame(li, columns=['embedding'])"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"df_embbeding.to_csv('../dataset/word_embeddings.csv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"l"
] ]
}, },
{ {
...@@ -152,9 +295,9 @@ ...@@ -152,9 +295,9 @@
"provenance": [] "provenance": []
}, },
"kernelspec": { "kernelspec": {
"display_name": "default:Python", "display_name": "Pytorch (Local)",
"language": "python", "language": "python",
"name": "conda-env-default-py" "name": "local-pytorch"
}, },
"language_info": { "language_info": {
"codemirror_mode": { "codemirror_mode": {
...@@ -166,7 +309,7 @@ ...@@ -166,7 +309,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.9.13" "version": "3.7.12"
} }
}, },
"nbformat": 4, "nbformat": 4,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment