Commit d0d487ac authored by Lelkada L L P S M's avatar Lelkada L L P S M

embedding generating script added

parent 78f96434
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "wrohN8tt5Ty8",
"outputId": "a7b1b675-0f13-4e6d-c554-a4e6fb6f23e1"
},
"outputs": [],
"source": [
"!pip install sentence-transformers \n",
"!pip install langdetect"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"id": "XK-Fhom65eWi"
},
"outputs": [],
"source": [
"from sentence_transformers import SentenceTransformer\n",
"from sklearn.metrics.pairwise import cosine_similarity\n",
"import numpy as np\n",
"import pandas as pd\n",
"from langdetect import detect"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"id": "hqMzBp6m5iV6"
},
"outputs": [],
"source": [
"model = SentenceTransformer('bert-base-nli-mean-tokens')"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"id": "P4Prq9HG7d4j"
},
"outputs": [],
"source": [
"df_word = pd.read_csv('../dataset/word_no_label.csv')\n",
"df_word"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"id": "lxn2CudUEAGz"
},
"outputs": [],
"source": [
"def detect_language(word):\n",
" try:\n",
" lang = detect(word)\n",
" return lang == 'en' # Return True if the language is English, False otherwise\n",
" except:\n",
" return False"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"id": "PH45i7Ym5wCF"
},
"outputs": [],
"source": [
"def generate_embeddings(df):\n",
" embedding_list = []\n",
"\n",
" for index, value in df['word'].iteritems():\n",
" embedding_list.append(model.encode(value))\n",
" \n",
" print('Done')\n",
" \n",
" return embedding_list\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "CSvPFot643YO"
},
"outputs": [],
"source": [
"df_filtered = df_word[df_word['word'].apply(detect_language)]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "67i3XOPZ4y29"
},
"outputs": [],
"source": [
"li = generate_embeddings(df_filtered)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_embbeding = pd.dataframe(li, columns=['embedding'])"
]
}
],
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": "default:Python",
"language": "python",
"name": "conda-env-default-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
......@@ -12,8 +12,8 @@
},
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n",
......@@ -22,14 +22,14 @@
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"True"
]
},
"execution_count": 7,
"metadata": {},
"execution_count": 7
"output_type": "execute_result"
}
],
"source": [
......@@ -150,14 +150,14 @@
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"3014651"
]
},
"execution_count": 16,
"metadata": {},
"execution_count": 16
"output_type": "execute_result"
}
],
"source": [
......@@ -187,14 +187,14 @@
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"77937"
]
},
"execution_count": 18,
"metadata": {},
"execution_count": 18
"output_type": "execute_result"
}
],
"source": [
......@@ -205,15 +205,14 @@
"cell_type": "code",
"execution_count": 19,
"metadata": {
"id": "xCwh_I9VwPEw",
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "xCwh_I9VwPEw",
"outputId": "faa25216-b368-4e7a-8728-344f0450b83e"
},
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/plain": [
"['lived',\n",
......@@ -1219,8 +1218,9 @@
" ...]"
]
},
"execution_count": 19,
"metadata": {},
"execution_count": 19
"output_type": "execute_result"
}
],
"source": [
......@@ -1255,13 +1255,23 @@
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
"display_name": "default:Python",
"language": "python",
"name": "conda-env-default-py"
},
"language_info": {
"name": "python"
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
}
},
"nbformat": 4,
"nbformat_minor": 0
"nbformat_minor": 4
}
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "wrohN8tt5Ty8",
"outputId": "a7b1b675-0f13-4e6d-c554-a4e6fb6f23e1"
},
"outputs": [],
"source": [
"!pip install sentence-transformers \n",
"!pip install langdetect"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"id": "XK-Fhom65eWi"
},
"outputs": [],
"source": [
"from sentence_transformers import SentenceTransformer\n",
"from sklearn.metrics.pairwise import cosine_similarity\n",
"import numpy as np\n",
"import pandas as pd\n",
"from langdetect import detect"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"id": "hqMzBp6m5iV6"
},
"outputs": [],
"source": [
"model = SentenceTransformer('bert-base-nli-mean-tokens')"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"id": "P4Prq9HG7d4j"
},
"outputs": [],
"source": [
"df_word = pd.read_csv('../dataset/word_no_label.csv')\n",
"df_word"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"id": "lxn2CudUEAGz"
},
"outputs": [],
"source": [
"def detect_language(word):\n",
" try:\n",
" lang = detect(word)\n",
" return lang == 'en' # Return True if the language is English, False otherwise\n",
" except:\n",
" return False"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"id": "PH45i7Ym5wCF"
},
"outputs": [],
"source": [
"def generate_embeddings(df):\n",
" embedding_list = []\n",
"\n",
" for index, value in df['word'].iteritems():\n",
" embedding_list.append(model.encode(value))\n",
" \n",
" print('Done')\n",
" \n",
" return embedding_list\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "CSvPFot643YO"
},
"outputs": [],
"source": [
"df_filtered = df_word[df_word['word'].apply(detect_language)]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "67i3XOPZ4y29"
},
"outputs": [],
"source": [
"li = generate_embeddings(df_filtered)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_embbeding = pd.dataframe(li, columns=['embedding'])"
]
}
],
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": "default:Python",
"language": "python",
"name": "conda-env-default-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment