Commit b8a2a524 authored by Lelkada L L P S M's avatar Lelkada L L P S M
parents dc169c64 d0d487ac
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "wrohN8tt5Ty8",
"outputId": "a7b1b675-0f13-4e6d-c554-a4e6fb6f23e1"
},
"outputs": [],
"source": [
"!pip install sentence-transformers \n",
"!pip install langdetect"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"id": "XK-Fhom65eWi"
},
"outputs": [],
"source": [
"from sentence_transformers import SentenceTransformer\n",
"from sklearn.metrics.pairwise import cosine_similarity\n",
"import numpy as np\n",
"import pandas as pd\n",
"from langdetect import detect"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"id": "hqMzBp6m5iV6"
},
"outputs": [],
"source": [
"model = SentenceTransformer('bert-base-nli-mean-tokens')"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"id": "P4Prq9HG7d4j"
},
"outputs": [],
"source": [
"df_word = pd.read_csv('../dataset/word_no_label.csv')\n",
"df_word"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"id": "lxn2CudUEAGz"
},
"outputs": [],
"source": [
"def detect_language(word):\n",
" try:\n",
" lang = detect(word)\n",
" return lang == 'en' # Return True if the language is English, False otherwise\n",
" except:\n",
" return False"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"id": "PH45i7Ym5wCF"
},
"outputs": [],
"source": [
"def generate_embeddings(df):\n",
" embedding_list = []\n",
"\n",
" for index, value in df['word'].iteritems():\n",
" embedding_list.append(model.encode(value))\n",
" \n",
" print('Done')\n",
" \n",
" return embedding_list\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "CSvPFot643YO"
},
"outputs": [],
"source": [
"df_filtered = df_word[df_word['word'].apply(detect_language)]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "67i3XOPZ4y29"
},
"outputs": [],
"source": [
"li = generate_embeddings(df_filtered)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_embbeding = pd.dataframe(li, columns=['embedding'])"
]
}
],
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": "default:Python",
"language": "python",
"name": "conda-env-default-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
This diff is collapsed.
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "wrohN8tt5Ty8",
"outputId": "a7b1b675-0f13-4e6d-c554-a4e6fb6f23e1"
},
"outputs": [],
"source": [
"!pip install sentence-transformers \n",
"!pip install langdetect"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"id": "XK-Fhom65eWi"
},
"outputs": [],
"source": [
"from sentence_transformers import SentenceTransformer\n",
"from sklearn.metrics.pairwise import cosine_similarity\n",
"import numpy as np\n",
"import pandas as pd\n",
"from langdetect import detect"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"id": "hqMzBp6m5iV6"
},
"outputs": [],
"source": [
"model = SentenceTransformer('bert-base-nli-mean-tokens')"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"id": "P4Prq9HG7d4j"
},
"outputs": [],
"source": [
"df_word = pd.read_csv('../dataset/word_no_label.csv')\n",
"df_word"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"id": "lxn2CudUEAGz"
},
"outputs": [],
"source": [
"def detect_language(word):\n",
" try:\n",
" lang = detect(word)\n",
" return lang == 'en' # Return True if the language is English, False otherwise\n",
" except:\n",
" return False"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"id": "PH45i7Ym5wCF"
},
"outputs": [],
"source": [
"def generate_embeddings(df):\n",
" embedding_list = []\n",
"\n",
" for index, value in df['word'].iteritems():\n",
" embedding_list.append(model.encode(value))\n",
" \n",
" print('Done')\n",
" \n",
" return embedding_list\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "CSvPFot643YO"
},
"outputs": [],
"source": [
"df_filtered = df_word[df_word['word'].apply(detect_language)]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "67i3XOPZ4y29"
},
"outputs": [],
"source": [
"li = generate_embeddings(df_filtered)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_embbeding = pd.dataframe(li, columns=['embedding'])"
]
}
],
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": "default:Python",
"language": "python",
"name": "conda-env-default-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment