Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
2
22_23-J 18
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
1
Merge Requests
1
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
22_23-J 18
22_23-J 18
Commits
d0d487ac
Commit
d0d487ac
authored
Feb 25, 2023
by
Lelkada L L P S M
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
embedding generating script added
parent
78f96434
Changes
4
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
2825 additions
and
1250 deletions
+2825
-1250
IT19001708/STG/script/.ipynb_checkpoints/dataset_clean-checkpoint.ipynb
.../script/.ipynb_checkpoints/dataset_clean-checkpoint.ipynb
+1267
-0
IT19001708/STG/script/.ipynb_checkpoints/word_embeddings_for_train_dataset-checkpoint.ipynb
...points/word_embeddings_for_train_dataset-checkpoint.ipynb
+149
-0
IT19001708/STG/script/dataset_clean.ipynb
IT19001708/STG/script/dataset_clean.ipynb
+1260
-1250
IT19001708/STG/script/word_embeddings_for_train_dataset.ipynb
...001708/STG/script/word_embeddings_for_train_dataset.ipynb
+149
-0
No files found.
IT19001708/STG/script/.ipynb_checkpoints/dataset_clean-checkpoint.ipynb
0 → 100644
View file @
d0d487ac
This diff is collapsed.
Click to expand it.
IT19001708/STG/script/.ipynb_checkpoints/word_embeddings_for_train_dataset-checkpoint.ipynb
0 → 100644
View file @
d0d487ac
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "wrohN8tt5Ty8",
"outputId": "a7b1b675-0f13-4e6d-c554-a4e6fb6f23e1"
},
"outputs": [],
"source": [
"!pip install sentence-transformers \n",
"!pip install langdetect"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"id": "XK-Fhom65eWi"
},
"outputs": [],
"source": [
"from sentence_transformers import SentenceTransformer\n",
"from sklearn.metrics.pairwise import cosine_similarity\n",
"import numpy as np\n",
"import pandas as pd\n",
"from langdetect import detect"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"id": "hqMzBp6m5iV6"
},
"outputs": [],
"source": [
"model = SentenceTransformer('bert-base-nli-mean-tokens')"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"id": "P4Prq9HG7d4j"
},
"outputs": [],
"source": [
"df_word = pd.read_csv('../dataset/word_no_label.csv')\n",
"df_word"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"id": "lxn2CudUEAGz"
},
"outputs": [],
"source": [
"def detect_language(word):\n",
" try:\n",
" lang = detect(word)\n",
" return lang == 'en' # Return True if the language is English, False otherwise\n",
" except:\n",
" return False"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"id": "PH45i7Ym5wCF"
},
"outputs": [],
"source": [
"def generate_embeddings(df):\n",
" embedding_list = []\n",
"\n",
" for index, value in df['word'].iteritems():\n",
" embedding_list.append(model.encode(value))\n",
" \n",
" print('Done')\n",
" \n",
" return embedding_list\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "CSvPFot643YO"
},
"outputs": [],
"source": [
"df_filtered = df_word[df_word['word'].apply(detect_language)]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "67i3XOPZ4y29"
},
"outputs": [],
"source": [
"li = generate_embeddings(df_filtered)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_embbeding = pd.dataframe(li, columns=['embedding'])"
]
}
],
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": "default:Python",
"language": "python",
"name": "conda-env-default-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
IT19001708/STG/script/dataset_clean.ipynb
View file @
d0d487ac
This diff is collapsed.
Click to expand it.
IT19001708/STG/script/word_embeddings_for_train_dataset.ipynb
0 → 100644
View file @
d0d487ac
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "wrohN8tt5Ty8",
"outputId": "a7b1b675-0f13-4e6d-c554-a4e6fb6f23e1"
},
"outputs": [],
"source": [
"!pip install sentence-transformers \n",
"!pip install langdetect"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"id": "XK-Fhom65eWi"
},
"outputs": [],
"source": [
"from sentence_transformers import SentenceTransformer\n",
"from sklearn.metrics.pairwise import cosine_similarity\n",
"import numpy as np\n",
"import pandas as pd\n",
"from langdetect import detect"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"id": "hqMzBp6m5iV6"
},
"outputs": [],
"source": [
"model = SentenceTransformer('bert-base-nli-mean-tokens')"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"id": "P4Prq9HG7d4j"
},
"outputs": [],
"source": [
"df_word = pd.read_csv('../dataset/word_no_label.csv')\n",
"df_word"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"id": "lxn2CudUEAGz"
},
"outputs": [],
"source": [
"def detect_language(word):\n",
" try:\n",
" lang = detect(word)\n",
" return lang == 'en' # Return True if the language is English, False otherwise\n",
" except:\n",
" return False"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"id": "PH45i7Ym5wCF"
},
"outputs": [],
"source": [
"def generate_embeddings(df):\n",
" embedding_list = []\n",
"\n",
" for index, value in df['word'].iteritems():\n",
" embedding_list.append(model.encode(value))\n",
" \n",
" print('Done')\n",
" \n",
" return embedding_list\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "CSvPFot643YO"
},
"outputs": [],
"source": [
"df_filtered = df_word[df_word['word'].apply(detect_language)]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "67i3XOPZ4y29"
},
"outputs": [],
"source": [
"li = generate_embeddings(df_filtered)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df_embbeding = pd.dataframe(li, columns=['embedding'])"
]
}
],
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": "default:Python",
"language": "python",
"name": "conda-env-default-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment