new embedding script

f924fad8 · Lelkada L L P S M · 7f0d87e8 · f924fad8 · f924fad8 · f924fad8
Commit f924fad8 authored Mar 25, 2023 by Lelkada L L P S M
10 changed files
--- a/IT19001708/STG/script/.ipynb_checkpoints/20230324-checkpoint.ipynb
+++ b/IT19001708/STG/script/.ipynb_checkpoints/20230324-checkpoint.ipynb
--- a/IT19001708/STG/script/.ipynb_checkpoints/Word_embeddigns_working-checkpoint.ipynb
+++ b/IT19001708/STG/script/.ipynb_checkpoints/Word_embeddigns_working-checkpoint.ipynb
+{
+ "cells": [],
+ "metadata": {},
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/IT19001708/STG/script/20230324.ipynb
+++ b/IT19001708/STG/script/20230324.ipynb
--- a/IT19001708/STG/script/Word_embeddigns_working.ipynb
+++ b/IT19001708/STG/script/Word_embeddigns_working.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2b755866-3d8c-4086-aa36-3c4b255df43f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "from sklearn.cluster import KMeans\n",
+    "import matplotlib.pyplot as plt\n",
+    "from mpl_toolkits.mplot3d import Axes3D\n",
+    "import torch\n",
+    "from transformers import AutoTokenizer, AutoModel\n",
+    "from sklearn.cluster import KMeans"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1c22ae4d-aee3-4f24-9830-88122f46dffc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Define the tokenizer\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"bert-base-uncased\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5095fae7-ea55-48d6-921c-75cc3ff1bc66",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def generate_embeddings(sentences):\n",
+    "    tokenizer = AutoTokenizer.from_pretrained(\"bert-base-uncased\")\n",
+    "    model = AutoModel.from_pretrained(\"bert-base-uncased\")\n",
+    "\n",
+    "    embeddings = []\n",
+    "    for sentence in sentences:\n",
+    "        input_ids = torch.tensor(tokenizer.encode(sentence)).unsqueeze(0)\n",
+    "        outputs = model(input_ids)\n",
+    "        last_hidden_state = outputs[0][:, 0, :]\n",
+    "        embeddings.append(last_hidden_state.tolist()[0])\n",
+    "\n",
+    "    return embeddings\n",
+    "\n",
+    "\n",
+    "def create_index(sentences):\n",
+    "    tokenizer = AutoTokenizer.from_pretrained(\"bert-base-uncased\")\n",
+    "    model = AutoModel.from_pretrained(\"bert-base-uncased\")\n",
+    "\n",
+    "    word_to_idx = {}\n",
+    "    idx_to_word = {}\n",
+    "    idx = 0\n",
+    "    for sentence in sentences:\n",
+    "        tokens = tokenizer.tokenize(sentence)\n",
+    "        for token in tokens:\n",
+    "            if token not in word_to_idx:\n",
+    "                input_ids = torch.tensor(tokenizer.encode(token)).unsqueeze(0)\n",
+    "                outputs = model(input_ids)\n",
+    "                last_hidden_state = outputs[0][:, 0, :]\n",
+    "                embedding = last_hidden_state.tolist()[0]\n",
+    "                word_to_idx[token] = idx\n",
+    "                idx_to_word[idx] = {\"word\": token, \"embedding\": embedding}\n",
+    "                idx += 1\n",
+    "\n",
+    "    return word_to_idx, idx_to_word\n",
+    "\n",
+    "\n",
+    "def get_embedding(word, word_to_idx, idx_to_word):\n",
+    "    idx = word_to_idx.get(word, None)\n",
+    "    if idx is not None:\n",
+    "        return idx_to_word[idx][\"embedding\"]\n",
+    "    else:\n",
+    "        return None\n",
+    "\n",
+    "\n",
+    "def get_word(embedding, word_to_idx, idx_to_word):\n",
+    "    for idx, word_info in idx_to_word.items():\n",
+    "        if word_info[\"embedding\"] == embedding:\n",
+    "            return word_info[\"word\"]\n",
+    "    return None\n",
+    "\n",
+    "\n",
+    "sentences = [\"This is the first sentence.\", \"This is the second sentence.\", \"kite is flying in the sky\", \"i sat on the bench under the tree\", \"beach is blue\", \"today is a rainy day\", \"i forgot the umbrella at home\"]\n",
+    "embeddings = generate_embeddings(sentences)\n",
+    "\n",
+    "word_to_idx, idx_to_word = create_index(sentences)\n",
+    "\n",
+    "print(word_to_idx)\n",
+    "\n",
+    "print(get_embedding(\"sky\", word_to_idx, idx_to_word))\n",
+    "\n",
+    "kmeans = KMeans(n_clusters=5, random_state=42).fit(embeddings)\n",
+    "labels = kmeans.labels_\n",
+    "centroids = kmeans.cluster_centers_\n",
+    "\n",
+    "# Print cluster centroids\n",
+    "print(\"Cluster Centroids:\")\n",
+    "for i in range(len(centroids)):\n",
+    "    print(f\"Cluster {i+1}: {centroids[i]}\")\n",
+    "    \n",
+    "    \n",
+    "# Print words in each cluster\n",
+    "idx_to_word = {}\n",
+    "word_to_idx = {}\n",
+    "for i, sentence in enumerate(sentences):\n",
+    "    words = sentence.split()\n",
+    "    for word in words:\n",
+    "        if word not in word_to_idx:\n",
+    "            embedding = embeddings[i][len(word_to_idx)]\n",
+    "            word_to_idx[word] = len(word_to_idx)\n",
+    "            idx_to_word[len(idx_to_word)] = {\"word\": word, \"embedding\": embedding}\n",
+    "            \n",
+    "for i in range(5):\n",
+    "    print(f\"\\nWords in Cluster {i+1}:\")\n",
+    "    cluster_words = [idx_to_word[j][\"word\"] for j in range(len(labels)) if labels[j] == i]\n",
+    "    print(cluster_words)\n",
+    "\n",
+    "    \n",
+    "cluster_embeddings = [[] for _ in range(5)]\n",
+    "for i in range(len(labels)):\n",
+    "    cluster_idx = labels[i]\n",
+    "    word_idx = word_to_idx[sentences[i].split()[0]]\n",
+    "    embedding = embeddings[cluster_idx][word_idx]\n",
+    "    cluster_embeddings[cluster_idx].append(embedding)\n",
+    "\n",
+    "cluster1_embeddings = cluster_embeddings[0]\n",
+    "cluster2_embeddings = cluster_embeddings[1]\n",
+    "cluster3_embeddings = cluster_embeddings[2]\n",
+    "cluster4_embeddings = cluster_embeddings[3]\n",
+    "cluster5_embeddings = cluster_embeddings[4]\n",
+    "\n",
+    "\n",
+    "# Plot words in clusters\n",
+    "fig, ax = plt.subplots(figsize=(8,8))\n",
+    "for i in range(5):\n",
+    "    cluster_embeddings = [embeddings[j] for j in range(len(labels)) if labels[j] == i]\n",
+    "    cluster_words = [idx_to_word[j][\"word\"] for j in range(len(labels)) if labels[j] == i]\n",
+    "    x = [embedding[0] for embedding in cluster_embeddings]\n",
+    "    y = [embedding[1] for embedding in cluster_embeddings]\n",
+    "    ax.scatter(x, y, label=f\"Cluster {i+1}\")\n",
+    "    for j, word in enumerate(cluster_words):\n",
+    "        ax.annotate(word, (x[j], y[j]))\n",
+    "ax.legend()\n",
+    "plt.show()\n",
+    "\n",
+    "\n",
+    "def generate_embedding_for_word(word):\n",
+    "    tokenizer = AutoTokenizer.from_pretrained(\"bert-base-uncased\")\n",
+    "    model = AutoModel.from_pretrained(\"bert-base-uncased\")\n",
+    "\n",
+    "    input_ids = torch.tensor(tokenizer.encode(word)).unsqueeze(0)\n",
+    "    outputs = model(input_ids)\n",
+    "    last_hidden_state = outputs[0][:, 0, :]\n",
+    "    embedding = last_hidden_state.tolist()[0]\n",
+    "\n",
+    "    return embedding\n",
+    "\n",
+    "newemb = generate_embedding_for_word(\"hello\")\n",
+    "\n",
+    "\n",
+    "# Calculate distances between new word's embedding and cluster centroids\n",
+    "distances = []\n",
+    "for centroid in centroids:\n",
+    "    distance = np.linalg.norm(new_embedding - centroid)\n",
+    "    distances.append(distance)\n",
+    "\n",
+    "# Print distances\n",
+    "print(f\"Distances between '{new_word}' and cluster centroids:\")\n",
+    "for i, distance in enumerate(distances):\n",
+    "    print(f\"Cluster {i+1}: {distance:.4f}\")\n",
+    "    \n",
+    "\n",
+    "def calculate_distances(new_embedding, centroids):\n",
+    "    distances = []\n",
+    "    for i, centroid in enumerate(centroids):\n",
+    "        distance = np.linalg.norm(new_embedding - centroid)\n",
+    "        distances.append(distance)\n",
+    "        #print(f\"Distance to Cluster {i+1}: {distance}\")\n",
+    "    closest_cluster = np.argmin(distances)\n",
+    "    #print(f\"Closest Cluster: {closest_cluster+1}\")\n",
+    "    return closest_cluster\n",
+    "\n",
+    "closest_cluster = calculate_distances(newemb, centroids)\n",
+    "\n",
+    "\n",
+    "def find_closest_words(n, cluster_embeddings, new_embedding):\n",
+    "    # Get the embeddings in the specified cluster\n",
+    "    cluster_words = cluster_embeddings[n]\n",
+    "    \n",
+    "    # Calculate the distances to each word embedding in the cluster\n",
+    "    distances = [np.linalg.norm(new_embedding - np.array(word_embedding)) for word_embedding in cluster_words]\n",
+    "    \n",
+    "    # Find the indices of the closest words based on the distances\n",
+    "    closest_indices = np.argsort(distances)\n",
+    "    \n",
+    "    # Print the distances to each word in the cluster\n",
+    "    print(\"Distances to each word in the cluster:\")\n",
+    "    for i in closest_indices:\n",
+    "        print(f\"{idx_to_word[i]['word']}: {distances[i]}\")\n",
+    "    \n",
+    "    # Get the words corresponding to the closest indices\n",
+    "    closest_words = [idx_to_word[i][\"word\"] for i in closest_indices[:5]]\n",
+    "    \n",
+    "    return closest_words\n",
+    "\n",
+    "\n",
+    "find_closest_words(closest_cluster, cluster_embeddings,newemb)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "default:Python",
+   "language": "python",
+   "name": "conda-env-default-py"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/IT19001708/STG/script/embeddings.txt
+++ b/IT19001708/STG/script/embeddings.txt
--- a/IT19001708/STG/script/embeddings_words.txt
+++ b/IT19001708/STG/script/embeddings_words.txt
--- a/IT19001708/STG/script/output_dir.txt
+++ b/IT19001708/STG/script/output_dir.txt
--- a/IT19001708/STG/script/output_dir/embeddings.csv
+++ b/IT19001708/STG/script/output_dir/embeddings.csv
--- a/IT19001708/STG/script/output_dir/words.txt
+++ b/IT19001708/STG/script/output_dir/words.txt
+This is a sentence.
+Here is another sentence.
+Yet another sentence is here.
+One more sentence to go.
+lazy dog lie on the floor
+rain falling to forest
+row your boat gently down the stream
--- a/IT19001708/STG/script/output_dir_words.txt
+++ b/IT19001708/STG/script/output_dir_words.txt