Commit f924fad8 authored by Lelkada L L P S M's avatar Lelkada L L P S M

new embedding script

parent 7f0d87e8
{
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 5
}
This diff is collapsed.
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "2b755866-3d8c-4086-aa36-3c4b255df43f",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"from sklearn.cluster import KMeans\n",
"import matplotlib.pyplot as plt\n",
"from mpl_toolkits.mplot3d import Axes3D\n",
"import torch\n",
"from transformers import AutoTokenizer, AutoModel\n",
"from sklearn.cluster import KMeans"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1c22ae4d-aee3-4f24-9830-88122f46dffc",
"metadata": {},
"outputs": [],
"source": [
"# Define the tokenizer\n",
"tokenizer = AutoTokenizer.from_pretrained(\"bert-base-uncased\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5095fae7-ea55-48d6-921c-75cc3ff1bc66",
"metadata": {},
"outputs": [],
"source": [
"def generate_embeddings(sentences):\n",
" tokenizer = AutoTokenizer.from_pretrained(\"bert-base-uncased\")\n",
" model = AutoModel.from_pretrained(\"bert-base-uncased\")\n",
"\n",
" embeddings = []\n",
" for sentence in sentences:\n",
" input_ids = torch.tensor(tokenizer.encode(sentence)).unsqueeze(0)\n",
" outputs = model(input_ids)\n",
" last_hidden_state = outputs[0][:, 0, :]\n",
" embeddings.append(last_hidden_state.tolist()[0])\n",
"\n",
" return embeddings\n",
"\n",
"\n",
"def create_index(sentences):\n",
" tokenizer = AutoTokenizer.from_pretrained(\"bert-base-uncased\")\n",
" model = AutoModel.from_pretrained(\"bert-base-uncased\")\n",
"\n",
" word_to_idx = {}\n",
" idx_to_word = {}\n",
" idx = 0\n",
" for sentence in sentences:\n",
" tokens = tokenizer.tokenize(sentence)\n",
" for token in tokens:\n",
" if token not in word_to_idx:\n",
" input_ids = torch.tensor(tokenizer.encode(token)).unsqueeze(0)\n",
" outputs = model(input_ids)\n",
" last_hidden_state = outputs[0][:, 0, :]\n",
" embedding = last_hidden_state.tolist()[0]\n",
" word_to_idx[token] = idx\n",
" idx_to_word[idx] = {\"word\": token, \"embedding\": embedding}\n",
" idx += 1\n",
"\n",
" return word_to_idx, idx_to_word\n",
"\n",
"\n",
"def get_embedding(word, word_to_idx, idx_to_word):\n",
" idx = word_to_idx.get(word, None)\n",
" if idx is not None:\n",
" return idx_to_word[idx][\"embedding\"]\n",
" else:\n",
" return None\n",
"\n",
"\n",
"def get_word(embedding, word_to_idx, idx_to_word):\n",
" for idx, word_info in idx_to_word.items():\n",
" if word_info[\"embedding\"] == embedding:\n",
" return word_info[\"word\"]\n",
" return None\n",
"\n",
"\n",
"sentences = [\"This is the first sentence.\", \"This is the second sentence.\", \"kite is flying in the sky\", \"i sat on the bench under the tree\", \"beach is blue\", \"today is a rainy day\", \"i forgot the umbrella at home\"]\n",
"embeddings = generate_embeddings(sentences)\n",
"\n",
"word_to_idx, idx_to_word = create_index(sentences)\n",
"\n",
"print(word_to_idx)\n",
"\n",
"print(get_embedding(\"sky\", word_to_idx, idx_to_word))\n",
"\n",
"kmeans = KMeans(n_clusters=5, random_state=42).fit(embeddings)\n",
"labels = kmeans.labels_\n",
"centroids = kmeans.cluster_centers_\n",
"\n",
"# Print cluster centroids\n",
"print(\"Cluster Centroids:\")\n",
"for i in range(len(centroids)):\n",
" print(f\"Cluster {i+1}: {centroids[i]}\")\n",
" \n",
" \n",
"# Print words in each cluster\n",
"idx_to_word = {}\n",
"word_to_idx = {}\n",
"for i, sentence in enumerate(sentences):\n",
" words = sentence.split()\n",
" for word in words:\n",
" if word not in word_to_idx:\n",
" embedding = embeddings[i][len(word_to_idx)]\n",
" word_to_idx[word] = len(word_to_idx)\n",
" idx_to_word[len(idx_to_word)] = {\"word\": word, \"embedding\": embedding}\n",
" \n",
"for i in range(5):\n",
" print(f\"\\nWords in Cluster {i+1}:\")\n",
" cluster_words = [idx_to_word[j][\"word\"] for j in range(len(labels)) if labels[j] == i]\n",
" print(cluster_words)\n",
"\n",
" \n",
"cluster_embeddings = [[] for _ in range(5)]\n",
"for i in range(len(labels)):\n",
" cluster_idx = labels[i]\n",
" word_idx = word_to_idx[sentences[i].split()[0]]\n",
" embedding = embeddings[cluster_idx][word_idx]\n",
" cluster_embeddings[cluster_idx].append(embedding)\n",
"\n",
"cluster1_embeddings = cluster_embeddings[0]\n",
"cluster2_embeddings = cluster_embeddings[1]\n",
"cluster3_embeddings = cluster_embeddings[2]\n",
"cluster4_embeddings = cluster_embeddings[3]\n",
"cluster5_embeddings = cluster_embeddings[4]\n",
"\n",
"\n",
"# Plot words in clusters\n",
"fig, ax = plt.subplots(figsize=(8,8))\n",
"for i in range(5):\n",
" cluster_embeddings = [embeddings[j] for j in range(len(labels)) if labels[j] == i]\n",
" cluster_words = [idx_to_word[j][\"word\"] for j in range(len(labels)) if labels[j] == i]\n",
" x = [embedding[0] for embedding in cluster_embeddings]\n",
" y = [embedding[1] for embedding in cluster_embeddings]\n",
" ax.scatter(x, y, label=f\"Cluster {i+1}\")\n",
" for j, word in enumerate(cluster_words):\n",
" ax.annotate(word, (x[j], y[j]))\n",
"ax.legend()\n",
"plt.show()\n",
"\n",
"\n",
"def generate_embedding_for_word(word):\n",
" tokenizer = AutoTokenizer.from_pretrained(\"bert-base-uncased\")\n",
" model = AutoModel.from_pretrained(\"bert-base-uncased\")\n",
"\n",
" input_ids = torch.tensor(tokenizer.encode(word)).unsqueeze(0)\n",
" outputs = model(input_ids)\n",
" last_hidden_state = outputs[0][:, 0, :]\n",
" embedding = last_hidden_state.tolist()[0]\n",
"\n",
" return embedding\n",
"\n",
"newemb = generate_embedding_for_word(\"hello\")\n",
"\n",
"\n",
"# Calculate distances between new word's embedding and cluster centroids\n",
"distances = []\n",
"for centroid in centroids:\n",
" distance = np.linalg.norm(new_embedding - centroid)\n",
" distances.append(distance)\n",
"\n",
"# Print distances\n",
"print(f\"Distances between '{new_word}' and cluster centroids:\")\n",
"for i, distance in enumerate(distances):\n",
" print(f\"Cluster {i+1}: {distance:.4f}\")\n",
" \n",
"\n",
"def calculate_distances(new_embedding, centroids):\n",
" distances = []\n",
" for i, centroid in enumerate(centroids):\n",
" distance = np.linalg.norm(new_embedding - centroid)\n",
" distances.append(distance)\n",
" #print(f\"Distance to Cluster {i+1}: {distance}\")\n",
" closest_cluster = np.argmin(distances)\n",
" #print(f\"Closest Cluster: {closest_cluster+1}\")\n",
" return closest_cluster\n",
"\n",
"closest_cluster = calculate_distances(newemb, centroids)\n",
"\n",
"\n",
"def find_closest_words(n, cluster_embeddings, new_embedding):\n",
" # Get the embeddings in the specified cluster\n",
" cluster_words = cluster_embeddings[n]\n",
" \n",
" # Calculate the distances to each word embedding in the cluster\n",
" distances = [np.linalg.norm(new_embedding - np.array(word_embedding)) for word_embedding in cluster_words]\n",
" \n",
" # Find the indices of the closest words based on the distances\n",
" closest_indices = np.argsort(distances)\n",
" \n",
" # Print the distances to each word in the cluster\n",
" print(\"Distances to each word in the cluster:\")\n",
" for i in closest_indices:\n",
" print(f\"{idx_to_word[i]['word']}: {distances[i]}\")\n",
" \n",
" # Get the words corresponding to the closest indices\n",
" closest_words = [idx_to_word[i][\"word\"] for i in closest_indices[:5]]\n",
" \n",
" return closest_words\n",
"\n",
"\n",
"find_closest_words(closest_cluster, cluster_embeddings,newemb)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "default:Python",
"language": "python",
"name": "conda-env-default-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
This diff is collapsed.
This diff is collapsed.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This is a sentence.
Here is another sentence.
Yet another sentence is here.
One more sentence to go.
lazy dog lie on the floor
rain falling to forest
row your boat gently down the stream
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment