Commit b868bcab authored by Lelkada L L P S M's avatar Lelkada L L P S M

clustering technique update

parent 5d365cbf
......@@ -2,10 +2,47 @@
"cells": [
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"id": "c3d5322b-ce1b-482d-b224-135da10982c1",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting transformers\n",
" Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.0/7.0 MB\u001b[0m \u001b[31m25.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
"\u001b[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1\n",
" Using cached tokenizers-0.13.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)\n",
"Requirement already satisfied: requests in /opt/conda/lib/python3.7/site-packages (from transformers) (2.28.2)\n",
"Requirement already satisfied: tqdm>=4.27 in /opt/conda/lib/python3.7/site-packages (from transformers) (4.64.1)\n",
"Requirement already satisfied: packaging>=20.0 in /opt/conda/lib/python3.7/site-packages (from transformers) (23.0)\n",
"Collecting huggingface-hub<1.0,>=0.11.0\n",
" Using cached huggingface_hub-0.13.4-py3-none-any.whl (200 kB)\n",
"Requirement already satisfied: filelock in /opt/conda/lib/python3.7/site-packages (from transformers) (3.10.6)\n",
"Collecting regex!=2019.12.17\n",
" Using cached regex-2022.10.31-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (757 kB)\n",
"Requirement already satisfied: numpy>=1.17 in /opt/conda/lib/python3.7/site-packages (from transformers) (1.21.6)\n",
"Requirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.7/site-packages (from transformers) (6.0)\n",
"Requirement already satisfied: importlib-metadata in /opt/conda/lib/python3.7/site-packages (from transformers) (6.0.1)\n",
"Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.7/site-packages (from huggingface-hub<1.0,>=0.11.0->transformers) (4.5.0)\n",
"Requirement already satisfied: zipp>=0.5 in /opt/conda/lib/python3.7/site-packages (from importlib-metadata->transformers) (3.15.0)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.7/site-packages (from requests->transformers) (2022.12.7)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.7/site-packages (from requests->transformers) (2.1.1)\n",
"Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.7/site-packages (from requests->transformers) (3.4)\n",
"Requirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/conda/lib/python3.7/site-packages (from requests->transformers) (1.26.15)\n",
"Installing collected packages: tokenizers, regex, huggingface-hub, transformers\n",
"Successfully installed huggingface-hub-0.13.4 regex-2022.10.31 tokenizers-0.13.3 transformers-4.28.1\n",
"Collecting pymongo\n",
" Using cached pymongo-4.3.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (484 kB)\n",
"Collecting dnspython<3.0.0,>=1.16.0\n",
" Using cached dnspython-2.3.0-py3-none-any.whl (283 kB)\n",
"Installing collected packages: dnspython, pymongo\n",
"Successfully installed dnspython-2.3.0 pymongo-4.3.3\n"
]
}
],
"source": [
"!pip install transformers\n",
"!pip install pymongo"
......@@ -13,7 +50,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 2,
"id": "5aeb5561-495d-4225-881f-1d3ee830b27c",
"metadata": {},
"outputs": [],
......@@ -27,7 +64,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 3,
"id": "f0f0fd1f-7f30-4c0e-86ff-47978850a3a4",
"metadata": {},
"outputs": [],
......@@ -56,7 +93,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 4,
"id": "44b8acee-7a5e-40cb-9278-b9bda61dbeef",
"metadata": {},
"outputs": [],
......@@ -82,7 +119,7 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 10,
"id": "1735effa-2020-48d9-889a-39c9d954aa3a",
"metadata": {},
"outputs": [],
......@@ -92,14 +129,14 @@
" closest_cluster_no = None\n",
"\n",
" for cluster_no, centroid_data in centroids.items():\n",
" print(cluster_no)\n",
" centroid_embedding = centroid_data['centroid']\n",
" current_distance = distance.euclidean(input_embedding, centroid_embedding)\n",
" print(current_distance)\n",
" if current_distance < min_distance:\n",
" min_distance = current_distance\n",
" closest_cluster_no = cluster_no\n",
"\n",
" return closest_cluster_no"
" closest_cluster_no = int(cluster_no) # Convert cluster_no to an integer\n",
" print(min_distance, closest_cluster_no)\n",
" return closest_cluster_no\n"
]
},
{
......@@ -120,7 +157,7 @@
},
{
"cell_type": "code",
"execution_count": 26,
"execution_count": 11,
"id": "cf2e79ca-461b-4368-98cc-26d9e55488fd",
"metadata": {},
"outputs": [
......@@ -128,7 +165,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']\n",
"Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']\n",
"- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
"- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
]
......@@ -137,11 +174,14 @@
"name": "stdout",
"output_type": "stream",
"text": [
"13.285225168586905\n",
"11.51259043024684\n",
"8.962974422826848\n",
"12.52214870605026\n",
"10.757946725715826\n",
"1\n",
"13.285225168586905 1\n",
"4\n",
"11.51259043024684 4\n",
"0\n",
"8.962974422826848 0\n",
"2\n",
"3\n",
"The closest cluster for the word 'ocean' is cluster 0\n"
]
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment