Commit 353f995f authored by Lelkada L L P S M's avatar Lelkada L L P S M

clustering script

parent 0933145e
This diff is collapsed.
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "51481245-33cd-4f7c-ae5e-2376e8006545",
"metadata": {},
"outputs": [],
"source": [
"!pip install transformers\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "aeea3911-8296-40a7-9770-9834031fb3d5",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from transformers import AutoTokenizer, TFAutoModel\n",
"import tensorflow as tf"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "70bad982-8669-4559-a579-e8b7ba905517",
"metadata": {},
"outputs": [],
"source": [
"tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')\n",
"model = TFAutoModel.from_pretrained('bert-base-uncased')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "a3ad97ea-6074-4a09-b66e-ae8b79d816ef",
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv('../dataset/df_filtered.csv')\n",
"words = df['word'].tolist()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "ab859a35-451e-4168-8277-e4d4d6e30e78",
"metadata": {},
"outputs": [],
"source": [
"encoded_inputs = [tokenizer(word, return_tensors='tf') for word in words]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "de2ce194-c5b6-48c6-bab9-6687abd51077",
"metadata": {},
"outputs": [],
"source": [
"encoded_inputs[0]"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "d2d4363e-eec8-4559-8ea1-850ce2319b2c",
"metadata": {},
"outputs": [],
"source": [
"embeddings = []\n",
"with tf.device('/GPU:0'):\n",
" for encoded_input in encoded_inputs:\n",
" outputs = model(encoded_input)\n",
" embedding = outputs[0][0].numpy()\n",
" embeddings.append(embedding)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "dff2f4e7-a31b-439c-93b4-a9b75819d56a",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/conda/lib/python3.7/site-packages/pandas/core/internals/construction.py:540: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.\n",
" values = np.array([convert(v) for v in values])\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>[[-0.2112137, -0.057769418, -0.01924374, -0.00...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>[[-0.32676977, 0.032084987, -0.52493036, -0.44...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>[[-0.19533601, 0.24660303, -0.002797313, -0.06...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>[[-0.4519474, 0.18390995, -0.53327066, -0.2371...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>[[-0.052097924, 0.30908817, -0.039699733, 0.01...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3305</th>\n",
" <td>[[-0.1235517, 0.43329853, -0.13414639, -0.1531...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3306</th>\n",
" <td>[[-0.5598528, -0.07930106, -0.13959903, -0.069...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3307</th>\n",
" <td>[[-0.39649808, 0.12856255, 0.21035239, -0.1410...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3308</th>\n",
" <td>[[-0.5090915, 0.18303792, -0.29818898, 0.09537...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3309</th>\n",
" <td>[[-0.25211066, 0.5173553, -0.0053291023, -0.33...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3310 rows × 1 columns</p>\n",
"</div>"
],
"text/plain": [
" 0\n",
"0 [[-0.2112137, -0.057769418, -0.01924374, -0.00...\n",
"1 [[-0.32676977, 0.032084987, -0.52493036, -0.44...\n",
"2 [[-0.19533601, 0.24660303, -0.002797313, -0.06...\n",
"3 [[-0.4519474, 0.18390995, -0.53327066, -0.2371...\n",
"4 [[-0.052097924, 0.30908817, -0.039699733, 0.01...\n",
"... ...\n",
"3305 [[-0.1235517, 0.43329853, -0.13414639, -0.1531...\n",
"3306 [[-0.5598528, -0.07930106, -0.13959903, -0.069...\n",
"3307 [[-0.39649808, 0.12856255, 0.21035239, -0.1410...\n",
"3308 [[-0.5090915, 0.18303792, -0.29818898, 0.09537...\n",
"3309 [[-0.25211066, 0.5173553, -0.0053291023, -0.33...\n",
"\n",
"[3310 rows x 1 columns]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_emb = pd.DataFrame(embeddings)\n",
"df_emb"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "55cb94bc-ce3c-4b4e-a58a-eb7ab728b34e",
"metadata": {},
"outputs": [],
"source": [
"df_emb.to_csv('../dataset/word_embeddings.csv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bacca570-1e23-4ca7-b6ce-d6e29be434bd",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "TensorFlow 2 (Local)",
"language": "python",
"name": "local-tf2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment