data processing and model tranning started

parent 8c0fd60c
......@@ -79,6 +79,109 @@
"source": [
"train_data_file = '/content/drive/MyDrive/RP_GoviMitura/Kavee - ChatBot/DataSets/intents.json'"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## 💬 Data_Preprocessing and Model Training"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Used in Tensorflow Model\n",
"import numpy as np\n",
"import tensorflow as tf\n",
"import tflearn\n",
"import random\n",
"\n",
"#Usde to for Contextualisation and Other NLP Tasks.\n",
"import nltk\n",
"from nltk.stem.lancaster import LancasterStemmer\n",
"stemmer = LancasterStemmer()\n",
"\n",
"#Other\n",
"import json\n",
"import pickle\n",
"import warnings\n",
"warnings.filterwarnings(\"ignore\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(\"Processing the Intents.....\")\n",
"with open(train_data_file) as json_data:\n",
" intents = json.load(json_data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"nltk.download('punkt')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"words = []\n",
"classes = []\n",
"documents = []\n",
"ignore_words = ['?']\n",
"print(\"Looping through the Intents to Convert them to words, classes, documents and ignore_words.......\")\n",
"for intent in intents['intents']:\n",
" for pattern in intent['patterns']:\n",
" # tokenize each word in the sentence\n",
" w = nltk.word_tokenize(pattern)\n",
" # add to our words list\n",
" words.extend(w)\n",
" # add to documents in our corpus\n",
" documents.append((w, intent['tag']))\n",
" # add to our classes list\n",
" if intent['tag'] not in classes:\n",
" classes.append(intent['tag'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(\"Stemming, Lowering and Removing Duplicates.......\")\n",
"words = [stemmer.stem(w.lower()) for w in words if w not in ignore_words]\n",
"words = sorted(list(set(words)))\n",
"\n",
"# remove duplicates\n",
"classes = sorted(list(set(classes)))\n",
"\n",
"print (len(documents), \"documents\")\n",
"print (len(classes), \"classes\", classes)\n",
"print (len(words), \"unique stemmed words\", words)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(documents)"
]
}
],
"metadata": {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment