Commit 9cafb8de authored by thili97's avatar thili97

pushing the project

parent 17d4ba9c
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "ab90d4da",
"metadata": {},
"outputs": [],
"source": [
"import tensorflow as tf\n",
"import cv2\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"from tensorflow.keras.models import load_model\n",
"\n",
"dataset = tf.keras.datasets.mnist\n",
"\n",
"#### train - test - split ####\n",
"(X_train, y_train), (X_test, y_test) = dataset.load_data()\n",
"\n",
"\n",
"#### normalize value to b/w 0and1 ###\n",
"X_train= X_train/255.0\n",
"X_test= X_test/255.0\n",
"\n",
"\n",
"### CNN (BATCH , HEIGHT, WIDTH, 1)\n",
"#### ANN (BATCH_SIZE, FEATURES)\n",
"#### FEATURES = WIDTH * HEIGHT\n",
"#### reshape array to fit in network ####\n",
"\n",
"X_train = X_train.reshape(X_train.shape[0], -1)\n",
"X_test = X_test.reshape(X_test.shape[0], -1)\n",
"\n",
"# (batch_size, height, width, 1)\n",
"#### ANN ########\n",
"\n",
"from tensorflow.keras.models import Sequential\n",
"from tensorflow.keras.layers import Dense, Dropout\n",
"\n",
"# 0-1\n",
"model = Sequential()\n",
"model.add(Dense(128, activation='relu'))\n",
"model.add(Dropout(0.2))\n",
"\n",
"model.add(Dense(128, activation='relu'))\n",
"model.add(Dropout(0.2))\n",
"\n",
"## [0-9] ##\n",
"model.add(Dense(10, activation='softmax'))\n",
"\n",
"model.compile('adam', 'sparse_categorical_crossentropy', metrics=['acc'])\n",
"\n",
"model.fit(X_train, y_train, epochs=3, batch_size=12, validation_split=0.1)\n",
"\n",
"\n",
"#### making prediction #######\n",
"plt.imshow(X_test[1255].reshape(28,28), cmap='gray')\n",
"plt.xlabel(y_test[1255])\n",
"plt.ylabel(np.argmax(model.predict(X_test)[1255]))\n",
"\n",
"\n",
"model.save('digit_trained.h5')\n",
"\n",
"\n",
"##### open cv for capture and predicting through camera #####\n",
"'''\n",
"##### cv2\n",
"\n",
"\n",
"cap = cv2.VideoCapture(0)\n",
"while True:\n",
" ret, img = cap.read()\n",
" #img = cv2.flip(img, 1)\n",
" img = img[200:400, 200:400]\n",
" gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)\n",
" _, gray = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY_INV)\n",
" cv2.imshow(\"gray_wind\", gray)\n",
" gray = cv2.resize(gray, (28, 28))\n",
" #cv2.imshow('resized')\n",
" gray = gray.reshape(1, 784)\n",
" result = np.argmax(model.predict(gray))\n",
" result = 'cnn : {}'.format(result)\n",
" cv2.putText(img, org=(25,25), fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=1, text= result, color=(255,0,0), thickness=1)\n",
" cv2.imshow(\"image\", img)\n",
" \n",
" if cv2.waitKey(1) == 13:\n",
" break\n",
"\n",
"cap.release()\n",
"cv2.destroyAllWindows()\n",
"#plt.imshow(img)\n",
"'''\n",
"\n",
"\n",
"############ prediction via paints ##########\n",
"### glob\n",
"run = False\n",
"ix,iy = -1,-1\n",
"follow = 25\n",
"img = np.zeros((512,512,1))\n",
"\n",
"### func\n",
"def draw(event, x, y, flag, params):\n",
" global run,ix,iy,img,follow\n",
" if event == cv2.EVENT_LBUTTONDOWN:\n",
" run = True\n",
" ix, iy = x, y\n",
" elif event == cv2.EVENT_MOUSEMOVE:\n",
" if run == True:\n",
" cv2.circle(img, (x,y), 20, (255,255,255), -1)\n",
"\n",
" elif event == cv2.EVENT_LBUTTONUP:\n",
" run = False\n",
" cv2.circle(img, (x,y), 20, (255,255,255), -1)\n",
" gray = cv2.resize(img, (28, 28))\n",
" gray = gray.reshape(1, 784)\n",
" result = np.argmax(model.predict(gray))\n",
" result = 'cnn : {}'.format(result)\n",
" cv2.putText(img, org=(25,follow), fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=1, text= result, color=(255,0,0), thickness=1)\n",
" follow += 25\n",
" elif event == cv2.EVENT_RBUTTONDOWN:\n",
" img = np.zeros((512,512,1))\n",
" follow = 25\n",
"\n",
"\n",
"### param\n",
"cv2.namedWindow('image')\n",
"cv2.setMouseCallback('image', draw)\n",
"\n",
"\n",
"\n",
"while True: \n",
" cv2.imshow(\"image\", img)\n",
" \n",
" if cv2.waitKey(1) == 27:\n",
" break\n",
"\n",
"cv2.destroyAllWindows()\n",
"\n",
"########## THANKS ##########\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "906b56b1",
"metadata": {},
"outputs": [],
"source": [
"from tensorflow.compat.v1 import ConfigProto\n",
"from tensorflow.compat.v1 import Session\n",
"import os\n",
"import librosa\n",
"import IPython.display as ipd\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"from scipy.io import wavfile\n",
"from tqdm import tqdm\n",
"import warnings\n",
"\n",
"\n",
"config = ConfigProto()\n",
"config.gpu_options.allow_growth = True\n",
"sess = Session(config=config)\n",
"\n",
"warnings.filterwarnings(\"ignore\")\n",
"\n",
"labels = [\n",
" 'left', 'cat', 'wow', 'six', 'go', 'one', 'dog', 'nine', 'sheila', 'yes',\n",
" 'down', 'bird', 'tree', 'up', 'eight', 'bed', 'three', 'on', 'house',\n",
" 'five', 'seven', 'zero', 'right', 'four', 'no', 'two', 'off', 'happy',\n",
" 'stop', 'marvin'\n",
"]\n",
"\n",
"\n",
"train_audio_path = './train/audio/'\n",
"\n",
"all_wave = []\n",
"all_label = []\n",
"for label in tqdm(labels):\n",
" waves = [f for f in os.listdir(train_audio_path + '/'+ label) if f.endswith('.wav')]\n",
" for wav in waves:\n",
" samples, sample_rate = librosa.load(train_audio_path + '/' + label + '/' + wav, sr = 16000)\n",
" samples = librosa.resample(samples, sample_rate, 8000)\n",
" if(len(samples)== 8000) : \n",
" all_wave.append(samples)\n",
" all_label.append(label)\n",
" \n",
"from sklearn.preprocessing import LabelEncoder\n",
"from keras.utils import np_utils\n",
"\n",
"label_enconder = LabelEncoder()\n",
"y = label_enconder.fit_transform(all_label)\n",
"classes = list(label_enconder.classes_)\n",
"y = np_utils.to_categorical(y, num_classes=len(labels))\n",
"\n",
"all_wave = np.array(all_wave).reshape(-1,8000,1)\n",
"\n",
"from sklearn.model_selection import train_test_split\n",
"x_train, x_valid, y_train, y_valid = train_test_split(np.array(all_wave),np.array(y),stratify=y,test_size = 0.2,random_state=777,shuffle=True)\n",
"\n",
"from keras.layers import Bidirectional, BatchNormalization, CuDNNGRU, TimeDistributed\n",
"\n",
"from keras.layers import Dense, Dropout, Flatten, Conv1D, Input, MaxPooling1D\n",
"from keras.models import Model\n",
"from keras.callbacks import EarlyStopping, ModelCheckpoint\n",
"from keras import backend as K\n",
"K.clear_session()\n",
"\n",
"inputs = Input(shape=(8000,1))\n",
"x = BatchNormalization(axis=-1, momentum=0.99, epsilon=1e-3, center=True, scale=True)(inputs)\n",
"\n",
"#First Conv1D layer\n",
"x = Conv1D(8,13, padding='valid', activation='relu', strides=1)(x)\n",
"x = MaxPooling1D(3)(x)\n",
"x = Dropout(0.3)(x)\n",
"\n",
"#Second Conv1D layer\n",
"x = Conv1D(16, 11, padding='valid', activation='relu', strides=1)(x)\n",
"x = MaxPooling1D(3)(x)\n",
"x = Dropout(0.3)(x)\n",
"\n",
"#Third Conv1D layer\n",
"x = Conv1D(32, 9, padding='valid', activation='relu', strides=1)(x)\n",
"x = MaxPooling1D(3)(x)\n",
"x = Dropout(0.3)(x)\n",
"\n",
"x = BatchNormalization(axis=-1, momentum=0.99, epsilon=1e-3, center=True, scale=True)(x)\n",
"\n",
"x = Bidirectional(CuDNNGRU(128, return_sequences=True), merge_mode='sum')(x)\n",
"x = Bidirectional(CuDNNGRU(128, return_sequences=True), merge_mode='sum')(x)\n",
"x = Bidirectional(CuDNNGRU(128, return_sequences=False), merge_mode='sum')(x)\n",
"\n",
"x = BatchNormalization(axis=-1, momentum=0.99, epsilon=1e-3, center=True, scale=True)(x)\n",
"\n",
"#Flatten layer\n",
"# x = Flatten()(x)\n",
"\n",
"#Dense Layer 1\n",
"x = Dense(256, activation='relu')(x)\n",
"outputs = Dense(len(labels), activation=\"softmax\")(x)\n",
"\n",
"model = Model(inputs, outputs)\n",
"model.summary()\n",
"\n",
"model.compile(loss='categorical_crossentropy',optimizer='nadam',metrics=['accuracy'])\n",
"\n",
"early_stop = EarlyStopping(monitor='val_loss', mode='min', \n",
" verbose=1, patience=10, min_delta=0.0001)\n",
"\n",
"checkpoint = ModelCheckpoint('speech2text_model.hdf5', monitor='val_acc', \n",
" verbose=1, save_best_only=True, mode='max')\n",
"\n",
"hist = model.fit(\n",
" x=x_train, \n",
" y=y_train,\n",
" epochs=100, \n",
" callbacks=[early_stop, checkpoint], \n",
" batch_size=32, \n",
" validation_data=(x_valid,y_valid)\n",
")\n",
"\n",
"from matplotlib import pyplot\n",
"pyplot.plot(hist.history['loss'], label='train')\n",
"pyplot.plot(hist.history['val_loss'], label='test')\n",
"pyplot.legend()\n",
"pyplot.show()\n",
"\n",
"model.save('speech2text_model.hdf5')\n",
"\n",
"from keras.models import load_model\n",
"model = load_model('speech2text_model.hdf5')\n",
"\n",
"def s2t_predict(audio, shape_num=8000):\n",
" prob=model.predict(audio.reshape(1,shape_num,1))\n",
" index=np.argmax(prob[0])\n",
" return classes[index]\n",
"\n",
"import random\n",
"index=random.randint(0,len(x_valid)-1)\n",
"samples=x_valid[index].ravel()\n",
"print(\"Audio:\",classes[np.argmax(y_valid[index])])\n",
"ipd.Audio(samples, rate=8000)\n",
"\n",
"samples.shape\n",
"\n",
"print(\"Text:\",s2t_predict(samples))\n",
"\n",
"import sounddevice as sd\n",
"import soundfile as sf\n",
"\n",
"samplerate = 16000 \n",
"duration = 1 # seconds\n",
"filename = 'zero.wav'\n",
"print(\"start\")\n",
"mydata = sd.rec(int(samplerate * duration), samplerate=samplerate,\n",
" channels=1, blocking=True)\n",
"print(\"end\")\n",
"sd.wait()\n",
"sf.write(filename, mydata, samplerate)\n",
"\n",
"test, test_rate = librosa.load('yes.wav', sr = 16000)\n",
"test_sample = librosa.resample(test, test_rate, 8000)\n",
"print(test_sample.shape)\n",
"ipd.Audio(test_sample,rate=8000) \n",
"\n",
"s2t_predict(test_sample)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "060a52c4",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Epoch 1/3\n",
"4500/4500 [==============================] - 3s 637us/step - loss: 0.2888 - acc: 0.9111 - val_loss: 0.0949 - val_acc: 0.9733\n",
"Epoch 2/3\n",
"4500/4500 [==============================] - 3s 612us/step - loss: 0.1527 - acc: 0.9536 - val_loss: 0.0909 - val_acc: 0.9710\n",
"Epoch 3/3\n",
"4500/4500 [==============================] - 3s 632us/step - loss: 0.1241 - acc: 0.9625 - val_loss: 0.0878 - val_acc: 0.9740\n"
]
}
],
"source": [
"import tensorflow as tf\n",
"import cv2\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"from tensorflow.keras.models import load_model\n",
"\n",
"dataset = tf.keras.datasets.mnist\n",
"\n",
"#### train - test - split ####\n",
"(X_train, y_train), (X_test, y_test) = dataset.load_data()\n",
"\n",
"\n",
"#### normalize value to b/w 0and1 ###\n",
"X_train= X_train/255.0\n",
"X_test= X_test/255.0\n",
"\n",
"\n",
"### CNN (BATCH , HEIGHT, WIDTH, 1)\n",
"#### ANN (BATCH_SIZE, FEATURES)\n",
"#### FEATURES = WIDTH * HEIGHT\n",
"#### reshape array to fit in network ####\n",
"\n",
"X_train = X_train.reshape(X_train.shape[0], -1)\n",
"X_test = X_test.reshape(X_test.shape[0], -1)\n",
"\n",
"# (batch_size, height, width, 1)\n",
"#### ANN ########\n",
"\n",
"from tensorflow.keras.models import Sequential\n",
"from tensorflow.keras.layers import Dense, Dropout\n",
"\n",
"# 0-1\n",
"model = Sequential()\n",
"model.add(Dense(128, activation='relu'))\n",
"model.add(Dropout(0.2))\n",
"\n",
"model.add(Dense(128, activation='relu'))\n",
"model.add(Dropout(0.2))\n",
"\n",
"## [0-9] ##\n",
"model.add(Dense(10, activation='softmax'))\n",
"\n",
"model.compile('adam', 'sparse_categorical_crossentropy', metrics=['acc'])\n",
"\n",
"model.fit(X_train, y_train, epochs=3, batch_size=12, validation_split=0.1)\n",
"\n",
"\n",
"#### making prediction #######\n",
"plt.imshow(X_test[1255].reshape(28,28), cmap='gray')\n",
"plt.xlabel(y_test[1255])\n",
"plt.ylabel(np.argmax(model.predict(X_test)[1255]))\n",
"\n",
"\n",
"model.save('digit_trained.h5')\n",
"\n",
"\n",
"##### open cv for capture and predicting through camera #####\n",
"'''\n",
"##### cv2\n",
"\n",
"\n",
"cap = cv2.VideoCapture(0)\n",
"while True:\n",
" ret, img = cap.read()\n",
" #img = cv2.flip(img, 1)\n",
" img = img[200:400, 200:400]\n",
" gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)\n",
" _, gray = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY_INV)\n",
" cv2.imshow(\"gray_wind\", gray)\n",
" gray = cv2.resize(gray, (28, 28))\n",
" #cv2.imshow('resized')\n",
" gray = gray.reshape(1, 784)\n",
" result = np.argmax(model.predict(gray))\n",
" result = 'cnn : {}'.format(result)\n",
" cv2.putText(img, org=(25,25), fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=1, text= result, color=(255,0,0), thickness=1)\n",
" cv2.imshow(\"image\", img)\n",
" \n",
" if cv2.waitKey(1) == 13:\n",
" break\n",
"\n",
"cap.release()\n",
"cv2.destroyAllWindows()\n",
"#plt.imshow(img)\n",
"'''\n",
"\n",
"\n",
"############ prediction via paints ##########\n",
"### glob\n",
"run = False\n",
"ix,iy = -1,-1\n",
"follow = 25\n",
"img = np.zeros((512,512,1))\n",
"\n",
"### func\n",
"def draw(event, x, y, flag, params):\n",
" global run,ix,iy,img,follow\n",
" if event == cv2.EVENT_LBUTTONDOWN:\n",
" run = True\n",
" ix, iy = x, y\n",
" elif event == cv2.EVENT_MOUSEMOVE:\n",
" if run == True:\n",
" cv2.circle(img, (x,y), 20, (255,255,255), -1)\n",
"\n",
" elif event == cv2.EVENT_LBUTTONUP:\n",
" run = False\n",
" cv2.circle(img, (x,y), 20, (255,255,255), -1)\n",
" gray = cv2.resize(img, (28, 28))\n",
" gray = gray.reshape(1, 784)\n",
" result = np.argmax(model.predict(gray))\n",
" result = 'cnn : {}'.format(result)\n",
" cv2.putText(img, org=(25,follow), fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=1, text= result, color=(255,0,0), thickness=1)\n",
" follow += 25\n",
" elif event == cv2.EVENT_RBUTTONDOWN:\n",
" img = np.zeros((512,512,1))\n",
" follow = 25\n",
"\n",
"\n",
"### param\n",
"cv2.namedWindow('image')\n",
"cv2.setMouseCallback('image', draw)\n",
"\n",
"\n",
"\n",
"while True: \n",
" cv2.imshow(\"image\", img)\n",
" \n",
" if cv2.waitKey(1) == 27:\n",
" break\n",
"\n",
"cv2.destroyAllWindows()\n",
"\n",
"########## THANKS ##########\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3624a4a0",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "127a71a9",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment