Create model.py

3835901e · DESKTOP-ERETTBS\Chathuka Heshan · 29551612 · 3835901e
Commit 3835901e authored May 26, 2023 by $DESKTOP-ERETTBS\Chathuka Heshan's avatar$ DESKTOP-ERETTBS\Chathuka Heshan
Show whitespace changes
Inline Side-by-side

Showing with 174 additions and 0 deletions

model.py model.py +174 -0

No files found.
--- a/model.py
+++ b/model.py
+import os
+import pathlib
+
+import matplotlib.pyplot as plt
+import numpy as np
+import seaborn as sns
+import tensorflow as tf
+
+from tensorflow.keras import layers
+from tensorflow.keras import models
+from IPython import display
+
+# Set the seed value for experiment reproducibility.
+seed = 42
+tf.random.set_seed(seed)
+np.random.seed(seed)
+
+DATASET_PATH = 'data/mini_speech_commands'
+
+data_dir = pathlib.Path(DATASET_PATH)
+if not data_dir.exists():
+  tf.keras.utils.get_file(
+      'mini_speech_commands.zip',
+      origin="http://storage.googleapis.com/download.tensorflow.org/data/mini_speech_commands.zip",
+      extract=True,
+      cache_dir='.', cache_subdir='data')
+
+#The dataset's audio clips are stored in eight folders corresponding to each speech command: `no`, `yes`, `down`, `go`, `left`, `up`, `right`, and `stop`:
+commands = np.array(tf.io.gfile.listdir(str(data_dir)))
+commands = commands[(commands != 'README.md') & (commands != '.DS_Store')]
+print('Commands:', commands)
+
+
+#The audio clips are 1 second or less at 16kHz. The `output_sequence_length=16000` pads the short ones to exactly 1 second (and would trim longer ones) so that they can be easily batched.
+train_ds, val_ds = tf.keras.utils.audio_dataset_from_directory(
+    directory=data_dir,
+    batch_size=64,
+    validation_split=0.2,
+    seed=0,
+    output_sequence_length=16000,
+    subset='both')
+
+label_names = np.array(train_ds.class_names)
+print()
+print("label names:", label_names)
+
+#The dataset now contains batches of audio clips and integer labels. The audio clips have a shape of `(batch, samples, channels)`. 
+train_ds.element_spec
+
+
+#This dataset only contains single channel audio, so use the `tf.squeeze` function to drop the extra axis:
+def squeeze(audio, labels):
+  audio = tf.squeeze(audio, axis=-1)
+  return audio, labels
+
+train_ds = train_ds.map(squeeze, tf.data.AUTOTUNE)
+val_ds = val_ds.map(squeeze, tf.data.AUTOTUNE)
+
+
+#The `utils.audio_dataset_from_directory` function only returns up to two splits. It's a good idea to keep a test set separate from your validation set.
+#Ideally you'd keep it in a separate directory, but in this case you can use `Dataset.shard` to split the validation set into two halves. Note that iterating over **any** shard will load **all** the data, and only keep its fraction. 
+test_ds = val_ds.shard(num_shards=2, index=0)
+val_ds = val_ds.shard(num_shards=2, index=1)
+
+for example_audio, example_labels in train_ds.take(1):  
+  print(example_audio.shape)
+  print(example_labels.shape)
+
+#plot a few audio waveforms:
+label_names[[1,1,3,0]]
+plt.figure(figsize=(16, 10))
+rows = 3
+cols = 3
+n = rows * cols
+for i in range(n):
+  plt.subplot(rows, cols, i+1)
+  audio_signal = example_audio[i]
+  plt.plot(audio_signal)
+  plt.title(label_names[example_labels[i]])
+  plt.yticks(np.arange(-1.2, 1.2, 0.2))
+  plt.ylim([-1.1, 1.1])
+
+
+# Build and train the model
+train_spectrogram_ds = train_spectrogram_ds.cache().shuffle(10000).prefetch(tf.data.AUTOTUNE)
+val_spectrogram_ds = val_spectrogram_ds.cache().prefetch(tf.data.AUTOTUNE)
+test_spectrogram_ds = test_spectrogram_ds.cache().prefetch(tf.data.AUTOTUNE)
+
+
+input_shape = example_spectrograms.shape[1:]
+print('Input shape:', input_shape)
+num_labels = len(label_names)
+
+# Instantiate the `tf.keras.layers.Normalization` layer.
+norm_layer = layers.Normalization()
+# Fit the state of the layer to the spectrograms
+# with `Normalization.adapt`.
+norm_layer.adapt(data=train_spectrogram_ds.map(map_func=lambda spec, label: spec))
+
+model = models.Sequential([
+    layers.Input(shape=input_shape),
+    # Downsample the input.
+    layers.Resizing(32, 32),
+    # Normalize.
+    norm_layer,
+    layers.Conv2D(32, 3, activation='relu'),
+    layers.Conv2D(64, 3, activation='relu'),
+    layers.MaxPooling2D(),
+    layers.Dropout(0.25),
+    layers.Flatten(),
+    layers.Dense(128, activation='relu'),
+    layers.Dropout(0.5),
+    layers.Dense(num_labels),
+])
+
+model.summary()
+
+model.compile(
+    optimizer=tf.keras.optimizers.Adam(),
+    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+    metrics=['accuracy'],
+)
+
+#Train the model over 10 epochs
+EPOCHS = 10
+history = model.fit(
+    train_spectrogram_ds,
+    validation_data=val_spectrogram_ds,
+    epochs=EPOCHS,
+    callbacks=tf.keras.callbacks.EarlyStopping(verbose=1, patience=2),
+)
+
+
+#plot the training and validation loss curves to check how the model has improved during training:
+metrics = history.history
+plt.figure(figsize=(16,6))
+plt.subplot(1,2,1)
+plt.plot(history.epoch, metrics['loss'], metrics['val_loss'])
+plt.legend(['loss', 'val_loss'])
+plt.ylim([0, max(plt.ylim())])
+plt.xlabel('Epoch')
+plt.ylabel('Loss [CrossEntropy]')
+
+plt.subplot(1,2,2)
+plt.plot(history.epoch, 100*np.array(metrics['accuracy']), 100*np.array(metrics['val_accuracy']))
+plt.legend(['accuracy', 'val_accuracy'])
+plt.ylim([0, 100])
+plt.xlabel('Epoch')
+plt.ylabel('Accuracy [%]')
+
+
+### Run inference on an audio file
+x = data_dir/'stop/0227998e_nohash_0.wav'
+x = tf.io.read_file(str(x))
+x, sample_rate = tf.audio.decode_wav(x, desired_channels=1, desired_samples=16000,)
+x = tf.squeeze(x, axis=-1)
+waveform = x
+x = get_spectrogram(x)
+x = x[tf.newaxis,...]
+
+prediction = model(x)
+x_labels = ['no', 'yes', 'down', 'go', 'left', 'up', 'right', 'stop']
+print(prediction)
+
+predicted_class = tf.argmax(prediction, axis=1)
+predicted_label = x_labels[predicted_class[0].numpy()]
+
+plt.bar(x_labels, tf.nn.softmax(prediction[0]))
+plt.title('Up')
+plt.show()
+
+display.display(display.Audio(waveform, rate=16000))
+
+print("Predicted label:", predicted_label)
\ No newline at end of file