Added Backend

9c1e53da · kulvinu · 7de86a70 · 9c1e53da · 9c1e53da · 9c1e53da
Commit 9c1e53da authored Oct 09, 2022 by kulvinu
3 changed files
--- a/web-app/backend/AudioPreprocessing.py
+++ b/web-app/backend/AudioPreprocessing.py
+import numpy as np
+
+import librosa
+import librosa.display
+
+import IPython.display as ipd
+import matplotlib.pyplot as plt
+
+
+def read_file(file_name, sample_rate):
+    wav_file = wave.open(file_name, mode="rb")
+    channels = wav_file.getnchannels()
+    num_frames = wav_file.getnframes()
+
+    if wav_file.getframerate() != sample_rate:
+        raise ValueError("Audio file should have a sample rate of %d. got %d" % (sample_rate, wav_file.getframerate()))
+
+    samples = wav_file.readframes(num_frames)
+    wav_file.close()
+
+    frames = struct.unpack('h' * num_frames * channels, samples)
+
+    if channels == 2:
+        print("Picovoice processes single-channel audio but stereo file is provided. Processing left channel only.")
+
+    return frames[::channels]
+
+## Loading audio
+dataset_dir = '/datasets/live_recordings/'
+audio_name = 'one.wav'
+y, sample_rate = librosa.load(dataset_dir + audio_name, res_type='kaiser_fast')
+
+# Play the original audio
+print("Original audio - downsampled by librosa")
+ipd.Audio(y, rate=sample_rate)
+
+#------------------------------------------------------------------------------------
+## Trim the beginning and ending silence
+y_trimmed, _ = librosa.effects.trim(y)
+
+print("Original duration: ", librosa.get_duration(y))
+print("Trimmed duration:  ", librosa.get_duration(y_trimmed))
+
+figure = plt.figure()
+
+# Trimmed audio - without silence
+trimmed = figure.add_subplot(2, 1, 2)
+librosa.display.waveplot(y_trimmed, sr=sample_rate, color='r')
+plt.title('Trimmed')
+
+# Original audio - with silence at the end
+original = figure.add_subplot(2, 1, 1,sharex=trimmed)
+librosa.display.waveplot(y, sr=sample_rate)
+plt.title('Original')
+
+plt.tight_layout()
+plt.show()
+
+# Play the original audio
+print("Trimmed audio")
+ipd.Audio(y_trimmed, rate=sample_rate)
+
+###Audio Segmentation into windows
+from pydub import AudioSegment
+from pydub.silence import split_on_silence
+
+sound_file = AudioSegment.from_wav("one.wav")
+audio_chunks = split_on_silence(sound_file, min_silence_len=500, silence_thresh=-40)
+
+print ("AudioChunks", audio_chunks)
+for i, chunk in enumerate(audio_chunks):
+  out_file = "./a//.wav".format(i)
+  print ("exporting", out_file)
+  chunk.export(out_file, format="wav")
+
--- a/web-app/backend/LiveSpeechRecorder.py
+++ b/web-app/backend/LiveSpeechRecorder.py
+import os
+
+os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
+
+import tensorflow as tf
+from tensorflow.keras.models import Model, load_model
+from mlengine import transform_audio, get_prediction
+
+import pickle
+import pyaudio
+import numpy as np
+from queue import Queue
+import matplotlib.pyplot as plt
+from python_speech_features import logfbank
+
+class StreamPrediction:
+    """
+    Class for predicting streaming data. Heavily adapted from the implementation:
+    """
+
+    def __init__(self, model_path):
+        # Load model
+        self.feature_extractor = None
+        self.pca = None
+        self.marvin_svm = None
+        self.load_models(model_path)
+
+        # Recording parameters
+        self.sr = 16000
+        self.chunk_duration = 0.75
+        self.chunk_samples = int(self.sr * self.chunk_duration)
+        self.window_duration = 1
+        self.window_samples = int(self.sr * self.window_duration)
+        self.silence_threshold = 100
+
+        # Data structures and buffers
+        self.queue = Queue()
+        self.data = np.zeros(self.window_samples, dtype="int16")
+
+        # Plotting parameters
+        self.change_bkg_frames = 2
+        self.change_bkg_counter = 0
+        self.change_bkg = False
+
+    def load_models(self, model_path):
+        """
+        Loads the models for hotword detection
+        :param model_path: Path to model directory
+        :return: None
+        """
+
+        # Load model structure
+        model = load_model(model_path, compile=True)
+
+        # layer_name = "features256"
+        # self.feature_extractor = Model(inputs=model.input, outputs=model.get_layer(layer_name).output)
+
+        # # Load trained PCA object
+        # with open(model_path + "/marvin_kws_pca.pickle", "rb") as file:
+        #     self.pca = pickle.load(file)
+
+        # # Load trained SVM
+        # with open(model_path + "/marvin_kws_svm.pickle", "rb") as file:
+        #     self.marvin_svm = pickle.load(file)
+
+        print("Loaded models from disk")
+
+    def start_stream(self):
+        """
+        Start audio data streaming from microphone
+        :return: None
+        """
+
+        stream = pyaudio.PyAudio().open(
+            format=pyaudio.paInt16,
+            channels=1,
+            rate=self.sr,
+            input=True,
+            frames_per_buffer=self.chunk_samples,
+            # input_device_index=6,
+            stream_callback=self.callback,
+        )
+
+        stream.start_stream()
+
+        try:
+            while True:
+                data = self.queue.get()
+                # fbank = logfbank(data, samplerate=self.sr, nfilt=40)
+                # pred = self.detect_keyword(fbank)
+
+                tensor = transform_audio(self.sr,data)
+                pred = get_prediction(tensor)
+
+                self.plotter(data, pred)
+
+                if pred == 0:
+                    print("0", sep="", end="", flush=True)
+                if pred == 1:
+                    print("1", sep="", end="", flush=True)
+                if pred == 2:
+                    print("2", sep="", end="", flush=True)
+                if pred == 3:
+                    print("3", sep="", end="", flush=True)
+                if pred == 4:
+                    print("4", sep="", end="", flush=True)
+                if pred == 5:
+                    print("5", sep="", end="", flush=True)
+                if pred == 6:
+                    print("6", sep="", end="", flush=True)
+                if pred == 7:
+                    print("7", sep="", end="", flush=True)
+                if pred == 8:
+                    print("8", sep="", end="", flush=True)
+                if pred == 9:
+                    print("9", sep="", end="", flush=True)
+
+        except (KeyboardInterrupt, SystemExit):
+            stream.stop_stream()
+            stream.close()
+
+    # def detect_keyword(self, fbank):
+    #     """
+    #     Detect hotword presence in current window
+    #     :param fbank: Log Mel filterbank energies
+    #     :return: Prediction
+    #     """
+
+    #     fbank = np.expand_dims(fbank, axis=0)
+    #     feature_embeddings = self.feature_extractor.predict(fbank)
+
+    #     feature_embeddings_scaled = self.pca.transform(feature_embeddings)
+    #     prediction = self.marvin_svm.predict(feature_embeddings_scaled)
+
+    #     return prediction
+
+    def callback(self, in_data, frame_count, time_info, status):
+        """
+        Obtain the data from buffer and load it to queue
+        :param in_data: Daa buffer
+        :param frame_count: Frame count
+        :param time_info: Time information
+        :param status: Status
+        :return:
+        """
+
+        data0 = np.frombuffer(in_data, dtype="int16")
+
+        if np.abs(data0).mean() < self.silence_threshold:
+            print(".", sep="", end="", flush=True)
+        else:
+            print("-", sep="", end="", flush=True)
+
+        self.data = np.append(self.data, data0)
+
+        if len(self.data) > self.window_samples:
+            self.data = self.data[-self.window_samples :]
+            self.queue.put(self.data)
+
+        return in_data, pyaudio.paContinue
+
+    def plotter(self, data, pred):
+        """
+        Plot waveform, filterbank energies and hotword presence
+        :param data: Audio data array
+        :param fbank: Log Mel filterbank energies
+        :param pred: Prediction
+        :return:
+        """
+
+        plt.clf()
+
+        # Wave
+        plt.subplot(311)
+        plt.plot(data[-len(data) // 2 :])
+        plt.gca().xaxis.set_major_locator(plt.NullLocator())
+        plt.ylabel("Amplitude")
+
+        # Filterbank energies
+        # plt.subplot(312)
+        # plt.imshow(fbank[-fbank.shape[0] // 2 :, :].T, aspect="auto")
+        # plt.gca().xaxis.set_major_locator(plt.NullLocator())
+        # plt.gca().invert_yaxis()
+        # plt.ylim(0, 40)
+        # plt.ylabel("$\log \, E_{m}$")
+
+        # Hotword detection
+        plt.subplot(313)
+        ax = plt.gca()
+
+        if pred == 0:
+            self.change_bkg = True
+        if pred == 1:
+            self.change_bkg = True
+        if pred == 2:
+            self.change_bkg = True
+        if pred == 3:
+            self.change_bkg = True
+        if pred == 4:
+            self.change_bkg = True
+        if pred == 5:
+            self.change_bkg = True
+        if pred == 6:
+            self.change_bkg = True
+        if pred == 7:
+            self.change_bkg = True
+        if pred == 8:
+            self.change_bkg = True
+        if pred == 9:
+            self.change_bkg = True
+
+        if self.change_bkg and self.change_bkg_counter < self.change_bkg_frames:
+            ax.set_facecolor("lightgreen")
+
+            ax.text(
+                x=0.5,
+                y=0.5,
+                s="{pred}",
+                horizontalalignment="center",
+                verticalalignment="center",
+                fontsize=30,
+                color="red",
+                fontweight="bold",
+                transform=ax.transAxes,
+            )
+
+            self.change_bkg_counter += 1
+        else:
+            ax.set_facecolor("salmon")
+            self.change_bkg = False
+            self.change_bkg_counter = 0
+
+        plt.tight_layout()
+        plt.pause(0.01)
+
+
+if __name__ == "__main__":
+    audio_stream = StreamPrediction("./saved_model")
+    audio_stream.start_stream()
\ No newline at end of file
--- a/web-app/backend/mlengine.py
+++ b/web-app/backend/mlengine.py
+import io
+import tensorflow as tf
+from tensorflow import keras
+import os
+import pandas as pd
+from sklearn.model_selection import train_test_split
+import pickle
+import numpy as np
+from scipy.io import wavfile as wav
+import scipy
+import scipy.signal as sps
+from python_speech_features import mfcc
+from python_speech_features import logfbank
+from tensorflow.keras.models import Sequential, save_model, load_model
+
+
+modelfilepath = './saved_model'
+datafilepath = './data'
+size = 48
+
+DIGITS = ["0", "1", "2", "3","4","5","6","7","8","9"]
+
+model = load_model(modelfilepath, compile=True)
+
+
+def transform_audio(rate,sig):
+        #read .wav file
+        #resample audio file
+    number_of_samples = round(len(sig) * float(16000) / rate)
+    sig = sps.resample(sig, number_of_samples)
+
+    #Encode numbers using 48*13 matrix
+        #Compute MFCC features from an audio signal
+    mfcc_feat = mfcc(sig,rate,nfft=2048)
+        #Return a new numpy array with the specified shape.
+    mfcc_feat = np.resize(mfcc_feat, (size,13))
+    #set the independent variable
+    return mfcc_feat
+
+
+def get_prediction(X):
+    
+    pred = model.predict(X.reshape(-1,size,13,1))
+
+    prediction = DIGITS[np.argmax(pred)]
+
+    print("\n\033[1mPredicted digit sound: %.0f"%pred.argmax(),"\033[0m \n ")
+    print("Predicted probability array:")
+    print(pred)
+
+    return prediction
+
+