feat: Audio emotion detection py

226feab3 · NaweenTharuka · c2e71338 · 226feab3 · 226feab3 · 226feab3
Commit 226feab3 authored Jan 09, 2022 by NaweenTharuka
4 changed files
--- a/BE-Emotion-detection-Audio/extract_audio_features.py
+++ b/BE-Emotion-detection-Audio/extract_audio_features.py
+import numpy as np
+import librosa # To extract speech features
+import glob
+import os
+
+
+# Extract feature function
+def extract_audio_features(file_name, should_augment=False, **kwargs):
+  """
+  Extract feature from audio file `file_name`
+    Features supported:
+     - MFCC (mfcc)
+     - Chroma (chroma)
+     - MEL Spectrogram Frequency (mel)
+    e.g:
+      `features = extract_audio_features(path, mel=True, mfcc=True)`
+  """
+  mfcc = kwargs.get("mfcc")
+  chroma = kwargs.get("chroma")
+  mel = kwargs.get("mel")
+
+  # https://stackoverflow.com/questions/9458480/read-mp3-in-python-3
+  # https://librosa.org/doc/latest/tutorial.html#quickstart
+  # https://github.com/librosa/librosa/issues/1015
+  X, sample_rate = librosa.load(file_name)
+  if chroma:
+    stft = np.abs(librosa.stft(X))
+  result = np.array([])
+  if mfcc:
+    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
+    result = np.hstack((result, mfccs))
+    # print('mfccs shape', mfccs.shape)
+  if mel:
+    mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
+    result = np.hstack((result, mel))
+    # print('mel shape', mel.shape)
+  if chroma:
+    chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
+    result = np.hstack((result, chroma))
+    # print('chroma shape', chroma.shape)
+  return result
+
--- a/BE-Emotion-detection-Audio/predict_emotion_mer_thesis_app.ipynb
+++ b/BE-Emotion-detection-Audio/predict_emotion_mer_thesis_app.ipynb
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "predict_emotion_mer_thesis_app.ipynb",
+      "provenance": [],
+      "collapsed_sections": [],
+      "authorship_tag": "ABX9TyNROFZ0mqm4VUJjZk5Jgyiy",
+      "include_colab_link": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "<a href=\"https://colab.research.google.com/github/HonzaCuhel/mer-thesis-app/blob/main/predict_emotion_mer_thesis_app.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "coKAZLXvspj9"
+      },
+      "source": [
+        "# Multimodal Speech Emotion Recognition - Demo app\n",
+        "<hr/>\n",
+        "<b>Description:</b> This notebook contains a demo application of  Emotion Recognition models trained on the IEMOCAP dataset using 4 basic emotions.<br/>\n",
+        "<b>Model Architecture:</b> Electra small (TER), TRILL (SER), YAMNet (SER), Electra + TRILL (MER), Electra + YAMNet (MER)<br/>\n",
+        "<b>Author:</b> Jan Čuhel<br/>\n",
+        "<b>Date:</b> 5.5.2021<br/>\n",
+        "<b>Dataset:</b> <a href='https://usc.edu/iemocap/'>IEMOCAP</a>, <a href='https://zenodo.org/record/1188976'>RAVDESS</a>, <a href='https://github.com/bfelbo/DeepMoji/tree/master/data/PsychExp'>PsychExp</a><br/>\n",
+        "<b>Predicting emotions:</b><br/>\n",
+        "- IEMOCAP: [happy + excited, sad, angry, neutral]<br/>\n",
+        "- RAVDESS: [neutral, calm, happy, sad, angry, fearful, disgust, surprised]<br/>\n",
+        "- PsychExp: [joy, fear, anger, sadness, disgust, shame, guilt]<br/>"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "DHBgi40Tycgh"
+      },
+      "source": [
+        "\n",
+        "###Resources \n",
+        "- https://ricardodeazambuja.com/deep_learning/2019/03/09/audio_and_video_google_colab/\n",
+        "- https://stackoverflow.com/questions/9031783/hide-all-warnings-in-ipython\n",
+        "- https://getemoji.com/\n",
+        "- https://realpython.com/python-speech-recognition/\n",
+        "- https://github.com/Uberi/speech_recognition#readme\n",
+        "- https://www.howtogeek.com/423214/how-to-use-the-rename-command-on-linux/"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "vj-Bbk2DyK1-"
+      },
+      "source": [
+        "## Code preparation\n",
+        "### Please run this code cell (click inside the cell and press `Ctrl + Enter`, or click on the `run icon` in the top left corner of the cell)\n",
+        "\n",
+        "What this does:\n",
+        "\n",
+        "1.   Installs the required Python packages\n",
+        "2.   Clones a Github repository containing classes for Emotion Recognition models\n",
+        "3.   Downloads the trained saved models\n",
+        "4.   Moves the downloaded trained saved models\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "ktVYqO43jrPs"
+      },
+      "source": [
+        "# Step 1) Installation\n",
+        "!pip install -q ffmpeg-python SpeechRecognition gTTS pydub librosa tensorflow-text\n",
+        "\n",
+        "# Step 2) Connect to GitHub\n",
+        "!git clone https://github.com/HonzaCuhel/mer-thesis-app\n",
+        "\n",
+        "# Step 3) Download the models\n",
+        "!wget 'https://alquist2021-data.s3.amazonaws.com/public/mer_electra_yamnet_iemocap_model+(1).h5'\n",
+        "!wget 'https://alquist2021-data.s3.amazonaws.com/public/mer_trill_electra_small_model.h5'\n",
+        "!wget 'https://alquist2021-data.s3.amazonaws.com/public/ter_electra_iemocap_model+(1).h5'\n",
+        "!wget 'https://alquist2021-data.s3.amazonaws.com/public/ter_electra_model_psychexp.h5'\n",
+        "\n",
+        "# Step 4) Move the models\n",
+        "!mv '/content/mer_electra_yamnet_iemocap_model+(1).h5' /content/mer-thesis-app/result_models/mer_electra_yamnet_iemocap_model.h5\n",
+        "!mv '/content/mer_trill_electra_small_model.h5' /content/mer-thesis-app/result_models/mer_trill_electra_small_model.h5\n",
+        "!mv '/content/ter_electra_iemocap_model+(1).h5' /content/mer-thesis-app/result_models/ter_electra_iemocap_model.h5\n",
+        "!mv '/content/ter_electra_model_psychexp.h5' /content/mer-thesis-app/result_models/ter_electra_model_psychexp.h5"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "A17BHwScB7DF"
+      },
+      "source": [
+        "## Code definition\n",
+        "### Please run this code cell (click inside the cell and press `Ctrl + Enter`, or click on the `run icon` in the top left corner of the cell) as well\n",
+        "\n",
+        "What this does:\n",
+        "\n",
+        "1.   Imports the required packages\n",
+        "2.   Defines some constants\n",
+        "3.   Load the trained models\n",
+        "4.   Defines functions for Emotion Recognition"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "n00MeETqutfK"
+      },
+      "source": [
+        "# Step 1) Imports\n",
+        "import sys\n",
+        "import gtts\n",
+        "import os\n",
+        "import IPython.display as display\n",
+        "import matplotlib.pyplot as plt\n",
+        "import librosa\n",
+        "import speech_recognition as sr\n",
+        "\n",
+        "import warnings\n",
+        "warnings.filterwarnings('ignore')\n",
+        "\n",
+        "sys.path.append('/content/mer-thesis-app/')\n",
+        "from record_audio import get_audio\n",
+        "from predict_emotion_tf import *\n",
+        "\n",
+        "\n",
+        "# Step 2) Defining constants\n",
+        "lang = 'en'\n",
+        "dur = 11\n",
+        "emoji_dict = {\"happy\":\"😊\", \"fear\":\"😱\", \"angry\":\"😡\", \"sad\":\"😢\", \"disgust\":\"🤮\", \"shame\":\"😳\", \"guilt\":\"😓\", \"neutral\": \"😐\"}\n",
+        "NO = 'no'\n",
+        "DEFAULT_SAMPLE_RATE = 16000\n",
+        "output_file = 'output_emotion.mp3'\n",
+        "\n",
+        "# Step 3) Model loading\n",
+        "print('Models are being loaded, it will take some time...')\n",
+        "\n",
+        "ser_trill_model_iemocap = TRILLSERModel(SER_TRILL_MODEL_IEMOCAP, TRILL_URL, emotions_iemocap, input_length_iemocap, SAMPLING_RATE)\n",
+        "ser_trill_model_ravdess = TRILLSERModel(SER_TRILL_MODEL_RAVDESS, TRILL_URL, emotions_ravdess, input_length_ravdess, SAMPLING_RATE)\n",
+        "mer_electra_trill_model_iemocap = ElectraTRILLMERModel(MER_ELECTRA_TRILL, TRILL_URL, emotions_iemocap, input_length_iemocap, SAMPLING_RATE)\n",
+        "\n",
+        "ser_yamnet_model_iemocap = YAMNetSERModel(SER_YAMNET_MODEL_IEMOCAP, YAMNET_URL, emotions_iemocap, input_length_iemocap, SAMPLING_RATE)\n",
+        "ser_yamnet_model_ravdess = YAMNetSERModel(SER_YAMNET_MODEL_RAVDESS, YAMNET_URL, emotions_ravdess, input_length_ravdess, SAMPLING_RATE)\n",
+        "mer_electra_yamnet_model_iemocap = ElectraYAMNetMERModel(MER_ELECTRA_YAMNET, YAMNET_URL, emotions_iemocap, input_length_iemocap, SAMPLING_RATE)\n",
+        "\n",
+        "ter_electra_model_iemocap = TERModel(TER_ELECTRA_IEMOCAP, emotions_iemocap)\n",
+        "ter_electra_model_psychexp = TERModel(TER_ELECTRA_PSYCHEXP, emotion_psychexp)\n",
+        "\n",
+        "print('Models are loaded!')\n",
+        "\n",
+        "# Step 4) Definition of functions\n",
+        "def get_transription(audio_file):\n",
+        "  # use the audio file as the audio source\n",
+        "  r = sr.Recognizer()\n",
+        "  with sr.AudioFile(audio_file) as source:\n",
+        "      audio = r.record(source, duration=dur)  # read the entire audio file\n",
+        "  \n",
+        "  # Resource: https://github.com/Uberi/speech_recognition/blob/master/examples/audio_transcribe.py\n",
+        "  # Recognize speech using Google Speech Recognition\n",
+        "  try:\n",
+        "      text = r.recognize_google(audio, language=lang)\n",
+        "  except sr.UnknownValueError:\n",
+        "      print(\"Google Speech Recognition could not understand audio\")\n",
+        "      return \"\"\n",
+        "  except sr.RequestError as e:\n",
+        "      print(\"Could not request results from Google Speech Recognition service; {0}\".format(e))\n",
+        "      return \"\"\n",
+        "  \n",
+        "  return text\n",
+        "\n",
+        "def predict_emotion(audio_file, print_intro=True):\n",
+        "  if print_intro:\n",
+        "    print('Welcome to the Multimodal Speech Emotion Recognizer app from audio and text!')\n",
+        "    print('-'*80)\n",
+        "    print('Help:')\n",
+        "    print(' - record a speech and the program will recognize your emotion')\n",
+        "\n",
+        "  print('Recognizing emotion...')\n",
+        "  # Recognize the emotion\n",
+        "  text = get_transription(audio_file)\n",
+        "\n",
+        "  # TRILL predictions\n",
+        "  ser_trill_iemocap = ser_trill_model_iemocap.predict_emotion(audio_file)\n",
+        "  ser_trill_ravdess = ser_trill_model_ravdess.predict_emotion(audio_file)\n",
+        "  mer_trill_electra = mer_electra_trill_model_iemocap.predict_emotion(text, audio_file)\n",
+        "\n",
+        "  # Yamnet predictions\n",
+        "  ser_yamnet_iemocap = ser_yamnet_model_iemocap.predict_emotion(audio_file)\n",
+        "  ser_yamnet_ravdess = ser_yamnet_model_ravdess.predict_emotion(audio_file)\n",
+        "  mer_yamnet_electra = mer_electra_yamnet_model_iemocap.predict_emotion(text, audio_file)\n",
+        "\n",
+        "  # TER Electra predictions\n",
+        "  ter_electra_iemocap = ter_electra_model_iemocap.predict_emotion(text)\n",
+        "  ter_electra_psychexp = ter_electra_model_psychexp.predict_emotion(text)\n",
+        "\n",
+        "  print('\\n' + '='*60)\n",
+        "  print(f'\\nYou\\'ve said: {text}.\\n')\n",
+        "  print(\"Audio's waveform:\")\n",
+        "  plt.figure(figsize=(10,5))\n",
+        "  plt.plot(librosa.load(audio_file)[0])\n",
+        "  plt.title(f'Audio\\'s waveform (sample rate {round(DEFAULT_SAMPLE_RATE/1000)}kHz)')\n",
+        "  plt.xlabel('Time')\n",
+        "  plt.ylabel('Amplitude')\n",
+        "  plt.show()\n",
+        "  print('='*60)\n",
+        "  print(\"Predictions:\")\n",
+        "  print('-'*40)\n",
+        "  print('TRILL models:')\n",
+        "  print(f'MER Electra TRILL (IEMOCAP): {mer_trill_electra}')\n",
+        "  print(f'SER TRILL (IEMOCAP): {ser_trill_iemocap}')\n",
+        "  print(f'SER TRILL (RAVDESS): {ser_trill_ravdess}')\n",
+        "  print('-'*40)\n",
+        "  print('YAMNet models:')\n",
+        "  print(f'MER Electra YAMNet (IEMOCAP): {mer_yamnet_electra}')\n",
+        "  print(f'SER YAMNet (IEMOCAP): {ser_yamnet_iemocap}')\n",
+        "  print(f'SER YAMNet (RAVDESS): {ser_yamnet_ravdess}')\n",
+        "  print('-'*40)\n",
+        "  print('Only text - Electra small')\n",
+        "  print(f'TER Electra small (IEMOCAP): {ter_electra_iemocap}')\n",
+        "  print(f'TER Electra small (PsychExp): {ter_electra_psychexp}')\n",
+        "  print('='*60)\n",
+        "\n",
+        "  return mer_trill_electra"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ETpnzv33QOzP"
+      },
+      "source": [
+        "### Record a speech\n",
+        "\n",
+        "Here you can record a sample of your speech. To record just execute the next cell by either hitting the `run icon` or by clicking inside of the cell and then press `Ctrl + Enter`. After you've sad something click on the button `Stop recording` to stop recording. \n",
+        "\n",
+        "<b>WARNING: ONLY 11 SECONDS OF YOUR SPEECH WILL BE USED, SO IF YOU WILL SPEAK LONGER THE AUDIO FILE WILL BE TRUNCATED, IF YOU WILL SPEAK LESS, IT IS FINE, THE AUDIO RECORDING WILL BE PADDED.</b>"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "RPTSXUW9u3Ak"
+      },
+      "source": [
+        "audio, sample_rate, audio_file = get_audio()\n",
+        "\n",
+        "print('Speech recorded!')"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "AhbvAYwJP2Ym"
+      },
+      "source": [
+        "#### Emotion recognition"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "QJ7Fcv6WJx2A"
+      },
+      "source": [
+        "pred_emotion = predict_emotion(audio_file)"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "OEacJF59wbM4"
+      },
+      "source": [
+        "### Audio uploading\n",
+        "\n",
+        "You can test out the models by uploading `.wav` audio files and the models will try to predict emotions from them. Try it! \n",
+        "\n",
+        "<b>WARNING: ONLY 11 SECONDS OF THE AUDIO FILES WILL BE USED, SO IF YOU WILL SPEAK LONGER THE AUDIO FILE WILL BE TRUNCATED, IF YOU WILL SPEAK LESS, IT IS FINE, THE AUDIO RECORDING WILL BE PADDED.</b>"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "btlhd4b5wmjp"
+      },
+      "source": [
+        "from google.colab import files\n",
+        "\n",
+        "# Upload files\n",
+        "uploaded = files.upload()\n",
+        "# Wav files counter \n",
+        "i_num = 1\n",
+        "\n",
+        "for uf in uploaded.keys():\n",
+        "  if '.wav' in uf:\n",
+        "    print('User uploaded file \"{name}\" with length {length} bytes'.format(\n",
+        "        name=uf, length=len(uploaded[uf])))\n",
+        "    \n",
+        "    print('*'*80)\n",
+        "    print(f'{i_num}) RESULTS FOR {uf}:')\n",
+        "    # Predict the emotion\n",
+        "    pred_emotion = predict_emotion(uf, print_intro=False)\n",
+        "    print('*'*80)\n",
+        "\n",
+        "    # Actualize counter\n",
+        "    i_num += 1"
+      ],
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "66McRKEM_N5i"
+      },
+      "source": [
+        ""
+      ],
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}
\ No newline at end of file
--- a/BE-Emotion-detection-Audio/predict_emotion_tf.py
+++ b/BE-Emotion-detection-Audio/predict_emotion_tf.py
+#!/usr/bin/env python3
+# Author: Jan Cuhel
+# Date: 2.5.2021
+
+import os
+
+import gtts
+import librosa
+import numpy as np
+import pickle
+from pydub import AudioSegment
+from pydub.playback import play
+
+import tensorflow as tf
+import tensorflow_hub as hub
+import tensorflow_text as text
+
+import scipy
+import speech_recognition as sr
+
+# Import TF 2.X and make sure we're running eager.
+import tensorflow.compat.v2 as tf
+tf.enable_v2_behavior()
+assert tf.executing_eagerly()
+
+import warnings
+warnings.filterwarnings('ignore')
+
+from extract_audio_features import extract_audio_features
+
+
+# Audio constants
+DURATION_RAVDESS = 3
+DURATION_IEMOCAP = 11
+SAMPLING_RATE = 16000
+input_length_iemocap = SAMPLING_RATE * DURATION_IEMOCAP
+input_length_ravdess = SAMPLING_RATE * DURATION_RAVDESS
+DEFAULT_FILE = 'microphone-results.wav'
+# TRILL models
+SER_TRILL_MODEL_IEMOCAP = '/content/mer-thesis-app/result_models/ser_trill_lstm_iemocap_model.h5'
+SER_TRILL_MODEL_RAVDESS = '/content/mer-thesis-app/result_models/ser_trill_lstm_ravdess_model.h5'
+MER_ELECTRA_TRILL = '/content/mer-thesis-app/result_models/mer_trill_electra_small_model.h5'
+# Yamnet models
+SER_YAMNET_MODEL_IEMOCAP = '/content/mer-thesis-app/result_models/ser_yamnet_iemocap_model.h5'
+SER_YAMNET_MODEL_RAVDESS = '/content/mer-thesis-app/result_models/ser_yamnet_ravdess_model.h5'
+MER_ELECTRA_YAMNET = '/content/mer-thesis-app/result_models/mer_electra_yamnet_iemocap_model.h5'
+# TER Electra
+TER_ELECTRA_IEMOCAP = '/content/mer-thesis-app/result_models/ter_electra_iemocap_model.h5'
+TER_ELECTRA_PSYCHEXP = '/content/mer-thesis-app/result_models/ter_electra_model_psychexp.h5'
+# Emotion available in datasets
+emotions_iemocap = ['neutral', 'happy', 'sad', 'angry']
+emotions_ravdess = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']
+emotion_psychexp = ['joy', 'fear', 'anger', 'sadness', 'disgust', 'shame', 'guilt']
+# Language of the models
+LANG='en'
+# URL addresses for the audio embeddings
+TRILL_URL = 'https://tfhub.dev/google/nonsemantic-speech-benchmark/trill-distilled/3'
+YAMNET_URL = 'https://tfhub.dev/google/yamnet/1'
+
+
+class DeepLearningModel():
+    """ Definition of a class for DeepLearning Emotion Recognition model """
+    def __init__(self, model_filename, emotions=emotions_iemocap):
+        self.model_filename = model_filename
+        self.emotions = emotions
+        self.model = self.load_model()
+    
+    def load_model(self):
+        """ Loads the model from TF Hub """
+        return tf.keras.models.load_model(
+            self.model_filename, custom_objects={'KerasLayer':hub.KerasLayer})
+
+
+class TERModel(DeepLearningModel):
+    """ Definition of a class for Text Emotion Recognition model (TER) """
+    def __init__(self, model_filename, emotions=emotions_iemocap):
+        super().__init__(model_filename, emotions)
+
+    def predict_emotion(self, text):
+        """ Predicts an emotion of the given text """
+        X_text = np.array([text])
+
+        # Make prediction    
+        pred_id = tf.argmax(self.model.predict(X_text), 1).numpy()[0]
+
+        return self.emotions[pred_id]
+
+
+class SERModel(DeepLearningModel):
+    """ Definition of a class for Speech Emotion Recognition model (SER) """
+    def __init__(self, model_filename, embedding_url, emotions=emotions_iemocap, input_length=input_length_iemocap, sample_rate=SAMPLING_RATE):
+        super().__init__(model_filename, emotions)
+        self.input_length = input_length
+        self.embedding = hub.load(embedding_url)
+        self.sample_rate = sample_rate
+
+    def load_model(self):
+        """ Loads the model """
+        return tf.keras.models.load_model(self.model_filename)
+    
+    def predict_emotion(self, audio_file):
+        """ Predicts an emotion of the given audio file """
+        y, _ = librosa.load(audio_file, sr=self.sample_rate)
+        # y,_ = librosa.effects.trim(y, top_db = 25)
+        # https://en.wikipedia.org/wiki/Wiener_filter
+        # https://cs.wikipedia.org/wiki/Wiener%C5%AFv_filtr
+        # https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.wiener.html
+        y = scipy.signal.wiener(y)
+
+        if len(y) > self.input_length:
+            # Cut to the same length 
+            y = y[0:self.input_length]
+        elif self.input_length > len(y):
+            # Pad the sequence
+            max_offset = self.input_length - len(y)  
+            y = np.pad(y, (0, max_offset), "constant")
+
+        X_audio = self.get_audio_embedding(y)
+
+        # Make prediction
+        pred_id = tf.argmax(self.model.predict(X_audio), 1).numpy()[0]
+
+        return self.emotions[pred_id]
+
+    def get_audio_embedding(self, audio):
+        return np.array([audio])
+
+
+class TRILLSERModel(SERModel):
+    """ 
+    Definition of a class for Speech Emotion Recognition model (SER) that
+    uses TRILL Embedding
+    """
+    def __init__(self, model_filename, embedding_url, emotions=emotions_iemocap, input_length=input_length_iemocap, sample_rate=SAMPLING_RATE):
+        super().__init__(model_filename, embedding_url, emotions, input_length, sample_rate)
+
+    def get_audio_embedding(self, audio):
+        return np.array([self.embedding(samples=audio, sample_rate=self.sample_rate)['embedding'].numpy()])
+
+
+class YAMNetSERModel(SERModel):
+    """ 
+    Definition of a class for Speech Emotion Recognition model (SER) that
+    uses YAMNet as an Embedding
+    """
+    def __init__(self, model_filename, embedding_url, emotions=emotions_iemocap, input_length=input_length_iemocap, sample_rate=SAMPLING_RATE):
+        super().__init__(model_filename, embedding_url, emotions, input_length, sample_rate)
+
+    def get_audio_embedding(self, audio):
+        # Get the embedding from the yamnet
+        _, embeddings, _ = self.embedding(audio)
+        return np.array([embeddings.numpy()])
+
+
+class MERModel(DeepLearningModel):
+    """ Definition of a class for Multimodal Emotion Recognition model (MER) """
+    def __init__(self, model_filename, embedding_url, emotions=emotions_iemocap, input_length=input_length_iemocap, sample_rate=SAMPLING_RATE):
+        super().__init__(model_filename, emotions)
+        self.input_length = input_length
+        self.embedding = hub.load(embedding_url)
+        self.input_length = input_length
+        self.sample_rate = sample_rate
+    
+    def predict_emotion(self, text, audio_file):
+        """ Predicts an emotion of the given text and audio file """
+        y, _ = librosa.load(audio_file, sr=self.sample_rate)
+        # y,_ = librosa.effects.trim(y, top_db = 25)
+        # https://en.wikipedia.org/wiki/Wiener_filter
+        # https://cs.wikipedia.org/wiki/Wiener%C5%AFv_filtr
+        # https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.wiener.html
+        y = scipy.signal.wiener(y)
+
+        if len(y) > self.input_length:
+            # Cut to the same length 
+            y = y[0:self.input_length]
+        elif self.input_length > len(y):
+            # Pad the sequence
+            max_offset = self.input_length - len(y)  
+            y = np.pad(y, (0, max_offset), "constant")
+
+        X_audio = self.get_audio_embedding(y)
+
+        X_text = np.array([text])
+
+        # Make prediction
+        pred_id = tf.argmax(self.model.predict([X_text, X_audio]), 1).numpy()[0]
+
+        return self.emotions[pred_id]
+    
+    def get_audio_embedding(self, audio):
+        return np.array([audio])
+
+
+class ElectraTRILLMERModel(MERModel):
+    """ 
+    Definition of a class for Multimodal Emotion Recognition model (MER) that
+    uses TRILL Embedding
+    """
+    def __init__(self, model_filename, embedding_url, emotions=emotions_iemocap, input_length=input_length_iemocap, sample_rate=SAMPLING_RATE):
+        super().__init__(model_filename, embedding_url, emotions, input_length, sample_rate)
+
+    def get_audio_embedding(self, audio):
+        return np.array([self.embedding(samples=audio, sample_rate=self.sample_rate)['embedding'].numpy()])
+
+
+class ElectraYAMNetMERModel(MERModel):
+    """ 
+    Definition of a class for Multimodal Emotion Recognition model (MER) that
+    uses YAMNet as an Embedding
+    """
+    def __init__(self, model_filename, embedding_url, emotions=emotions_iemocap, input_length=input_length_iemocap, sample_rate=SAMPLING_RATE):
+        super().__init__(model_filename, embedding_url, emotions, input_length, sample_rate)
+
+    def get_audio_embedding(self, audio):
+        # Get the embedding from the yamnet
+        _, embeddings, _ = self.embedding(audio)
+        return np.array([embeddings.numpy()])        
+
+
+def record_speech(lang=LANG, dur=DURATION_IEMOCAP, filepath=DEFAULT_FILE):
+    """ 
+    This function records a speech from a microphone and get the text. 
+
+    params:
+    - lang: the language of the recorded speach
+    - dur: how long in seconds should the function record
+    - filepath: path to the file where should be the audio recording saved
+    returns:
+    - text: transcript of the audio recording
+    - filepath: where was the audio recording saved
+    """
+    # initialize the recognizer
+    r = sr.Recognizer()
+
+    try:
+        with sr.Microphone() as source:
+            print(f'Starting recording for the next {dur}s.\nPlease speak...')
+            # read the audio data from the default microphone
+            audio_data = r.record(source, duration=dur)
+            print("Recording ended.\nRecognizing...")
+            # convert speech to text
+            text = r.recognize_google(audio_data, language=lang)
+            print('Done.')
+
+            print(f'\nYou\'ve said {text}.\n')
+
+            # write audio to a WAV file
+            with open(filepath, "wb") as f:
+                f.write(audio_data.get_wav_data())
+
+            print('Done.')
+
+            return text, filepath
+    except:
+        print('Something went wrong... Try to speak again')
+    
+    return None, None
--- a/BE-Emotion-detection-Audio/record_audio.py
+++ b/BE-Emotion-detection-Audio/record_audio.py
+
+"""
+Resource: https://ricardodeazambuja.com/deep_learning/2019/03/09/audio_and_video_google_colab/
+
+Author references:
+To write this piece of code I took inspiration/code from a lot of places.
+It was late night, so I'm not sure how much I created or just copied o.O
+Here are some of the possible references:
+https://blog.addpipe.com/recording-audio-in-the-browser-using-pure-html5-and-minimal-javascript/
+https://stackoverflow.com/a/18650249
+https://hacks.mozilla.org/2014/06/easy-audio-capture-with-the-mediarecorder-api/
+https://air.ghost.io/recording-to-an-audio-file-using-html5-and-js/
+https://stackoverflow.com/a/49019356
+"""
+from IPython.display import HTML, Audio
+from google.colab.output import eval_js
+from base64 import b64decode
+import numpy as np
+# from scipy.io.wavfile import read as wav_read
+import librosa
+import io
+import ffmpeg
+
+AUDIO_HTML = """
+<script>
+var my_div = document.createElement("DIV");
+var my_p = document.createElement("P");
+var my_btn = document.createElement("BUTTON");
+var t = document.createTextNode("Press to start recording");
+
+my_btn.appendChild(t);
+//my_p.appendChild(my_btn);
+my_div.appendChild(my_btn);
+document.body.appendChild(my_div);
+
+var base64data = 0;
+var reader;
+var recorder, gumStream;
+var recordButton = my_btn;
+
+var handleSuccess = function(stream) {
+  gumStream = stream;
+  var options = {
+    //bitsPerSecond: 8000, //chrome seems to ignore, always 48k
+    mimeType : 'audio/webm;codecs=opus'
+    //mimeType : 'audio/webm;codecs=pcm'
+  };            
+  //recorder = new MediaRecorder(stream, options);
+  recorder = new MediaRecorder(stream);
+  recorder.ondataavailable = function(e) {            
+    var url = URL.createObjectURL(e.data);
+    var preview = document.createElement('audio');
+    preview.controls = true;
+    preview.src = url;
+    document.body.appendChild(preview);
+
+    reader = new FileReader();
+    reader.readAsDataURL(e.data); 
+    reader.onloadend = function() {
+      base64data = reader.result;
+      //console.log("Inside FileReader:" + base64data);
+    }
+  };
+  recorder.start();
+  };
+
+recordButton.innerText = "Recording... press to stop";
+
+navigator.mediaDevices.getUserMedia({audio: true}).then(handleSuccess);
+
+
+function toggleRecording() {
+  if (recorder && recorder.state == "recording") {
+      recorder.stop();
+      gumStream.getAudioTracks()[0].stop();
+      recordButton.innerText = "Saving the recording... pls wait!"
+  }
+}
+
+// https://stackoverflow.com/a/951057
+function sleep(ms) {
+  return new Promise(resolve => setTimeout(resolve, ms));
+}
+
+var data = new Promise(resolve=>{
+//recordButton.addEventListener("click", toggleRecording);
+recordButton.onclick = ()=>{
+toggleRecording()
+
+sleep(2000).then(() => {
+  // wait 2000ms for the data to be available...
+  // ideally this should use something like await...
+  //console.log("Inside data:" + base64data)
+  resolve(base64data.toString())
+});
+}
+});
+      
+</script>
+"""
+
+def get_audio():
+  display(HTML(AUDIO_HTML))
+  data = eval_js("data")
+  binary = b64decode(data.split(',')[1])
+  
+  process = (ffmpeg
+    .input('pipe:0')
+    .output('pipe:1', format='wav')
+    .run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True, quiet=True, overwrite_output=True)
+  )
+  output, err = process.communicate(input=binary)
+  
+  riff_chunk_size = len(output) - 8
+  # Break up the chunk size into four bytes, held in b.
+  q = riff_chunk_size
+  b = []
+  for i in range(4):
+      q, r = divmod(q, 256)
+      b.append(r)
+
+  # Replace bytes 4:8 in proc.stdout with the actual size of the RIFF chunk.
+  riff = output[:4] + bytes(b) + output[8:]
+
+  # sr, audio = wav_read(io.BytesIO(riff))
+  audio, sr = librosa.load(io.BytesIO(riff), sr=16000)
+
+  audio_file = 'audio.wav'
+  with open(audio_file,'wb') as f:
+    f.write(riff)
+
+  return audio, sr, audio_file