Commit 226feab3 authored by NaweenTharuka's avatar NaweenTharuka

feat: Audio emotion detection py

parent c2e71338
import numpy as np
import librosa # To extract speech features
import glob
import os
# Extract feature function
def extract_audio_features(file_name, should_augment=False, **kwargs):
"""
Extract feature from audio file `file_name`
Features supported:
- MFCC (mfcc)
- Chroma (chroma)
- MEL Spectrogram Frequency (mel)
e.g:
`features = extract_audio_features(path, mel=True, mfcc=True)`
"""
mfcc = kwargs.get("mfcc")
chroma = kwargs.get("chroma")
mel = kwargs.get("mel")
# https://stackoverflow.com/questions/9458480/read-mp3-in-python-3
# https://librosa.org/doc/latest/tutorial.html#quickstart
# https://github.com/librosa/librosa/issues/1015
X, sample_rate = librosa.load(file_name)
if chroma:
stft = np.abs(librosa.stft(X))
result = np.array([])
if mfcc:
mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
result = np.hstack((result, mfccs))
# print('mfccs shape', mfccs.shape)
if mel:
mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
result = np.hstack((result, mel))
# print('mel shape', mel.shape)
if chroma:
chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
result = np.hstack((result, chroma))
# print('chroma shape', chroma.shape)
return result
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "predict_emotion_mer_thesis_app.ipynb",
"provenance": [],
"collapsed_sections": [],
"authorship_tag": "ABX9TyNROFZ0mqm4VUJjZk5Jgyiy",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/github/HonzaCuhel/mer-thesis-app/blob/main/predict_emotion_mer_thesis_app.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "coKAZLXvspj9"
},
"source": [
"# Multimodal Speech Emotion Recognition - Demo app\n",
"<hr/>\n",
"<b>Description:</b> This notebook contains a demo application of Emotion Recognition models trained on the IEMOCAP dataset using 4 basic emotions.<br/>\n",
"<b>Model Architecture:</b> Electra small (TER), TRILL (SER), YAMNet (SER), Electra + TRILL (MER), Electra + YAMNet (MER)<br/>\n",
"<b>Author:</b> Jan Čuhel<br/>\n",
"<b>Date:</b> 5.5.2021<br/>\n",
"<b>Dataset:</b> <a href='https://usc.edu/iemocap/'>IEMOCAP</a>, <a href='https://zenodo.org/record/1188976'>RAVDESS</a>, <a href='https://github.com/bfelbo/DeepMoji/tree/master/data/PsychExp'>PsychExp</a><br/>\n",
"<b>Predicting emotions:</b><br/>\n",
"- IEMOCAP: [happy + excited, sad, angry, neutral]<br/>\n",
"- RAVDESS: [neutral, calm, happy, sad, angry, fearful, disgust, surprised]<br/>\n",
"- PsychExp: [joy, fear, anger, sadness, disgust, shame, guilt]<br/>"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "DHBgi40Tycgh"
},
"source": [
"\n",
"###Resources \n",
"- https://ricardodeazambuja.com/deep_learning/2019/03/09/audio_and_video_google_colab/\n",
"- https://stackoverflow.com/questions/9031783/hide-all-warnings-in-ipython\n",
"- https://getemoji.com/\n",
"- https://realpython.com/python-speech-recognition/\n",
"- https://github.com/Uberi/speech_recognition#readme\n",
"- https://www.howtogeek.com/423214/how-to-use-the-rename-command-on-linux/"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "vj-Bbk2DyK1-"
},
"source": [
"## Code preparation\n",
"### Please run this code cell (click inside the cell and press `Ctrl + Enter`, or click on the `run icon` in the top left corner of the cell)\n",
"\n",
"What this does:\n",
"\n",
"1. Installs the required Python packages\n",
"2. Clones a Github repository containing classes for Emotion Recognition models\n",
"3. Downloads the trained saved models\n",
"4. Moves the downloaded trained saved models\n"
]
},
{
"cell_type": "code",
"metadata": {
"id": "ktVYqO43jrPs"
},
"source": [
"# Step 1) Installation\n",
"!pip install -q ffmpeg-python SpeechRecognition gTTS pydub librosa tensorflow-text\n",
"\n",
"# Step 2) Connect to GitHub\n",
"!git clone https://github.com/HonzaCuhel/mer-thesis-app\n",
"\n",
"# Step 3) Download the models\n",
"!wget 'https://alquist2021-data.s3.amazonaws.com/public/mer_electra_yamnet_iemocap_model+(1).h5'\n",
"!wget 'https://alquist2021-data.s3.amazonaws.com/public/mer_trill_electra_small_model.h5'\n",
"!wget 'https://alquist2021-data.s3.amazonaws.com/public/ter_electra_iemocap_model+(1).h5'\n",
"!wget 'https://alquist2021-data.s3.amazonaws.com/public/ter_electra_model_psychexp.h5'\n",
"\n",
"# Step 4) Move the models\n",
"!mv '/content/mer_electra_yamnet_iemocap_model+(1).h5' /content/mer-thesis-app/result_models/mer_electra_yamnet_iemocap_model.h5\n",
"!mv '/content/mer_trill_electra_small_model.h5' /content/mer-thesis-app/result_models/mer_trill_electra_small_model.h5\n",
"!mv '/content/ter_electra_iemocap_model+(1).h5' /content/mer-thesis-app/result_models/ter_electra_iemocap_model.h5\n",
"!mv '/content/ter_electra_model_psychexp.h5' /content/mer-thesis-app/result_models/ter_electra_model_psychexp.h5"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "A17BHwScB7DF"
},
"source": [
"## Code definition\n",
"### Please run this code cell (click inside the cell and press `Ctrl + Enter`, or click on the `run icon` in the top left corner of the cell) as well\n",
"\n",
"What this does:\n",
"\n",
"1. Imports the required packages\n",
"2. Defines some constants\n",
"3. Load the trained models\n",
"4. Defines functions for Emotion Recognition"
]
},
{
"cell_type": "code",
"metadata": {
"id": "n00MeETqutfK"
},
"source": [
"# Step 1) Imports\n",
"import sys\n",
"import gtts\n",
"import os\n",
"import IPython.display as display\n",
"import matplotlib.pyplot as plt\n",
"import librosa\n",
"import speech_recognition as sr\n",
"\n",
"import warnings\n",
"warnings.filterwarnings('ignore')\n",
"\n",
"sys.path.append('/content/mer-thesis-app/')\n",
"from record_audio import get_audio\n",
"from predict_emotion_tf import *\n",
"\n",
"\n",
"# Step 2) Defining constants\n",
"lang = 'en'\n",
"dur = 11\n",
"emoji_dict = {\"happy\":\"😊\", \"fear\":\"😱\", \"angry\":\"😡\", \"sad\":\"😢\", \"disgust\":\"🤮\", \"shame\":\"😳\", \"guilt\":\"😓\", \"neutral\": \"😐\"}\n",
"NO = 'no'\n",
"DEFAULT_SAMPLE_RATE = 16000\n",
"output_file = 'output_emotion.mp3'\n",
"\n",
"# Step 3) Model loading\n",
"print('Models are being loaded, it will take some time...')\n",
"\n",
"ser_trill_model_iemocap = TRILLSERModel(SER_TRILL_MODEL_IEMOCAP, TRILL_URL, emotions_iemocap, input_length_iemocap, SAMPLING_RATE)\n",
"ser_trill_model_ravdess = TRILLSERModel(SER_TRILL_MODEL_RAVDESS, TRILL_URL, emotions_ravdess, input_length_ravdess, SAMPLING_RATE)\n",
"mer_electra_trill_model_iemocap = ElectraTRILLMERModel(MER_ELECTRA_TRILL, TRILL_URL, emotions_iemocap, input_length_iemocap, SAMPLING_RATE)\n",
"\n",
"ser_yamnet_model_iemocap = YAMNetSERModel(SER_YAMNET_MODEL_IEMOCAP, YAMNET_URL, emotions_iemocap, input_length_iemocap, SAMPLING_RATE)\n",
"ser_yamnet_model_ravdess = YAMNetSERModel(SER_YAMNET_MODEL_RAVDESS, YAMNET_URL, emotions_ravdess, input_length_ravdess, SAMPLING_RATE)\n",
"mer_electra_yamnet_model_iemocap = ElectraYAMNetMERModel(MER_ELECTRA_YAMNET, YAMNET_URL, emotions_iemocap, input_length_iemocap, SAMPLING_RATE)\n",
"\n",
"ter_electra_model_iemocap = TERModel(TER_ELECTRA_IEMOCAP, emotions_iemocap)\n",
"ter_electra_model_psychexp = TERModel(TER_ELECTRA_PSYCHEXP, emotion_psychexp)\n",
"\n",
"print('Models are loaded!')\n",
"\n",
"# Step 4) Definition of functions\n",
"def get_transription(audio_file):\n",
" # use the audio file as the audio source\n",
" r = sr.Recognizer()\n",
" with sr.AudioFile(audio_file) as source:\n",
" audio = r.record(source, duration=dur) # read the entire audio file\n",
" \n",
" # Resource: https://github.com/Uberi/speech_recognition/blob/master/examples/audio_transcribe.py\n",
" # Recognize speech using Google Speech Recognition\n",
" try:\n",
" text = r.recognize_google(audio, language=lang)\n",
" except sr.UnknownValueError:\n",
" print(\"Google Speech Recognition could not understand audio\")\n",
" return \"\"\n",
" except sr.RequestError as e:\n",
" print(\"Could not request results from Google Speech Recognition service; {0}\".format(e))\n",
" return \"\"\n",
" \n",
" return text\n",
"\n",
"def predict_emotion(audio_file, print_intro=True):\n",
" if print_intro:\n",
" print('Welcome to the Multimodal Speech Emotion Recognizer app from audio and text!')\n",
" print('-'*80)\n",
" print('Help:')\n",
" print(' - record a speech and the program will recognize your emotion')\n",
"\n",
" print('Recognizing emotion...')\n",
" # Recognize the emotion\n",
" text = get_transription(audio_file)\n",
"\n",
" # TRILL predictions\n",
" ser_trill_iemocap = ser_trill_model_iemocap.predict_emotion(audio_file)\n",
" ser_trill_ravdess = ser_trill_model_ravdess.predict_emotion(audio_file)\n",
" mer_trill_electra = mer_electra_trill_model_iemocap.predict_emotion(text, audio_file)\n",
"\n",
" # Yamnet predictions\n",
" ser_yamnet_iemocap = ser_yamnet_model_iemocap.predict_emotion(audio_file)\n",
" ser_yamnet_ravdess = ser_yamnet_model_ravdess.predict_emotion(audio_file)\n",
" mer_yamnet_electra = mer_electra_yamnet_model_iemocap.predict_emotion(text, audio_file)\n",
"\n",
" # TER Electra predictions\n",
" ter_electra_iemocap = ter_electra_model_iemocap.predict_emotion(text)\n",
" ter_electra_psychexp = ter_electra_model_psychexp.predict_emotion(text)\n",
"\n",
" print('\\n' + '='*60)\n",
" print(f'\\nYou\\'ve said: {text}.\\n')\n",
" print(\"Audio's waveform:\")\n",
" plt.figure(figsize=(10,5))\n",
" plt.plot(librosa.load(audio_file)[0])\n",
" plt.title(f'Audio\\'s waveform (sample rate {round(DEFAULT_SAMPLE_RATE/1000)}kHz)')\n",
" plt.xlabel('Time')\n",
" plt.ylabel('Amplitude')\n",
" plt.show()\n",
" print('='*60)\n",
" print(\"Predictions:\")\n",
" print('-'*40)\n",
" print('TRILL models:')\n",
" print(f'MER Electra TRILL (IEMOCAP): {mer_trill_electra}')\n",
" print(f'SER TRILL (IEMOCAP): {ser_trill_iemocap}')\n",
" print(f'SER TRILL (RAVDESS): {ser_trill_ravdess}')\n",
" print('-'*40)\n",
" print('YAMNet models:')\n",
" print(f'MER Electra YAMNet (IEMOCAP): {mer_yamnet_electra}')\n",
" print(f'SER YAMNet (IEMOCAP): {ser_yamnet_iemocap}')\n",
" print(f'SER YAMNet (RAVDESS): {ser_yamnet_ravdess}')\n",
" print('-'*40)\n",
" print('Only text - Electra small')\n",
" print(f'TER Electra small (IEMOCAP): {ter_electra_iemocap}')\n",
" print(f'TER Electra small (PsychExp): {ter_electra_psychexp}')\n",
" print('='*60)\n",
"\n",
" return mer_trill_electra"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "ETpnzv33QOzP"
},
"source": [
"### Record a speech\n",
"\n",
"Here you can record a sample of your speech. To record just execute the next cell by either hitting the `run icon` or by clicking inside of the cell and then press `Ctrl + Enter`. After you've sad something click on the button `Stop recording` to stop recording. \n",
"\n",
"<b>WARNING: ONLY 11 SECONDS OF YOUR SPEECH WILL BE USED, SO IF YOU WILL SPEAK LONGER THE AUDIO FILE WILL BE TRUNCATED, IF YOU WILL SPEAK LESS, IT IS FINE, THE AUDIO RECORDING WILL BE PADDED.</b>"
]
},
{
"cell_type": "code",
"metadata": {
"id": "RPTSXUW9u3Ak"
},
"source": [
"audio, sample_rate, audio_file = get_audio()\n",
"\n",
"print('Speech recorded!')"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "AhbvAYwJP2Ym"
},
"source": [
"#### Emotion recognition"
]
},
{
"cell_type": "code",
"metadata": {
"id": "QJ7Fcv6WJx2A"
},
"source": [
"pred_emotion = predict_emotion(audio_file)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "OEacJF59wbM4"
},
"source": [
"### Audio uploading\n",
"\n",
"You can test out the models by uploading `.wav` audio files and the models will try to predict emotions from them. Try it! \n",
"\n",
"<b>WARNING: ONLY 11 SECONDS OF THE AUDIO FILES WILL BE USED, SO IF YOU WILL SPEAK LONGER THE AUDIO FILE WILL BE TRUNCATED, IF YOU WILL SPEAK LESS, IT IS FINE, THE AUDIO RECORDING WILL BE PADDED.</b>"
]
},
{
"cell_type": "code",
"metadata": {
"id": "btlhd4b5wmjp"
},
"source": [
"from google.colab import files\n",
"\n",
"# Upload files\n",
"uploaded = files.upload()\n",
"# Wav files counter \n",
"i_num = 1\n",
"\n",
"for uf in uploaded.keys():\n",
" if '.wav' in uf:\n",
" print('User uploaded file \"{name}\" with length {length} bytes'.format(\n",
" name=uf, length=len(uploaded[uf])))\n",
" \n",
" print('*'*80)\n",
" print(f'{i_num}) RESULTS FOR {uf}:')\n",
" # Predict the emotion\n",
" pred_emotion = predict_emotion(uf, print_intro=False)\n",
" print('*'*80)\n",
"\n",
" # Actualize counter\n",
" i_num += 1"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "66McRKEM_N5i"
},
"source": [
""
],
"execution_count": null,
"outputs": []
}
]
}
\ No newline at end of file
#!/usr/bin/env python3
# Author: Jan Cuhel
# Date: 2.5.2021
import os
import gtts
import librosa
import numpy as np
import pickle
from pydub import AudioSegment
from pydub.playback import play
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import scipy
import speech_recognition as sr
# Import TF 2.X and make sure we're running eager.
import tensorflow.compat.v2 as tf
tf.enable_v2_behavior()
assert tf.executing_eagerly()
import warnings
warnings.filterwarnings('ignore')
from extract_audio_features import extract_audio_features
# Audio constants
DURATION_RAVDESS = 3
DURATION_IEMOCAP = 11
SAMPLING_RATE = 16000
input_length_iemocap = SAMPLING_RATE * DURATION_IEMOCAP
input_length_ravdess = SAMPLING_RATE * DURATION_RAVDESS
DEFAULT_FILE = 'microphone-results.wav'
# TRILL models
SER_TRILL_MODEL_IEMOCAP = '/content/mer-thesis-app/result_models/ser_trill_lstm_iemocap_model.h5'
SER_TRILL_MODEL_RAVDESS = '/content/mer-thesis-app/result_models/ser_trill_lstm_ravdess_model.h5'
MER_ELECTRA_TRILL = '/content/mer-thesis-app/result_models/mer_trill_electra_small_model.h5'
# Yamnet models
SER_YAMNET_MODEL_IEMOCAP = '/content/mer-thesis-app/result_models/ser_yamnet_iemocap_model.h5'
SER_YAMNET_MODEL_RAVDESS = '/content/mer-thesis-app/result_models/ser_yamnet_ravdess_model.h5'
MER_ELECTRA_YAMNET = '/content/mer-thesis-app/result_models/mer_electra_yamnet_iemocap_model.h5'
# TER Electra
TER_ELECTRA_IEMOCAP = '/content/mer-thesis-app/result_models/ter_electra_iemocap_model.h5'
TER_ELECTRA_PSYCHEXP = '/content/mer-thesis-app/result_models/ter_electra_model_psychexp.h5'
# Emotion available in datasets
emotions_iemocap = ['neutral', 'happy', 'sad', 'angry']
emotions_ravdess = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']
emotion_psychexp = ['joy', 'fear', 'anger', 'sadness', 'disgust', 'shame', 'guilt']
# Language of the models
LANG='en'
# URL addresses for the audio embeddings
TRILL_URL = 'https://tfhub.dev/google/nonsemantic-speech-benchmark/trill-distilled/3'
YAMNET_URL = 'https://tfhub.dev/google/yamnet/1'
class DeepLearningModel():
""" Definition of a class for DeepLearning Emotion Recognition model """
def __init__(self, model_filename, emotions=emotions_iemocap):
self.model_filename = model_filename
self.emotions = emotions
self.model = self.load_model()
def load_model(self):
""" Loads the model from TF Hub """
return tf.keras.models.load_model(
self.model_filename, custom_objects={'KerasLayer':hub.KerasLayer})
class TERModel(DeepLearningModel):
""" Definition of a class for Text Emotion Recognition model (TER) """
def __init__(self, model_filename, emotions=emotions_iemocap):
super().__init__(model_filename, emotions)
def predict_emotion(self, text):
""" Predicts an emotion of the given text """
X_text = np.array([text])
# Make prediction
pred_id = tf.argmax(self.model.predict(X_text), 1).numpy()[0]
return self.emotions[pred_id]
class SERModel(DeepLearningModel):
""" Definition of a class for Speech Emotion Recognition model (SER) """
def __init__(self, model_filename, embedding_url, emotions=emotions_iemocap, input_length=input_length_iemocap, sample_rate=SAMPLING_RATE):
super().__init__(model_filename, emotions)
self.input_length = input_length
self.embedding = hub.load(embedding_url)
self.sample_rate = sample_rate
def load_model(self):
""" Loads the model """
return tf.keras.models.load_model(self.model_filename)
def predict_emotion(self, audio_file):
""" Predicts an emotion of the given audio file """
y, _ = librosa.load(audio_file, sr=self.sample_rate)
# y,_ = librosa.effects.trim(y, top_db = 25)
# https://en.wikipedia.org/wiki/Wiener_filter
# https://cs.wikipedia.org/wiki/Wiener%C5%AFv_filtr
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.wiener.html
y = scipy.signal.wiener(y)
if len(y) > self.input_length:
# Cut to the same length
y = y[0:self.input_length]
elif self.input_length > len(y):
# Pad the sequence
max_offset = self.input_length - len(y)
y = np.pad(y, (0, max_offset), "constant")
X_audio = self.get_audio_embedding(y)
# Make prediction
pred_id = tf.argmax(self.model.predict(X_audio), 1).numpy()[0]
return self.emotions[pred_id]
def get_audio_embedding(self, audio):
return np.array([audio])
class TRILLSERModel(SERModel):
"""
Definition of a class for Speech Emotion Recognition model (SER) that
uses TRILL Embedding
"""
def __init__(self, model_filename, embedding_url, emotions=emotions_iemocap, input_length=input_length_iemocap, sample_rate=SAMPLING_RATE):
super().__init__(model_filename, embedding_url, emotions, input_length, sample_rate)
def get_audio_embedding(self, audio):
return np.array([self.embedding(samples=audio, sample_rate=self.sample_rate)['embedding'].numpy()])
class YAMNetSERModel(SERModel):
"""
Definition of a class for Speech Emotion Recognition model (SER) that
uses YAMNet as an Embedding
"""
def __init__(self, model_filename, embedding_url, emotions=emotions_iemocap, input_length=input_length_iemocap, sample_rate=SAMPLING_RATE):
super().__init__(model_filename, embedding_url, emotions, input_length, sample_rate)
def get_audio_embedding(self, audio):
# Get the embedding from the yamnet
_, embeddings, _ = self.embedding(audio)
return np.array([embeddings.numpy()])
class MERModel(DeepLearningModel):
""" Definition of a class for Multimodal Emotion Recognition model (MER) """
def __init__(self, model_filename, embedding_url, emotions=emotions_iemocap, input_length=input_length_iemocap, sample_rate=SAMPLING_RATE):
super().__init__(model_filename, emotions)
self.input_length = input_length
self.embedding = hub.load(embedding_url)
self.input_length = input_length
self.sample_rate = sample_rate
def predict_emotion(self, text, audio_file):
""" Predicts an emotion of the given text and audio file """
y, _ = librosa.load(audio_file, sr=self.sample_rate)
# y,_ = librosa.effects.trim(y, top_db = 25)
# https://en.wikipedia.org/wiki/Wiener_filter
# https://cs.wikipedia.org/wiki/Wiener%C5%AFv_filtr
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.wiener.html
y = scipy.signal.wiener(y)
if len(y) > self.input_length:
# Cut to the same length
y = y[0:self.input_length]
elif self.input_length > len(y):
# Pad the sequence
max_offset = self.input_length - len(y)
y = np.pad(y, (0, max_offset), "constant")
X_audio = self.get_audio_embedding(y)
X_text = np.array([text])
# Make prediction
pred_id = tf.argmax(self.model.predict([X_text, X_audio]), 1).numpy()[0]
return self.emotions[pred_id]
def get_audio_embedding(self, audio):
return np.array([audio])
class ElectraTRILLMERModel(MERModel):
"""
Definition of a class for Multimodal Emotion Recognition model (MER) that
uses TRILL Embedding
"""
def __init__(self, model_filename, embedding_url, emotions=emotions_iemocap, input_length=input_length_iemocap, sample_rate=SAMPLING_RATE):
super().__init__(model_filename, embedding_url, emotions, input_length, sample_rate)
def get_audio_embedding(self, audio):
return np.array([self.embedding(samples=audio, sample_rate=self.sample_rate)['embedding'].numpy()])
class ElectraYAMNetMERModel(MERModel):
"""
Definition of a class for Multimodal Emotion Recognition model (MER) that
uses YAMNet as an Embedding
"""
def __init__(self, model_filename, embedding_url, emotions=emotions_iemocap, input_length=input_length_iemocap, sample_rate=SAMPLING_RATE):
super().__init__(model_filename, embedding_url, emotions, input_length, sample_rate)
def get_audio_embedding(self, audio):
# Get the embedding from the yamnet
_, embeddings, _ = self.embedding(audio)
return np.array([embeddings.numpy()])
def record_speech(lang=LANG, dur=DURATION_IEMOCAP, filepath=DEFAULT_FILE):
"""
This function records a speech from a microphone and get the text.
params:
- lang: the language of the recorded speach
- dur: how long in seconds should the function record
- filepath: path to the file where should be the audio recording saved
returns:
- text: transcript of the audio recording
- filepath: where was the audio recording saved
"""
# initialize the recognizer
r = sr.Recognizer()
try:
with sr.Microphone() as source:
print(f'Starting recording for the next {dur}s.\nPlease speak...')
# read the audio data from the default microphone
audio_data = r.record(source, duration=dur)
print("Recording ended.\nRecognizing...")
# convert speech to text
text = r.recognize_google(audio_data, language=lang)
print('Done.')
print(f'\nYou\'ve said {text}.\n')
# write audio to a WAV file
with open(filepath, "wb") as f:
f.write(audio_data.get_wav_data())
print('Done.')
return text, filepath
except:
print('Something went wrong... Try to speak again')
return None, None
"""
Resource: https://ricardodeazambuja.com/deep_learning/2019/03/09/audio_and_video_google_colab/
Author references:
To write this piece of code I took inspiration/code from a lot of places.
It was late night, so I'm not sure how much I created or just copied o.O
Here are some of the possible references:
https://blog.addpipe.com/recording-audio-in-the-browser-using-pure-html5-and-minimal-javascript/
https://stackoverflow.com/a/18650249
https://hacks.mozilla.org/2014/06/easy-audio-capture-with-the-mediarecorder-api/
https://air.ghost.io/recording-to-an-audio-file-using-html5-and-js/
https://stackoverflow.com/a/49019356
"""
from IPython.display import HTML, Audio
from google.colab.output import eval_js
from base64 import b64decode
import numpy as np
# from scipy.io.wavfile import read as wav_read
import librosa
import io
import ffmpeg
AUDIO_HTML = """
<script>
var my_div = document.createElement("DIV");
var my_p = document.createElement("P");
var my_btn = document.createElement("BUTTON");
var t = document.createTextNode("Press to start recording");
my_btn.appendChild(t);
//my_p.appendChild(my_btn);
my_div.appendChild(my_btn);
document.body.appendChild(my_div);
var base64data = 0;
var reader;
var recorder, gumStream;
var recordButton = my_btn;
var handleSuccess = function(stream) {
gumStream = stream;
var options = {
//bitsPerSecond: 8000, //chrome seems to ignore, always 48k
mimeType : 'audio/webm;codecs=opus'
//mimeType : 'audio/webm;codecs=pcm'
};
//recorder = new MediaRecorder(stream, options);
recorder = new MediaRecorder(stream);
recorder.ondataavailable = function(e) {
var url = URL.createObjectURL(e.data);
var preview = document.createElement('audio');
preview.controls = true;
preview.src = url;
document.body.appendChild(preview);
reader = new FileReader();
reader.readAsDataURL(e.data);
reader.onloadend = function() {
base64data = reader.result;
//console.log("Inside FileReader:" + base64data);
}
};
recorder.start();
};
recordButton.innerText = "Recording... press to stop";
navigator.mediaDevices.getUserMedia({audio: true}).then(handleSuccess);
function toggleRecording() {
if (recorder && recorder.state == "recording") {
recorder.stop();
gumStream.getAudioTracks()[0].stop();
recordButton.innerText = "Saving the recording... pls wait!"
}
}
// https://stackoverflow.com/a/951057
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
var data = new Promise(resolve=>{
//recordButton.addEventListener("click", toggleRecording);
recordButton.onclick = ()=>{
toggleRecording()
sleep(2000).then(() => {
// wait 2000ms for the data to be available...
// ideally this should use something like await...
//console.log("Inside data:" + base64data)
resolve(base64data.toString())
});
}
});
</script>
"""
def get_audio():
display(HTML(AUDIO_HTML))
data = eval_js("data")
binary = b64decode(data.split(',')[1])
process = (ffmpeg
.input('pipe:0')
.output('pipe:1', format='wav')
.run_async(pipe_stdin=True, pipe_stdout=True, pipe_stderr=True, quiet=True, overwrite_output=True)
)
output, err = process.communicate(input=binary)
riff_chunk_size = len(output) - 8
# Break up the chunk size into four bytes, held in b.
q = riff_chunk_size
b = []
for i in range(4):
q, r = divmod(q, 256)
b.append(r)
# Replace bytes 4:8 in proc.stdout with the actual size of the RIFF chunk.
riff = output[:4] + bytes(b) + output[8:]
# sr, audio = wav_read(io.BytesIO(riff))
audio, sr = librosa.load(io.BytesIO(riff), sr=16000)
audio_file = 'audio.wav'
with open(audio_file,'wb') as f:
f.write(riff)
return audio, sr, audio_file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment