Commit 9c1e53da authored by kulvinu's avatar kulvinu

Added Backend

parent 7de86a70
import numpy as np
import librosa
import librosa.display
import IPython.display as ipd
import matplotlib.pyplot as plt
def read_file(file_name, sample_rate):
wav_file = wave.open(file_name, mode="rb")
channels = wav_file.getnchannels()
num_frames = wav_file.getnframes()
if wav_file.getframerate() != sample_rate:
raise ValueError("Audio file should have a sample rate of %d. got %d" % (sample_rate, wav_file.getframerate()))
samples = wav_file.readframes(num_frames)
wav_file.close()
frames = struct.unpack('h' * num_frames * channels, samples)
if channels == 2:
print("Picovoice processes single-channel audio but stereo file is provided. Processing left channel only.")
return frames[::channels]
## Loading audio
dataset_dir = '/datasets/live_recordings/'
audio_name = 'one.wav'
y, sample_rate = librosa.load(dataset_dir + audio_name, res_type='kaiser_fast')
# Play the original audio
print("Original audio - downsampled by librosa")
ipd.Audio(y, rate=sample_rate)
#------------------------------------------------------------------------------------
## Trim the beginning and ending silence
y_trimmed, _ = librosa.effects.trim(y)
print("Original duration: ", librosa.get_duration(y))
print("Trimmed duration: ", librosa.get_duration(y_trimmed))
figure = plt.figure()
# Trimmed audio - without silence
trimmed = figure.add_subplot(2, 1, 2)
librosa.display.waveplot(y_trimmed, sr=sample_rate, color='r')
plt.title('Trimmed')
# Original audio - with silence at the end
original = figure.add_subplot(2, 1, 1,sharex=trimmed)
librosa.display.waveplot(y, sr=sample_rate)
plt.title('Original')
plt.tight_layout()
plt.show()
# Play the original audio
print("Trimmed audio")
ipd.Audio(y_trimmed, rate=sample_rate)
###Audio Segmentation into windows
from pydub import AudioSegment
from pydub.silence import split_on_silence
sound_file = AudioSegment.from_wav("one.wav")
audio_chunks = split_on_silence(sound_file, min_silence_len=500, silence_thresh=-40)
print ("AudioChunks", audio_chunks)
for i, chunk in enumerate(audio_chunks):
out_file = "./a//.wav".format(i)
print ("exporting", out_file)
chunk.export(out_file, format="wav")
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from mlengine import transform_audio, get_prediction
import pickle
import pyaudio
import numpy as np
from queue import Queue
import matplotlib.pyplot as plt
from python_speech_features import logfbank
class StreamPrediction:
"""
Class for predicting streaming data. Heavily adapted from the implementation:
"""
def __init__(self, model_path):
# Load model
self.feature_extractor = None
self.pca = None
self.marvin_svm = None
self.load_models(model_path)
# Recording parameters
self.sr = 16000
self.chunk_duration = 0.75
self.chunk_samples = int(self.sr * self.chunk_duration)
self.window_duration = 1
self.window_samples = int(self.sr * self.window_duration)
self.silence_threshold = 100
# Data structures and buffers
self.queue = Queue()
self.data = np.zeros(self.window_samples, dtype="int16")
# Plotting parameters
self.change_bkg_frames = 2
self.change_bkg_counter = 0
self.change_bkg = False
def load_models(self, model_path):
"""
Loads the models for hotword detection
:param model_path: Path to model directory
:return: None
"""
# Load model structure
model = load_model(model_path, compile=True)
# layer_name = "features256"
# self.feature_extractor = Model(inputs=model.input, outputs=model.get_layer(layer_name).output)
# # Load trained PCA object
# with open(model_path + "/marvin_kws_pca.pickle", "rb") as file:
# self.pca = pickle.load(file)
# # Load trained SVM
# with open(model_path + "/marvin_kws_svm.pickle", "rb") as file:
# self.marvin_svm = pickle.load(file)
print("Loaded models from disk")
def start_stream(self):
"""
Start audio data streaming from microphone
:return: None
"""
stream = pyaudio.PyAudio().open(
format=pyaudio.paInt16,
channels=1,
rate=self.sr,
input=True,
frames_per_buffer=self.chunk_samples,
# input_device_index=6,
stream_callback=self.callback,
)
stream.start_stream()
try:
while True:
data = self.queue.get()
# fbank = logfbank(data, samplerate=self.sr, nfilt=40)
# pred = self.detect_keyword(fbank)
tensor = transform_audio(self.sr,data)
pred = get_prediction(tensor)
self.plotter(data, pred)
if pred == 0:
print("0", sep="", end="", flush=True)
if pred == 1:
print("1", sep="", end="", flush=True)
if pred == 2:
print("2", sep="", end="", flush=True)
if pred == 3:
print("3", sep="", end="", flush=True)
if pred == 4:
print("4", sep="", end="", flush=True)
if pred == 5:
print("5", sep="", end="", flush=True)
if pred == 6:
print("6", sep="", end="", flush=True)
if pred == 7:
print("7", sep="", end="", flush=True)
if pred == 8:
print("8", sep="", end="", flush=True)
if pred == 9:
print("9", sep="", end="", flush=True)
except (KeyboardInterrupt, SystemExit):
stream.stop_stream()
stream.close()
# def detect_keyword(self, fbank):
# """
# Detect hotword presence in current window
# :param fbank: Log Mel filterbank energies
# :return: Prediction
# """
# fbank = np.expand_dims(fbank, axis=0)
# feature_embeddings = self.feature_extractor.predict(fbank)
# feature_embeddings_scaled = self.pca.transform(feature_embeddings)
# prediction = self.marvin_svm.predict(feature_embeddings_scaled)
# return prediction
def callback(self, in_data, frame_count, time_info, status):
"""
Obtain the data from buffer and load it to queue
:param in_data: Daa buffer
:param frame_count: Frame count
:param time_info: Time information
:param status: Status
:return:
"""
data0 = np.frombuffer(in_data, dtype="int16")
if np.abs(data0).mean() < self.silence_threshold:
print(".", sep="", end="", flush=True)
else:
print("-", sep="", end="", flush=True)
self.data = np.append(self.data, data0)
if len(self.data) > self.window_samples:
self.data = self.data[-self.window_samples :]
self.queue.put(self.data)
return in_data, pyaudio.paContinue
def plotter(self, data, pred):
"""
Plot waveform, filterbank energies and hotword presence
:param data: Audio data array
:param fbank: Log Mel filterbank energies
:param pred: Prediction
:return:
"""
plt.clf()
# Wave
plt.subplot(311)
plt.plot(data[-len(data) // 2 :])
plt.gca().xaxis.set_major_locator(plt.NullLocator())
plt.ylabel("Amplitude")
# Filterbank energies
# plt.subplot(312)
# plt.imshow(fbank[-fbank.shape[0] // 2 :, :].T, aspect="auto")
# plt.gca().xaxis.set_major_locator(plt.NullLocator())
# plt.gca().invert_yaxis()
# plt.ylim(0, 40)
# plt.ylabel("$\log \, E_{m}$")
# Hotword detection
plt.subplot(313)
ax = plt.gca()
if pred == 0:
self.change_bkg = True
if pred == 1:
self.change_bkg = True
if pred == 2:
self.change_bkg = True
if pred == 3:
self.change_bkg = True
if pred == 4:
self.change_bkg = True
if pred == 5:
self.change_bkg = True
if pred == 6:
self.change_bkg = True
if pred == 7:
self.change_bkg = True
if pred == 8:
self.change_bkg = True
if pred == 9:
self.change_bkg = True
if self.change_bkg and self.change_bkg_counter < self.change_bkg_frames:
ax.set_facecolor("lightgreen")
ax.text(
x=0.5,
y=0.5,
s="{pred}",
horizontalalignment="center",
verticalalignment="center",
fontsize=30,
color="red",
fontweight="bold",
transform=ax.transAxes,
)
self.change_bkg_counter += 1
else:
ax.set_facecolor("salmon")
self.change_bkg = False
self.change_bkg_counter = 0
plt.tight_layout()
plt.pause(0.01)
if __name__ == "__main__":
audio_stream = StreamPrediction("./saved_model")
audio_stream.start_stream()
\ No newline at end of file
import io
import tensorflow as tf
from tensorflow import keras
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import pickle
import numpy as np
from scipy.io import wavfile as wav
import scipy
import scipy.signal as sps
from python_speech_features import mfcc
from python_speech_features import logfbank
from tensorflow.keras.models import Sequential, save_model, load_model
modelfilepath = './saved_model'
datafilepath = './data'
size = 48
DIGITS = ["0", "1", "2", "3","4","5","6","7","8","9"]
model = load_model(modelfilepath, compile=True)
def transform_audio(rate,sig):
#read .wav file
#resample audio file
number_of_samples = round(len(sig) * float(16000) / rate)
sig = sps.resample(sig, number_of_samples)
#Encode numbers using 48*13 matrix
#Compute MFCC features from an audio signal
mfcc_feat = mfcc(sig,rate,nfft=2048)
#Return a new numpy array with the specified shape.
mfcc_feat = np.resize(mfcc_feat, (size,13))
#set the independent variable
return mfcc_feat
def get_prediction(X):
pred = model.predict(X.reshape(-1,size,13,1))
prediction = DIGITS[np.argmax(pred)]
print("\n\033[1mPredicted digit sound: %.0f"%pred.argmax(),"\033[0m \n ")
print("Predicted probability array:")
print(pred)
return prediction
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment