Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
2
2022-073
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
2022-073
2022-073
Commits
9c1e53da
Commit
9c1e53da
authored
Oct 09, 2022
by
kulvinu
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Added Backend
parent
7de86a70
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
367 additions
and
0 deletions
+367
-0
web-app/backend/AudioPreprocessing.py
web-app/backend/AudioPreprocessing.py
+75
-0
web-app/backend/LiveSpeechRecorder.py
web-app/backend/LiveSpeechRecorder.py
+239
-0
web-app/backend/mlengine.py
web-app/backend/mlengine.py
+53
-0
No files found.
web-app/backend/AudioPreprocessing.py
0 → 100644
View file @
9c1e53da
import
numpy
as
np
import
librosa
import
librosa.display
import
IPython.display
as
ipd
import
matplotlib.pyplot
as
plt
def
read_file
(
file_name
,
sample_rate
):
wav_file
=
wave
.
open
(
file_name
,
mode
=
"rb"
)
channels
=
wav_file
.
getnchannels
()
num_frames
=
wav_file
.
getnframes
()
if
wav_file
.
getframerate
()
!=
sample_rate
:
raise
ValueError
(
"Audio file should have a sample rate of
%
d. got
%
d"
%
(
sample_rate
,
wav_file
.
getframerate
()))
samples
=
wav_file
.
readframes
(
num_frames
)
wav_file
.
close
()
frames
=
struct
.
unpack
(
'h'
*
num_frames
*
channels
,
samples
)
if
channels
==
2
:
print
(
"Picovoice processes single-channel audio but stereo file is provided. Processing left channel only."
)
return
frames
[::
channels
]
## Loading audio
dataset_dir
=
'/datasets/live_recordings/'
audio_name
=
'one.wav'
y
,
sample_rate
=
librosa
.
load
(
dataset_dir
+
audio_name
,
res_type
=
'kaiser_fast'
)
# Play the original audio
print
(
"Original audio - downsampled by librosa"
)
ipd
.
Audio
(
y
,
rate
=
sample_rate
)
#------------------------------------------------------------------------------------
## Trim the beginning and ending silence
y_trimmed
,
_
=
librosa
.
effects
.
trim
(
y
)
print
(
"Original duration: "
,
librosa
.
get_duration
(
y
))
print
(
"Trimmed duration: "
,
librosa
.
get_duration
(
y_trimmed
))
figure
=
plt
.
figure
()
# Trimmed audio - without silence
trimmed
=
figure
.
add_subplot
(
2
,
1
,
2
)
librosa
.
display
.
waveplot
(
y_trimmed
,
sr
=
sample_rate
,
color
=
'r'
)
plt
.
title
(
'Trimmed'
)
# Original audio - with silence at the end
original
=
figure
.
add_subplot
(
2
,
1
,
1
,
sharex
=
trimmed
)
librosa
.
display
.
waveplot
(
y
,
sr
=
sample_rate
)
plt
.
title
(
'Original'
)
plt
.
tight_layout
()
plt
.
show
()
# Play the original audio
print
(
"Trimmed audio"
)
ipd
.
Audio
(
y_trimmed
,
rate
=
sample_rate
)
###Audio Segmentation into windows
from
pydub
import
AudioSegment
from
pydub.silence
import
split_on_silence
sound_file
=
AudioSegment
.
from_wav
(
"one.wav"
)
audio_chunks
=
split_on_silence
(
sound_file
,
min_silence_len
=
500
,
silence_thresh
=-
40
)
print
(
"AudioChunks"
,
audio_chunks
)
for
i
,
chunk
in
enumerate
(
audio_chunks
):
out_file
=
"./a//.wav"
.
format
(
i
)
print
(
"exporting"
,
out_file
)
chunk
.
export
(
out_file
,
format
=
"wav"
)
web-app/backend/LiveSpeechRecorder.py
0 → 100644
View file @
9c1e53da
import
os
os
.
environ
[
"CUDA_VISIBLE_DEVICES"
]
=
"-1"
import
tensorflow
as
tf
from
tensorflow.keras.models
import
Model
,
load_model
from
mlengine
import
transform_audio
,
get_prediction
import
pickle
import
pyaudio
import
numpy
as
np
from
queue
import
Queue
import
matplotlib.pyplot
as
plt
from
python_speech_features
import
logfbank
class
StreamPrediction
:
"""
Class for predicting streaming data. Heavily adapted from the implementation:
"""
def
__init__
(
self
,
model_path
):
# Load model
self
.
feature_extractor
=
None
self
.
pca
=
None
self
.
marvin_svm
=
None
self
.
load_models
(
model_path
)
# Recording parameters
self
.
sr
=
16000
self
.
chunk_duration
=
0.75
self
.
chunk_samples
=
int
(
self
.
sr
*
self
.
chunk_duration
)
self
.
window_duration
=
1
self
.
window_samples
=
int
(
self
.
sr
*
self
.
window_duration
)
self
.
silence_threshold
=
100
# Data structures and buffers
self
.
queue
=
Queue
()
self
.
data
=
np
.
zeros
(
self
.
window_samples
,
dtype
=
"int16"
)
# Plotting parameters
self
.
change_bkg_frames
=
2
self
.
change_bkg_counter
=
0
self
.
change_bkg
=
False
def
load_models
(
self
,
model_path
):
"""
Loads the models for hotword detection
:param model_path: Path to model directory
:return: None
"""
# Load model structure
model
=
load_model
(
model_path
,
compile
=
True
)
# layer_name = "features256"
# self.feature_extractor = Model(inputs=model.input, outputs=model.get_layer(layer_name).output)
# # Load trained PCA object
# with open(model_path + "/marvin_kws_pca.pickle", "rb") as file:
# self.pca = pickle.load(file)
# # Load trained SVM
# with open(model_path + "/marvin_kws_svm.pickle", "rb") as file:
# self.marvin_svm = pickle.load(file)
print
(
"Loaded models from disk"
)
def
start_stream
(
self
):
"""
Start audio data streaming from microphone
:return: None
"""
stream
=
pyaudio
.
PyAudio
()
.
open
(
format
=
pyaudio
.
paInt16
,
channels
=
1
,
rate
=
self
.
sr
,
input
=
True
,
frames_per_buffer
=
self
.
chunk_samples
,
# input_device_index=6,
stream_callback
=
self
.
callback
,
)
stream
.
start_stream
()
try
:
while
True
:
data
=
self
.
queue
.
get
()
# fbank = logfbank(data, samplerate=self.sr, nfilt=40)
# pred = self.detect_keyword(fbank)
tensor
=
transform_audio
(
self
.
sr
,
data
)
pred
=
get_prediction
(
tensor
)
self
.
plotter
(
data
,
pred
)
if
pred
==
0
:
print
(
"0"
,
sep
=
""
,
end
=
""
,
flush
=
True
)
if
pred
==
1
:
print
(
"1"
,
sep
=
""
,
end
=
""
,
flush
=
True
)
if
pred
==
2
:
print
(
"2"
,
sep
=
""
,
end
=
""
,
flush
=
True
)
if
pred
==
3
:
print
(
"3"
,
sep
=
""
,
end
=
""
,
flush
=
True
)
if
pred
==
4
:
print
(
"4"
,
sep
=
""
,
end
=
""
,
flush
=
True
)
if
pred
==
5
:
print
(
"5"
,
sep
=
""
,
end
=
""
,
flush
=
True
)
if
pred
==
6
:
print
(
"6"
,
sep
=
""
,
end
=
""
,
flush
=
True
)
if
pred
==
7
:
print
(
"7"
,
sep
=
""
,
end
=
""
,
flush
=
True
)
if
pred
==
8
:
print
(
"8"
,
sep
=
""
,
end
=
""
,
flush
=
True
)
if
pred
==
9
:
print
(
"9"
,
sep
=
""
,
end
=
""
,
flush
=
True
)
except
(
KeyboardInterrupt
,
SystemExit
):
stream
.
stop_stream
()
stream
.
close
()
# def detect_keyword(self, fbank):
# """
# Detect hotword presence in current window
# :param fbank: Log Mel filterbank energies
# :return: Prediction
# """
# fbank = np.expand_dims(fbank, axis=0)
# feature_embeddings = self.feature_extractor.predict(fbank)
# feature_embeddings_scaled = self.pca.transform(feature_embeddings)
# prediction = self.marvin_svm.predict(feature_embeddings_scaled)
# return prediction
def
callback
(
self
,
in_data
,
frame_count
,
time_info
,
status
):
"""
Obtain the data from buffer and load it to queue
:param in_data: Daa buffer
:param frame_count: Frame count
:param time_info: Time information
:param status: Status
:return:
"""
data0
=
np
.
frombuffer
(
in_data
,
dtype
=
"int16"
)
if
np
.
abs
(
data0
)
.
mean
()
<
self
.
silence_threshold
:
print
(
"."
,
sep
=
""
,
end
=
""
,
flush
=
True
)
else
:
print
(
"-"
,
sep
=
""
,
end
=
""
,
flush
=
True
)
self
.
data
=
np
.
append
(
self
.
data
,
data0
)
if
len
(
self
.
data
)
>
self
.
window_samples
:
self
.
data
=
self
.
data
[
-
self
.
window_samples
:]
self
.
queue
.
put
(
self
.
data
)
return
in_data
,
pyaudio
.
paContinue
def
plotter
(
self
,
data
,
pred
):
"""
Plot waveform, filterbank energies and hotword presence
:param data: Audio data array
:param fbank: Log Mel filterbank energies
:param pred: Prediction
:return:
"""
plt
.
clf
()
# Wave
plt
.
subplot
(
311
)
plt
.
plot
(
data
[
-
len
(
data
)
//
2
:])
plt
.
gca
()
.
xaxis
.
set_major_locator
(
plt
.
NullLocator
())
plt
.
ylabel
(
"Amplitude"
)
# Filterbank energies
# plt.subplot(312)
# plt.imshow(fbank[-fbank.shape[0] // 2 :, :].T, aspect="auto")
# plt.gca().xaxis.set_major_locator(plt.NullLocator())
# plt.gca().invert_yaxis()
# plt.ylim(0, 40)
# plt.ylabel("$\log \, E_{m}$")
# Hotword detection
plt
.
subplot
(
313
)
ax
=
plt
.
gca
()
if
pred
==
0
:
self
.
change_bkg
=
True
if
pred
==
1
:
self
.
change_bkg
=
True
if
pred
==
2
:
self
.
change_bkg
=
True
if
pred
==
3
:
self
.
change_bkg
=
True
if
pred
==
4
:
self
.
change_bkg
=
True
if
pred
==
5
:
self
.
change_bkg
=
True
if
pred
==
6
:
self
.
change_bkg
=
True
if
pred
==
7
:
self
.
change_bkg
=
True
if
pred
==
8
:
self
.
change_bkg
=
True
if
pred
==
9
:
self
.
change_bkg
=
True
if
self
.
change_bkg
and
self
.
change_bkg_counter
<
self
.
change_bkg_frames
:
ax
.
set_facecolor
(
"lightgreen"
)
ax
.
text
(
x
=
0.5
,
y
=
0.5
,
s
=
"{pred}"
,
horizontalalignment
=
"center"
,
verticalalignment
=
"center"
,
fontsize
=
30
,
color
=
"red"
,
fontweight
=
"bold"
,
transform
=
ax
.
transAxes
,
)
self
.
change_bkg_counter
+=
1
else
:
ax
.
set_facecolor
(
"salmon"
)
self
.
change_bkg
=
False
self
.
change_bkg_counter
=
0
plt
.
tight_layout
()
plt
.
pause
(
0.01
)
if
__name__
==
"__main__"
:
audio_stream
=
StreamPrediction
(
"./saved_model"
)
audio_stream
.
start_stream
()
\ No newline at end of file
web-app/backend/mlengine.py
0 → 100644
View file @
9c1e53da
import
io
import
tensorflow
as
tf
from
tensorflow
import
keras
import
os
import
pandas
as
pd
from
sklearn.model_selection
import
train_test_split
import
pickle
import
numpy
as
np
from
scipy.io
import
wavfile
as
wav
import
scipy
import
scipy.signal
as
sps
from
python_speech_features
import
mfcc
from
python_speech_features
import
logfbank
from
tensorflow.keras.models
import
Sequential
,
save_model
,
load_model
modelfilepath
=
'./saved_model'
datafilepath
=
'./data'
size
=
48
DIGITS
=
[
"0"
,
"1"
,
"2"
,
"3"
,
"4"
,
"5"
,
"6"
,
"7"
,
"8"
,
"9"
]
model
=
load_model
(
modelfilepath
,
compile
=
True
)
def
transform_audio
(
rate
,
sig
):
#read .wav file
#resample audio file
number_of_samples
=
round
(
len
(
sig
)
*
float
(
16000
)
/
rate
)
sig
=
sps
.
resample
(
sig
,
number_of_samples
)
#Encode numbers using 48*13 matrix
#Compute MFCC features from an audio signal
mfcc_feat
=
mfcc
(
sig
,
rate
,
nfft
=
2048
)
#Return a new numpy array with the specified shape.
mfcc_feat
=
np
.
resize
(
mfcc_feat
,
(
size
,
13
))
#set the independent variable
return
mfcc_feat
def
get_prediction
(
X
):
pred
=
model
.
predict
(
X
.
reshape
(
-
1
,
size
,
13
,
1
))
prediction
=
DIGITS
[
np
.
argmax
(
pred
)]
print
(
"
\n\033
[1mPredicted digit sound:
%.0
f"
%
pred
.
argmax
(),
"
\033
[0m
\n
"
)
print
(
"Predicted probability array:"
)
print
(
pred
)
return
prediction
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment