Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
2
21_22-J-02
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
21_22-J-02
21_22-J-02
Commits
880ba0ee
Commit
880ba0ee
authored
Jan 08, 2022
by
Shehara AKGH - IT18205152
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
feat: Audio Signal Processing
parent
7ebe4680
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
163 additions
and
0 deletions
+163
-0
BE-Pronunciation/Speech Signal Processing.py
BE-Pronunciation/Speech Signal Processing.py
+163
-0
No files found.
BE-Pronunciation/Speech Signal Processing.py
0 → 100644
View file @
880ba0ee
#!/usr/bin/env python
# coding: utf-8
# In[1]:
import
numpy
as
np
from
scipy.io
import
wavfile
from
scipy.fftpack
import
dct
from
matplotlib
import
pyplot
as
plt
sample_rate
,
signal
=
wavfile
.
read
(
'speech.wav'
)
signal
=
signal
[
0
:
int
(
10
*
sample_rate
)]
Time
=
np
.
linspace
(
0
,
len
(
signal
)
/
sample_rate
,
num
=
len
(
signal
))
plt
.
plot
(
Time
,
signal
)
# In[2]:
pre_emphasis
=
0.97
emphasized_signal
=
np
.
append
(
signal
[
0
],
signal
[
1
:]
-
pre_emphasis
*
signal
[:
-
1
])
# In[3]:
plt
.
plot
(
Time
,
signal
)
# In[4]:
frame_size
=
0.025
frame_stride
=
0.01
frame_length
,
frame_step
=
frame_size
*
sample_rate
,
frame_stride
*
sample_rate
# Convert from seconds to samples
signal_length
=
len
(
emphasized_signal
)
frame_length
=
int
(
round
(
frame_length
))
frame_step
=
int
(
round
(
frame_step
))
num_frames
=
int
(
np
.
ceil
(
float
(
np
.
abs
(
signal_length
-
frame_length
))
/
frame_step
))
# Make sure that we have at least 1 frame
pad_signal_length
=
num_frames
*
frame_step
+
frame_length
z
=
np
.
zeros
((
pad_signal_length
-
signal_length
))
pad_signal
=
np
.
append
(
emphasized_signal
,
z
)
# Pad Signal to make sure that all frames have equal number of samples without truncating any samples from the original signal
indices
=
np
.
tile
(
np
.
arange
(
0
,
frame_length
),
(
num_frames
,
1
))
+
np
.
tile
(
np
.
arange
(
0
,
num_frames
*
frame_step
,
frame_step
),
(
frame_length
,
1
))
.
T
frames
=
pad_signal
[
indices
.
astype
(
np
.
int32
,
copy
=
False
)]
# In[5]:
frames
*=
np
.
hamming
(
frame_length
)
# frames *= 0.54 - 0.46 * np.cos((2 * np.pi * n) / (frame_length - 1)) # Explicit Implementation **
# In[6]:
NFFT
=
512
mag_frames
=
np
.
absolute
(
np
.
fft
.
rfft
(
frames
,
NFFT
))
# Magnitude of the FFT
pow_frames
=
((
1.0
/
NFFT
)
*
((
mag_frames
)
**
2
))
# Power Spectrum
# Filter Banks
# In[7]:
nfilt
=
40
low_freq_mel
=
0
high_freq_mel
=
(
2595
*
np
.
log10
(
1
+
(
sample_rate
/
2
)
/
700
))
# Convert Hz to Mel
mel_points
=
np
.
linspace
(
low_freq_mel
,
high_freq_mel
,
nfilt
+
2
)
# Equally spaced in Mel scale
hz_points
=
(
700
*
(
10
**
(
mel_points
/
2595
)
-
1
))
# Convert Mel to Hz
bin
=
np
.
floor
((
NFFT
+
1
)
*
hz_points
/
sample_rate
)
fbank
=
np
.
zeros
((
nfilt
,
int
(
np
.
floor
(
NFFT
/
2
+
1
))))
for
m
in
range
(
1
,
nfilt
+
1
):
f_m_minus
=
int
(
bin
[
m
-
1
])
# left
f_m
=
int
(
bin
[
m
])
# center
f_m_plus
=
int
(
bin
[
m
+
1
])
# right
for
k
in
range
(
f_m_minus
,
f_m
):
fbank
[
m
-
1
,
k
]
=
(
k
-
bin
[
m
-
1
])
/
(
bin
[
m
]
-
bin
[
m
-
1
])
for
k
in
range
(
f_m
,
f_m_plus
):
fbank
[
m
-
1
,
k
]
=
(
bin
[
m
+
1
]
-
k
)
/
(
bin
[
m
+
1
]
-
bin
[
m
])
filter_banks
=
np
.
dot
(
pow_frames
,
fbank
.
T
)
filter_banks
=
np
.
where
(
filter_banks
==
0
,
np
.
finfo
(
float
)
.
eps
,
filter_banks
)
# Numerical Stability
filter_banks
=
20
*
np
.
log10
(
filter_banks
)
# dB
# In[8]:
fig
,
ax
=
plt
.
subplots
(
nrows
=
1
,
ncols
=
1
,
figsize
=
(
20
,
4
))
cax
=
ax
.
matshow
(
np
.
transpose
(
filter_banks
),
interpolation
=
"nearest"
,
aspect
=
"auto"
,
cmap
=
plt
.
cm
.
afmhot_r
,
origin
=
"lower"
,
)
fig
.
colorbar
(
cax
)
plt
.
title
(
"Mel compression Spectrogram"
)
plt
.
show
()
# Mel-frequency cepstral Coecfficents (MFCCs)
# In[9]:
num_ceps
=
12
mfcc
=
dct
(
filter_banks
,
type
=
2
,
axis
=
1
,
norm
=
"ortho"
)[:,
1
:
(
num_ceps
+
1
)]
# keep 2-13
# In[10]:
cep_lifter
=
22
(
nframes
,
ncoeff
)
=
mfcc
.
shape
n
=
np
.
arange
(
ncoeff
)
lift
=
1
+
(
cep_lifter
/
2
)
*
np
.
sin
(
np
.
pi
*
n
/
cep_lifter
)
mfcc
*=
lift
fig
,
ax
=
plt
.
subplots
(
nrows
=
1
,
ncols
=
1
,
figsize
=
(
20
,
4
))
cax
=
ax
.
matshow
(
np
.
transpose
(
mfcc
),
interpolation
=
"nearest"
,
aspect
=
"auto"
,
cmap
=
plt
.
cm
.
afmhot_r
,
origin
=
"lower"
,
)
fig
.
colorbar
(
cax
)
plt
.
title
(
"MFCC Spectrogram"
)
plt
.
show
()
# Mean Normalization
# In[11]:
##to balance the spectrum and improve the Signal-to-Noise (SNR),
##we can simply substract the mean of each coefficeint from all frames,
filter_banks
-=
(
np
.
mean
(
filter_banks
,
axis
=
0
)
+
1e-8
)
##and similarly for MFCCs:
mfcc
-=
(
np
.
mean
(
mfcc
,
axis
=
0
)
+
1e-8
)
# In[ ]:
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment