feat: Audio Signal Processing

880ba0ee · Shehara AKGH - IT18205152 · 7ebe4680 · 880ba0ee
Commit 880ba0ee authored Jan 08, 2022 by Shehara AKGH - IT18205152
Hide whitespace changes
Inline Side-by-side

Showing with 163 additions and 0 deletions

BE-Pronunciation/Speech Signal Processing.py BE-Pronunciation/Speech Signal Processing.py +163 -0

No files found.
--- a/BE-Pronunciation/Speech Signal Processing.py
+++ b/BE-Pronunciation/Speech Signal Processing.py
+#!/usr/bin/env python
+# coding: utf-8
+
+# In[1]:
+
+
+import numpy as np
+from scipy.io import wavfile
+from scipy.fftpack import dct
+from matplotlib import pyplot as plt
+
+sample_rate, signal = wavfile.read('speech.wav')
+
+signal = signal[0:int(10* sample_rate)]
+Time = np.linspace(0, len(signal) / sample_rate, num=len(signal))
+
+plt.plot(Time, signal)
+
+
+# In[2]:
+
+
+pre_emphasis = 0.97
+emphasized_signal = np.append(signal[0], signal[1:] - pre_emphasis * signal[:-1])
+
+
+# In[3]:
+
+
+plt.plot(Time, signal)
+
+
+# In[4]:
+
+
+frame_size = 0.025
+frame_stride = 0.01
+
+frame_length, frame_step = frame_size * sample_rate, frame_stride * sample_rate  # Convert from seconds to samples
+signal_length = len(emphasized_signal)
+frame_length = int(round(frame_length))
+frame_step = int(round(frame_step))
+num_frames = int(np.ceil(float(np.abs(signal_length - frame_length)) / frame_step))  # Make sure that we have at least 1 frame
+
+pad_signal_length = num_frames * frame_step + frame_length
+z = np.zeros((pad_signal_length - signal_length))
+pad_signal = np.append(emphasized_signal, z) # Pad Signal to make sure that all frames have equal number of samples without truncating any samples from the original signal
+
+indices = np.tile(np.arange(0, frame_length), (num_frames, 1)) + np.tile(np.arange(0, num_frames * frame_step, frame_step), (frame_length, 1)).T
+frames = pad_signal[indices.astype(np.int32, copy=False)]
+
+
+# In[5]:
+
+
+frames *= np.hamming(frame_length)
+# frames *= 0.54 - 0.46 * np.cos((2 * np.pi * n) / (frame_length - 1))  # Explicit Implementation **
+
+
+# In[6]:
+
+
+NFFT = 512
+
+mag_frames = np.absolute(np.fft.rfft(frames, NFFT))  # Magnitude of the FFT
+pow_frames = ((1.0 / NFFT) * ((mag_frames) ** 2))  # Power Spectrum
+
+
+# Filter Banks
+
+# In[7]:
+
+
+nfilt = 40
+
+low_freq_mel = 0
+high_freq_mel = (2595 * np.log10(1 + (sample_rate / 2) / 700))  # Convert Hz to Mel
+mel_points = np.linspace(low_freq_mel, high_freq_mel, nfilt + 2)  # Equally spaced in Mel scale
+hz_points = (700 * (10**(mel_points / 2595) - 1))  # Convert Mel to Hz
+bin = np.floor((NFFT + 1) * hz_points / sample_rate)
+
+fbank = np.zeros((nfilt, int(np.floor(NFFT / 2 + 1))))
+for m in range(1, nfilt + 1):
+    f_m_minus = int(bin[m - 1])   # left
+    f_m = int(bin[m])             # center
+    f_m_plus = int(bin[m + 1])    # right
+
+    for k in range(f_m_minus, f_m):
+        fbank[m - 1, k] = (k - bin[m - 1]) / (bin[m] - bin[m - 1])
+    for k in range(f_m, f_m_plus):
+        fbank[m - 1, k] = (bin[m + 1] - k) / (bin[m + 1] - bin[m])
+filter_banks = np.dot(pow_frames, fbank.T)
+filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks)  # Numerical Stability
+filter_banks = 20 * np.log10(filter_banks)  # dB
+
+
+# In[8]:
+
+
+fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(20, 4))
+cax = ax.matshow(
+    np.transpose(filter_banks),
+    interpolation="nearest",
+    aspect="auto",
+    cmap=plt.cm.afmhot_r,
+    origin="lower",
+)
+fig.colorbar(cax)
+plt.title("Mel compression Spectrogram")
+plt.show()
+
+
+# Mel-frequency cepstral Coecfficents (MFCCs)
+
+# In[9]:
+
+
+num_ceps = 12
+mfcc = dct(filter_banks, type = 2, axis=1, norm="ortho")[:,1: (num_ceps + 1)] # keep 2-13
+
+
+# In[10]:
+
+
+cep_lifter = 22
+(nframes, ncoeff) = mfcc.shape
+n = np.arange(ncoeff)
+lift = 1 + (cep_lifter / 2) * np.sin(np.pi * n/ cep_lifter)
+mfcc *= lift
+
+fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(20, 4))
+cax = ax.matshow(
+    np.transpose(mfcc),
+    interpolation="nearest",
+    aspect="auto",
+    cmap=plt.cm.afmhot_r,
+    origin="lower",
+)
+fig.colorbar(cax)
+plt.title("MFCC Spectrogram")
+plt.show()
+
+
+# Mean Normalization
+
+# In[11]:
+
+
+##to balance the spectrum and improve the Signal-to-Noise (SNR),
+##we can simply substract the mean of each coefficeint from all frames,
+
+filter_banks -= (np.mean(filter_banks, axis = 0) + 1e-8)
+
+##and similarly for MFCCs:
+
+mfcc -= (np.mean(mfcc, axis = 0) + 1e-8)
+
+
+# In[ ]:
+
+
+
+