Merge branch 'it18205152' into 'develop'

It18205152 See merge request !8

Merge branch 'it18205152' into 'develop'
It18205152 See merge request !8
30091170 · Shehara AKGH - IT18205152 · fd04e6d3 · 880ba0ee · 30091170 · 30091170
Commit 30091170 authored Jan 08, 2022 by Shehara AKGH - IT18205152
3 changed files
--- a/BE-Pronunciation/Audio Comparison.py
+++ b/BE-Pronunciation/Audio Comparison.py
+#!/usr/bin/env python
+# coding: utf-8
+
+# In[1]:
+
+
+get_ipython().system('pip install librosa')
+
+
+# In[2]:
+
+
+import matplotlib.pyplot as plt
+import librosa
+import librosa.display
+import numpy as np
+import warnings
+warnings.filterwarnings('ignore')
+
+path = 'D:\Presently/test.wav'
+
+y, sr = librosa.load(path, duration=10)
+y_filt = librosa.effects.preemphasis(y)
+
+
+# In[3]:
+
+
+# and plot the results for comparison
+S_orig = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max, top_db=None)
+S_preemph = librosa.amplitude_to_db(np.abs(librosa.stft(y_filt)), ref=np.max, top_db=None)
+fig, ax = plt.subplots(nrows=2, sharex=True, sharey=True)
+librosa.display.specshow(S_orig, y_axis='log', x_axis='time', ax=ax[0])
+ax[0].set(title='Original signal')
+ax[0].label_outer()
+img = librosa.display.specshow(S_preemph, y_axis='log', x_axis='time', ax=ax[1])
+ax[1].set(title='Pre-emphasized signal')
+fig.colorbar(img, ax=ax, format="%+2.f dB")
+
+
+# #Apply pre-emphasis in pieces for block streaming.
+# #Note that the second block initializes zi with the final state zf returned by the first call.
+
+# In[4]:
+
+
+y_filt_1, zf = librosa.effects.preemphasis(y[:1000], return_zf=True)
+y_filt_2, zf = librosa.effects.preemphasis(y[1000:], zi=zf, return_zf=True)
+np.allclose(y_filt, np.concatenate([y_filt_1, y_filt_2]))
+
+
+# Framing and windowing of voice signals
+
+# In[6]:
+
+
+import numpy as np
+
+
+def framing(sig, fs=16000, win_len=0.025, win_hop=0.01):
+    """
+    transform a signal into a series of overlapping frames.
+
+    Args:
+        sig            (array) : a mono audio signal (Nx1) from which to compute features.
+        fs               (int) : the sampling frequency of the signal we are working with.
+                                 Default is 16000.
+        win_len        (float) : window length in sec.
+                                 Default is 0.025.
+        win_hop        (float) : step between successive windows in sec.
+                                 Default is 0.01.
+
+    Returns:
+        array of frames.
+        frame length.
+    """
+    # compute frame length and frame step (convert from seconds to samples)
+    frame_length = win_len * fs
+    frame_step = win_hop * fs
+    signal_length = len(sig)
+    frames_overlap = frame_length - frame_step
+
+    # Make sure that we have at least 1 frame+
+    num_frames = np.abs(signal_length - frames_overlap) // np.abs(frame_length - frames_overlap)
+    rest_samples = np.abs(signal_length - frames_overlap) % np.abs(frame_length - frames_overlap)
+
+    # Pad Signal to make sure that all frames have equal number of samples
+    # without truncating any samples from the original signal
+    if rest_samples != 0:
+        pad_signal_length = int(frame_step - rest_samples)
+        z = np.zeros((pad_signal_length))
+        pad_signal = np.append(sig, z)
+        num_frames += 1
+    else:
+        pad_signal = sig
+
+    # make sure to use integers as indices
+    frame_length = int(frame_length)
+    frame_step = int(frame_step)
+    num_frames = int(num_frames)
+
+    # compute indices
+    idx1 = np.tile(np.arange(0, frame_length), (num_frames, 1))
+    idx2 = np.tile(np.arange(0, num_frames * frame_step, frame_step),
+                   (frame_length, 1)).T
+    indices = idx1 + idx2
+    frames = pad_signal[indices.astype(np.int32, copy=False)]
+    return frames
+
+
+# In[ ]:
+
+
+
+
--- a/BE-Pronunciation/Speech Signal Processing.py
+++ b/BE-Pronunciation/Speech Signal Processing.py
+#!/usr/bin/env python
+# coding: utf-8
+
+# In[1]:
+
+
+import numpy as np
+from scipy.io import wavfile
+from scipy.fftpack import dct
+from matplotlib import pyplot as plt
+
+sample_rate, signal = wavfile.read('speech.wav')
+
+signal = signal[0:int(10* sample_rate)]
+Time = np.linspace(0, len(signal) / sample_rate, num=len(signal))
+
+plt.plot(Time, signal)
+
+
+# In[2]:
+
+
+pre_emphasis = 0.97
+emphasized_signal = np.append(signal[0], signal[1:] - pre_emphasis * signal[:-1])
+
+
+# In[3]:
+
+
+plt.plot(Time, signal)
+
+
+# In[4]:
+
+
+frame_size = 0.025
+frame_stride = 0.01
+
+frame_length, frame_step = frame_size * sample_rate, frame_stride * sample_rate  # Convert from seconds to samples
+signal_length = len(emphasized_signal)
+frame_length = int(round(frame_length))
+frame_step = int(round(frame_step))
+num_frames = int(np.ceil(float(np.abs(signal_length - frame_length)) / frame_step))  # Make sure that we have at least 1 frame
+
+pad_signal_length = num_frames * frame_step + frame_length
+z = np.zeros((pad_signal_length - signal_length))
+pad_signal = np.append(emphasized_signal, z) # Pad Signal to make sure that all frames have equal number of samples without truncating any samples from the original signal
+
+indices = np.tile(np.arange(0, frame_length), (num_frames, 1)) + np.tile(np.arange(0, num_frames * frame_step, frame_step), (frame_length, 1)).T
+frames = pad_signal[indices.astype(np.int32, copy=False)]
+
+
+# In[5]:
+
+
+frames *= np.hamming(frame_length)
+# frames *= 0.54 - 0.46 * np.cos((2 * np.pi * n) / (frame_length - 1))  # Explicit Implementation **
+
+
+# In[6]:
+
+
+NFFT = 512
+
+mag_frames = np.absolute(np.fft.rfft(frames, NFFT))  # Magnitude of the FFT
+pow_frames = ((1.0 / NFFT) * ((mag_frames) ** 2))  # Power Spectrum
+
+
+# Filter Banks
+
+# In[7]:
+
+
+nfilt = 40
+
+low_freq_mel = 0
+high_freq_mel = (2595 * np.log10(1 + (sample_rate / 2) / 700))  # Convert Hz to Mel
+mel_points = np.linspace(low_freq_mel, high_freq_mel, nfilt + 2)  # Equally spaced in Mel scale
+hz_points = (700 * (10**(mel_points / 2595) - 1))  # Convert Mel to Hz
+bin = np.floor((NFFT + 1) * hz_points / sample_rate)
+
+fbank = np.zeros((nfilt, int(np.floor(NFFT / 2 + 1))))
+for m in range(1, nfilt + 1):
+    f_m_minus = int(bin[m - 1])   # left
+    f_m = int(bin[m])             # center
+    f_m_plus = int(bin[m + 1])    # right
+
+    for k in range(f_m_minus, f_m):
+        fbank[m - 1, k] = (k - bin[m - 1]) / (bin[m] - bin[m - 1])
+    for k in range(f_m, f_m_plus):
+        fbank[m - 1, k] = (bin[m + 1] - k) / (bin[m + 1] - bin[m])
+filter_banks = np.dot(pow_frames, fbank.T)
+filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks)  # Numerical Stability
+filter_banks = 20 * np.log10(filter_banks)  # dB
+
+
+# In[8]:
+
+
+fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(20, 4))
+cax = ax.matshow(
+    np.transpose(filter_banks),
+    interpolation="nearest",
+    aspect="auto",
+    cmap=plt.cm.afmhot_r,
+    origin="lower",
+)
+fig.colorbar(cax)
+plt.title("Mel compression Spectrogram")
+plt.show()
+
+
+# Mel-frequency cepstral Coecfficents (MFCCs)
+
+# In[9]:
+
+
+num_ceps = 12
+mfcc = dct(filter_banks, type = 2, axis=1, norm="ortho")[:,1: (num_ceps + 1)] # keep 2-13
+
+
+# In[10]:
+
+
+cep_lifter = 22
+(nframes, ncoeff) = mfcc.shape
+n = np.arange(ncoeff)
+lift = 1 + (cep_lifter / 2) * np.sin(np.pi * n/ cep_lifter)
+mfcc *= lift
+
+fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(20, 4))
+cax = ax.matshow(
+    np.transpose(mfcc),
+    interpolation="nearest",
+    aspect="auto",
+    cmap=plt.cm.afmhot_r,
+    origin="lower",
+)
+fig.colorbar(cax)
+plt.title("MFCC Spectrogram")
+plt.show()
+
+
+# Mean Normalization
+
+# In[11]:
+
+
+##to balance the spectrum and improve the Signal-to-Noise (SNR),
+##we can simply substract the mean of each coefficeint from all frames,
+
+filter_banks -= (np.mean(filter_banks, axis = 0) + 1e-8)
+
+##and similarly for MFCCs:
+
+mfcc -= (np.mean(mfcc, axis = 0) + 1e-8)
+
+
+# In[ ]:
+
+
+
+
--- a/BE-Pronunciation/Text_To_Speech.py
+++ b/BE-Pronunciation/Text_To_Speech.py
+#!/usr/bin/env python
+# coding: utf-8
+
+# In[1]:
+
+
+import os
+os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="D:\Presently/key-file.json"
+
+
+# In[2]:
+
+
+from google.cloud import texttospeech
+
+
+# In[3]:
+
+
+# Instantiates a client
+client = texttospeech.TextToSpeechClient()
+
+
+# In[4]:
+
+
+# Set the text input to be synthesized
+synthesis_input = texttospeech.SynthesisInput(text="That night, as the Dursleys are falling asleep, Albus Dumbledore, a wizard and the head of the Hogwarts wizardry academy, appears on their street. He shuts off all the streetlights and approaches a cat that is soon revealed to be a woman named Professor McGonagall (who also teaches at Hogwarts) in disguise. They discuss the disappearance of You-Know-Who, otherwise known as Voldemort. Dumbledore tells McGonagall that Voldemort killed the Potter parents the previous night and tried to kill their son, Harry, as well, but was unable to. Dumbledore adds that Voldemort’s power apparently began to wane after his failed attempt to kill Harry and that he retreated. Dumbledore adds that the baby Harry can be left on the Dursleys’ doorstep. McGonagall protests that Harry cannot be brought up by the Dursleys. But Dumbledore insists that there is no one else to take care of the child. He says that when Harry is old enough, he will be told of his fate. A giant named Hagrid, who is carrying a bundle of blankets with the baby Harry inside, then falls out of the sky on a motorcycle. Dumbledore takes Harry and places him on the Dursley’s doorstep with an explanatory letter he has written to the Dursleys, and the three part ways.")
+
+
+# In[5]:
+
+
+# Build the voice request, select the language code ("en-IN") and the ssml
+# voice gender ("neutral")
+voice = texttospeech.VoiceSelectionParams(
+    language_code="en-IN", ssml_gender=texttospeech.SsmlVoiceGender.NEUTRAL
+)
+
+
+# In[6]:
+
+
+# Select the type of audio file you want returned
+audio_config = texttospeech.AudioConfig(
+    audio_encoding=texttospeech.AudioEncoding.MP3
+)
+
+
+# In[7]:
+
+
+# Perform the text-to-speech request on the text input with the selected
+# voice parameters and audio file type
+response = client.synthesize_speech(
+    input=synthesis_input, voice=voice, audio_config=audio_config
+)
+
+
+# In[8]:
+
+
+# The response's audio_content is binary.
+with open("output.mp3", "wb") as out:
+    # Write the response to the output file.
+    out.write(response.audio_content)
+    print('Audio content written to file "output.mp3" ')
+