Commit 30091170 authored by Shehara AKGH - IT18205152's avatar Shehara AKGH - IT18205152

Merge branch 'it18205152' into 'develop'

It18205152

See merge request !8
parents fd04e6d3 880ba0ee
#!/usr/bin/env python
# coding: utf-8
# In[1]:
get_ipython().system('pip install librosa')
# In[2]:
import matplotlib.pyplot as plt
import librosa
import librosa.display
import numpy as np
import warnings
warnings.filterwarnings('ignore')
path = 'D:\Presently/test.wav'
y, sr = librosa.load(path, duration=10)
y_filt = librosa.effects.preemphasis(y)
# In[3]:
# and plot the results for comparison
S_orig = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max, top_db=None)
S_preemph = librosa.amplitude_to_db(np.abs(librosa.stft(y_filt)), ref=np.max, top_db=None)
fig, ax = plt.subplots(nrows=2, sharex=True, sharey=True)
librosa.display.specshow(S_orig, y_axis='log', x_axis='time', ax=ax[0])
ax[0].set(title='Original signal')
ax[0].label_outer()
img = librosa.display.specshow(S_preemph, y_axis='log', x_axis='time', ax=ax[1])
ax[1].set(title='Pre-emphasized signal')
fig.colorbar(img, ax=ax, format="%+2.f dB")
# #Apply pre-emphasis in pieces for block streaming.
# #Note that the second block initializes zi with the final state zf returned by the first call.
# In[4]:
y_filt_1, zf = librosa.effects.preemphasis(y[:1000], return_zf=True)
y_filt_2, zf = librosa.effects.preemphasis(y[1000:], zi=zf, return_zf=True)
np.allclose(y_filt, np.concatenate([y_filt_1, y_filt_2]))
# Framing and windowing of voice signals
# In[6]:
import numpy as np
def framing(sig, fs=16000, win_len=0.025, win_hop=0.01):
"""
transform a signal into a series of overlapping frames.
Args:
sig (array) : a mono audio signal (Nx1) from which to compute features.
fs (int) : the sampling frequency of the signal we are working with.
Default is 16000.
win_len (float) : window length in sec.
Default is 0.025.
win_hop (float) : step between successive windows in sec.
Default is 0.01.
Returns:
array of frames.
frame length.
"""
# compute frame length and frame step (convert from seconds to samples)
frame_length = win_len * fs
frame_step = win_hop * fs
signal_length = len(sig)
frames_overlap = frame_length - frame_step
# Make sure that we have at least 1 frame+
num_frames = np.abs(signal_length - frames_overlap) // np.abs(frame_length - frames_overlap)
rest_samples = np.abs(signal_length - frames_overlap) % np.abs(frame_length - frames_overlap)
# Pad Signal to make sure that all frames have equal number of samples
# without truncating any samples from the original signal
if rest_samples != 0:
pad_signal_length = int(frame_step - rest_samples)
z = np.zeros((pad_signal_length))
pad_signal = np.append(sig, z)
num_frames += 1
else:
pad_signal = sig
# make sure to use integers as indices
frame_length = int(frame_length)
frame_step = int(frame_step)
num_frames = int(num_frames)
# compute indices
idx1 = np.tile(np.arange(0, frame_length), (num_frames, 1))
idx2 = np.tile(np.arange(0, num_frames * frame_step, frame_step),
(frame_length, 1)).T
indices = idx1 + idx2
frames = pad_signal[indices.astype(np.int32, copy=False)]
return frames
# In[ ]:
#!/usr/bin/env python
# coding: utf-8
# In[1]:
import numpy as np
from scipy.io import wavfile
from scipy.fftpack import dct
from matplotlib import pyplot as plt
sample_rate, signal = wavfile.read('speech.wav')
signal = signal[0:int(10* sample_rate)]
Time = np.linspace(0, len(signal) / sample_rate, num=len(signal))
plt.plot(Time, signal)
# In[2]:
pre_emphasis = 0.97
emphasized_signal = np.append(signal[0], signal[1:] - pre_emphasis * signal[:-1])
# In[3]:
plt.plot(Time, signal)
# In[4]:
frame_size = 0.025
frame_stride = 0.01
frame_length, frame_step = frame_size * sample_rate, frame_stride * sample_rate # Convert from seconds to samples
signal_length = len(emphasized_signal)
frame_length = int(round(frame_length))
frame_step = int(round(frame_step))
num_frames = int(np.ceil(float(np.abs(signal_length - frame_length)) / frame_step)) # Make sure that we have at least 1 frame
pad_signal_length = num_frames * frame_step + frame_length
z = np.zeros((pad_signal_length - signal_length))
pad_signal = np.append(emphasized_signal, z) # Pad Signal to make sure that all frames have equal number of samples without truncating any samples from the original signal
indices = np.tile(np.arange(0, frame_length), (num_frames, 1)) + np.tile(np.arange(0, num_frames * frame_step, frame_step), (frame_length, 1)).T
frames = pad_signal[indices.astype(np.int32, copy=False)]
# In[5]:
frames *= np.hamming(frame_length)
# frames *= 0.54 - 0.46 * np.cos((2 * np.pi * n) / (frame_length - 1)) # Explicit Implementation **
# In[6]:
NFFT = 512
mag_frames = np.absolute(np.fft.rfft(frames, NFFT)) # Magnitude of the FFT
pow_frames = ((1.0 / NFFT) * ((mag_frames) ** 2)) # Power Spectrum
# Filter Banks
# In[7]:
nfilt = 40
low_freq_mel = 0
high_freq_mel = (2595 * np.log10(1 + (sample_rate / 2) / 700)) # Convert Hz to Mel
mel_points = np.linspace(low_freq_mel, high_freq_mel, nfilt + 2) # Equally spaced in Mel scale
hz_points = (700 * (10**(mel_points / 2595) - 1)) # Convert Mel to Hz
bin = np.floor((NFFT + 1) * hz_points / sample_rate)
fbank = np.zeros((nfilt, int(np.floor(NFFT / 2 + 1))))
for m in range(1, nfilt + 1):
f_m_minus = int(bin[m - 1]) # left
f_m = int(bin[m]) # center
f_m_plus = int(bin[m + 1]) # right
for k in range(f_m_minus, f_m):
fbank[m - 1, k] = (k - bin[m - 1]) / (bin[m] - bin[m - 1])
for k in range(f_m, f_m_plus):
fbank[m - 1, k] = (bin[m + 1] - k) / (bin[m + 1] - bin[m])
filter_banks = np.dot(pow_frames, fbank.T)
filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks) # Numerical Stability
filter_banks = 20 * np.log10(filter_banks) # dB
# In[8]:
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(20, 4))
cax = ax.matshow(
np.transpose(filter_banks),
interpolation="nearest",
aspect="auto",
cmap=plt.cm.afmhot_r,
origin="lower",
)
fig.colorbar(cax)
plt.title("Mel compression Spectrogram")
plt.show()
# Mel-frequency cepstral Coecfficents (MFCCs)
# In[9]:
num_ceps = 12
mfcc = dct(filter_banks, type = 2, axis=1, norm="ortho")[:,1: (num_ceps + 1)] # keep 2-13
# In[10]:
cep_lifter = 22
(nframes, ncoeff) = mfcc.shape
n = np.arange(ncoeff)
lift = 1 + (cep_lifter / 2) * np.sin(np.pi * n/ cep_lifter)
mfcc *= lift
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(20, 4))
cax = ax.matshow(
np.transpose(mfcc),
interpolation="nearest",
aspect="auto",
cmap=plt.cm.afmhot_r,
origin="lower",
)
fig.colorbar(cax)
plt.title("MFCC Spectrogram")
plt.show()
# Mean Normalization
# In[11]:
##to balance the spectrum and improve the Signal-to-Noise (SNR),
##we can simply substract the mean of each coefficeint from all frames,
filter_banks -= (np.mean(filter_banks, axis = 0) + 1e-8)
##and similarly for MFCCs:
mfcc -= (np.mean(mfcc, axis = 0) + 1e-8)
# In[ ]:
#!/usr/bin/env python
# coding: utf-8
# In[1]:
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="D:\Presently/key-file.json"
# In[2]:
from google.cloud import texttospeech
# In[3]:
# Instantiates a client
client = texttospeech.TextToSpeechClient()
# In[4]:
# Set the text input to be synthesized
synthesis_input = texttospeech.SynthesisInput(text="That night, as the Dursleys are falling asleep, Albus Dumbledore, a wizard and the head of the Hogwarts wizardry academy, appears on their street. He shuts off all the streetlights and approaches a cat that is soon revealed to be a woman named Professor McGonagall (who also teaches at Hogwarts) in disguise. They discuss the disappearance of You-Know-Who, otherwise known as Voldemort. Dumbledore tells McGonagall that Voldemort killed the Potter parents the previous night and tried to kill their son, Harry, as well, but was unable to. Dumbledore adds that Voldemort’s power apparently began to wane after his failed attempt to kill Harry and that he retreated. Dumbledore adds that the baby Harry can be left on the Dursleys’ doorstep. McGonagall protests that Harry cannot be brought up by the Dursleys. But Dumbledore insists that there is no one else to take care of the child. He says that when Harry is old enough, he will be told of his fate. A giant named Hagrid, who is carrying a bundle of blankets with the baby Harry inside, then falls out of the sky on a motorcycle. Dumbledore takes Harry and places him on the Dursley’s doorstep with an explanatory letter he has written to the Dursleys, and the three part ways.")
# In[5]:
# Build the voice request, select the language code ("en-IN") and the ssml
# voice gender ("neutral")
voice = texttospeech.VoiceSelectionParams(
language_code="en-IN", ssml_gender=texttospeech.SsmlVoiceGender.NEUTRAL
)
# In[6]:
# Select the type of audio file you want returned
audio_config = texttospeech.AudioConfig(
audio_encoding=texttospeech.AudioEncoding.MP3
)
# In[7]:
# Perform the text-to-speech request on the text input with the selected
# voice parameters and audio file type
response = client.synthesize_speech(
input=synthesis_input, voice=voice, audio_config=audio_config
)
# In[8]:
# The response's audio_content is binary.
with open("output.mp3", "wb") as out:
# Write the response to the output file.
out.write(response.audio_content)
print('Audio content written to file "output.mp3" ')
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment