Commit 6da59127 authored by Emika Chamodi's avatar Emika Chamodi

Voice analyzer

parent 9f96db15
DataSet: https://drive.google.com/file/d/1wWsrN2Ep7x6lWqOXfr4rpKGYrJhWc8z7/view
"""
A utility script used for converting audio samples to be
suitable for feature extraction
"""
import os
def convert_audio(audio_path, target_path, remove=False):
"""This function sets the audio `audio_path` to:
- 16000Hz Sampling rate
- one audio channel ( mono )
Params:
audio_path (str): the path of audio wav file you want to convert
target_path (str): target path to save your new converted wav file
remove (bool): whether to remove the old file after converting
Note that this function requires ffmpeg installed in your system."""
os.system(f"ffmpeg -i {audio_path} -ac 1 -ar 16000 {target_path}")
# os.system(f"ffmpeg -i {audio_path} -ac 1 {target_path}")
if remove:
os.remove(audio_path)
def convert_audios(path, target_path, remove=False):
"""Converts a path of wav files to:
- 16000Hz Sampling rate
- one audio channel ( mono )
and then put them into a new folder called `target_path`
Params:
audio_path (str): the path of audio wav file you want to convert
target_path (str): target path to save your new converted wav file
remove (bool): whether to remove the old file after converting
Note that this function requires ffmpeg installed in your system."""
for dirpath, dirnames, filenames in os.walk(path):
for dirname in dirnames:
dirname = os.path.join(dirpath, dirname)
target_dir = dirname.replace(path, target_path)
if not os.path.isdir(target_dir):
os.mkdir(target_dir)
for dirpath, _, filenames in os.walk(path):
for filename in filenames:
file = os.path.join(dirpath, filename)
if file.endswith(".wav"):
# it is a wav file
target_file = file.replace(path, target_path)
convert_audio(file, target_file, remove=remove)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="""Convert ( compress ) wav files to 16MHz and mono audio channel ( 1 channel )
This utility helps for compressing wav files for training and testing""")
parser.add_argument("audio_path", help="Folder that contains wav files you want to convert")
parser.add_argument("target_path", help="Folder to save new wav files")
parser.add_argument("-r", "--remove", type=bool, help="Whether to remove the old wav file after converting", default=False)
args = parser.parse_args()
audio_path = args.audio_path
target_path = args.target_path
if os.path.isdir(audio_path):
if not os.path.isdir(target_path):
os.makedirs(target_path)
convert_audios(audio_path, target_path, remove=args.remove)
elif os.path.isfile(audio_path) and audio_path.endswith(".wav"):
if not target_path.endswith(".wav"):
target_path += ".wav"
convert_audio(audio_path, target_path, remove=args.remove)
else:
raise TypeError("The audio_path file you specified isn't appropriate for this operation")
import soundfile
import numpy as np
import librosa
import glob
import os
from sklearn.model_selection import train_test_split
EMOTIONS = {
"01": "neutral",
"02": "calm",
"03": "happy",
"04": "sad",
"05": "angry",
"06": "fearful",
"07": "disgust",
"08": "surprised"
}
AVAILABLE_EMOTIONS = {
"angry",
"sad",
"neutral",
"happy"
}
def extract_feature(file_name, **kwargs):
mfcc = kwargs.get("mfcc")
chroma = kwargs.get("chroma")
mel = kwargs.get("mel")
contrast = kwargs.get("contrast")
tonnetz = kwargs.get("tonnetz")
with soundfile.SoundFile(file_name) as sound_file:
X = sound_file.read(dtype="float32")
sample_rate = sound_file.samplerate
if chroma or contrast:
stft = np.abs(librosa.stft(X))
result = np.array([])
if mfcc:
mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
result = np.hstack((result, mfccs))
if chroma:
chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
result = np.hstack((result, chroma))
if mel:
mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
result = np.hstack((result, mel))
if contrast:
contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
result = np.hstack((result, contrast))
if tonnetz:
tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0)
result = np.hstack((result, tonnetz))
return result
# update random_state=9
def load_data(test_size=0.2, random_state=7):
X, y = [], []
for file in glob.glob("data/Actor_*/*.wav"):
basename = os.path.basename(file)
emotion = EMOTIONS[basename.split("-")[2]]
if emotion not in AVAILABLE_EMOTIONS:
continue
features = extract_feature(file, mfcc=True, chroma=True, mel=True)
X.append(features)
y.append(emotion)
return train_test_split(np.array(X), y, test_size=test_size, random_state=random_state)
import os, glob, pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
def extract_feature_2(file_name, mfcc, chroma, mel):
with soundfile.SoundFile(file_name) as sound_file:
X = sound_file.read(dtype="float32")
sample_rate=sound_file.samplerate
if chroma:
stft=np.abs(librosa.stft(X))
result=np.array([])
if mfcc:
mfccs=np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
result=np.hstack((result, mfccs))
if chroma:
chroma=np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
result=np.hstack((result, chroma))
if mel:
mel=np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
result=np.hstack((result, mel))
return result
import pyaudio
import os
import wave
import pickle
from sys import byteorder
from array import array
from struct import pack
from sklearn.neural_network import MLPClassifier
from lib import extract_feature
THRESHOLD = 500
CHUNK_SIZE = 1024
FORMAT = pyaudio.paInt16
RATE = 16000
SILENCE = 30
def is_silent(snd_data):
return max(snd_data) < THRESHOLD
def normalize(snd_data):
MAXIMUM = 16384
times = float(MAXIMUM)/max(abs(i) for i in snd_data)
r = array('h')
for i in snd_data:
r.append(int(i*times))
return r
def trim(snd_data):
def _trim(snd_data):
snd_started = False
r = array('h')
for i in snd_data:
if not snd_started and abs(i)>THRESHOLD:
snd_started = True
r.append(i)
elif snd_started:
r.append(i)
return r
snd_data = _trim(snd_data)
snd_data.reverse()
snd_data = _trim(snd_data)
snd_data.reverse()
return snd_data
def add_silence(snd_data, seconds):
r = array('h', [0 for i in range(int(seconds*RATE))])
r.extend(snd_data)
r.extend([0 for i in range(int(seconds*RATE))])
return r
def record():
p = pyaudio.PyAudio()
stream = p.open(format=FORMAT, channels=1, rate=RATE,
input=True, output=True,
frames_per_buffer=CHUNK_SIZE)
num_silent = 0
snd_started = False
r = array('h')
while 1:
# little endian, signed short
snd_data = array('h', stream.read(CHUNK_SIZE))
if byteorder == 'big':
snd_data.byteswap()
r.extend(snd_data)
silent = is_silent(snd_data)
if silent and snd_started:
num_silent += 1
elif not silent and not snd_started:
snd_started = True
if snd_started and num_silent > SILENCE:
break
sample_width = p.get_sample_size(FORMAT)
stream.stop_stream()
stream.close()
p.terminate()
r = normalize(r)
r = trim(r)
r = add_silence(r, 0.5)
return sample_width, r
def record_to_file(path):
sample_width, data = record()
data = pack('<' + ('h'*len(data)), *data)
wf = wave.open(path, 'wb')
wf.setnchannels(1)
wf.setsampwidth(sample_width)
wf.setframerate(RATE)
wf.writeframes(data)
wf.close()
if __name__ == "__main__":
model = pickle.load(open("result/mlp_classifier.model", "rb"))
print("Please talk")
filename = "test.wav"
record_to_file(filename)
features = extract_feature(filename, mfcc=True, chroma=True, mel=True).reshape(1, -1)
result = model.predict(features)[0]
print("result:", result)
\ No newline at end of file
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from lib import load_data
import os
import pickle
X_train, X_test, y_train, y_test = load_data(test_size=0.25)
model=MLPClassifier(alpha=0.01, batch_size=256, epsilon=1e-08, hidden_layer_sizes=(300,), learning_rate='adaptive', max_iter=500)
print("Training the model...")
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
print("Accuracy: {:.2f}%".format(accuracy*100))
if not os.path.isdir("result"):
os.mkdir("result")
pickle.dump(model, open("result/mlp_classifier.model", "wb"))
\ No newline at end of file
[[source]]
url = "https://pypi.org/simple"
verify_ssl = true
name = "pypi"
[packages]
vosk = "*"
pydub = "*"
transformers = "*"
torch = "*"
pyaudio = "*"
regex = "*"
ipywidgets = "*"
spacy = "*"
[dev-packages]
[requires]
python_version = "3.9"
This diff is collapsed.
Pretrained models:
English : https://alphacephei.com/vosk/models/vosk-model-en-us-0.22.zip or https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip
Punctuation : https://alphacephei.com/vosk/models/vosk-recasepunc-en-0.22.zip
from vosk import Model, KaldiRecognizer
from pydub import AudioSegment
from transformers import pipeline
import json
import subprocess
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
from heapq import nlargest
FRAME_RATE = 16000
CHANNELS=1
def voice_recognition(filename):
model = Model(model_name="vosk-model-en-us-0.22")
rec = KaldiRecognizer(model, FRAME_RATE)
rec.SetWords(True)
mp3 = AudioSegment.from_mp3(filename)
mp3 = mp3.set_channels(CHANNELS)
mp3 = mp3.set_frame_rate(FRAME_RATE)
step = 45000
transcript = ""
for i in range(0, len(mp3), step):
print(f"Progress: {i/len(mp3)}")
segment = mp3[i:i+step]
rec.AcceptWaveform(segment.raw_data)
result = rec.Result()
text = json.loads(result)["text"]
transcript += text
cased = subprocess.check_output('python recasepunc/recasepunc.py predict recasepunc/checkpoint', shell=True, text=True, input=transcript)
return cased
def summarize(text, per):
nlp = spacy.load('en_core_web_sm')
doc= nlp(text)
tokens=[token.text for token in doc]
word_frequencies={}
for word in doc:
if word.text.lower() not in list(STOP_WORDS):
if word.text.lower() not in punctuation:
if word.text not in word_frequencies.keys():
word_frequencies[word.text] = 1
else:
word_frequencies[word.text] += 1
max_frequency=max(word_frequencies.values())
for word in word_frequencies.keys():
word_frequencies[word]=word_frequencies[word]/max_frequency
sentence_tokens= [sent for sent in doc.sents]
sentence_scores = {}
for sent in sentence_tokens:
for word in sent:
if word.text.lower() in word_frequencies.keys():
if sent not in sentence_scores.keys():
sentence_scores[sent]=word_frequencies[word.text.lower()]
else:
sentence_scores[sent]+=word_frequencies[word.text.lower()]
select_length=int(len(sentence_tokens)*per)
summary=nlargest(select_length, sentence_scores,key=sentence_scores.get)
final_summary=[word.text for word in summary]
summary=''.join(final_summary)
return summary
transcript = voice_recognition("sample_voice.mp3")
summary = summarize(transcript, 0.05)
print(summary)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment