Voice analyzer

6da59127 · Emika Chamodi · 9f96db15 · 6da59127 · 6da59127 · 6da59127
Commit 6da59127 authored Jan 20, 2023 by Emika Chamodi
10 changed files
--- a/voice_analyzer/Voice_Emotion/ReadMe.md
+++ b/voice_analyzer/Voice_Emotion/ReadMe.md
+DataSet: https://drive.google.com/file/d/1wWsrN2Ep7x6lWqOXfr4rpKGYrJhWc8z7/view
--- a/voice_analyzer/Voice_Emotion/convert_wavs.py
+++ b/voice_analyzer/Voice_Emotion/convert_wavs.py
+"""
+A utility script used for converting audio samples to be 
+suitable for feature extraction
+"""
+import os
+def convert_audio(audio_path, target_path, remove=False):
+    """This function sets the audio `audio_path` to:
+        - 16000Hz Sampling rate
+        - one audio channel ( mono )
+            Params:
+                audio_path (str): the path of audio wav file you want to convert
+                target_path (str): target path to save your new converted wav file
+                remove (bool): whether to remove the old file after converting
+        Note that this function requires ffmpeg installed in your system."""
+    os.system(f"ffmpeg -i {audio_path} -ac 1 -ar 16000 {target_path}")
+    # os.system(f"ffmpeg -i {audio_path} -ac 1 {target_path}")
+    if remove:
+        os.remove(audio_path)
+def convert_audios(path, target_path, remove=False):
+    """Converts a path of wav files to:
+        - 16000Hz Sampling rate
+        - one audio channel ( mono )
+        and then put them into a new folder called `target_path`
+            Params:
+                audio_path (str): the path of audio wav file you want to convert
+                target_path (str): target path to save your new converted wav file
+                remove (bool): whether to remove the old file after converting
+        Note that this function requires ffmpeg installed in your system."""
+    for dirpath, dirnames, filenames in os.walk(path):
+        for dirname in dirnames:
+            dirname = os.path.join(dirpath, dirname)
+            target_dir = dirname.replace(path, target_path)
+            if not os.path.isdir(target_dir):
+                os.mkdir(target_dir)
+    for dirpath, _, filenames in os.walk(path):
+        for filename in filenames:
+            file = os.path.join(dirpath, filename)
+            if file.endswith(".wav"):
+                # it is a wav file
+                target_file = file.replace(path, target_path)
+                convert_audio(file, target_file, remove=remove)
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="""Convert ( compress ) wav files to 16MHz and mono audio channel ( 1 channel )
+                                                    This utility helps for compressing wav files for training and testing""")
+    parser.add_argument("audio_path", help="Folder that contains wav files you want to convert")
+    parser.add_argument("target_path", help="Folder to save new wav files")
+    parser.add_argument("-r", "--remove", type=bool, help="Whether to remove the old wav file after converting", default=False)
+    args = parser.parse_args()
+    audio_path = args.audio_path
+    target_path = args.target_path
+    if os.path.isdir(audio_path):
+        if not os.path.isdir(target_path):
+            os.makedirs(target_path)
+            convert_audios(audio_path, target_path, remove=args.remove)
+    elif os.path.isfile(audio_path) and audio_path.endswith(".wav"):
+        if not target_path.endswith(".wav"):
+            target_path += ".wav"
+        convert_audio(audio_path, target_path, remove=args.remove)
+    else:
+        raise TypeError("The audio_path file you specified isn't appropriate for this operation")
--- a/voice_analyzer/Voice_Emotion/lib.py
+++ b/voice_analyzer/Voice_Emotion/lib.py
+import soundfile
+import numpy as np
+import librosa
+import glob
+import os
+from sklearn.model_selection import train_test_split
+EMOTIONS = {
+    "01": "neutral",
+    "02": "calm",
+    "03": "happy",
+    "04": "sad",
+    "05": "angry",
+    "06": "fearful",
+    "07": "disgust",
+    "08": "surprised"
+}
+AVAILABLE_EMOTIONS = {
+    "angry",
+    "sad",
+    "neutral",
+    "happy"
+}
+def extract_feature(file_name, **kwargs):
+    mfcc = kwargs.get("mfcc")
+    chroma = kwargs.get("chroma")
+    mel = kwargs.get("mel")
+    contrast = kwargs.get("contrast")
+    tonnetz = kwargs.get("tonnetz")
+    with soundfile.SoundFile(file_name) as sound_file:
+        X = sound_file.read(dtype="float32")
+        sample_rate = sound_file.samplerate
+        if chroma or contrast:
+            stft = np.abs(librosa.stft(X))
+        result = np.array([])
+        if mfcc:
+            mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
+            result = np.hstack((result, mfccs))
+        if chroma:
+            chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
+            result = np.hstack((result, chroma))
+        if mel:
+            mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
+            result = np.hstack((result, mel))
+        if contrast:
+            contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
+            result = np.hstack((result, contrast))
+        if tonnetz:
+            tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0)
+            result = np.hstack((result, tonnetz))
+    return result
+# update random_state=9
+def load_data(test_size=0.2, random_state=7):
+    X, y = [], []
+    for file in glob.glob("data/Actor_*/*.wav"):
+        basename = os.path.basename(file)
+        emotion = EMOTIONS[basename.split("-")[2]]
+        if emotion not in AVAILABLE_EMOTIONS:
+            continue
+        features = extract_feature(file, mfcc=True, chroma=True, mel=True)
+        X.append(features)
+        y.append(emotion)
+    return train_test_split(np.array(X), y, test_size=test_size, random_state=random_state)
+import os, glob, pickle
+import numpy as np
+from sklearn.model_selection import train_test_split
+from sklearn.neural_network import MLPClassifier
+from sklearn.metrics import accuracy_score
+def extract_feature_2(file_name, mfcc, chroma, mel):
+    with soundfile.SoundFile(file_name) as sound_file:
+        X = sound_file.read(dtype="float32")
+        sample_rate=sound_file.samplerate
+        if chroma:
+            stft=np.abs(librosa.stft(X))
+        result=np.array([])
+        if mfcc:
+            mfccs=np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
+            result=np.hstack((result, mfccs))
+        if chroma:
+            chroma=np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
+            result=np.hstack((result, chroma))
+        if mel:
+            mel=np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
+            result=np.hstack((result, mel))
+    return result
--- a/voice_analyzer/Voice_Emotion/main
+++ b/voice_analyzer/Voice_Emotion/main
+import pyaudio
+import os
+import wave
+import pickle
+from sys import byteorder
+from array import array
+from struct import pack
+from sklearn.neural_network import MLPClassifier
+from lib import extract_feature
+THRESHOLD = 500
+CHUNK_SIZE = 1024
+FORMAT = pyaudio.paInt16
+RATE = 16000
+SILENCE = 30
+def is_silent(snd_data):
+    return max(snd_data) < THRESHOLD
+def normalize(snd_data):
+    MAXIMUM = 16384
+    times = float(MAXIMUM)/max(abs(i) for i in snd_data)
+    r = array('h')
+    for i in snd_data:
+        r.append(int(i*times))
+    return r
+def trim(snd_data):
+    def _trim(snd_data):
+        snd_started = False
+        r = array('h')
+        for i in snd_data:
+            if not snd_started and abs(i)>THRESHOLD:
+                snd_started = True
+                r.append(i)
+            elif snd_started:
+                r.append(i)
+        return r
+    snd_data = _trim(snd_data)
+    snd_data.reverse()
+    snd_data = _trim(snd_data)
+    snd_data.reverse()
+    return snd_data
+def add_silence(snd_data, seconds):
+    r = array('h', [0 for i in range(int(seconds*RATE))])
+    r.extend(snd_data)
+    r.extend([0 for i in range(int(seconds*RATE))])
+    return r
+def record():
+    p = pyaudio.PyAudio()
+    stream = p.open(format=FORMAT, channels=1, rate=RATE,
+        input=True, output=True,
+        frames_per_buffer=CHUNK_SIZE)
+    num_silent = 0
+    snd_started = False
+    r = array('h')
+    while 1:
+        # little endian, signed short
+        snd_data = array('h', stream.read(CHUNK_SIZE))
+        if byteorder == 'big':
+            snd_data.byteswap()
+        r.extend(snd_data)
+        silent = is_silent(snd_data)
+        if silent and snd_started:
+            num_silent += 1
+        elif not silent and not snd_started:
+            snd_started = True
+        if snd_started and num_silent > SILENCE:
+            break
+    sample_width = p.get_sample_size(FORMAT)
+    stream.stop_stream()
+    stream.close()
+    p.terminate()
+    r = normalize(r)
+    r = trim(r)
+    r = add_silence(r, 0.5)
+    return sample_width, r
+def record_to_file(path):
+    sample_width, data = record()
+    data = pack('<' + ('h'*len(data)), *data)
+    wf = wave.open(path, 'wb')
+    wf.setnchannels(1)
+    wf.setsampwidth(sample_width)
+    wf.setframerate(RATE)
+    wf.writeframes(data)
+    wf.close()
+if __name__ == "__main__":
+    model = pickle.load(open("result/mlp_classifier.model", "rb"))
+    print("Please talk")
+    filename = "test.wav"
+    record_to_file(filename)
+    features = extract_feature(filename, mfcc=True, chroma=True, mel=True).reshape(1, -1)
+    result = model.predict(features)[0]
+    print("result:", result)
\ No newline at end of file
--- a/voice_analyzer/Voice_Emotion/train.py
+++ b/voice_analyzer/Voice_Emotion/train.py
+from sklearn.neural_network import MLPClassifier
+from sklearn.metrics import accuracy_score
+from lib import load_data
+import os
+import pickle
+X_train, X_test, y_train, y_test = load_data(test_size=0.25)
+model=MLPClassifier(alpha=0.01, batch_size=256, epsilon=1e-08, hidden_layer_sizes=(300,), learning_rate='adaptive', max_iter=500)
+print("Training the model...")
+model.fit(X_train, y_train)
+y_pred = model.predict(X_test)
+accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
+print("Accuracy: {:.2f}%".format(accuracy*100))
+if not os.path.isdir("result"):
+    os.mkdir("result")
+pickle.dump(model, open("result/mlp_classifier.model", "wb"))
\ No newline at end of file
--- a/voice_analyzer/Voice_recognizer/Pipfile
+++ b/voice_analyzer/Voice_recognizer/Pipfile
+[[source]]
+url = "https://pypi.org/simple"
+verify_ssl = true
+name = "pypi"
+[packages]
+vosk = "*"
+pydub = "*"
+transformers = "*"
+torch = "*"
+pyaudio = "*"
+regex = "*"
+ipywidgets = "*"
+spacy = "*"
+[dev-packages]
+[requires]
+python_version = "3.9"
--- a/voice_analyzer/Voice_recognizer/Pipfile.lock
+++ b/voice_analyzer/Voice_recognizer/Pipfile.lock
--- a/voice_analyzer/Voice_recognizer/ReadMe.md
+++ b/voice_analyzer/Voice_recognizer/ReadMe.md
+Pretrained models:
+English : https://alphacephei.com/vosk/models/vosk-model-en-us-0.22.zip or https://alphacephei.com/vosk/models/vosk-model-small-en-us-0.15.zip
+Punctuation : https://alphacephei.com/vosk/models/vosk-recasepunc-en-0.22.zip
--- a/voice_analyzer/Voice_recognizer/main.py
+++ b/voice_analyzer/Voice_recognizer/main.py
+from vosk import Model, KaldiRecognizer
+from pydub import AudioSegment
+from transformers import pipeline
+import json
+import subprocess
+import spacy
+from spacy.lang.en.stop_words import STOP_WORDS
+from string import punctuation
+from heapq import nlargest
+FRAME_RATE = 16000
+CHANNELS=1
+def voice_recognition(filename):
+    model = Model(model_name="vosk-model-en-us-0.22")
+    rec = KaldiRecognizer(model, FRAME_RATE)
+    rec.SetWords(True)
+    mp3 = AudioSegment.from_mp3(filename)
+    mp3 = mp3.set_channels(CHANNELS)
+    mp3 = mp3.set_frame_rate(FRAME_RATE)
+    step = 45000
+    transcript = ""
+    for i in range(0, len(mp3), step):
+        print(f"Progress: {i/len(mp3)}")
+        segment = mp3[i:i+step]
+        rec.AcceptWaveform(segment.raw_data)
+        result = rec.Result()
+        text = json.loads(result)["text"]
+        transcript += text
+    cased = subprocess.check_output('python recasepunc/recasepunc.py predict recasepunc/checkpoint', shell=True, text=True, input=transcript)
+    return cased
+def summarize(text, per):
+    nlp = spacy.load('en_core_web_sm')
+    doc= nlp(text)
+    tokens=[token.text for token in doc]
+    word_frequencies={}
+    for word in doc:
+        if word.text.lower() not in list(STOP_WORDS):
+            if word.text.lower() not in punctuation:
+                if word.text not in word_frequencies.keys():
+                    word_frequencies[word.text] = 1
+                else:
+                    word_frequencies[word.text] += 1
+    max_frequency=max(word_frequencies.values())
+    for word in word_frequencies.keys():
+        word_frequencies[word]=word_frequencies[word]/max_frequency
+    sentence_tokens= [sent for sent in doc.sents]
+    sentence_scores = {}
+    for sent in sentence_tokens:
+        for word in sent:
+            if word.text.lower() in word_frequencies.keys():
+                if sent not in sentence_scores.keys():                            
+                    sentence_scores[sent]=word_frequencies[word.text.lower()]
+                else:
+                    sentence_scores[sent]+=word_frequencies[word.text.lower()]
+    select_length=int(len(sentence_tokens)*per)
+    summary=nlargest(select_length, sentence_scores,key=sentence_scores.get)
+    final_summary=[word.text for word in summary]
+    summary=''.join(final_summary)
+    return summary 
+transcript = voice_recognition("sample_voice.mp3")
+summary = summarize(transcript, 0.05)
+print(summary)
\ No newline at end of file
--- a/voice_analyzer/Voice_recognizer/requirements.txt
+++ b/voice_analyzer/Voice_recognizer/requirements.txt
 B#