trying out initial implementation

e29e7ea8 · Tandin Wangchen · d51f7fd3 · e29e7ea8 · e29e7ea8 · e29e7ea8
Commit e29e7ea8 authored May 12, 2022 by Tandin Wangchen
7 changed files
--- a/capturing_audio_noise/audio_signal.py
+++ b/capturing_audio_noise/audio_signal.py
+from scipy.io.wavfile import read
+import numpy as np
+import matplotlib.pyplot as plt
+
+# Read the Audiofile
+samplerate, data = read('/home/tandin/notebook/Research/videosrc/myvoice.wav')
+# Frame rate for the Audio
+print(samplerate)
+
+# Duration of the audio in Seconds
+duration = len(data)/samplerate
+print("Duration of Audio in Seconds", duration)
+print("Duration of Audio in Minutes", duration/60)
+
+time = np.arange(0,duration,1/samplerate)
+
+# Plotting the Graph using Matplotlib
+plt.plot(time,data)
+
+plt.xlabel('Time [s]')
+plt.ylabel('Amplitude')
+plt.title('data1.wav')
+
+plt.show()
\ No newline at end of file
--- a/capturing_audio_noise/listenaudio.py
+++ b/capturing_audio_noise/listenaudio.py
+# Import packages
+from pydub import AudioSegment
+from pydub.playback import play
+
+
+# Play audio
+playaudio = AudioSegment.from_file("/home/tandin/notebook/Research/videosrc/IT4010ResearchProject34384144_full_video.mp3", format="mp3")
+play(playaudio)
\ No newline at end of file
--- a/converting2waFormate/wavConvert.py
+++ b/converting2waFormate/wavConvert.py
+from os import path
+from pydub import AudioSegment
+
+# files                                                                         
+src = "/home/tandin/notebook/Research/videosrc/try1.mp3"
+dst = "/home/tandin/notebook/Research/videosrc/myvoice.wav"
+
+# convert wav to mp3                                                            
+sound = AudioSegment.from_mp3(src)
+sound.export(dst, format="wav")
\ No newline at end of file
--- a/silenceremover/silenceremove.py
+++ b/silenceremover/silenceremove.py
+import collections
+import contextlib
+import sys
+import wave
+import webrtcvad
+
+
+def read_wave(path):
+    """Reads a .wav file.
+    Takes the path, and returns (PCM audio data, sample rate).
+    """
+    with contextlib.closing(wave.open(path, 'rb')) as wf:
+        num_channels = wf.getnchannels()
+        assert num_channels == 1
+        sample_width = wf.getsampwidth()
+        assert sample_width == 2
+        sample_rate = wf.getframerate()
+        assert sample_rate in (8000, 16000, 32000, 48000)
+        pcm_data = wf.readframes(wf.getnframes())
+        return pcm_data, sample_rate
+
+
+def write_wave(path, audio, sample_rate):
+    """Writes a .wav file.
+    Takes path, PCM audio data, and sample rate.
+    """
+    with contextlib.closing(wave.open(path, 'wb')) as wf:
+        wf.setnchannels(1)
+        wf.setsampwidth(2)
+        wf.setframerate(sample_rate)
+        wf.writeframes(audio)
+
+
+class Frame(object):
+    """Represents a "frame" of audio data."""
+    def __init__(self, bytes, timestamp, duration):
+        self.bytes = bytes
+        self.timestamp = timestamp
+        self.duration = duration
+
+
+def frame_generator(frame_duration_ms, audio, sample_rate):
+    """Generates audio frames from PCM audio data.
+    Takes the desired frame duration in milliseconds, the PCM data, and
+    the sample rate.
+    Yields Frames of the requested duration.
+    """
+    n = int(sample_rate * (frame_duration_ms / 1000.0) * 2)
+    offset = 0
+    timestamp = 0.0
+    duration = (float(n) / sample_rate) / 2.0
+    while offset + n < len(audio):
+        yield Frame(audio[offset:offset + n], timestamp, duration)
+        timestamp += duration
+        offset += n
+
+
+def vad_collector(sample_rate, frame_duration_ms,
+                  padding_duration_ms, vad, frames):
+    """Filters out non-voiced audio frames.
+    Given a webrtcvad.Vad and a source of audio frames, yields only
+    the voiced audio.
+    Uses a padded, sliding window algorithm over the audio frames.
+    When more than 90% of the frames in the window are voiced (as
+    reported by the VAD), the collector triggers and begins yielding
+    audio frames. Then the collector waits until 90% of the frames in
+    the window are unvoiced to detrigger.
+    The window is padded at the front and back to provide a small
+    amount of silence or the beginnings/endings of speech around the
+    voiced frames.
+    Arguments:
+    sample_rate - The audio sample rate, in Hz.
+    frame_duration_ms - The frame duration in milliseconds.
+    padding_duration_ms - The amount to pad the window, in milliseconds.
+    vad - An instance of webrtcvad.Vad.
+    frames - a source of audio frames (sequence or generator).
+    Returns: A generator that yields PCM audio data.
+    """
+    num_padding_frames = int(padding_duration_ms / frame_duration_ms)
+    # We use a deque for our sliding window/ring buffer.
+    ring_buffer = collections.deque(maxlen=num_padding_frames)
+    # We have two states: TRIGGERED and NOTTRIGGERED. We start in the
+    # NOTTRIGGERED state.
+    triggered = False
+
+    voiced_frames = []
+    for frame in frames:
+        is_speech = vad.is_speech(frame.bytes, sample_rate)
+
+        sys.stdout.write('1' if is_speech else '0')
+        if not triggered:
+            ring_buffer.append((frame, is_speech))
+            num_voiced = len([f for f, speech in ring_buffer if speech])
+            # If we're NOTTRIGGERED and more than 90% of the frames in
+            # the ring buffer are voiced frames, then enter the
+            # TRIGGERED state.
+            if num_voiced > 0.9 * ring_buffer.maxlen:
+                triggered = True
+                sys.stdout.write('+(%s)' % (ring_buffer[0][0].timestamp,))
+                # We want to yield all the audio we see from now until
+                # we are NOTTRIGGERED, but we have to start with the
+                # audio that's already in the ring buffer.
+                for f, s in ring_buffer:
+                    voiced_frames.append(f)
+                ring_buffer.clear()
+        else:
+            # We're in the TRIGGERED state, so collect the audio data
+            # and add it to the ring buffer.
+            voiced_frames.append(frame)
+            ring_buffer.append((frame, is_speech))
+            num_unvoiced = len([f for f, speech in ring_buffer if not speech])
+            # If more than 90% of the frames in the ring buffer are
+            # unvoiced, then enter NOTTRIGGERED and yield whatever
+            # audio we've collected.
+            if num_unvoiced > 0.9 * ring_buffer.maxlen:
+                sys.stdout.write('-(%s)' % (frame.timestamp + frame.duration))
+                triggered = False
+                yield b''.join([f.bytes for f in voiced_frames])
+                ring_buffer.clear()
+                voiced_frames = []
+    if triggered:
+        sys.stdout.write('-(%s)' % (frame.timestamp + frame.duration))
+    sys.stdout.write('\n')
+    # If we have any leftover voiced audio when we run out of input,
+    # yield it.
+    if voiced_frames:
+        yield b''.join([f.bytes for f in voiced_frames])
+
+
+def main(args):
+    if len(args) != 2:
+        sys.stderr.write(
+            'Usage: example.py <aggressiveness> <path to wav file>\n')
+        sys.exit(1)
+    audio, sample_rate = read_wave(args[1])
+    vad = webrtcvad.Vad(int(args[0]))
+    frames = frame_generator(30, audio, sample_rate)
+    frames = list(frames)
+    segments = vad_collector(sample_rate, 30, 300, vad, frames)
+    for i, segment in enumerate(segments):
+        path = 'chunk-%002d.wav' % (i,)
+        print(' Writing %s' % (path,))
+        write_wave(path, segment, sample_rate)
+
+
+if __name__ == '__main__':
+    main(sys.argv[1:])
\ No newline at end of file
--- a/silenceremover/silenceremover1.py
+++ b/silenceremover/silenceremover1.py
+from pydub import AudioSegment
+from pydub.silence import split_on_silence
+
+# Variables for the audio file
+file_path = "/home/tandin/notebook/Research/splitAudioFiles/splitaudio1/chunk0.mp4"
+file_name = file_path.split('/')[-1]
+audio_format = "mp4"
+
+# Reading and splitting the audio file into chunks
+sound = AudioSegment.from_file(file_path, format = audio_format) 
+audio_chunks = split_on_silence(sound
+                            ,min_silence_len = 100
+                            ,silence_thresh = -45
+                            ,keep_silence = 50
+                        )
+
+# Putting the file back together
+combined = AudioSegment.empty()
+for chunk in audio_chunks:
+    combined += chunk
+combined.export(f'/home/tandin/notebook/Research/silenceremover{file_name}', format = audio_format)
\ No newline at end of file
--- a/splitAudioFiles/split.py
+++ b/splitAudioFiles/split.py
+from pydub import AudioSegment
+import os
+
+if not os.path.isdir("splitaudio"):
+    os.mkdir("splitaudio")
+
+audio = AudioSegment.from_file("/home/tandin/notebook/Research/videosrc/IT4010ResearchProject34384144_full_video.mp4")
+lengthaudio = len(audio)
+print("Length of Audio File", lengthaudio)
+
+start = 0
+# # In Milliseconds, this will cut 10 Sec of audio(1 Sec = 1000 milliseconds)
+threshold = 300000
+end = 0
+counter = 0
+
+while start < len(audio):
+
+    end += threshold
+
+    print(start , end)
+
+    chunk = audio[start:end]
+
+    filename = f'splitaudio1/chunk{counter}.mp4'
+
+    chunk.export(filename, format="mp4")
+
+    counter +=1
+
+    start += threshold
\ No newline at end of file
--- a/video2audio_ffmpeg.py
+++ b/video2audio_ffmpeg.py
+import subprocess
+import os
+import sys
+
+def convert_video_to_audio_ffmpeg(video_file, output_ext="mp3"):
+    """Converts video to audio directly using `ffmpeg` command
+    with the help of subprocess module"""
+    filename, ext = os.path.splitext(video_file)
+    subprocess.call(["ffmpeg", "-y", "-i", video_file, f"{filename}.{output_ext}"], 
+                    stdout=subprocess.DEVNULL,
+                    stderr=subprocess.STDOUT)
+
+if __name__ == "__main__":
+    vf = sys.argv[1]
+    convert_video_to_audio_ffmpeg(vf)
\ No newline at end of file