some nltk pre-processing steps are added

e31c9649 · Pamal-Ranasinghe · 8212e2c2 · e31c9649 · e31c9649 · 8212e2c2
Commit e31c9649 authored Apr 15, 2022 by Pamal-Ranasinghe
7 changed files
--- a/.env.example
+++ b/.env.example
+CONVERTED_AUDIO_PATH = '<fil path for the extracted audio>'
+CONVERTED_AUDIO_FILE_NAME = '<extracted audio file name>'
+NORMAL_LANGUAGE = '<english>'
\ No newline at end of file
--- a/app.py
+++ b/app.py
 #import flask module
 from flask import Flask
 from flask_restful import Api
-from flask import Flask
 from resources.routes import initialize_routes

--- a/assets/converted_wav/converted.wav
+++ b/assets/converted_wav/converted.wav
--- a/resources/__pycache__/routes.cpython-38.pyc
+++ b/resources/__pycache__/routes.cpython-38.pyc
--- a/resources/__pycache__/speechExtraction.cpython-38.pyc
+++ b/resources/__pycache__/speechExtraction.cpython-38.pyc
--- a/resources/speechExtraction.py
+++ b/resources/speechExtraction.py
 from flask_restful import Resource
 from loguru import logger
+from .wordsProcessModel import WordModel
 import speech_recognition as sr
 import moviepy.editor as mp
 import json
+import os
 class SpeechExtraction(Resource):
@@ -11,6 +13,7 @@ class SpeechExtraction(Resource):
    # params: self
    # return: json
    # author: Pamal Ranasinghe
    def get(self):
        try:
            # Check the endpoint execution
@@ -26,12 +29,23 @@ class SpeechExtraction(Resource):
            result = r.recognize_google(audio_file)
-            # Create a dict object which includes the result
-            value = {"text" : result}
+            # Calling word pre processor model
+            wm = WordModel(result)
+            processed_words = json.dumps(wm.word_pre_processor())
+            value = {
+                "text" : result,
+                "tokens" : json.loads(processed_words)["tokens"],
+                "functional_words": json.loads(processed_words)["filtered_words"],
+                }
+            #remove the coverted.wav for get more space in the server
+            os.remove(os.path.join(os.getenv('CONVERTED_AUDIO_PATH'), os.getenv('CONVERTED_AUDIO_FILE_NAME')))
            #return the json object which is having converted speech
            return json.loads(json.dumps(value)), 200
        except Exception as e:
            logger.error(str(e))
-            return json.loads(json.dumps({"message" : "Something went wrong"})) , 500
+            return json.loads(json.dumps({"message" : str(e)})) , 500
\ No newline at end of file
--- a/resources/wordsProcessModel.py
+++ b/resources/wordsProcessModel.py
+from nltk.tokenize import word_tokenize
+from loguru import logger
+import os
+import json
+class WordModel:
+    def __init__(self,para):
+        self.para = para
+    # This function uses for pre-processing on words 
+    # params: self
+    # return: json
+    # author: Pamal Ranasinghe
+    def word_pre_processor(self):
+        try:
+            logger.info('word_pre_process - hits')
+            # Identify all the takens
+            para_tokenize = word_tokenize(self.para)
+            logger.info('Tokenized Words : ' ,para_tokenize)
+            # Remove the stop words from the text
+            from nltk.corpus import stopwords
+            stop_words = set(stopwords.words(os.getenv('NORMAL_LANGUAGE')))
+            filtered_sentence = [w for w in para_tokenize if not w.lower() in stop_words]
+            filtered_sentence = []
+            #Append the rest of words after removing the stop words
+            for w in para_tokenize:
+                if w not in stop_words:
+                    filtered_sentence.append(w)
+            return json.loads(json.dumps({
+                "filtered_words" : filtered_sentence,
+                "tokens" : para_tokenize,                   
+                }))
+        except Exception as e:
+            logger.error(str(e))
+            return json.loads(json.dumps({"message" : str(e)})), 500