Commit 49a6d367 authored by Prabuddha Gimhan's avatar Prabuddha Gimhan

Text-to-speech and speech to text Models import and accuracy score function developed

parent 9684ed1e
{"cells":[{"cell_type":"code","execution_count":1,"metadata":{"id":"q0z2LwMJx0AH","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1706014199735,"user_tz":-330,"elapsed":47007,"user":{"displayName":"Kaushi Gihan","userId":"11214181140146971518"}},"outputId":"c7aabc39-9ae6-4ee3-b4a3-90baf25c2e43"},"outputs":[{"output_type":"stream","name":"stdout","text":["\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m8.4/8.4 MB\u001b[0m \u001b[31m18.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m507.1/507.1 kB\u001b[0m \u001b[31m5.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m630.6/630.6 kB\u001b[0m \u001b[31m9.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.3/115.3 kB\u001b[0m \u001b[31m10.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m1.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m13.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.4/116.4 kB\u001b[0m \u001b[31m12.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m526.7/526.7 kB\u001b[0m \u001b[31m15.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25h"]}],"source":["#!pip install --upgrade pip -q\n","!pip install --upgrade transformers -q\n","#!pip install --upgrade transformers sentencepiece datasets[audio] -q\n","#!pip install sentencepiece -q\n","#!pip install git+https://github.com/huggingface/transformers.git -q\n","!pip install datasets soundfile speechbrain -q"]},{"cell_type":"code","execution_count":2,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":13946,"status":"ok","timestamp":1706014213659,"user":{"displayName":"Kaushi Gihan","userId":"11214181140146971518"},"user_tz":-330},"id":"8iiX_8Kxtc50","outputId":"220c03db-dfbf-469d-f5ee-a8e05858a3cc"},"outputs":[{"output_type":"stream","name":"stderr","text":["[nltk_data] Downloading package punkt to /root/nltk_data...\n","[nltk_data] Unzipping tokenizers/punkt.zip.\n"]},{"output_type":"execute_result","data":{"text/plain":["True"]},"metadata":{},"execution_count":2}],"source":["import librosa\n","import torch\n","import IPython.display as display\n","import transformers\n","import numpy as np\n","import os\n","import nltk\n","import soundfile as sf\n","import torchaudio\n","\n","\n","from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan\n","from datasets import load_dataset\n","from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor\n","from nltk.tokenize import sent_tokenize, word_tokenize\n","nltk.download('punkt')"]},{"cell_type":"code","source":["!pip show torchaudio"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"C4bGnxeTFQ5T","executionInfo":{"status":"ok","timestamp":1702704285027,"user_tz":-330,"elapsed":9046,"user":{"displayName":"Kaushi Gihan","userId":"11214181140146971518"}},"outputId":"a3d58ae9-514c-486e-c891-027fb2dd0be7"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Name: torchaudio\n","Version: 2.1.0+cu121\n","Summary: An audio package for PyTorch\n","Home-page: https://github.com/pytorch/audio\n","Author: Soumith Chintala, David Pollack, Sean Naren, Peter Goldsborough, Moto Hira, Caroline Chen, Jeff Hwang, Zhaoheng Ni, Xiaohui Zhang\n","Author-email: soumith@pytorch.org\n","License: \n","Location: /usr/local/lib/python3.10/dist-packages\n","Requires: torch\n","Required-by: speechbrain\n"]}]},{"cell_type":"code","source":["!pip freeze > requirements.txt\n"],"metadata":{"id":"wMQqSObSg1cr"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["!pip show nltk.tokeniz"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"UsiQbZzIgUWQ","executionInfo":{"status":"ok","timestamp":1699558022797,"user_tz":-330,"elapsed":2469,"user":{"displayName":"KAUSHI GIHAN","userId":"17781977202246029874"}},"outputId":"d3061cd8-377d-4a62-a13a-0bca83a53dff"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["\u001b[33mWARNING: Package(s) not found: nltk.tokenize\u001b[0m\u001b[33m\n","\u001b[0m"]}]},{"cell_type":"markdown","metadata":{"id":"Iq8O-Tf3x_0g"},"source":["**Speech to text**"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"YV4kqwgcyCxY"},"outputs":[],"source":["def speech_to_text(audio_file):\n","\n"," # Load pretrained model and processor\n"," #model = Wav2Vec2ForCTC.from_pretrained(\"facebook/wav2vec2-base-960h\")\n"," ##processor = Wav2Vec2Processor.from_pretrained(\"facebook/wav2vec2-base-960h\")\n","\n","\n"," #model.save_pretrained(\"Wav2Vec2ForCTC\")\n"," #processor.save_pretrained(\"Wav2Vec2Processor\")\n","\n"," # Load pretrained model and processor\n"," model_stt= Wav2Vec2ForCTC.from_pretrained(\"/content/drive/MyDrive/Work_space/Silverline_IT/Project/Learn_Joy/API/app/service03/fun03_model/Wav2Vec2ForCTC\")\n"," processor_stt = Wav2Vec2Processor.from_pretrained(\"/content/drive/MyDrive/Work_space/Silverline_IT/Project/Learn_Joy/API/app/service03/fun03_model/Wav2Vec2Processor\")\n","\n","\n"," # Process audio input with specified sampling rate\n"," audio_input, _ = torchaudio.load(audio_file, normalize=True)\n"," sampling_rate = 16000 # Replace with the actual sampling rate of your audio file\n"," input_values = processor_stt(audio_input.squeeze().numpy(), return_tensors=\"pt\", sampling_rate=sampling_rate).input_values\n","\n"," # Perform inference\n"," with torch.no_grad():\n"," logits = model_stt(input_values).logits\n","\n"," predicted_ids = torch.argmax(logits, dim=-1)\n"," transcription = processor_stt.batch_decode(predicted_ids)[0]\n","\n"," return transcription\n"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"oN5i7q2B0kDU"},"outputs":[],"source":["test1=speech_to_text(\"/content/drive/MyDrive/Work_space/Data set/Learn_Joy/function3/predict/i am good boy.i like mango and banana.wav\")"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":492,"status":"ok","timestamp":1702707513344,"user":{"displayName":"Kaushi Gihan","userId":"11214181140146971518"},"user_tz":-330},"id":"6q0ehPZc1ATC","outputId":"b4a67051-061b-4050-aa2e-db09d3295ad8"},"outputs":[{"output_type":"stream","name":"stdout","text":["I AM GOOD BOY THEY LIKE MANGO AND VANANA\n"]}],"source":["print(test1)"]},{"cell_type":"code","source":["from google.colab import drive\n","drive.mount('/content/drive')"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"lM3kA_Gyt5u-","executionInfo":{"status":"ok","timestamp":1706014619908,"user_tz":-330,"elapsed":78908,"user":{"displayName":"Kaushi Gihan","userId":"11214181140146971518"}},"outputId":"b7627f40-fbf7-41e7-b9da-a6ab51f450c8"},"execution_count":5,"outputs":[{"output_type":"stream","name":"stdout","text":["Mounted at /content/drive\n"]}]},{"cell_type":"markdown","metadata":{"id":"63dSfuC51z1P"},"source":["**scoring**"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"6P9W4Va317-a"},"outputs":[],"source":["def scoring(words,transcriptions):\n","\n"," words=words.lower()\n"," transcriptions=transcriptions.lower()\n","\n"," unwanted=[\".\",\",\",\"/\",\"?\",\"-\",\";\",\":\",\"`\",\"@\",\"&\",\"%\",\"*\"]\n","\n"," clean_words=[]\n"," clean_voices=[]\n","\n"," #remove the unwanted symbol in the paragraph\n"," clean_word = nltk.word_tokenize(words)\n"," clean_voice= nltk.word_tokenize(transcriptions)\n","\n"," for i in clean_word:\n"," if i not in unwanted:\n"," clean_words.append(i)\n","\n"," else:\n"," pass\n","\n"," for i in clean_voice:\n"," if i not in unwanted:\n"," clean_voices.append(i)\n","\n"," else:\n"," pass\n","\n","####technic 01\n","\n"," #tokenized the word\n"," words_sent = nltk.sent_tokenize(words)\n"," voice_sent = nltk.sent_tokenize(transcriptions)\n","\n"," #check write sentences\n"," write_sentences=[]\n"," write_word=[]\n"," missing_voice=[]\n","\n"," for i , j in enumerate(words_sent):\n"," for k,l in enumerate(voice_sent):\n"," if i==k:\n"," #clean j\n"," i_token=nltk.word_tokenize(j)\n"," clean_word=[]\n"," for a in i_token:\n"," if a not in unwanted:\n"," clean_word.append(a)\n"," j=\" \".join(clean_word)\n","\n"," #clean l\n"," k_token=nltk.word_tokenize(l)\n"," clean_word2=[]\n"," for b in k_token:\n"," if b not in unwanted:\n"," clean_word2.append(b)\n"," l=\" \".join(clean_word2)\n","\n"," #compair j & l\n"," if j==l:\n"," write_sentences.append(l)\n","\n"," else:\n"," text_words=nltk.word_tokenize(j)\n"," voice_words=nltk.word_tokenize(l)\n","\n"," for q,w in enumerate (text_words):\n"," for d,f in enumerate (voice_words):\n"," if q==d:\n"," if w==f:\n"," write_word.append(w)\n","\n"," else:\n"," missing_voice.append(w)\n","\n"," else:\n"," pass\n"," else:\n"," pass\n","\n","\n"," #get the write_sentences`s word\n"," for i in write_sentences:\n"," len_write_sentences=nltk.word_tokenize(i)\n"," for j in len_write_sentences:\n"," write_word.append(j)\n","\n","\n"," #technic 01 final score\n"," sentences_score1=len(write_sentences)/len(words_sent)*100\n"," word_score1=len(write_word)/len(clean_words)*100\n"," #print(write_sentences)\n"," #print(write_word)\n"," #print(missing_voice)\n"," #print(clean_words)\n"," #print(sentences_score1)\n"," #print(word_score1)\n","\n","\n","\n","\n","####technic 02\n","\n"," write_sentences2=[]\n"," write_word2=[]\n"," missing_voice2=[]\n","\n"," for i,j in enumerate(clean_words):\n"," for k, l in enumerate(clean_voices):\n"," if i==k:\n"," if j==l:\n"," write_sentences2.append(j)\n"," else:\n"," pass\n"," else:\n"," pass\n","\n"," for i in clean_words:\n"," for j in clean_voices:\n"," if i==j:\n"," write_word2.append(i)\n","\n"," else:\n"," pass\n","\n"," for i in clean_words:\n"," if i not in write_word2:\n"," missing_voice2.append(i)\n"," else:\n"," pass\n","\n"," #thecnic 02 final score\n"," sentences_score2=len(write_sentences2)/len(clean_words)*100\n"," word_score2=len(set(write_word2))/len(set(clean_words))*100\n"," #print(write_sentences2)\n"," #print(write_word2)\n"," #print(missing_voice2)\n"," #print(sentences_score2)\n"," #print(word_score2)\n","\n"," ###function final score\n"," final_sent_score=''\n"," final_word_score=''\n","\n","\n"," if sentences_score1 >= sentences_score2:\n"," final_sent_score=sentences_score1\n"," else:\n"," final_sent_score=sentences_score2\n","\n","\n","\n"," if word_score1 >= word_score2:\n"," final_word_score=word_score1\n"," else:\n"," final_word_score= word_score2\n","\n","\n"," return final_sent_score,final_word_score,missing_voice2\n","\n"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":585,"status":"ok","timestamp":1702704418685,"user":{"displayName":"Kaushi Gihan","userId":"11214181140146971518"},"user_tz":-330},"id":"RbVIJPa1CAdY","outputId":"f74a39c2-7b6b-436b-f774-233a6444a7b9"},"outputs":[{"output_type":"stream","name":"stdout","text":["(33.33333333333333, 66.66666666666666, ['apple'])\n"]}],"source":["test3=scoring(\"i like apple\",\"i ee like ee \")\n","print(test3)"]},{"cell_type":"markdown","metadata":{"id":"FkVy__slwIum"},"source":["**Text to speech**"]},{"cell_type":"code","execution_count":17,"metadata":{"id":"zUJ67X5WIfyO","executionInfo":{"status":"ok","timestamp":1706015232214,"user_tz":-330,"elapsed":666,"user":{"displayName":"Kaushi Gihan","userId":"11214181140146971518"}}},"outputs":[],"source":["def text_to_speech(text,return_tensors=\"pt\"):\n","\n"," #load model in outside\n","\n"," #processor = SpeechT5Processor.from_pretrained(\"microsoft/speecht5_tts\")\n"," #model = SpeechT5ForTextToSpeech.from_pretrained(\"microsoft/speecht5_tts\")\n"," #vocoder = SpeechT5HifiGan.from_pretrained(\"microsoft/speecht5_hifigan\")\n","\n","\n"," # Save the models and their configurations to the specified directory\n","\n"," #processor.save_pretrained(\"SpeechT5Processor\")\n"," #model.save_pretrained(\"SpeechT5model\")\n"," #vocoder.save_pretrained(\"SpeechT5vocoder\")\n","\n"," #processor = SpeechT5Processor.from_pretrained(\"SpeechT5Processor\")\n"," #model = SpeechT5ForTextToSpeech.from_pretrained(\"SpeechT5model\")\n"," #vocoder = SpeechT5HifiGan.from_pretrained(\"SpeechT5vocoder\")\n","\n","\n"," #load model in local pc\n","\n"," processor = SpeechT5Processor.from_pretrained(\"/content/drive/MyDrive/Work_space/Silverline_IT/Project/Learn_Joy/API/app/service03/fun03_model/SpeechT5_TTS-model/SpeechT5Processor\", local_files_only=True)\n"," model = SpeechT5ForTextToSpeech.from_pretrained(\"/content/drive/MyDrive/Work_space/Silverline_IT/Project/Learn_Joy/API/app/service03/fun03_model/SpeechT5_TTS-model/SpeechT5model\", local_files_only=True)\n"," vocoder = SpeechT5HifiGan.from_pretrained(\"/content/drive/MyDrive/Work_space/Silverline_IT/Project/Learn_Joy/API/app/service03/fun03_model/SpeechT5_TTS-model/SpeechT5vocoder\", local_files_only=True)\n","\n","\n"," inputs = processor(text=text, return_tensors=return_tensors)\n","\n"," # load xvector containing speaker's voice characteristics from a dataset\n"," embeddings_dataset = load_dataset(\"Matthijs/cmu-arctic-xvectors\", split=\"validation\")\n"," speaker_embeddings = torch.tensor(embeddings_dataset[7306][\"xvector\"]).unsqueeze(0)\n","\n"," speech = model.generate_speech(inputs[\"input_ids\"], speaker_embeddings, vocoder=vocoder)\n","\n"," audio_out=sf.write(\"speech.wav\", speech.numpy(), samplerate=16000)\n","\n"," return audio_out"]},{"cell_type":"code","execution_count":20,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":5969,"status":"ok","timestamp":1706022123233,"user":{"displayName":"Kaushi Gihan","userId":"11214181140146971518"},"user_tz":-330},"id":"jR_WDvgQIf6G","outputId":"106650cb-cb0b-4e70-f18a-05c54f3c22c6"},"outputs":[{"output_type":"stream","name":"stderr","text":["Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"]}],"source":["test=text_to_speech(\" cat \")"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":532,"status":"ok","timestamp":1699511408591,"user":{"displayName":"KAUSHI GIHAN","userId":"17781977202246029874"},"user_tz":-330},"id":"iHRb3RPeIf8z","outputId":"69d037f4-2c6e-4ddf-e18e-bd34b1b94d74"},"outputs":[{"name":"stdout","output_type":"stream","text":["None\n"]}],"source":["print(test)"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"DZ2sB9dCg8Es"},"outputs":[],"source":[]},{"cell_type":"markdown","metadata":{"id":"wM59vZgakeWU"},"source":["# ***Main function***"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"rR9AoOt9Dw21"},"outputs":[],"source":["import numpy as np"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"PVeTMhpJki-O"},"outputs":[],"source":["def main_fun03(words:str,voice):\n","\n"," #auto speech recognition\n"," transcriptions=speech_to_text(voice)\n","\n"," #get the prediction score\n"," final_sent_score,final_word_score,missing_voice2=scoring(words,transcriptions)\n","\n","\n"," #return score\n"," voice_score=''\n"," voice_sent=''\n","\n","\n"," if int(final_sent_score) >= int(final_word_score):\n"," voice_score=np.round(final_sent_score)\n","\n"," else :\n"," voice_score=np.round(final_word_score)\n","\n","\n"," if int(voice_score)==100:\n"," voice_sent=f\"WOW !!!! YOU WIN !!!, You got hundred percent score\"\n","\n"," else:\n"," voice_sent=f\"You got less than hundred percent score, Please try these words {missing_voice2}\"\n","\n"," #text to speech\n"," Voice_out=text_to_speech(voice_sent)\n","\n","\n"," return Voice_out,{\"final_sent_score\":np.round(final_sent_score), \"final_word_score\":np.round(final_word_score), \"missing_voice2\":missing_voice2}\n"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":17374,"status":"ok","timestamp":1702708078906,"user":{"displayName":"Kaushi Gihan","userId":"11214181140146971518"},"user_tz":-330},"id":"NxI5zxSalyDe","outputId":"489d7fae-d8ec-440e-f65c-4aad64de4c9d"},"outputs":[{"output_type":"stream","name":"stderr","text":["Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"]}],"source":["test=main_fun03(\"i am good boy. i like mango and banana\",\"/content/drive/MyDrive/Work_space/Data set/Learn_Joy/function3/predict/i am good boy..wav\")"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":463,"status":"ok","timestamp":1702708082227,"user":{"displayName":"Kaushi Gihan","userId":"11214181140146971518"},"user_tz":-330},"id":"SPcIrc2sDTp7","outputId":"35b9b241-e71a-4a3a-fa13-ff4b07a0fcf5"},"outputs":[{"output_type":"stream","name":"stdout","text":["(None, {'final_sent_score': 50.0, 'final_word_score': 50.0, 'missing_voice2': ['like', 'mango', 'and', 'banana']})\n"]}],"source":["print(test)"]},{"cell_type":"markdown","source":[],"metadata":{"id":"Z79E1UdN07Hi"}}],"metadata":{"colab":{"provenance":[]},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.8.8"},"vscode":{"interpreter":{"hash":"b337b16e1f284c9fe7de692799556d56c1809887abe3f5a49ffeb9e7df151cfb"}}},"nbformat":4,"nbformat_minor":0}
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment