Commit 78da6b02 authored by Prabuddha Gimhan's avatar Prabuddha Gimhan

API developed for dyslexia

parent 65affb14
import librosa
import torch
#import IPython.display as display
#import transformers
import numpy as np
import os
import nltk
import torchaudio
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
from nltk.tokenize import sent_tokenize, word_tokenize
from transformers import pipeline
nltk.download('punkt')
####Speech to text######
current_dir = os.path.dirname(os.path.abspath(__file__))
# Construct the absolute path to the model and scalers
processor_path_stt= os.path.join(current_dir,"fun03_model/Wav2Vec2Processor")
model_path_stt= os.path.join(current_dir,"fun03_model/Wav2Vec2ForCTC")
# Load the saved tokenizer &model for speech to text
processor_stt = Wav2Vec2Processor.from_pretrained(processor_path_stt, local_files_only=True)
model_stt = Wav2Vec2ForCTC.from_pretrained(model_path_stt, local_files_only=True)
# Construct the absolute path to the model and scalers
processor_path_tts= os.path.join(current_dir,"fun03_model/SpeechT5_TTS-model/SpeechT5Processor")
model_path_tts= os.path.join(current_dir,"fun03_model/SpeechT5_TTS-model/SpeechT5model")
vocoder_path_tts=os.path.join(current_dir,"fun03_model/SpeechT5_TTS-model/SpeechT5vocoder")
# Load the saved processor & model for text to speech model
processor_tts = SpeechT5Processor.from_pretrained(processor_path_tts, local_files_only=True)
model_tts = SpeechT5ForTextToSpeech.from_pretrained(model_path_tts, local_files_only=True)
vocoder_tts = SpeechT5HifiGan.from_pretrained(vocoder_path_tts, local_files_only=True)
def speech_to_text(audio_file):
# Load pretrained model and processor
#model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
##processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
#model.save_pretrained("Wav2Vec2ForCTC")
#processor.save_pretrained("Wav2Vec2Processor")
# Load pretrained model and processor
#model_stt= Wav2Vec2ForCTC.from_pretrained("/content/drive/MyDrive/Work_space/Silverline_IT/Project/Learn_Joy/API/app/service03/fun03_model/Wav2Vec2ForCTC")
#processor_stt = Wav2Vec2Processor.from_pretrained("/content/drive/MyDrive/Work_space/Silverline_IT/Project/Learn_Joy/API/app/service03/fun03_model/Wav2Vec2Processor")
# Process audio input with specified sampling rate
audio_input, _ = torchaudio.load(audio_file, normalize=True)
sampling_rate = 16000 # Replace with the actual sampling rate of your audio file
input_values = processor_stt(audio_input.squeeze().numpy(), return_tensors="pt", sampling_rate=sampling_rate).input_values
# Perform inference
with torch.no_grad():
logits = model_stt(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor_stt.batch_decode(predicted_ids)[0]
return transcription
#########scoring#############
def scoring(words,transcriptions):
words=words.lower()
transcriptions=transcriptions.lower()
unwanted=[".",",","/","?","-",";",":","`","@","&","%","*"]
clean_words=[]
clean_voices=[]
#remove the unwanted symbol in the paragraph
clean_word = nltk.word_tokenize(words)
clean_voice= nltk.word_tokenize(transcriptions)
for i in clean_word:
if i not in unwanted:
clean_words.append(i)
else:
pass
for i in clean_voice:
if i not in unwanted:
clean_voices.append(i)
else:
pass
####technic 01
#tokenized the word
words_sent = nltk.sent_tokenize(words)
voice_sent = nltk.sent_tokenize(transcriptions)
#check write sentences
write_sentences=[]
write_word=[]
missing_voice=[]
for i , j in enumerate(words_sent):
for k,l in enumerate(voice_sent):
if i==k:
#clean j
i_token=nltk.word_tokenize(j)
clean_word=[]
for a in i_token:
if a not in unwanted:
clean_word.append(a)
j=" ".join(clean_word)
#clean l
k_token=nltk.word_tokenize(l)
clean_word2=[]
for b in k_token:
if b not in unwanted:
clean_word2.append(b)
l=" ".join(clean_word2)
#compair j & l
if j==l:
write_sentences.append(l)
else:
text_words=nltk.word_tokenize(j)
voice_words=nltk.word_tokenize(l)
for q,w in enumerate (text_words):
for d,f in enumerate (voice_words):
if q==d:
if w==f:
write_word.append(w)
else:
missing_voice.append(w)
else:
pass
else:
pass
#get the write_sentences`s word
for i in write_sentences:
len_write_sentences=nltk.word_tokenize(i)
for j in len_write_sentences:
write_word.append(j)
#technic 01 final score
sentences_score1=len(write_sentences)/len(words_sent)*100
word_score1=len(write_word)/len(clean_words)*100
####technic 02
write_sentences2=[]
write_word2=[]
missing_voice2=[]
for i,j in enumerate(clean_words):
for k, l in enumerate(clean_voices):
if i==k:
if j==l:
write_sentences2.append(j)
else:
pass
else:
pass
for i in clean_words:
for j in clean_voices:
if i==j:
write_word2.append(i)
else:
pass
for i in clean_words:
if i not in write_word2:
missing_voice2.append(i)
else:
pass
#thecnic 02 final score
sentences_score2=len(write_sentences2)/len(clean_words)*100
word_score2=len(set(write_word2))/len(set(clean_words))*100
###function final score
final_sent_score=''
final_word_score=''
if sentences_score1 >= sentences_score2:
final_sent_score=sentences_score1
else:
final_sent_score=sentences_score2
if word_score1 >= word_score2:
final_word_score=word_score1
else:
final_word_score= word_score2
return final_sent_score,final_word_score,missing_voice2
##################scoring letter###################
def scoring_letter(words,transcriptions):
pronunsation={"a":["ah","a","aa","ae"],"b":["b","be","bhe","bee","e"],"c":["C","cee","see","s"],"d":["d","de","dee","the","tha"],"e":["e","ae","ee"],"f":["af","f","ahf"],"g":["g","gee","jee"],
"h":["h","ah","ag","age"],"i":["i","ai","ii"],"j":["j","ja","jee"],"k":["k","kha","k`"],"l":["l","al","el"],"m":["am","m","em","eam"],"n":["n","en","an"] ,"o":["o`","oo","o","oh"],
"p":["p","pe","pee","pi","phi","phe"],"q":["q","que","queue"],"r":["r","ar","aer","er"],"s":["as","s","es"],"t":["t","tee","tea","ti"],"u":["u","you","yuu","yu"],
"v":["v","ve","we","wee"],"w":["w","dabluev"],"x":["x","ax","ex","xe"],"y":["y","why","whe"],"z":["z","ezed","Esed","zed","sed"] }
#get the lower case
words=words.lower()
transcriptions=transcriptions.lower()
score=''
#print(pronunsation[words])
l=pronunsation[words]
if transcriptions in l:
score=100
else:
score=0
return score
#########Text to speech#####
def text_to_speech(text,return_tensors="pt"):
#load model in outside
#processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
#model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
#vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
# Save the models and their configurations to the specified directory
#processor.save_pretrained("SpeechT5Processor")
#model.save_pretrained("SpeechT5model")
#vocoder.save_pretrained("SpeechT5vocoder")
#processor = SpeechT5Processor.from_pretrained("SpeechT5Processor")
#model = SpeechT5ForTextToSpeech.from_pretrained("SpeechT5model")
#vocoder = SpeechT5HifiGan.from_pretrained("SpeechT5vocoder")
#load model in local pc
#processor_tts = SpeechT5Processor.from_pretrained(r"/content/drive/MyDrive/Work_space/Silverline_IT/Project/Learn_Joy/API/app/service03/fun03_model/SpeechT5_TTS-model/SpeechT5Processor", local_files_only=True)
#model_tts = SpeechT5ForTextToSpeech.from_pretrained(r"/content/drive/MyDrive/Work_space/Silverline_IT/Project/Learn_Joy/API/app/service03/fun03_model/SpeechT5_TTS-model/SpeechT5model", local_files_only=True)
#vocoder_tts = SpeechT5HifiGan.from_pretrained(r"/content/drive/MyDrive/Work_space/Silverline_IT/Project/Learn_Joy/API/app/service03/fun03_model/SpeechT5_TTS-model/SpeechT5vocoder", local_files_only=True)
inputs = processor_tts (text=text, return_tensors=return_tensors)
# load xvector containing speaker's voice characteristics from a dataset
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
speech = model_tts.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder_tts)
# Ensure that speech is a 1D NumPy array
speech_array = speech.numpy().flatten()
# Return the speech_array a response
return speech_array
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment