Spaces:

navodit17
/

speech-to-speech-translation

Sleeping

App Files Files Community

navodit17 commited on Apr 16

Commit

7133195

1 Parent(s): f3b5f91

working demo_2

Browse files

Files changed (1) hide show

app.py +8 -6

app.py CHANGED Viewed

@@ -12,7 +12,7 @@ title = "Cascaded STST"
 description = """
 Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Hindi.
-Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-small) model for speech translation to English,
 then MarianMT's [opus-mt-en-hi](https://huggingface.co/Helsinki-NLP/opus-mt-en-hi) model for translation to Hindi,
 and finally microsoft/speechT5 fine-tuned for Hindi on IndicTTS dataset for text-to-speech.
 [SpeechT5 TTS](https://huggingface.co/navodit17/speecht5_finetuned_indic_tts_hi) model for text-to-speech:
@@ -26,9 +26,10 @@ The model might give poor result for very short sentences (1-2 words or so). Try
 """
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
 # load speech translation checkpoint
-asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-small", device=device)
 # load text-to-speech checkpoint and speaker embeddings
 processor = SpeechT5Processor.from_pretrained("navodit17/speecht5_finetuned_indic_tts_hi")
@@ -51,16 +52,16 @@ def translate_en_hi(text):
 def translate(audio):
     outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
-    # print(f"Translated text - English: {outputs['text']}")
     translated_text = translate_en_hi(outputs["text"])
-    # print(f"Translated text - Hindi: {translated_text}")
     return translated_text
 def synthesise(text):
     text = normalizer(transliterate(text, sanscript.DEVANAGARI, sanscript.ITRANS))
-    # print(f"Normalized Text: {text}")
     inputs = processor(text=text, return_tensors="pt")
-    # print(f"Inputs: {inputs['input_ids'].shape}")
     speech = model.generate_speech(input_ids=inputs["input_ids"].to(device), speaker_embeddings=speaker_embeddings.to(device), vocoder=vocoder)
     return speech.cpu()
@@ -68,6 +69,7 @@ def synthesise(text):
 def speech_to_speech_translation(audio):
     translated_text = translate(audio)
     synthesised_speech = synthesise(translated_text)
     synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
     return 16000, synthesised_speech

 description = """
 Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Hindi.
+Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation to English,
 then MarianMT's [opus-mt-en-hi](https://huggingface.co/Helsinki-NLP/opus-mt-en-hi) model for translation to Hindi,
 and finally microsoft/speechT5 fine-tuned for Hindi on IndicTTS dataset for text-to-speech.
 [SpeechT5 TTS](https://huggingface.co/navodit17/speecht5_finetuned_indic_tts_hi) model for text-to-speech:
 """
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
+print(f"device: {device}")
 # load speech translation checkpoint
+asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
 # load text-to-speech checkpoint and speaker embeddings
 processor = SpeechT5Processor.from_pretrained("navodit17/speecht5_finetuned_indic_tts_hi")
 def translate(audio):
     outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
+    print(f"Translated text - English: {outputs['text']}")
     translated_text = translate_en_hi(outputs["text"])
+    print(f"Translated text - Hindi: {translated_text}")
     return translated_text
 def synthesise(text):
     text = normalizer(transliterate(text, sanscript.DEVANAGARI, sanscript.ITRANS))
+    print(f"Normalized Text: {text}")
     inputs = processor(text=text, return_tensors="pt")
+    print(f"Inputs: {inputs['input_ids'].shape}")
     speech = model.generate_speech(input_ids=inputs["input_ids"].to(device), speaker_embeddings=speaker_embeddings.to(device), vocoder=vocoder)
     return speech.cpu()
 def speech_to_speech_translation(audio):
     translated_text = translate(audio)
     synthesised_speech = synthesise(translated_text)
+    print(f"Generated speech shape: {synthesised_speech.shape}")
     synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
     return 16000, synthesised_speech