working demo_2
Browse files
app.py
CHANGED
@@ -12,7 +12,7 @@ title = "Cascaded STST"
|
|
12 |
description = """
|
13 |
Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Hindi.
|
14 |
|
15 |
-
Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-
|
16 |
then MarianMT's [opus-mt-en-hi](https://huggingface.co/Helsinki-NLP/opus-mt-en-hi) model for translation to Hindi,
|
17 |
and finally microsoft/speechT5 fine-tuned for Hindi on IndicTTS dataset for text-to-speech.
|
18 |
[SpeechT5 TTS](https://huggingface.co/navodit17/speecht5_finetuned_indic_tts_hi) model for text-to-speech:
|
@@ -26,9 +26,10 @@ The model might give poor result for very short sentences (1-2 words or so). Try
|
|
26 |
"""
|
27 |
|
28 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
|
|
29 |
|
30 |
# load speech translation checkpoint
|
31 |
-
asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-
|
32 |
|
33 |
# load text-to-speech checkpoint and speaker embeddings
|
34 |
processor = SpeechT5Processor.from_pretrained("navodit17/speecht5_finetuned_indic_tts_hi")
|
@@ -51,16 +52,16 @@ def translate_en_hi(text):
|
|
51 |
|
52 |
def translate(audio):
|
53 |
outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
|
54 |
-
|
55 |
translated_text = translate_en_hi(outputs["text"])
|
56 |
-
|
57 |
return translated_text
|
58 |
|
59 |
def synthesise(text):
|
60 |
text = normalizer(transliterate(text, sanscript.DEVANAGARI, sanscript.ITRANS))
|
61 |
-
|
62 |
inputs = processor(text=text, return_tensors="pt")
|
63 |
-
|
64 |
speech = model.generate_speech(input_ids=inputs["input_ids"].to(device), speaker_embeddings=speaker_embeddings.to(device), vocoder=vocoder)
|
65 |
return speech.cpu()
|
66 |
|
@@ -68,6 +69,7 @@ def synthesise(text):
|
|
68 |
def speech_to_speech_translation(audio):
|
69 |
translated_text = translate(audio)
|
70 |
synthesised_speech = synthesise(translated_text)
|
|
|
71 |
synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
|
72 |
return 16000, synthesised_speech
|
73 |
|
|
|
12 |
description = """
|
13 |
Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Hindi.
|
14 |
|
15 |
+
Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation to English,
|
16 |
then MarianMT's [opus-mt-en-hi](https://huggingface.co/Helsinki-NLP/opus-mt-en-hi) model for translation to Hindi,
|
17 |
and finally microsoft/speechT5 fine-tuned for Hindi on IndicTTS dataset for text-to-speech.
|
18 |
[SpeechT5 TTS](https://huggingface.co/navodit17/speecht5_finetuned_indic_tts_hi) model for text-to-speech:
|
|
|
26 |
"""
|
27 |
|
28 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
29 |
+
print(f"device: {device}")
|
30 |
|
31 |
# load speech translation checkpoint
|
32 |
+
asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
|
33 |
|
34 |
# load text-to-speech checkpoint and speaker embeddings
|
35 |
processor = SpeechT5Processor.from_pretrained("navodit17/speecht5_finetuned_indic_tts_hi")
|
|
|
52 |
|
53 |
def translate(audio):
|
54 |
outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
|
55 |
+
print(f"Translated text - English: {outputs['text']}")
|
56 |
translated_text = translate_en_hi(outputs["text"])
|
57 |
+
print(f"Translated text - Hindi: {translated_text}")
|
58 |
return translated_text
|
59 |
|
60 |
def synthesise(text):
|
61 |
text = normalizer(transliterate(text, sanscript.DEVANAGARI, sanscript.ITRANS))
|
62 |
+
print(f"Normalized Text: {text}")
|
63 |
inputs = processor(text=text, return_tensors="pt")
|
64 |
+
print(f"Inputs: {inputs['input_ids'].shape}")
|
65 |
speech = model.generate_speech(input_ids=inputs["input_ids"].to(device), speaker_embeddings=speaker_embeddings.to(device), vocoder=vocoder)
|
66 |
return speech.cpu()
|
67 |
|
|
|
69 |
def speech_to_speech_translation(audio):
|
70 |
translated_text = translate(audio)
|
71 |
synthesised_speech = synthesise(translated_text)
|
72 |
+
print(f"Generated speech shape: {synthesised_speech.shape}")
|
73 |
synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
|
74 |
return 16000, synthesised_speech
|
75 |
|