navodit17 commited on
Commit
7133195
·
1 Parent(s): f3b5f91

working demo_2

Browse files
Files changed (1) hide show
  1. app.py +8 -6
app.py CHANGED
@@ -12,7 +12,7 @@ title = "Cascaded STST"
12
  description = """
13
  Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Hindi.
14
 
15
- Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-small) model for speech translation to English,
16
  then MarianMT's [opus-mt-en-hi](https://huggingface.co/Helsinki-NLP/opus-mt-en-hi) model for translation to Hindi,
17
  and finally microsoft/speechT5 fine-tuned for Hindi on IndicTTS dataset for text-to-speech.
18
  [SpeechT5 TTS](https://huggingface.co/navodit17/speecht5_finetuned_indic_tts_hi) model for text-to-speech:
@@ -26,9 +26,10 @@ The model might give poor result for very short sentences (1-2 words or so). Try
26
  """
27
 
28
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
 
29
 
30
  # load speech translation checkpoint
31
- asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-small", device=device)
32
 
33
  # load text-to-speech checkpoint and speaker embeddings
34
  processor = SpeechT5Processor.from_pretrained("navodit17/speecht5_finetuned_indic_tts_hi")
@@ -51,16 +52,16 @@ def translate_en_hi(text):
51
 
52
  def translate(audio):
53
  outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
54
- # print(f"Translated text - English: {outputs['text']}")
55
  translated_text = translate_en_hi(outputs["text"])
56
- # print(f"Translated text - Hindi: {translated_text}")
57
  return translated_text
58
 
59
  def synthesise(text):
60
  text = normalizer(transliterate(text, sanscript.DEVANAGARI, sanscript.ITRANS))
61
- # print(f"Normalized Text: {text}")
62
  inputs = processor(text=text, return_tensors="pt")
63
- # print(f"Inputs: {inputs['input_ids'].shape}")
64
  speech = model.generate_speech(input_ids=inputs["input_ids"].to(device), speaker_embeddings=speaker_embeddings.to(device), vocoder=vocoder)
65
  return speech.cpu()
66
 
@@ -68,6 +69,7 @@ def synthesise(text):
68
  def speech_to_speech_translation(audio):
69
  translated_text = translate(audio)
70
  synthesised_speech = synthesise(translated_text)
 
71
  synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
72
  return 16000, synthesised_speech
73
 
 
12
  description = """
13
  Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Hindi.
14
 
15
+ Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation to English,
16
  then MarianMT's [opus-mt-en-hi](https://huggingface.co/Helsinki-NLP/opus-mt-en-hi) model for translation to Hindi,
17
  and finally microsoft/speechT5 fine-tuned for Hindi on IndicTTS dataset for text-to-speech.
18
  [SpeechT5 TTS](https://huggingface.co/navodit17/speecht5_finetuned_indic_tts_hi) model for text-to-speech:
 
26
  """
27
 
28
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
29
+ print(f"device: {device}")
30
 
31
  # load speech translation checkpoint
32
+ asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
33
 
34
  # load text-to-speech checkpoint and speaker embeddings
35
  processor = SpeechT5Processor.from_pretrained("navodit17/speecht5_finetuned_indic_tts_hi")
 
52
 
53
  def translate(audio):
54
  outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
55
+ print(f"Translated text - English: {outputs['text']}")
56
  translated_text = translate_en_hi(outputs["text"])
57
+ print(f"Translated text - Hindi: {translated_text}")
58
  return translated_text
59
 
60
  def synthesise(text):
61
  text = normalizer(transliterate(text, sanscript.DEVANAGARI, sanscript.ITRANS))
62
+ print(f"Normalized Text: {text}")
63
  inputs = processor(text=text, return_tensors="pt")
64
+ print(f"Inputs: {inputs['input_ids'].shape}")
65
  speech = model.generate_speech(input_ids=inputs["input_ids"].to(device), speaker_embeddings=speaker_embeddings.to(device), vocoder=vocoder)
66
  return speech.cpu()
67
 
 
69
  def speech_to_speech_translation(audio):
70
  translated_text = translate(audio)
71
  synthesised_speech = synthesise(translated_text)
72
+ print(f"Generated speech shape: {synthesised_speech.shape}")
73
  synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
74
  return 16000, synthesised_speech
75