speech-to-speech-translation

Sleeping

App Files Files Community

ihanif commited on Aug 24, 2023

Commit

35fee4b

1 Parent(s): 8ed9d92

Clean up text with LT alphabets

Browse files

Files changed (1) hide show

app.py +31 -4

app.py CHANGED Viewed

@@ -14,7 +14,8 @@ asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base",
 # load text-to-speech checkpoint and speaker embeddings
 #processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
 #"ihanif/speecht5_finetuned_voxpopuli_lt"
-model_id = "sanchit-gandhi/speecht5_tts_vox_nl"
 processor = SpeechT5Processor.from_pretrained(model_id)
 #model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
@@ -26,12 +27,38 @@ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validat
 speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
 def translate(audio):
-    outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate", "language": "nl"})
     return outputs["text"]
 def synthesise(text):
     inputs = processor(text=text, return_tensors="pt")
     speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
     return speech.cpu()
@@ -57,7 +84,7 @@ demo = gr.Blocks()
 mic_translate = gr.Interface(
     fn=speech_to_speech_translation,
     inputs=gr.Audio(source="microphone", type="filepath"),
-    outputs=gr.Audio(label="Generated Speech", type="numpy"),
     title=title,
     description=description,
 )
@@ -65,7 +92,7 @@ mic_translate = gr.Interface(
 file_translate = gr.Interface(
     fn=speech_to_speech_translation,
     inputs=gr.Audio(source="upload", type="filepath"),
-    outputs=gr.Audio(label="Generated Speech", type="numpy"),
     examples=[["./example.wav"]],
     title=title,
     description=description,

 # load text-to-speech checkpoint and speaker embeddings
 #processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
 #"ihanif/speecht5_finetuned_voxpopuli_lt"
+#model_id = "sanchit-gandhi/speecht5_tts_vox_nl"
+model_id = "ihanif/speecht5_finetuned_voxpopuli_lt"
 processor = SpeechT5Processor.from_pretrained(model_id)
 #model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
 speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
+replacements = [
+    ("à", "a"),
+    ("ą", "a"),
+    ("ç", "c"),
+    ("č", "c"),
+    ("è", "e"),
+    ("ë", "e"),
+    ("ė", "e"),
+    ("ę", "e"),
+    ("í", "i"),
+    ("ï", "i"),
+    ("į", "i"),
+    ("ö", "o"),
+    ("š", "s"),
+    ("ü", "u"),
+    ("ū", "u"),
+    ("ų", "u"),
+    ("ž", "z"),
+]
+def cleanup_text(text):
+    for src, dst in replacements:
+        text = text.replace(src, dst)
+    return text
 def translate(audio):
+    outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate", "language": "lt"})
     return outputs["text"]
 def synthesise(text):
+    text = cleanup_text(text)
     inputs = processor(text=text, return_tensors="pt")
     speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
     return speech.cpu()
 mic_translate = gr.Interface(
     fn=speech_to_speech_translation,
     inputs=gr.Audio(source="microphone", type="filepath"),
+    outputs=[gr.Audio(label="Generated Speech", type="numpy"), gr.outputs.Textbox()],
     title=title,
     description=description,
 )
 file_translate = gr.Interface(
     fn=speech_to_speech_translation,
     inputs=gr.Audio(source="upload", type="filepath"),
+    outputs=[gr.Audio(label="Generated Speech", type="numpy"), gr.outputs.Textbox()],
     examples=[["./example.wav"]],
     title=title,
     description=description,