speech-to-speech-translation-rus

Runtime error

App Files Files Community

voxxer commited on Aug 25, 2023

Commit

daf595f

1 Parent(s): 844211b

Changed to Russian language

Browse files

Files changed (1) hide show

app.py +16 -2

app.py CHANGED Viewed

@@ -2,6 +2,7 @@ import gradio as gr
 import numpy as np
 import torch
 from datasets import load_dataset
 from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
@@ -14,7 +15,7 @@ asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base",
 # load text-to-speech checkpoint and speaker embeddings
 processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
-model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
 vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
 embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
@@ -22,7 +23,7 @@ speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze
 def translate(audio):
-    outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
     return outputs["text"]
@@ -31,9 +32,22 @@ def synthesise(text):
     speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
     return speech.cpu()
 def speech_to_speech_translation(audio):
     translated_text = translate(audio)
     synthesised_speech = synthesise(translated_text)
     synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
     return 16000, synthesised_speech

 import numpy as np
 import torch
 from datasets import load_dataset
+from transliterate import translit
 from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
 # load text-to-speech checkpoint and speaker embeddings
 processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
+model = SpeechT5ForTextToSpeech.from_pretrained("voxxer/speecht5_finetuned_commonvoice_ru_translit").to(device)
 vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
 embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
 def translate(audio):
+    outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate", "language": "ru"})
     return outputs["text"]
     speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
     return speech.cpu()
+def cleanup_text(inputs):
+    replacements = [('«', '"'),
+                     ('»', '"'),
+                     ('‑', '-'),
+                     ('–', '-'),
+                     ('−', '-'),
+                     ('…', '...'),
+                    ]
+    for src, dst in replacements:
+        inputs = translit(inputs.replace(src, dst).lower(), 'ru', reversed=True)
+    return inputs
 def speech_to_speech_translation(audio):
     translated_text = translate(audio)
+    translated_text = cleanup_text(translated_text)
     synthesised_speech = synthesise(translated_text)
     synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
     return 16000, synthesised_speech