artyomboyko commited on
Commit
c1990e6
1 Parent(s): c9c650d

Update app.py

Browse files

Add the code debugged on the local machine

Files changed (1) hide show
  1. app.py +12 -6
app.py CHANGED
@@ -5,13 +5,16 @@ from datasets import load_dataset
5
 
6
  from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
7
 
 
 
8
 
9
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
10
 
11
- # load speech translation checkpoint
12
  asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
 
13
 
14
- # load text-to-speech checkpoint and speaker embeddings
15
  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
16
 
17
  model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
@@ -22,11 +25,15 @@ speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze
22
 
23
 
24
  def translate(audio):
25
- outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
26
- return outputs["text"]
 
 
 
27
 
28
 
29
  def synthesise(text):
 
30
  inputs = processor(text=text, return_tensors="pt")
31
  speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
32
  return speech.cpu()
@@ -43,7 +50,6 @@ title = "Cascaded STST"
43
  description = """
44
  Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in English. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
45
  [SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech:
46
-
47
  ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
48
  """
49
 
@@ -69,4 +75,4 @@ file_translate = gr.Interface(
69
  with demo:
70
  gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
71
 
72
- demo.launch()
 
5
 
6
  from transformers import SpeechT5ForTextToSpeech, SpeechT5HifiGan, SpeechT5Processor, pipeline
7
 
8
+ from transliterate import translit
9
+
10
 
11
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
12
 
13
+ # загрузить контрольную точку модели транскибации и перевода речи
14
  asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
15
+ translate_pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ru")
16
 
17
+ # Загрузим контрольную точку преобразования текста в речь и эбеддинги дикторов
18
  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
19
 
20
  model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
 
25
 
26
 
27
  def translate(audio):
28
+ transcription = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "transcribe", "language": "english"})
29
+ translation = translate_pipe(transcription["text"])
30
+ result = translit(translation[0]['translation_text'], "ru", reversed=True)
31
+ print(result)
32
+ return result
33
 
34
 
35
  def synthesise(text):
36
+ print(text)
37
  inputs = processor(text=text, return_tensors="pt")
38
  speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
39
  return speech.cpu()
 
50
  description = """
51
  Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in English. Demo uses OpenAI's [Whisper Base](https://huggingface.co/openai/whisper-base) model for speech translation, and Microsoft's
52
  [SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model for text-to-speech:
 
53
  ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
54
  """
55
 
 
75
  with demo:
76
  gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
77
 
78
+ demo.launch()