vladelesin commited on
Commit
2dcbe47
·
1 Parent(s): 7aec40a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -22
app.py CHANGED
@@ -1,44 +1,54 @@
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import numpy as np
3
  import torch
4
 
5
- from transformers import AutoTokenizer, VitsModel
6
- from transformers import pipeline
7
 
8
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
9
 
10
- # Translate audio to russian text
11
- asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device=device)
12
- translator_to_ru = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ru")
 
 
13
 
14
- def translate(audio, translator_to_ru: pipeline = translator_to_ru):
 
 
 
 
15
  outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
16
- return translator_to_ru(outputs['text'])[0]['translation_text']
17
 
18
- # Text to russian speech
19
- model = VitsModel.from_pretrained("facebook/mms-tts-rus")
20
- tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-rus")
21
 
22
- def synthesise(text: str, tokenizer: AutoTokenizer = tokenizer, model: VitsModel = model):
23
- inputs = tokenizer(text, return_tensors="pt")
24
- # print(inputs)
25
  with torch.no_grad():
26
- output = model(**inputs).waveform
27
- return output.cpu()
28
-
29
 
30
  def speech_to_speech_translation(audio):
31
- translated_text = translate(audio)
32
  synthesised_speech = synthesise(translated_text)
33
  synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
34
  return 16000, synthesised_speech[0]
35
 
36
-
37
  title = "Cascaded STST"
38
  description = """
39
- Demo for cascaded speech-to-speech translation (STST), mapping from source speech in multi language to target speech in Russian. Demo uses OpenAI's [Whisper Tiny](https://huggingface.co/openai/whisper-tiny) model for speech translation, and Facebook's
40
- [mms-tts-rus](https://huggingface.co/acebook/mms-tts-rus) model for text-to-speech:
41
- ![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
42
  """
43
 
44
  demo = gr.Blocks()
@@ -55,7 +65,7 @@ file_translate = gr.Interface(
55
  fn=speech_to_speech_translation,
56
  inputs=gr.Audio(source="upload", type="filepath"),
57
  outputs=gr.Audio(label="Generated Speech", type="numpy"),
58
- examples=[["./test_2.wav"]],
59
  title=title,
60
  description=description,
61
  )
 
1
+ # -*- coding: utf-8 -*-
2
+ """app.ipynb
3
+ Automatically generated by Colaboratory.
4
+ Original file is located at
5
+ https://colab.research.google.com/drive/16MxXQeF3O0htL9eQ61aa6ZxnApGg9TKN
6
+ """
7
+
8
  import gradio as gr
9
  import numpy as np
10
  import torch
11
 
12
+ from transformers import pipeline, VitsModel, VitsTokenizer, FSMTForConditionalGeneration, FSMTTokenizer
 
13
 
14
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
15
 
16
+ #eng audio to text transformation
17
+ asr_pipe = pipeline("automatic-speech-recognition", model="asapp/sew-d-tiny-100k-ft-ls100h", device=device)
18
+
19
+ #eng text to rus text translation
20
+ translation_pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-en-ru")
21
 
22
+ #rus text to rus speech transformation
23
+ vits_model = VitsModel.from_pretrained("facebook/mms-tts-rus")
24
+ vits_tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-rus")
25
+
26
+ def transform_audio_to_speech_en(audio):
27
  outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
28
+ return outputs["text"]
29
 
30
+ def translator(text):
31
+ translated_text = translation_pipe(text)
32
+ return translated_text[0]['translation_text']
33
 
34
+ def synthesise(translated_text):
35
+ translated_text = translator(translated_text)
36
+ inputs = vits_tokenizer(translated_text, return_tensors="pt")
37
  with torch.no_grad():
38
+ speech = vits_model(**inputs).waveform
39
+ return speech.cpu()
 
40
 
41
  def speech_to_speech_translation(audio):
42
+ translated_text = transform_audio_to_speech_en(audio)
43
  synthesised_speech = synthesise(translated_text)
44
  synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
45
  return 16000, synthesised_speech[0]
46
 
 
47
  title = "Cascaded STST"
48
  description = """
49
+ В Демо используется модель SEW-D-tiny(https://huggingface.co/asapp/sew-d-tiny-100k-ft-ls100h),
50
+ распознающая английскую речь и преобразующая ее в строку. Затем с помощью модели Helsinki-NLP/opus-mt-en-ru(https://huggingface.co/Helsinki-NLP/opus-mt-en-ru) текст
51
+ переводится на русский язык и преобразуется в русскую речь с помощью модели facebook/mms-tts-rus(https://huggingface.co/facebook/mms-tts-rus).
52
  """
53
 
54
  demo = gr.Blocks()
 
65
  fn=speech_to_speech_translation,
66
  inputs=gr.Audio(source="upload", type="filepath"),
67
  outputs=gr.Audio(label="Generated Speech", type="numpy"),
68
+ examples=[["./example.wav"]],
69
  title=title,
70
  description=description,
71
  )