gitgato commited on
Commit
4492d96
verified
1 Parent(s): d697ccc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -29
app.py CHANGED
@@ -1,60 +1,54 @@
1
  import torch
2
  from transformers import pipeline
3
  from datasets import load_dataset
 
 
 
4
 
 
5
  pipe = pipeline(
6
  "automatic-speech-recognition", model="openai/whisper-base"
7
  )
 
 
8
  def translate(audio):
9
  outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
10
  return outputs["text"]
11
 
12
- from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
13
-
14
  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
15
-
16
- model = SpeechT5ForTextToSpeech.from_pretrained("gitgato/mabama")
17
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
18
 
 
19
  embeddings_dataset = load_dataset("ovieyra21/mabama-v5", split="train")
20
- speaker_embeddings = torch.tensor("gitgato/mabama").unsqueeze(0)
21
 
 
22
  def synthesise(text):
23
  inputs = processor(text=text, return_tensors="pt")
24
- speech = model.generate_speech(
25
- inputs["input_ids"], speaker_embeddings, vocoder=vocoder
26
- )
27
- return speech.cpu()
28
-
29
- import numpy as np
30
 
 
31
  target_dtype = np.int16
32
  max_range = np.iinfo(target_dtype).max
33
 
34
-
35
  def speech_to_speech_translation(audio):
36
  translated_text = translate(audio)
37
  synthesised_speech = synthesise(translated_text)
38
- synthesised_speech = (synthesised_speech.numpy() * max_range).astype(np.int16)
39
  return 16000, synthesised_speech
40
 
41
- import gradio as gr
42
-
43
- demo = gr.Blocks()
44
-
45
- mic_translate = gr.Interface(
46
- fn=speech_to_speech_translation,
47
- inputs=gr.Audio(sources=["microphone"], type="filepath"),
48
- outputs=gr.Audio(label="Generated Speech", type="numpy"),
49
- )
50
-
51
- file_translate = gr.Interface(
52
  fn=speech_to_speech_translation,
53
- inputs=gr.Audio(sources=["upload"], type="filepath"),
54
  outputs=gr.Audio(label="Generated Speech", type="numpy"),
 
 
55
  )
56
 
57
- with demo:
58
- gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])
59
-
60
- demo.launch(debug=True)
 
1
  import torch
2
  from transformers import pipeline
3
  from datasets import load_dataset
4
+ from transformers import SpeechT5Processor, SpeechT5ForConditionalGeneration, SpeechT5HifiGan
5
+ import numpy as np
6
+ import gradio as gr
7
 
8
+ # Configurar el pipeline de reconocimiento autom谩tico de voz
9
  pipe = pipeline(
10
  "automatic-speech-recognition", model="openai/whisper-base"
11
  )
12
+
13
+ # Funci贸n para traducir texto
14
  def translate(audio):
15
  outputs = pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
16
  return outputs["text"]
17
 
18
+ # Cargar el procesador y el modelo de SpeechT5
 
19
  processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
20
+ model = SpeechT5ForConditionalGeneration.from_pretrained("gitgato/mabama")
 
21
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
22
 
23
+ # Cargar los datos de embeddings del hablante
24
  embeddings_dataset = load_dataset("ovieyra21/mabama-v5", split="train")
25
+ speaker_embeddings = torch.tensor(embeddings_dataset[0]["speaker_embedding"]).unsqueeze(0)
26
 
27
+ # Funci贸n para sintetizar el habla
28
  def synthesise(text):
29
  inputs = processor(text=text, return_tensors="pt")
30
+ speech = model.generate(inputs["input_ids"], speaker_embedding=speaker_embeddings, vocoder=vocoder)
31
+ return speech.numpy()
 
 
 
 
32
 
33
+ # Configuraci贸n para el tipo de audio de salida
34
  target_dtype = np.int16
35
  max_range = np.iinfo(target_dtype).max
36
 
37
+ # Funci贸n para traducci贸n de habla a habla
38
  def speech_to_speech_translation(audio):
39
  translated_text = translate(audio)
40
  synthesised_speech = synthesise(translated_text)
41
+ synthesised_speech = (synthesised_speech * max_range).astype(np.int16)
42
  return 16000, synthesised_speech
43
 
44
+ # Interfaz de Gradio
45
+ demo = gr.Interface(
 
 
 
 
 
 
 
 
 
46
  fn=speech_to_speech_translation,
47
+ inputs=gr.Audio(sources=["microphone"], type="file", label="Input Audio"),
48
  outputs=gr.Audio(label="Generated Speech", type="numpy"),
49
+ title="Speech-to-Speech Translation",
50
+ description="Translate speech input to synthesized speech output."
51
  )
52
 
53
+ # Lanzar la interfaz
54
+ demo.launch(debug=True)