File size: 4,664 Bytes
d347764
 
 
08375d8
d347764
08375d8
d347764
 
08375d8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d347764
 
 
08375d8
 
d347764
08375d8
d347764
08375d8
 
 
 
d347764
 
08375d8
d347764
08375d8
d347764
08375d8
 
 
 
417f0b6
08375d8
417f0b6
08375d8
 
d347764
08375d8
417f0b6
 
 
 
08375d8
 
 
 
 
d347764
 
 
 
 
 
 
f805e49
 
08375d8
f805e49
 
 
 
c737803
 
 
d347764
08375d8
d347764
f805e49
 
d347764
c737803
 
 
08375d8
c737803
 
 
 
 
 
 
3946ba6
c737803
d347764
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import gradio as gr
import numpy as np
import torch
from transformers import pipeline, VitsModel, AutoTokenizer, AutoTokenizer

device = "cuda:0" if torch.cuda.is_available() else "cpu"


translation_models = {
    "en": "Helsinki-NLP/opus-mt-en-es",  # Inglés a Español
    "fr": "Helsinki-NLP/opus-mt-fr-es",  # Francés a Español
    "de": "Helsinki-NLP/opus-mt-de-es",  # Alemán a Español
    "it": "Helsinki-NLP/opus-mt-it-es",  # Italiano a Español
    "pt": "Helsinki-NLP/opus-mt-pt-es",  # Portugués a Español
    "nl": "Helsinki-NLP/opus-mt-nl-es",  # Neerlandés (Holandés) a Español
    "fi": "Helsinki-NLP/opus-mt-fi-es",  # Finés a Español
    "sv": "Helsinki-NLP/opus-mt-sv-es",  # Sueco a Español
    "da": "Helsinki-NLP/opus-mt-da-es",  # Danés a Español
    "no": "Helsinki-NLP/opus-mt-no-es",  # Noruego a Español
    "ru": "Helsinki-NLP/opus-mt-ru-es",  # Ruso a Español
    "pl": "Helsinki-NLP/opus-mt-pl-es",  # Polaco a Español
    "cs": "Helsinki-NLP/opus-mt-cs-es",  # Checo a Español
    "tr": "Helsinki-NLP/opus-mt-tr-es",  # Turco a Español
    "zh": "Helsinki-NLP/opus-mt-zh-es",  # Chino a Español
    "ja": "Helsinki-NLP/opus-mt-ja-es",  # Japonés a Español
    "ar": "Helsinki-NLP/opus-mt-ar-es",  # Árabe a Español
    "ro": "Helsinki-NLP/opus-mt-ro-es",  # Rumano a Español
    "el": "Helsinki-NLP/opus-mt-el-es",  # Griego a Español
    "bg": "Helsinki-NLP/opus-mt-bg-es",  # Búlgaro a Español
    "uk": "Helsinki-NLP/opus-mt-uk-es",  # Ucraniano a Español
    "he": "Helsinki-NLP/opus-mt-he-es",  # Hebreo a Español
    "lt": "Helsinki-NLP/opus-mt-lt-es",  # Lituano a Español
    "et": "Helsinki-NLP/opus-mt-et-es",  # Estonio a Español
    "hr": "Helsinki-NLP/opus-mt-hr-es",  # Croata a Español
    "hu": "Helsinki-NLP/opus-mt-hu-es",  # Húngaro a Español
    "lv": "Helsinki-NLP/opus-mt-lv-es",  # Letón a Español
    "sl": "Helsinki-NLP/opus-mt-sl-es",  # Esloveno a Español
    "sk": "Helsinki-NLP/opus-mt-sk-es",  # Eslovaco a Español
    "sr": "Helsinki-NLP/opus-mt-sr-es",  # Serbio a Español
    "fa": "Helsinki-NLP/opus-mt-fa-es",  # Persa a Español
}

asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)

vist_model = VitsModel.from_pretrained("facebook/mms-tts-spa")
vist_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-spa")

lang_detector = pipeline("text-classification", model="papluca/xlm-roberta-base-language-detection")

def language_detector(text):
    resultado = lang_detector(text)
    idioma_detectado = resultado[0]['label']
    return idioma_detectado

def translate(audio):
    transcribe = asr_pipe(audio, max_new_tokens=256)

    codigo_idioma = language_detector(transcribe['text'])

    if codigo_idioma in translation_models:
        translator = pipeline("translation", model=translation_models[codigo_idioma])
        traduccion = translator(transcribe['text'])
    else:
        transcribe = transcribe['text']
        print(f"No hay un modelo de traducción disponible para el idioma detectado {codigo_idioma}")
        return transcribe
    
    return traduccion

def synthesise(text):
    if isinstance(text, list):
        text = text[0]['translation_text']
    else:
        text = text
    print(text)
    inputs = vist_tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        output = vist_model(**inputs).waveform[0]
    return output

def speech_to_speech_translation(audio):
    translated_text = translate(audio)
    synthesised_speech = synthesise(translated_text)
    synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
    return 16000, synthesised_speech

title = "Cascaded STST"
description = """
Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in Spanish.

![Cascaded STST](https://huggingface.co/datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
"""

demo = gr.Blocks()

mic_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(sources="microphone", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
    title=title,
    description=description,
)

file_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(sources="upload", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
    examples=[["./example.wav"]],
    title=title,
    description=description,
)

with demo:
    gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])

demo.launch()