File size: 8,906 Bytes
a3199db
694f93a
a3199db
 
694f93a
c5093bb
694f93a
 
c5093bb
694f93a
 
8f59a41
df609a3
007d6a1
c55c408
 
df609a3
007d6a1
e35f365
7fdbed5
2b71965
 
 
7fdbed5
 
2b71965
7fdbed5
1851c8f
7fdbed5
2b71965
 
 
007d6a1
2b71965
e35f365
 
9703cbc
 
 
 
 
 
 
 
e35f365
86a050b
007d6a1
694f93a
55c7d23
 
 
 
9703cbc
 
 
 
 
 
 
 
 
694f93a
e35f365
 
86a050b
 
 
007d6a1
694f93a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d2e9f55
fe4ae7f
694f93a
 
 
 
 
 
 
 
 
df609a3
007d6a1
df609a3
694f93a
 
 
df609a3
694f93a
 
 
5653d92
694f93a
 
 
 
 
 
 
 
 
 
007d6a1
694f93a
 
 
 
 
c55c408
694f93a
 
 
 
 
 
 
7fdbed5
 
694f93a
 
 
 
7fdbed5
d2e9f55
694f93a
8f59a41
 
 
694f93a
8f59a41
694f93a
8f59a41
 
 
694f93a
d2e9f55
694f93a
2b71965
8f59a41
9703cbc
 
2b71965
 
8f59a41
 
2b71965
007d6a1
 
df609a3
d2e9f55
fe4ae7f
d2e9f55
694f93a
c55c408
 
8f59a41
007d6a1
 
fe4ae7f
8f59a41
007d6a1
694f93a
8f59a41
 
 
 
 
 
 
 
 
2b71965
007d6a1
694f93a
8f59a41
694f93a
007d6a1
8f59a41
2b71965
 
8f59a41
007d6a1
694f93a
a3199db
 
7fdbed5
694f93a
3b1a6b5
694f93a
 
7fdbed5
 
 
 
c55c408
 
d2e9f55
694f93a
7fdbed5
694f93a
 
df609a3
a3199db
 
 
007d6a1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
import gradio as gr
from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration
import torch
import librosa
import subprocess
from langdetect import detect_langs
import os
import warnings
from transformers import logging
import math
import json
import tempfile

# Suprimir advertencias
warnings.filterwarnings("ignore")
logging.set_verbosity_error()

# Modelos actualizados por idioma
MODELS = {
    "es": [
        "openai/whisper-large-v3",
        "facebook/wav2vec2-large-xlsr-53-spanish",
        "jonatasgrosman/wav2vec2-xls-r-1b-spanish"
    ],
    "en": [
        "openai/whisper-large-v3",
        "facebook/wav2vec2-large-960h",
        "microsoft/wav2vec2-base-960h"
    ],
    "pt": [
        "facebook/wav2vec2-large-xlsr-53-portuguese",
        "openai/whisper-medium",
        "jonatasgrosman/wav2vec2-xlsr-53-portuguese"
    ]
}

# Funci贸n para verificar si ffmpeg est谩 instalado
def verify_ffmpeg_installation():
    try:
        subprocess.run(["ffmpeg", "-version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
    except subprocess.CalledProcessError as e:
        print("ffmpeg no est谩 instalado o no se puede ejecutar correctamente.")
        raise e

def convert_audio_to_wav(audio_path):
    if os.path.isdir(audio_path):
        raise ValueError(f"La ruta proporcionada es un directorio, no un archivo: {audio_path}")
    wav_path = "converted_audio.wav"
    
    # A帽adir la opci贸n '-y' para sobrescribir el archivo existente sin preguntar
    command = ["ffmpeg", "-y", "-i", audio_path, "-ac", "1", "-ar", "16000", wav_path]
    
    process = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    
    # Imprimir resultados para depuraci贸n
    print(process.stdout.decode())  # Ver salida est谩ndar
    print(process.stderr.decode())  # Ver errores

    if process.returncode != 0:
        raise ValueError(f"Error al convertir el archivo de audio a wav: {process.stderr.decode()}")
    
    return wav_path

def detect_language(audio_path):
    try:
        speech, _ = librosa.load(audio_path, sr=16000, duration=30)
    except Exception as e:
        raise ValueError(f"Error al cargar el archivo de audio con librosa: {e}")
    
    processor = WhisperProcessor.from_pretrained("openai/whisper-base")
    model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")
    
    input_features = processor(speech, sampling_rate=16000, return_tensors="pt").input_features
    predicted_ids = model.generate(input_features)
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    
    langs = detect_langs(transcription)
    
    es_confidence = next((lang.prob for lang in langs if lang.lang == 'es'), 0)
    pt_confidence = next((lang.prob for lang in langs if lang.lang == 'pt'), 0)
    
    if abs(es_confidence - pt_confidence) < 0.2:
        return 'es'
    
    return max(langs, key=lambda x: x.prob).lang

def transcribe_audio_stream(audio, model_name):
    wav_audio = convert_audio_to_wav(audio)
    speech, rate = librosa.load(wav_audio, sr=16000)
    duration = len(speech) / rate
    
    transcriptions = []

    if "whisper" in model_name:
        processor = WhisperProcessor.from_pretrained(model_name)
        model = WhisperForConditionalGeneration.from_pretrained(model_name)
        
        chunk_duration = 30  # segundos
        
        for i in range(0, int(duration), chunk_duration):
            end = min(i + chunk_duration, duration)
            chunk = speech[int(i * rate):int(end * rate)]
            
            input_features = processor(chunk, sampling_rate=16000, return_tensors="pt").input_features
            predicted_ids = model.generate(input_features)
            transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
            
            progress = min(100, (end / duration) * 100)
            transcriptions.append({
                "start_time": i,
                "end_time": end,
                "text": transcription
            })
            yield transcriptions, progress
    else:
        transcriber = pipeline("automatic-speech-recognition", model=model_name)
        
        chunk_duration = 10  # segundos
        
        for i in range(0, int(duration), chunk_duration):
            end = min(i + chunk_duration, duration)
            chunk = speech[int(i * rate):int(end * rate)]
            result = transcriber(chunk)
            
            progress = min(100, (end / duration) * 100)
            transcriptions.append({
                "start_time": i,
                "end_time": end,
                "text": result["text"]
            })
            yield transcriptions, progress

def detect_and_select_model(audio):
    wav_audio = convert_audio_to_wav(audio)
    language = detect_language(wav_audio)
    model_options = MODELS.get(language, MODELS["en"])
    return language, model_options

def save_transcription(transcriptions, file_format):
    if file_format == "JSON":
        with tempfile.NamedTemporaryFile(delete=False, suffix=".json") as tmp:
            json.dump(transcriptions, tmp, ensure_ascii=False, indent=4)
            file_path = tmp.name
    elif file_format == "TXT":
        with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as tmp:
            for entry in transcriptions:
                tmp.write(f"{entry['start_time']:.2f},{entry['end_time']:.2f},{entry['text']}\n".encode())
            file_path = tmp.name
    print(f"Archivo de transcripci贸n guardado en: {file_path}")
    return file_path

def combined_interface(audio, file_format):
    try:
        print(f"Ruta del archivo de audio subido: {audio}")
        verify_ffmpeg_installation()  # Verifica si ffmpeg est谩 instalado

        language, model_options = detect_and_select_model(audio)
        selected_model = model_options[0]
        print(f"Idioma detectado: {language}")
        print(f"Modelos disponibles: {model_options}")
        
        # Primer yield: A帽adir None para la s茅ptima salida (Archivo de Descarga)
        yield language, model_options, selected_model, "", 0, "Initializing...", None
        
        transcriptions = []
        for partial_transcriptions, progress in transcribe_audio_stream(audio, selected_model):
            transcriptions = partial_transcriptions
            full_transcription = " ".join([t["text"] for t in transcriptions])
            progress_int = math.floor(progress)
            status = f"Transcribing... {progress_int}% complete"
            print(f"Progreso: {progress_int}%")
            # Yield con None para el archivo de descarga hasta que est茅 completo
            yield language, model_options, selected_model, full_transcription.strip(), progress_int, status, None
        
        print("Guardando transcripci贸n.")
        # Guardar transcripci贸n
        file_path = save_transcription(transcriptions, file_format)
        print(f"Transcripci贸n guardada en: {file_path}")
        
        # Verificar que file_path no es un directorio
        if os.path.isdir(file_path):
            raise ValueError(f"El archivo de transcripci贸n deber铆a ser un archivo, pero es un directorio: {file_path}")
        
        # Verificar que el archivo existe
        if not os.path.isfile(file_path):
            raise ValueError(f"El archivo de transcripci贸n no existe: {file_path}")
        
        # Limpiar archivos temporales
        os.remove("converted_audio.wav")
        print("Archivos temporales limpiados.")
        
        # Yield final con el archivo de descarga
        yield language, model_options, selected_model, full_transcription.strip(), 100, "Transcription complete! Download the file below.", file_path
        
    except Exception as e:
        print(f"Error: {e}")
        # Asegurarse de que el yield de error tambi茅n devuelva 7 valores
        yield str(e), [], "", "An error occurred during processing.", 0, "Error", ""

iface = gr.Interface(
    fn=combined_interface,
    inputs=[
        gr.Audio(type="filepath", label="Upload Audio File"),
        gr.Radio(choices=["JSON", "TXT"], label="Choose output format")
    ],
    outputs=[
        gr.Textbox(label="Detected Language"),
        gr.Dropdown(label="Available Models", choices=[]),
        gr.Textbox(label="Selected Model"),
        gr.Textbox(label="Transcription", lines=10),
        gr.Slider(minimum=0, maximum=100, label="Progress", interactive=False),
        gr.Textbox(label="Status"),
        gr.File(label="Download Transcription")
    ],
    title="Multilingual Audio Transcriber with Real-time Display and Progress Indicator",
    description="Upload an audio file to detect the language, select the transcription model, and get the transcription in real-time. Optimized for Spanish, English, and Portuguese.",
    live=True
)

if __name__ == "__main__":
    iface.queue().launch()