File size: 10,248 Bytes
a3199db
df609a3
a3199db
 
e35f365
2b71965
7fdbed5
df609a3
 
c55c408
d2e9f55
 
b7fce90
df609a3
 
c55c408
 
df609a3
b7fce90
 
 
40aba40
 
 
2b71965
e35f365
7fdbed5
2b71965
 
 
7fdbed5
 
2b71965
7fdbed5
1851c8f
7fdbed5
2b71965
 
 
 
 
e35f365
 
 
5653d92
b7fce90
5653d92
 
 
b7fce90
5653d92
 
b7fce90
5653d92
e35f365
 
5653d92
b7fce90
5653d92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b7fce90
5653d92
 
b7fce90
 
 
5653d92
b7fce90
5653d92
a3199db
d2e9f55
5653d92
b7fce90
5653d92
 
b7fce90
5653d92
 
b7fce90
5653d92
d2e9f55
fe4ae7f
5653d92
 
 
 
df609a3
5653d92
df609a3
5653d92
 
 
df609a3
5653d92
df609a3
5653d92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c55c408
5653d92
 
 
 
 
 
 
 
 
 
b7fce90
5653d92
d2e9f55
fe4ae7f
5653d92
b7fce90
5653d92
 
 
 
 
 
 
 
 
 
b7fce90
5653d92
 
b7fce90
5653d92
7fdbed5
 
5653d92
b7fce90
5653d92
 
 
b7fce90
5653d92
 
b7fce90
5653d92
7fdbed5
d2e9f55
5653d92
b7fce90
5653d92
 
 
 
 
b7fce90
5653d92
 
 
 
 
b7fce90
5653d92
 
b7fce90
5653d92
d2e9f55
7fdbed5
2b71965
b7fce90
2b71965
 
 
fe4ae7f
df609a3
d2e9f55
 
 
fe4ae7f
 
d2e9f55
fe4ae7f
c55c408
 
fe4ae7f
 
 
 
 
2b71965
fe4ae7f
 
 
 
2b71965
fe4ae7f
2b71965
b7fce90
fe4ae7f
a3199db
 
7fdbed5
3526f5d
7fdbed5
 
 
 
c55c408
 
d2e9f55
fe4ae7f
 
7fdbed5
d2e9f55
 
df609a3
a3199db
 
 
d2e9f55
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
import gradio as gr
from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration
import torch
import librosa
import subprocess
from langdetect import detect_langs
import os
import warnings
from transformers import logging
import math
import json
from pyannote.audio import Pipeline
import numpy as np  # Asegúrate de importar numpy

# Suppress warnings
warnings.filterwarnings("ignore")
logging.set_verbosity_error()

# Inicializar numpy correctamente
np._import_array()

# Read the Hugging Face token from the environment variable
HUGGINGFACE_TOKEN = os.getenv("HF_TOKEN")

# Updated models by language
MODELS = {
    "es": [
        "openai/whisper-large-v3",
        "facebook/wav2vec2-large-xlsr-53-spanish",
        "jonatasgrosman/wav2vec2-xls-r-1b-spanish"
    ],
    "en": [
        "openai/whisper-large-v3",
        "facebook/wav2vec2-large-960h",
        "microsoft/wav2vec2-base-960h"
    ],
    "pt": [
        "facebook/wav2vec2-large-xlsr-53-portuguese",
        "openai/whisper-medium",
        "jonatasgrosman/wav2vec2-large-xlsr-53-portuguese"
    ]
}

def convert_audio_to_wav(audio_path):
    try:
        print("Converting audio to WAV format...")
        wav_path = "converted_audio.wav"
        command = ["ffmpeg", "-i", audio_path, "-ac", "1", "-ar", "16000", wav_path]
        subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        print(f"Audio converted to {wav_path}")
        return wav_path
    except Exception as e:
        print(f"Error converting audio to WAV: {e}")
        raise RuntimeError(f"Error converting audio to WAV: {e}")

def detect_language(audio_path):
    try:
        print("Detecting language...")
        speech, _ = librosa.load(audio_path, sr=16000, duration=30)
        
        processor = WhisperProcessor.from_pretrained("openai/whisper-base")
        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")
        
        input_features = processor(speech, sampling_rate=16000, return_tensors="pt").input_features
        predicted_ids = model.generate(input_features)
        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
        
        langs = detect_langs(transcription)
        
        es_confidence = next((lang.prob for lang in langs if lang.lang == 'es'), 0)
        pt_confidence = next((lang.prob for lang in langs if lang.lang == 'pt'), 0)
        
        if abs(es_confidence - pt_confidence) < 0.2:
            print("Detected language: Spanish")
            return 'es'
        
        detected_language = max(langs, key=lambda x: x.prob).lang
        print(f"Detected language: {detected_language}")
        return detected_language
    except Exception as e:
        print(f"Error detecting language: {e}")
        raise RuntimeError(f"Error detecting language: {e}")

def diarize_audio(wav_audio):
    try:
        print("Performing diarization...")
        pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=HUGGINGFACE_TOKEN)
        diarization = pipeline(wav_audio)
        print("Diarization complete.")
        return diarization
    except Exception as e:
        print(f"Error in diarization: {e}")
        raise RuntimeError(f"Error in diarization: {e}")

def transcribe_audio_stream(audio, model_name):
    try:
        wav_audio = convert_audio_to_wav(audio)
        speech, rate = librosa.load(wav_audio, sr=16000)
        duration = len(speech) / rate
        
        transcriptions = []
        
        if "whisper" in model_name:
            processor = WhisperProcessor.from_pretrained(model_name)
            model = WhisperForConditionalGeneration.from_pretrained(model_name)
            
            chunk_duration = 30  # seconds
            
            for i in range(0, int(duration), chunk_duration):
                end = min(i + chunk_duration, duration)
                chunk = speech[int(i * rate):int(end * rate)]
                
                input_features = processor(chunk, sampling_rate=16000, return_tensors="pt").input_features
                predicted_ids = model.generate(input_features)
                transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
                
                progress = min(100, (end / duration) * 100)
                timestamp = i
                transcriptions.append((timestamp, transcription, progress))
                yield transcriptions, progress
        else:
            transcriber = pipeline("automatic-speech-recognition", model=model_name)
            
            chunk_duration = 10  # seconds
            
            for i in range(0, int(duration), chunk_duration):
                end = min(i + chunk_duration, duration)
                chunk = speech[int(i * rate):int(end * rate)]
                result = transcriber(chunk)
                
                progress = min(100, (end / duration) * 100)
                timestamp = i
                transcriptions.append((timestamp, result["text"], progress))
                yield transcriptions, progress
    except Exception as e:
        print(f"Error in transcription: {e}")
        raise RuntimeError(f"Error in transcription: {e}")

def merge_diarization_with_transcription(transcriptions, diarization, rate):
    try:
        print("Merging diarization with transcription...")
        speaker_transcriptions = []
        for segment in diarization.itertracks(yield_label=True):
            start, end, speaker = segment
            start_time = start / rate
            end_time = end / rate
            text_segment = ""
            for ts, text, _ in transcriptions:
                if start_time <= ts <= end_time:
                    text_segment += text + " "
            speaker_transcriptions.append((start_time, end_time, speaker, text_segment.strip()))
        print("Merge complete.")
        return speaker_transcriptions
    except Exception as e:
        print(f"Error merging diarization with transcription: {e}")
        raise RuntimeError(f"Error merging diarization with transcription: {e}")

def detect_and_select_model(audio):
    try:
        print("Detecting and selecting model...")
        wav_audio = convert_audio_to_wav(audio)
        language = detect_language(wav_audio)
        model_options = MODELS.get(language, MODELS["en"])
        print(f"Selected model: {model_options[0]}")
        return language, model_options
    except Exception as e:
        print(f"Error detecting and selecting model: {e}")
        raise RuntimeError(f"Error detecting and selecting model: {e}")

def save_transcription(transcriptions, file_format):
    try:
        print(f"Saving transcription to {file_format} format...")
        if file_format == "txt":
            file_path = "/tmp/transcription.txt"
            with open(file_path, "w") as f:
                for start, end, speaker, text in transcriptions:
                    f.write(f"[{start:.2f}-{end:.2f}] {speaker}: {text}\n")
            print(f"Transcription saved to {file_path}")
            return file_path
        elif file_format == "json":
            file_path = "/tmp/transcription.json"
            with open(file_path, "w") as f:
                json.dump(transcriptions, f)
            print(f"Transcription saved to {file_path}")
            return file_path
    except Exception as e:
        print(f"Error saving transcription: {e}")
        raise RuntimeError(f"Error saving transcription: {e}")

def combined_interface(audio):
    try:
        print("Starting combined interface...")
        language, model_options = detect_and_select_model(audio)
        selected_model = model_options[0]
        
        yield language, model_options, selected_model, "", 0, "Initializing...", None, None
        
        wav_audio = convert_audio_to_wav(audio)
        diarization = diarize_audio(wav_audio)
        transcriptions = []
        
        for partial_transcriptions, progress in transcribe_audio_stream(audio, selected_model):
            transcriptions = partial_transcriptions
            transcriptions_text = "\n".join([f"[{start}-{end}] {text}" for start, end, text in transcriptions])
            progress_int = math.floor(progress)
            status = f"Transcribing... {progress_int}% complete"
            yield language, model_options, selected_model, transcriptions_text, progress_int, status, None, None
        
        rate = librosa.get_samplerate(wav_audio)
        speaker_transcriptions = merge_diarization_with_transcription(transcriptions, diarization, rate)
        transcriptions_text = "\n".join([f"[{start:.2f}-{end:.2f}] {speaker}: {text}" for start, end, speaker, text in speaker_transcriptions])
        
        txt_file_path = save_transcription(speaker_transcriptions, "txt")
        json_file_path = save_transcription(speaker_transcriptions, "json")

        os.remove(wav_audio)
        
        yield language, model_options, selected_model, transcriptions_text, 100, "Transcription complete!", txt_file_path, json_file_path
    except Exception as e:
        print(f"Error in combined interface: {e}")
        yield str(e), [], "", "An error occurred during processing.", 0, "Error", None, None

iface = gr.Interface(
    fn=combined_interface,
    inputs=gr.Audio(type="filepath"),
    outputs=[
        gr.Textbox(label="Detected Language"),
        gr.Dropdown(label="Available Models", choices=[]),
        gr.Textbox(label="Selected Model"),
        gr.Textbox(label="Transcription", lines=10),
        gr.Slider(minimum=0, maximum=100, label="Progress", interactive=False),
        gr.Textbox(label="Status"),
        gr.File(label="Download Transcription (TXT)", type="filepath"),
        gr.File(label="Download Transcription (JSON)", type="filepath")
    ],
    title="Multilingual Audio Transcriber with Real-time Display, Timestamps, and Speaker Diarization",
    description="Upload an audio file to detect the language, select the transcription model, and get the transcription with timestamps and speaker labels in real-time. Download the transcription as TXT or JSON. Optimized for Spanish, English, and Portuguese.",
    live=True
)

if __name__ == "__main__":
    iface.queue().launch()