File size: 10,248 Bytes
a3199db df609a3 a3199db e35f365 2b71965 7fdbed5 df609a3 c55c408 d2e9f55 b7fce90 df609a3 c55c408 df609a3 b7fce90 40aba40 2b71965 e35f365 7fdbed5 2b71965 7fdbed5 2b71965 7fdbed5 1851c8f 7fdbed5 2b71965 e35f365 5653d92 b7fce90 5653d92 b7fce90 5653d92 b7fce90 5653d92 e35f365 5653d92 b7fce90 5653d92 b7fce90 5653d92 b7fce90 5653d92 b7fce90 5653d92 a3199db d2e9f55 5653d92 b7fce90 5653d92 b7fce90 5653d92 b7fce90 5653d92 d2e9f55 fe4ae7f 5653d92 df609a3 5653d92 df609a3 5653d92 df609a3 5653d92 df609a3 5653d92 c55c408 5653d92 b7fce90 5653d92 d2e9f55 fe4ae7f 5653d92 b7fce90 5653d92 b7fce90 5653d92 b7fce90 5653d92 7fdbed5 5653d92 b7fce90 5653d92 b7fce90 5653d92 b7fce90 5653d92 7fdbed5 d2e9f55 5653d92 b7fce90 5653d92 b7fce90 5653d92 b7fce90 5653d92 b7fce90 5653d92 d2e9f55 7fdbed5 2b71965 b7fce90 2b71965 fe4ae7f df609a3 d2e9f55 fe4ae7f d2e9f55 fe4ae7f c55c408 fe4ae7f 2b71965 fe4ae7f 2b71965 fe4ae7f 2b71965 b7fce90 fe4ae7f a3199db 7fdbed5 3526f5d 7fdbed5 c55c408 d2e9f55 fe4ae7f 7fdbed5 d2e9f55 df609a3 a3199db d2e9f55 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 |
import gradio as gr
from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration
import torch
import librosa
import subprocess
from langdetect import detect_langs
import os
import warnings
from transformers import logging
import math
import json
from pyannote.audio import Pipeline
import numpy as np # Asegúrate de importar numpy
# Suppress warnings
warnings.filterwarnings("ignore")
logging.set_verbosity_error()
# Inicializar numpy correctamente
np._import_array()
# Read the Hugging Face token from the environment variable
HUGGINGFACE_TOKEN = os.getenv("HF_TOKEN")
# Updated models by language
MODELS = {
"es": [
"openai/whisper-large-v3",
"facebook/wav2vec2-large-xlsr-53-spanish",
"jonatasgrosman/wav2vec2-xls-r-1b-spanish"
],
"en": [
"openai/whisper-large-v3",
"facebook/wav2vec2-large-960h",
"microsoft/wav2vec2-base-960h"
],
"pt": [
"facebook/wav2vec2-large-xlsr-53-portuguese",
"openai/whisper-medium",
"jonatasgrosman/wav2vec2-large-xlsr-53-portuguese"
]
}
def convert_audio_to_wav(audio_path):
try:
print("Converting audio to WAV format...")
wav_path = "converted_audio.wav"
command = ["ffmpeg", "-i", audio_path, "-ac", "1", "-ar", "16000", wav_path]
subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
print(f"Audio converted to {wav_path}")
return wav_path
except Exception as e:
print(f"Error converting audio to WAV: {e}")
raise RuntimeError(f"Error converting audio to WAV: {e}")
def detect_language(audio_path):
try:
print("Detecting language...")
speech, _ = librosa.load(audio_path, sr=16000, duration=30)
processor = WhisperProcessor.from_pretrained("openai/whisper-base")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")
input_features = processor(speech, sampling_rate=16000, return_tensors="pt").input_features
predicted_ids = model.generate(input_features)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
langs = detect_langs(transcription)
es_confidence = next((lang.prob for lang in langs if lang.lang == 'es'), 0)
pt_confidence = next((lang.prob for lang in langs if lang.lang == 'pt'), 0)
if abs(es_confidence - pt_confidence) < 0.2:
print("Detected language: Spanish")
return 'es'
detected_language = max(langs, key=lambda x: x.prob).lang
print(f"Detected language: {detected_language}")
return detected_language
except Exception as e:
print(f"Error detecting language: {e}")
raise RuntimeError(f"Error detecting language: {e}")
def diarize_audio(wav_audio):
try:
print("Performing diarization...")
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=HUGGINGFACE_TOKEN)
diarization = pipeline(wav_audio)
print("Diarization complete.")
return diarization
except Exception as e:
print(f"Error in diarization: {e}")
raise RuntimeError(f"Error in diarization: {e}")
def transcribe_audio_stream(audio, model_name):
try:
wav_audio = convert_audio_to_wav(audio)
speech, rate = librosa.load(wav_audio, sr=16000)
duration = len(speech) / rate
transcriptions = []
if "whisper" in model_name:
processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name)
chunk_duration = 30 # seconds
for i in range(0, int(duration), chunk_duration):
end = min(i + chunk_duration, duration)
chunk = speech[int(i * rate):int(end * rate)]
input_features = processor(chunk, sampling_rate=16000, return_tensors="pt").input_features
predicted_ids = model.generate(input_features)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
progress = min(100, (end / duration) * 100)
timestamp = i
transcriptions.append((timestamp, transcription, progress))
yield transcriptions, progress
else:
transcriber = pipeline("automatic-speech-recognition", model=model_name)
chunk_duration = 10 # seconds
for i in range(0, int(duration), chunk_duration):
end = min(i + chunk_duration, duration)
chunk = speech[int(i * rate):int(end * rate)]
result = transcriber(chunk)
progress = min(100, (end / duration) * 100)
timestamp = i
transcriptions.append((timestamp, result["text"], progress))
yield transcriptions, progress
except Exception as e:
print(f"Error in transcription: {e}")
raise RuntimeError(f"Error in transcription: {e}")
def merge_diarization_with_transcription(transcriptions, diarization, rate):
try:
print("Merging diarization with transcription...")
speaker_transcriptions = []
for segment in diarization.itertracks(yield_label=True):
start, end, speaker = segment
start_time = start / rate
end_time = end / rate
text_segment = ""
for ts, text, _ in transcriptions:
if start_time <= ts <= end_time:
text_segment += text + " "
speaker_transcriptions.append((start_time, end_time, speaker, text_segment.strip()))
print("Merge complete.")
return speaker_transcriptions
except Exception as e:
print(f"Error merging diarization with transcription: {e}")
raise RuntimeError(f"Error merging diarization with transcription: {e}")
def detect_and_select_model(audio):
try:
print("Detecting and selecting model...")
wav_audio = convert_audio_to_wav(audio)
language = detect_language(wav_audio)
model_options = MODELS.get(language, MODELS["en"])
print(f"Selected model: {model_options[0]}")
return language, model_options
except Exception as e:
print(f"Error detecting and selecting model: {e}")
raise RuntimeError(f"Error detecting and selecting model: {e}")
def save_transcription(transcriptions, file_format):
try:
print(f"Saving transcription to {file_format} format...")
if file_format == "txt":
file_path = "/tmp/transcription.txt"
with open(file_path, "w") as f:
for start, end, speaker, text in transcriptions:
f.write(f"[{start:.2f}-{end:.2f}] {speaker}: {text}\n")
print(f"Transcription saved to {file_path}")
return file_path
elif file_format == "json":
file_path = "/tmp/transcription.json"
with open(file_path, "w") as f:
json.dump(transcriptions, f)
print(f"Transcription saved to {file_path}")
return file_path
except Exception as e:
print(f"Error saving transcription: {e}")
raise RuntimeError(f"Error saving transcription: {e}")
def combined_interface(audio):
try:
print("Starting combined interface...")
language, model_options = detect_and_select_model(audio)
selected_model = model_options[0]
yield language, model_options, selected_model, "", 0, "Initializing...", None, None
wav_audio = convert_audio_to_wav(audio)
diarization = diarize_audio(wav_audio)
transcriptions = []
for partial_transcriptions, progress in transcribe_audio_stream(audio, selected_model):
transcriptions = partial_transcriptions
transcriptions_text = "\n".join([f"[{start}-{end}] {text}" for start, end, text in transcriptions])
progress_int = math.floor(progress)
status = f"Transcribing... {progress_int}% complete"
yield language, model_options, selected_model, transcriptions_text, progress_int, status, None, None
rate = librosa.get_samplerate(wav_audio)
speaker_transcriptions = merge_diarization_with_transcription(transcriptions, diarization, rate)
transcriptions_text = "\n".join([f"[{start:.2f}-{end:.2f}] {speaker}: {text}" for start, end, speaker, text in speaker_transcriptions])
txt_file_path = save_transcription(speaker_transcriptions, "txt")
json_file_path = save_transcription(speaker_transcriptions, "json")
os.remove(wav_audio)
yield language, model_options, selected_model, transcriptions_text, 100, "Transcription complete!", txt_file_path, json_file_path
except Exception as e:
print(f"Error in combined interface: {e}")
yield str(e), [], "", "An error occurred during processing.", 0, "Error", None, None
iface = gr.Interface(
fn=combined_interface,
inputs=gr.Audio(type="filepath"),
outputs=[
gr.Textbox(label="Detected Language"),
gr.Dropdown(label="Available Models", choices=[]),
gr.Textbox(label="Selected Model"),
gr.Textbox(label="Transcription", lines=10),
gr.Slider(minimum=0, maximum=100, label="Progress", interactive=False),
gr.Textbox(label="Status"),
gr.File(label="Download Transcription (TXT)", type="filepath"),
gr.File(label="Download Transcription (JSON)", type="filepath")
],
title="Multilingual Audio Transcriber with Real-time Display, Timestamps, and Speaker Diarization",
description="Upload an audio file to detect the language, select the transcription model, and get the transcription with timestamps and speaker labels in real-time. Download the transcription as TXT or JSON. Optimized for Spanish, English, and Portuguese.",
live=True
)
if __name__ == "__main__":
iface.queue().launch()
|