Wav2Txt / app.py
Merlintxu's picture
Update app.py
55c7d23 verified
raw
history blame
8.91 kB
import gradio as gr
from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration
import torch
import librosa
import subprocess
from langdetect import detect_langs
import os
import warnings
from transformers import logging
import math
import json
import tempfile
# Suprimir advertencias
warnings.filterwarnings("ignore")
logging.set_verbosity_error()
# Modelos actualizados por idioma
MODELS = {
"es": [
"openai/whisper-large-v3",
"facebook/wav2vec2-large-xlsr-53-spanish",
"jonatasgrosman/wav2vec2-xls-r-1b-spanish"
],
"en": [
"openai/whisper-large-v3",
"facebook/wav2vec2-large-960h",
"microsoft/wav2vec2-base-960h"
],
"pt": [
"facebook/wav2vec2-large-xlsr-53-portuguese",
"openai/whisper-medium",
"jonatasgrosman/wav2vec2-xlsr-53-portuguese"
]
}
# Funci贸n para verificar si ffmpeg est谩 instalado
def verify_ffmpeg_installation():
try:
subprocess.run(["ffmpeg", "-version"], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
except subprocess.CalledProcessError as e:
print("ffmpeg no est谩 instalado o no se puede ejecutar correctamente.")
raise e
def convert_audio_to_wav(audio_path):
if os.path.isdir(audio_path):
raise ValueError(f"La ruta proporcionada es un directorio, no un archivo: {audio_path}")
wav_path = "converted_audio.wav"
# A帽adir la opci贸n '-y' para sobrescribir el archivo existente sin preguntar
command = ["ffmpeg", "-y", "-i", audio_path, "-ac", "1", "-ar", "16000", wav_path]
process = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
# Imprimir resultados para depuraci贸n
print(process.stdout.decode()) # Ver salida est谩ndar
print(process.stderr.decode()) # Ver errores
if process.returncode != 0:
raise ValueError(f"Error al convertir el archivo de audio a wav: {process.stderr.decode()}")
return wav_path
def detect_language(audio_path):
try:
speech, _ = librosa.load(audio_path, sr=16000, duration=30)
except Exception as e:
raise ValueError(f"Error al cargar el archivo de audio con librosa: {e}")
processor = WhisperProcessor.from_pretrained("openai/whisper-base")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")
input_features = processor(speech, sampling_rate=16000, return_tensors="pt").input_features
predicted_ids = model.generate(input_features)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
langs = detect_langs(transcription)
es_confidence = next((lang.prob for lang in langs if lang.lang == 'es'), 0)
pt_confidence = next((lang.prob for lang in langs if lang.lang == 'pt'), 0)
if abs(es_confidence - pt_confidence) < 0.2:
return 'es'
return max(langs, key=lambda x: x.prob).lang
def transcribe_audio_stream(audio, model_name):
wav_audio = convert_audio_to_wav(audio)
speech, rate = librosa.load(wav_audio, sr=16000)
duration = len(speech) / rate
transcriptions = []
if "whisper" in model_name:
processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name)
chunk_duration = 30 # segundos
for i in range(0, int(duration), chunk_duration):
end = min(i + chunk_duration, duration)
chunk = speech[int(i * rate):int(end * rate)]
input_features = processor(chunk, sampling_rate=16000, return_tensors="pt").input_features
predicted_ids = model.generate(input_features)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
progress = min(100, (end / duration) * 100)
transcriptions.append({
"start_time": i,
"end_time": end,
"text": transcription
})
yield transcriptions, progress
else:
transcriber = pipeline("automatic-speech-recognition", model=model_name)
chunk_duration = 10 # segundos
for i in range(0, int(duration), chunk_duration):
end = min(i + chunk_duration, duration)
chunk = speech[int(i * rate):int(end * rate)]
result = transcriber(chunk)
progress = min(100, (end / duration) * 100)
transcriptions.append({
"start_time": i,
"end_time": end,
"text": result["text"]
})
yield transcriptions, progress
def detect_and_select_model(audio):
wav_audio = convert_audio_to_wav(audio)
language = detect_language(wav_audio)
model_options = MODELS.get(language, MODELS["en"])
return language, model_options
def save_transcription(transcriptions, file_format):
if file_format == "JSON":
with tempfile.NamedTemporaryFile(delete=False, suffix=".json") as tmp:
json.dump(transcriptions, tmp, ensure_ascii=False, indent=4)
file_path = tmp.name
elif file_format == "TXT":
with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as tmp:
for entry in transcriptions:
tmp.write(f"{entry['start_time']:.2f},{entry['end_time']:.2f},{entry['text']}\n".encode())
file_path = tmp.name
print(f"Archivo de transcripci贸n guardado en: {file_path}")
return file_path
def combined_interface(audio, file_format):
try:
print(f"Ruta del archivo de audio subido: {audio}")
verify_ffmpeg_installation() # Verifica si ffmpeg est谩 instalado
language, model_options = detect_and_select_model(audio)
selected_model = model_options[0]
print(f"Idioma detectado: {language}")
print(f"Modelos disponibles: {model_options}")
# Primer yield: A帽adir None para la s茅ptima salida (Archivo de Descarga)
yield language, model_options, selected_model, "", 0, "Initializing...", None
transcriptions = []
for partial_transcriptions, progress in transcribe_audio_stream(audio, selected_model):
transcriptions = partial_transcriptions
full_transcription = " ".join([t["text"] for t in transcriptions])
progress_int = math.floor(progress)
status = f"Transcribing... {progress_int}% complete"
print(f"Progreso: {progress_int}%")
# Yield con None para el archivo de descarga hasta que est茅 completo
yield language, model_options, selected_model, full_transcription.strip(), progress_int, status, None
print("Guardando transcripci贸n.")
# Guardar transcripci贸n
file_path = save_transcription(transcriptions, file_format)
print(f"Transcripci贸n guardada en: {file_path}")
# Verificar que file_path no es un directorio
if os.path.isdir(file_path):
raise ValueError(f"El archivo de transcripci贸n deber铆a ser un archivo, pero es un directorio: {file_path}")
# Verificar que el archivo existe
if not os.path.isfile(file_path):
raise ValueError(f"El archivo de transcripci贸n no existe: {file_path}")
# Limpiar archivos temporales
os.remove("converted_audio.wav")
print("Archivos temporales limpiados.")
# Yield final con el archivo de descarga
yield language, model_options, selected_model, full_transcription.strip(), 100, "Transcription complete! Download the file below.", file_path
except Exception as e:
print(f"Error: {e}")
# Asegurarse de que el yield de error tambi茅n devuelva 7 valores
yield str(e), [], "", "An error occurred during processing.", 0, "Error", ""
iface = gr.Interface(
fn=combined_interface,
inputs=[
gr.Audio(type="filepath", label="Upload Audio File"),
gr.Radio(choices=["JSON", "TXT"], label="Choose output format")
],
outputs=[
gr.Textbox(label="Detected Language"),
gr.Dropdown(label="Available Models", choices=[]),
gr.Textbox(label="Selected Model"),
gr.Textbox(label="Transcription", lines=10),
gr.Slider(minimum=0, maximum=100, label="Progress", interactive=False),
gr.Textbox(label="Status"),
gr.File(label="Download Transcription")
],
title="Multilingual Audio Transcriber with Real-time Display and Progress Indicator",
description="Upload an audio file to detect the language, select the transcription model, and get the transcription in real-time. Optimized for Spanish, English, and Portuguese.",
live=True
)
if __name__ == "__main__":
iface.queue().launch()