Spaces:

Merlintxu
/

Wav2Txt

Sleeping

File size: 10,248 Bytes

a3199db
df609a3
a3199db
 
e35f365
2b71965
7fdbed5
df609a3
 
c55c408
d2e9f55
 
b7fce90
df609a3
 
c55c408
 
df609a3
b7fce90
 
 
40aba40
 
 
2b71965
e35f365
7fdbed5
2b71965
 
 
7fdbed5
 
2b71965
7fdbed5
1851c8f
7fdbed5
2b71965
 
 
 
 
e35f365
 
 
5653d92
b7fce90
5653d92
 
 
b7fce90
5653d92
 
b7fce90
5653d92
e35f365
 
5653d92
b7fce90
5653d92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b7fce90
5653d92
 
b7fce90
 
 
5653d92
b7fce90
5653d92
a3199db
d2e9f55
5653d92
b7fce90
5653d92
 
b7fce90
5653d92
 
b7fce90
5653d92
d2e9f55
fe4ae7f
5653d92
 
 
 
df609a3
5653d92
df609a3
5653d92
 
 
df609a3
5653d92
df609a3
5653d92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c55c408
5653d92
 
 
 
 
 
 
 
 
 
b7fce90
5653d92
d2e9f55
fe4ae7f
5653d92
b7fce90
5653d92
 
 
 
 
 
 
 
 
 
b7fce90
5653d92
 
b7fce90
5653d92
7fdbed5
 
5653d92
b7fce90
5653d92
 
 
b7fce90
5653d92
 
b7fce90
5653d92
7fdbed5
d2e9f55
5653d92
b7fce90
5653d92
 
 
 
 
b7fce90
5653d92
 
 
 
 
b7fce90
5653d92
 
b7fce90
5653d92
d2e9f55
7fdbed5
2b71965
b7fce90
2b71965
 
 
fe4ae7f
df609a3
d2e9f55
 
 
fe4ae7f
 
d2e9f55
fe4ae7f
c55c408
 
fe4ae7f
 
 
 
 
2b71965
fe4ae7f
 
 
 
2b71965
fe4ae7f
2b71965
b7fce90
fe4ae7f
a3199db
 
7fdbed5
3526f5d
7fdbed5
 
 
 
c55c408
 
d2e9f55
fe4ae7f
 
7fdbed5
d2e9f55
 
df609a3
a3199db
 
 
d2e9f55

import gradio as gr
from transformers import pipeline, WhisperProcessor, WhisperForConditionalGeneration
import torch
import librosa
import subprocess
from langdetect import detect_langs
import os
import warnings
from transformers import logging
import math
import json
from pyannote.audio import Pipeline
import numpy as np  # Asegúrate de importar numpy

# Suppress warnings
warnings.filterwarnings("ignore")
logging.set_verbosity_error()

# Inicializar numpy correctamente
np._import_array()

# Read the Hugging Face token from the environment variable
HUGGINGFACE_TOKEN = os.getenv("HF_TOKEN")

# Updated models by language
MODELS = {
    "es": [
        "openai/whisper-large-v3",
        "facebook/wav2vec2-large-xlsr-53-spanish",
        "jonatasgrosman/wav2vec2-xls-r-1b-spanish"
    ],
    "en": [
        "openai/whisper-large-v3",
        "facebook/wav2vec2-large-960h",
        "microsoft/wav2vec2-base-960h"
    ],
    "pt": [
        "facebook/wav2vec2-large-xlsr-53-portuguese",
        "openai/whisper-medium",
        "jonatasgrosman/wav2vec2-large-xlsr-53-portuguese"
    ]
}

def convert_audio_to_wav(audio_path):
    try:
        print("Converting audio to WAV format...")
        wav_path = "converted_audio.wav"
        command = ["ffmpeg", "-i", audio_path, "-ac", "1", "-ar", "16000", wav_path]
        subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        print(f"Audio converted to {wav_path}")
        return wav_path
    except Exception as e:
        print(f"Error converting audio to WAV: {e}")
        raise RuntimeError(f"Error converting audio to WAV: {e}")

def detect_language(audio_path):
    try:
        print("Detecting language...")
        speech, _ = librosa.load(audio_path, sr=16000, duration=30)
        
        processor = WhisperProcessor.from_pretrained("openai/whisper-base")
        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")
        
        input_features = processor(speech, sampling_rate=16000, return_tensors="pt").input_features
        predicted_ids = model.generate(input_features)
        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
        
        langs = detect_langs(transcription)
        
        es_confidence = next((lang.prob for lang in langs if lang.lang == 'es'), 0)
        pt_confidence = next((lang.prob for lang in langs if lang.lang == 'pt'), 0)
        
        if abs(es_confidence - pt_confidence) < 0.2:
            print("Detected language: Spanish")
            return 'es'
        
        detected_language = max(langs, key=lambda x: x.prob).lang
        print(f"Detected language: {detected_language}")
        return detected_language
    except Exception as e:
        print(f"Error detecting language: {e}")
        raise RuntimeError(f"Error detecting language: {e}")

def diarize_audio(wav_audio):
    try:
        print("Performing diarization...")
        pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=HUGGINGFACE_TOKEN)
        diarization = pipeline(wav_audio)
        print("Diarization complete.")
        return diarization
    except Exception as e:
        print(f"Error in diarization: {e}")
        raise RuntimeError(f"Error in diarization: {e}")

def transcribe_audio_stream(audio, model_name):
    try:
        wav_audio = convert_audio_to_wav(audio)
        speech, rate = librosa.load(wav_audio, sr=16000)
        duration = len(speech) / rate
        
        transcriptions = []
        
        if "whisper" in model_name:
            processor = WhisperProcessor.from_pretrained(model_name)
            model = WhisperForConditionalGeneration.from_pretrained(model_name)
            
            chunk_duration = 30  # seconds
            
            for i in range(0, int(duration), chunk_duration):
                end = min(i + chunk_duration, duration)
                chunk = speech[int(i * rate):int(end * rate)]
                
                input_features = processor(chunk, sampling_rate=16000, return_tensors="pt").input_features
                predicted_ids = model.generate(input_features)
                transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
                
                progress = min(100, (end / duration) * 100)
                timestamp = i
                transcriptions.append((timestamp, transcription, progress))
                yield transcriptions, progress
        else:
            transcriber = pipeline("automatic-speech-recognition", model=model_name)
            
            chunk_duration = 10  # seconds
            
            for i in range(0, int(duration), chunk_duration):
                end = min(i + chunk_duration, duration)
                chunk = speech[int(i * rate):int(end * rate)]
                result = transcriber(chunk)
                
                progress = min(100, (end / duration) * 100)
                timestamp = i
                transcriptions.append((timestamp, result["text"], progress))
                yield transcriptions, progress
    except Exception as e:
        print(f"Error in transcription: {e}")
        raise RuntimeError(f"Error in transcription: {e}")

def merge_diarization_with_transcription(transcriptions, diarization, rate):
    try:
        print("Merging diarization with transcription...")
        speaker_transcriptions = []
        for segment in diarization.itertracks(yield_label=True):
            start, end, speaker = segment
            start_time = start / rate
            end_time = end / rate
            text_segment = ""
            for ts, text, _ in transcriptions:
                if start_time <= ts <= end_time:
                    text_segment += text + " "
            speaker_transcriptions.append((start_time, end_time, speaker, text_segment.strip()))
        print("Merge complete.")
        return speaker_transcriptions
    except Exception as e:
        print(f"Error merging diarization with transcription: {e}")
        raise RuntimeError(f"Error merging diarization with transcription: {e}")

def detect_and_select_model(audio):
    try:
        print("Detecting and selecting model...")
        wav_audio = convert_audio_to_wav(audio)
        language = detect_language(wav_audio)
        model_options = MODELS.get(language, MODELS["en"])
        print(f"Selected model: {model_options[0]}")
        return language, model_options
    except Exception as e:
        print(f"Error detecting and selecting model: {e}")
        raise RuntimeError(f"Error detecting and selecting model: {e}")

def save_transcription(transcriptions, file_format):
    try:
        print(f"Saving transcription to {file_format} format...")
        if file_format == "txt":
            file_path = "/tmp/transcription.txt"
            with open(file_path, "w") as f:
                for start, end, speaker, text in transcriptions:
                    f.write(f"[{start:.2f}-{end:.2f}] {speaker}: {text}\n")
            print(f"Transcription saved to {file_path}")
            return file_path
        elif file_format == "json":
            file_path = "/tmp/transcription.json"
            with open(file_path, "w") as f:
                json.dump(transcriptions, f)
            print(f"Transcription saved to {file_path}")
            return file_path
    except Exception as e:
        print(f"Error saving transcription: {e}")
        raise RuntimeError(f"Error saving transcription: {e}")

def combined_interface(audio):
    try:
        print("Starting combined interface...")
        language, model_options = detect_and_select_model(audio)
        selected_model = model_options[0]
        
        yield language, model_options, selected_model, "", 0, "Initializing...", None, None
        
        wav_audio = convert_audio_to_wav(audio)
        diarization = diarize_audio(wav_audio)
        transcriptions = []
        
        for partial_transcriptions, progress in transcribe_audio_stream(audio, selected_model):
            transcriptions = partial_transcriptions
            transcriptions_text = "\n".join([f"[{start}-{end}] {text}" for start, end, text in transcriptions])
            progress_int = math.floor(progress)
            status = f"Transcribing... {progress_int}% complete"
            yield language, model_options, selected_model, transcriptions_text, progress_int, status, None, None
        
        rate = librosa.get_samplerate(wav_audio)
        speaker_transcriptions = merge_diarization_with_transcription(transcriptions, diarization, rate)
        transcriptions_text = "\n".join([f"[{start:.2f}-{end:.2f}] {speaker}: {text}" for start, end, speaker, text in speaker_transcriptions])
        
        txt_file_path = save_transcription(speaker_transcriptions, "txt")
        json_file_path = save_transcription(speaker_transcriptions, "json")

        os.remove(wav_audio)
        
        yield language, model_options, selected_model, transcriptions_text, 100, "Transcription complete!", txt_file_path, json_file_path
    except Exception as e:
        print(f"Error in combined interface: {e}")
        yield str(e), [], "", "An error occurred during processing.", 0, "Error", None, None

iface = gr.Interface(
    fn=combined_interface,
    inputs=gr.Audio(type="filepath"),
    outputs=[
        gr.Textbox(label="Detected Language"),
        gr.Dropdown(label="Available Models", choices=[]),
        gr.Textbox(label="Selected Model"),
        gr.Textbox(label="Transcription", lines=10),
        gr.Slider(minimum=0, maximum=100, label="Progress", interactive=False),
        gr.Textbox(label="Status"),
        gr.File(label="Download Transcription (TXT)", type="filepath"),
        gr.File(label="Download Transcription (JSON)", type="filepath")
    ],
    title="Multilingual Audio Transcriber with Real-time Display, Timestamps, and Speaker Diarization",
    description="Upload an audio file to detect the language, select the transcription model, and get the transcription with timestamps and speaker labels in real-time. Download the transcription as TXT or JSON. Optimized for Spanish, English, and Portuguese.",
    live=True
)

if __name__ == "__main__":
    iface.queue().launch()