Spaces:

avfranco
/

audioqna

Running on Zero

File size: 2,077 Bytes

import gradio as gr
import os
from pydub import AudioSegment

def audio_converter(audio_file:str):
    audio_input = AudioSegment.from_file(audio_file,'m4a')
    audio_input_name = os.path.splitext(audio_file)[0]
    audio_wav_filename = f"{audio_input_name}.wav"
    audio_input.export(audio_wav_filename, 'wav')
    
    return audio_wav_filename

def asr_transcriber(audio_file):
    from transformers import pipeline
    import torch

    audio_file_wav = audio_converter(audio_file)

    # Check for CUDA availability (GPU)
    if torch.cuda.is_available():
        device_id = torch.device('cuda')
    else:
        device_id = torch.device('cpu')

    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

    #Mac runtime
    #device_id = "mps"
    #torch_dtype = torch.float16
    flash = False
    ts = False

    #Try to optimize when CPU and float32
    model_id = "openai/whisper-small"
    

    # Initialize the ASR pipeline
    pipe = pipeline(
        "automatic-speech-recognition",
        model=model_id,
        torch_dtype=torch_dtype,
        device=device_id
    )
    
    if device_id == "mps":
        torch.mps.empty_cache()
    elif not flash:
        pipe.model = pipe.model.to_bettertransformer()
        
    language = None
    task = "transcribe"

    json_output = pipe(
        audio_file_wav, 
        chunk_length_s=30, 
        batch_size=8, 
        generate_kwargs={"task": task, "language": language},
        return_timestamps=ts
    )

    return json_output["text"]

with gr.Blocks() as transcriberUI:
    gr.Markdown(
    """
    # Ola Xara & Solange!
    Clicar no botao abaixo para selecionar o Audio a ser transcrito!
    Ambiente Demo disponivel 24x7. Running on CPU Upgrade with openai/whisper-tiny
    """)
    inp = gr.File(label="Arquivo de Audio", show_label=True, file_count="single", file_types=["m4a"])
    transcribe = gr.Textbox(label="Transcricao", show_label=True, show_copy_button=True)
    inp.upload(asr_transcriber, inp, transcribe)

if __name__ == "__main__":
    transcriberUI.launch()