audioqna / app.py
avfranco's picture
back to whisper-tiny
a1439b8
raw
history blame
2.08 kB
import gradio as gr
import os
from pydub import AudioSegment
def audio_converter(audio_file:str):
audio_input = AudioSegment.from_file(audio_file,'m4a')
audio_input_name = os.path.splitext(audio_file)[0]
audio_wav_filename = f"{audio_input_name}.wav"
audio_input.export(audio_wav_filename, 'wav')
return audio_wav_filename
def asr_transcriber(audio_file):
from transformers import pipeline
import torch
audio_file_wav = audio_converter(audio_file)
# Check for CUDA availability (GPU)
if torch.cuda.is_available():
device_id = torch.device('cuda')
else:
device_id = torch.device('cpu')
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
#Mac runtime
#device_id = "mps"
#torch_dtype = torch.float16
flash = False
ts = False
#Try to optimize when CPU and float32
model_id = "openai/whisper-tiny"
# Initialize the ASR pipeline
pipe = pipeline(
"automatic-speech-recognition",
model=model_id,
torch_dtype=torch_dtype,
device=device_id
)
if device_id == "mps":
torch.mps.empty_cache()
elif not flash:
pipe.model = pipe.model.to_bettertransformer()
language = None
task = "transcribe"
json_output = pipe(
audio_file_wav,
chunk_length_s=30,
batch_size=8,
generate_kwargs={"task": task, "language": language},
return_timestamps=ts
)
return json_output["text"]
with gr.Blocks() as transcriberUI:
gr.Markdown(
"""
# Ola Xara & Solange!
Clicar no botao abaixo para selecionar o Audio a ser transcrito!
Ambiente Demo disponivel 24x7. Running on CPU Upgrade with openai/whisper-tiny
""")
inp = gr.File(label="Arquivo de Audio", show_label=True, file_count="single", file_types=["m4a"])
transcribe = gr.Textbox(label="Transcricao", show_label=True, show_copy_button=True)
inp.upload(asr_transcriber, inp, transcribe)
if __name__ == "__main__":
transcriberUI.launch()