asr-inference / app.py
mmarimon's picture
Update app.py
1faae08 verified
raw
history blame
1.64 kB
import torch
import gradio as gr
import yt_dlp as youtube_dl
from transformers import pipeline
from transformers.pipelines.audio_utils import ffmpeg_read
import tempfile
import os
MODEL_NAME = "openai/whisper-large-v3"
BATCH_SIZE = 8
FILE_LIMIT_MB = 1000
device = 0 if torch.cuda.is_available() else "cpu"
pipe = pipeline(
task="automatic-speech-recognition",
model=MODEL_NAME,
chunk_length_s=30,
device=device,
)
def transcribe(inputs, task):
if inputs is None:
raise gr.Error("Cap fitxer d'脿udio introduit! Si us plau pengeu un fitxer "\
"o enregistreu un 脿udio abans d'enviar la vostra sol路licitud")
text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
return text
description_string = "Transcripci贸 autom脿tica de micr貌fon o de fitxers d'脿udio.\n Aquest demostrador s'ha desenvolupat per"\
" comprovar els models de reconeixement de parla per a m贸bils. Per ara utilitza el checkpoint "\
f"[{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) i la llibreria de 馃 Transformers per a la transcripci贸."
file_transcribe = gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(sources=["upload", "microphone"], type="filepath", label="Audio"),
gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
],
outputs="text",
title="Transcripci贸 autom脿tica d'脿udio",
description=(description_string),
allow_flagging="never",
)
demo = gr.TabbedInterface([file_transcribe], ["Fitxer"])
if __name__ == "__main__":
demo.launch()