Spaces:

mozilla-ai
/

transcribe

Running on Zero

File size: 3,962 Bytes

ff71374
8bbb796
d0aba40
8bbb796
 
ff71374
8bbb796
5e6ab01
d0aba40
 
 
 
 
 
 
8bbb796
 
 
29a8a9a
5e6ab01
 
 
 
 
 
 
 
d0aba40
5e6ab01
 
 
c318bd7
 
 
 
 
 
 
 
 
 
5e6ab01
 
29a8a9a
c318bd7
 
 
 
 
 
 
5e6ab01
 
8370c97
29a8a9a
 
 
 
 
 
5e6ab01
d0aba40
29a8a9a
5e6ab01
29a8a9a
5e6ab01
29a8a9a
8bbb796
29a8a9a
c318bd7
 
 
8bbb796
 
 
 
 
 
 
 
c318bd7
 
 
8bbb796
 
d0aba40
5e6ab01
 
 
 
 
 
 
 
 
 
 
ff71374
5e6ab01
 
 
 
 
8bbb796
 
b669864
 
 
 
 
8bbb796
 
 
 
 
29a8a9a
 
 
8bbb796

import os
import gradio as gr
import spaces
from transformers import pipeline, Pipeline

is_hf_space = os.getenv("IS_HF_SPACE")
model_ids = [
    "",
    "mozilla-ai/whisper-small-gl (Galician)",
    "mozilla-ai/whisper-small-el (Greek)",
    "openai/whisper-tiny (Multilingual)",
    "openai/whisper-small (Multilingual)",
    "openai/whisper-medium (Multilingual)",
    "openai/whisper-large-v3 (Multilingual)",
    "openai/whisper-large-v3-turbo (Multilingual)",
]


def _load_local_model(model_dir: str) -> Pipeline:
    from transformers import (
        WhisperProcessor,
        WhisperTokenizer,
        WhisperFeatureExtractor,
        WhisperForConditionalGeneration,
    )

    processor = WhisperProcessor.from_pretrained(model_dir)
    tokenizer = WhisperTokenizer.from_pretrained(model_dir, task="transcribe")
    feature_extractor = WhisperFeatureExtractor.from_pretrained(model_dir)
    model = WhisperForConditionalGeneration.from_pretrained(model_dir)

    try:
        return pipeline(
            task="automatic-speech-recognition",
            model=model,
            processor=processor,
            tokenizer=tokenizer,
            feature_extractor=feature_extractor,
        )
    except Exception as e:
        return str(e)


def _load_hf_model(model_repo_id: str) -> Pipeline:
    try:
        return pipeline(
            "automatic-speech-recognition",
            model=model_repo_id,
        )
    except Exception as e:
        return str(e)


@spaces.GPU(duration=30)
def transcribe(
    dropdown_model_id: str,
    hf_model_id: str,
    local_model_id: str,
    audio: gr.Audio,
) -> str:
    if dropdown_model_id and not hf_model_id and not local_model_id:
        dropdown_model_id = dropdown_model_id.split(" (")[0]
        pipe = _load_hf_model(dropdown_model_id)
    elif hf_model_id and not local_model_id and not dropdown_model_id:
        pipe = _load_hf_model(hf_model_id)
    elif local_model_id and not hf_model_id and not dropdown_model_id:
        pipe = _load_local_model(local_model_id)
    else:
        return "️️⚠️ Please select or fill at least and only one of the options above"
    if isinstance(pipe, str):
        # Exception raised
        return pipe
    text = pipe(audio)["text"]
    return text


def setup_gradio_demo():
    with gr.Blocks() as demo:
        gr.Markdown(
            """ # 🗣️ Speech-to-Text Transcription
            ### 1. Select which model to use from one of the options below.
            ### 2. Record a message or upload an audio file.
            ### 3. Click Transcribe to see the transcription generated by the model.
            """
        )
        ### Model selection ###

        with gr.Row():
            with gr.Column():
                dropdown_model = gr.Dropdown(
                    choices=model_ids, label="Option 1: Select a model"
                )
            with gr.Column():
                user_model = gr.Textbox(
                    label="Option 2: Paste HF model id",
                    placeholder="my-username/my-whisper-tiny",
                )
            with gr.Column(visible=not is_hf_space):
                local_model = gr.Textbox(
                    label="Option 3: Paste local path to model directory",
                    placeholder="artifacts/my-whisper-tiny",
                )

        ### Transcription ###
        audio_input = gr.Audio(
            sources=["microphone", "upload"],
            type="filepath",
            label="Record a message / Upload audio file",
            show_download_button=True,
            max_length=30,
        )
        transcribe_button = gr.Button("Transcribe")
        transcribe_output = gr.Text(label="Output")

        transcribe_button.click(
            fn=transcribe,
            inputs=[dropdown_model, user_model, local_model, audio_input],
            outputs=transcribe_output,
        )

    demo.launch()


if __name__ == "__main__":
    setup_gradio_demo()