Spaces:

mozilla-ai
/

transcribe

Running on Zero

File size: 5,856 Bytes

import os
import gradio as gr
import spaces
from huggingface_hub import get_collection, HfApi
from transformers import pipeline, Pipeline

is_hf_space = os.getenv("IS_HF_SPACE")


def get_dropdown_model_ids():
    mozilla_ai_model_ids = []
    # Get model ids from collection and append the language in () from the model's metadata
    for model_i in get_collection(
        "mozilla-ai/common-voice-whisper-67b847a74ad7561781aa10fd"
    ).items:
        model_metadata = HfApi().model_info(model_i.item_id)
        language = model_metadata.card_data.model_name.split("on ")[1]
        mozilla_ai_model_ids.append(model_i.item_id + f" ({language})")

    return (
        [""]
        + mozilla_ai_model_ids
        + [
            "openai/whisper-tiny (Multilingual)",
            "openai/whisper-small (Multilingual)",
            "openai/whisper-medium (Multilingual)",
            "openai/whisper-large-v3 (Multilingual)",
            "openai/whisper-large-v3-turbo (Multilingual)",
        ]
    )


def _load_local_model(model_dir: str) -> Pipeline | str:
    from transformers import WhisperProcessor, WhisperForConditionalGeneration

    processor = WhisperProcessor.from_pretrained(model_dir)
    model = WhisperForConditionalGeneration.from_pretrained(model_dir)

    try:
        return pipeline(
            task="automatic-speech-recognition",
            model=model,
            processor=processor,
            chunk_length_s=30,  # max input duration for whisper
        )
    except Exception as e:
        return str(e)


def _load_hf_model(model_repo_id: str) -> Pipeline | str:
    try:
        return pipeline(
            "automatic-speech-recognition",
            model=model_repo_id,
            chunk_length_s=30,  # max input duration for whisper
        )
    except Exception as e:
        return str(e)


# Copied from https://github.com/openai/whisper/blob/517a43ecd132a2089d85f4ebc044728a71d49f6e/whisper/utils.py#L50
def format_timestamp(
    seconds: float, always_include_hours: bool = False, decimal_marker: str = "."
):
    assert seconds >= 0, "non-negative timestamp expected"
    milliseconds = round(seconds * 1000.0)

    hours = milliseconds // 3_600_000
    milliseconds -= hours * 3_600_000

    minutes = milliseconds // 60_000
    milliseconds -= minutes * 60_000

    seconds = milliseconds // 1_000
    milliseconds -= seconds * 1_000

    hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
    return (
        f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
    )


@spaces.GPU(duration=30)
def transcribe(
    dropdown_model_id: str,
    hf_model_id: str,
    local_model_id: str,
    audio: gr.Audio,
    show_timestamps: bool,
) -> str:
    if dropdown_model_id and not hf_model_id and not local_model_id:
        dropdown_model_id = dropdown_model_id.split(" (")[0]
        pipe = _load_hf_model(dropdown_model_id)
    elif hf_model_id and not local_model_id and not dropdown_model_id:
        pipe = _load_hf_model(hf_model_id)
    elif local_model_id and not hf_model_id and not dropdown_model_id:
        pipe = _load_local_model(local_model_id)
    else:
        return (
            "⚠️ Error: Please select or fill at least and only one of the options above"
        )
    if isinstance(pipe, str):
        # Exception raised when loading
        return f"⚠️ Error: {pipe}"

    output = pipe(
        audio,
        generate_kwargs={"task": "transcribe"},
        batch_size=16,
        return_timestamps=show_timestamps,
    )
    text = output["text"]
    if show_timestamps:
        timestamps = output["chunks"]
        timestamps = [
            f"[{format_timestamp(chunk['timestamp'][0])} -> {format_timestamp(chunk['timestamp'][1])}] {chunk['text']}"
            for chunk in timestamps
        ]
        text = "\n".join(str(feature) for feature in timestamps)
    return text


def setup_gradio_demo():
    with gr.Blocks() as demo:
        gr.Markdown(
            """ # 🗣️ Speech-to-Text Transcription
            ### 1. Select which model to use from one of the options below.
            ### 2. Record a message or upload an audio file.
            ### 3. Click Transcribe to see the transcription generated by the model.
            """
        )
        ### Model selection ###
        model_ids = get_dropdown_model_ids()
        with gr.Row():
            with gr.Column():
                dropdown_model = gr.Dropdown(
                    choices=model_ids, label="Option 1: Select a model"
                )
            with gr.Column():
                user_model = gr.Textbox(
                    label="Option 2: Paste HF model id",
                    placeholder="my-username/my-whisper-tiny",
                )
            with gr.Column(visible=not is_hf_space):
                local_model = gr.Textbox(
                    label="Option 3: Paste local path to model directory",
                    placeholder="artifacts/my-whisper-tiny",
                )

        ### Transcription ###
        with gr.Group():
            audio_input = gr.Audio(
                sources=["microphone", "upload"],
                type="filepath",
                label="Record a message / Upload audio file",
                show_download_button=True,
            )
            timestamps_check = gr.Checkbox(label="Show timestamps")

        transcribe_button = gr.Button("Transcribe")
        transcribe_output = gr.Text(label="Output")

        transcribe_button.click(
            fn=transcribe,
            inputs=[
                dropdown_model,
                user_model,
                local_model,
                audio_input,
                timestamps_check,
            ],
            outputs=transcribe_output,
        )

    demo.launch()


if __name__ == "__main__":
    setup_gradio_demo()