Spaces:

mozilla-ai
/

transcribe

Running on Zero

File size: 4,907 Bytes

from pathlib import Path
from typing import Tuple
import gradio as gr
from transformers import pipeline, Pipeline
from huggingface_hub import repo_exists


from speech_to_text_finetune.config import LANGUAGES_NAME_TO_ID

languages = LANGUAGES_NAME_TO_ID.keys()
model_ids = [
    "",
    "openai/whisper-tiny",
    "openai/whisper-small",
    "openai/whisper-medium",
    "openai/whisper-large-v3",
    "openai/whisper-large-v3-turbo",
]


def _load_local_model(model_dir: str, language: str) -> Tuple[Pipeline | None, str]:
    if not Path(model_dir).is_dir():
        return None, f"⚠️ Couldn't find local model directory: {model_dir}"
    from transformers import (
        WhisperProcessor,
        WhisperTokenizer,
        WhisperFeatureExtractor,
        WhisperForConditionalGeneration,
    )

    processor = WhisperProcessor.from_pretrained(model_dir)
    tokenizer = WhisperTokenizer.from_pretrained(
        model_dir, language=language, task="transcribe"
    )
    feature_extractor = WhisperFeatureExtractor.from_pretrained(model_dir)
    model = WhisperForConditionalGeneration.from_pretrained(model_dir)

    return pipeline(
        task="automatic-speech-recognition",
        model=model,
        processor=processor,
        tokenizer=tokenizer,
        feature_extractor=feature_extractor,
    ), f"✅ Local model has been loaded from {model_dir}."


def _load_hf_model(model_repo_id: str, language: str) -> Tuple[Pipeline | None, str]:
    if not repo_exists(model_repo_id):
        return (
            None,
            f"⚠️ Couldn't find {model_repo_id} on Hugging Face. If its a private repo, make sure you are logged in locally.",
        )
    return pipeline(
        "automatic-speech-recognition",
        model=model_repo_id,
        generate_kwargs={"language": language},
    ), f"✅ HF Model {model_repo_id} has been loaded."


def load_model(
    language: str, dropdown_model_id: str, hf_model_id: str, local_model_id: str
) -> Tuple[Pipeline, str]:
    if dropdown_model_id and not hf_model_id and not local_model_id:
        yield None, f"Loading {dropdown_model_id}..."
        yield _load_hf_model(dropdown_model_id, language)
    elif hf_model_id and not local_model_id and not dropdown_model_id:
        yield None, f"Loading {hf_model_id}..."
        yield _load_hf_model(hf_model_id, language)
    elif local_model_id and not hf_model_id and not dropdown_model_id:
        yield None, f"Loading {local_model_id}..."
        yield _load_local_model(local_model_id, language)
    else:
        yield (
            None,
            "️️⚠️ Please select or fill at least and only one of the three options above",
        )
    if not language:
        yield None, "⚠️ Please select a language from the dropdown"


def transcribe(pipe: Pipeline, audio: gr.Audio) -> str:
    text = pipe(audio)["text"]
    return text


def setup_gradio_demo():
    with gr.Blocks() as demo:
        gr.Markdown(
            """ # 🗣️ Speech-to-Text Transcription
            ### 1. Select a language from the dropdown menu.
            ### 2. Select which model to load from one of the 3 options
            ### 3. Load the model by clicking the Load model button.
            ### 4. Record a message and click Transcribe to see the transcription.
            """
        )
        ### Language & Model selection ###

        selected_lang = gr.Dropdown(
            choices=list(languages), value=None, label="Select a language"
        )

        with gr.Row():
            with gr.Column():
                dropdown_model = gr.Dropdown(
                    choices=model_ids, label="Option 1: Select a model"
                )
            with gr.Column():
                user_model = gr.Textbox(
                    label="Option 2: Paste HF model id",
                    placeholder="my-username/my-whisper-tiny",
                )
            with gr.Column():
                local_model = gr.Textbox(
                    label="Option 3: Paste local path to model directory",
                    placeholder="artifacts/my-whisper-tiny",
                )

        load_model_button = gr.Button("Load model")
        model_loaded = gr.Markdown()

        ### Transcription ###
        audio_input = gr.Audio(
            sources=["microphone"], type="filepath", label="Record a message"
        )
        transcribe_button = gr.Button("Transcribe")
        transcribe_output = gr.Text(label="Output")

        ### Event listeners ###
        model = gr.State()
        load_model_button.click(
            fn=load_model,
            inputs=[selected_lang, dropdown_model, user_model, local_model],
            outputs=[model, model_loaded],
        )

        transcribe_button.click(
            fn=transcribe, inputs=[model, audio_input], outputs=transcribe_output
        )

    demo.launch()


if __name__ == "__main__":
    setup_gradio_demo()