import os import gradio as gr import whisper import time model = whisper.load_model("base") def transcribe(audio, state={}, delay=0.2, lang=None, translate=False): time.sleep(delay) transcription = model.transcribe( audio, language = lang if lang != "auto" else None ) state['transcription'] += transcription['text'] + " " if translate: x = whisper.load_audio(audio) x = whisper.pad_or_trim(x) mel = whisper.log_mel_spectrogram(x).to(model.device) options = whisper.DecodingOptions(task = "translation") translation = whisper.decode(model, mel, options) state['translation'] += translation.text + " " return state['transcription'], state['translation'], state, f"detected language: {transcription['language']}" title = "OpenAI's Whisper Real-time Demo" description = "A simple demo of OpenAI's [**Whisper**](https://github.com/openai/whisper) speech recognition model." delay_slider = gr.inputs.Slider(minimum=0, maximum=5, default=0.2, label="Rate of transcription (1 sec + this value)") lang_dropdown = gr.inputs.Dropdown(choices=["auto", "english", "afrikaans", "albanian", "amharic", "arabic", "armenian", "assamese", "azerbaijani", "bashkir", "basque", "belarusian", "bengali", "bosnian", "breton", "bulgarian", "catalan", "chinese", "croatian", "czech", "danish", "dutch", "estonian", "faroese", "finnish", "french", "galician", "georgian", "german", "greek", "gujarati", "haitian creole", "hausa", "hawaiian", "hebrew", "hindi", "hungarian", "icelandic", "indonesian", "italian", "japanese", "javanese", "kannada", "kazakh", "khmer", "korean", "kyrgyz", "lao", "latin", "latvian", "lingala", "lithuanian", "luxembourgish", "macedonian", "malagasy", "malay", "malayalam", "maltese", "maori", "marathi", "mongolian", "myanmar", "nepali", "norwegian", "nyanja", "nynorsk", "occitan", "oriya", "pashto", "persian", "polish", "portuguese", "punjabi", "romanian", "russian", "sanskrit", "sardinian", "serbian", "shona", "sindhi", "sinhala", "slovak", "slovenian", "somali", "spanish", "sundanese", "swahili", "swedish", "tagalog", "tajik", "tamil", "tatar", "telugu", "thai", "tigrinya", "tibetan", "turkish", "turkmen", "ukrainian", "urdu", "uzbek", "vietnamese", "welsh", "xhosa", "yiddish", "yoruba"], label="Language", default="auto", type="value") translate_checkbox = gr.inputs.Checkbox(label="Translate to English", default=False) transcription_tb = gr.Textbox(label="Transcription", lines=10, max_lines=20) translation_tb = gr.Textbox(label="Translation", lines=10, max_lines=20) detected_lang = gr.outputs.HTML(label="Detected Language") state = gr.State({"transcription": "", "translation": ""}) gr.Interface( fn=transcribe, inputs=[ gr.Audio(source="microphone", type="filepath", streaming=True), state, delay_slider, lang_dropdown, translate_checkbox ], outputs=[ transcription_tb, translation_tb, state, detected_lang ], live=True, allow_flagging='never', title=title, description=description, ).launch( # enable_queue=True, # debug=True )