File size: 4,755 Bytes
fd585a0
 
 
 
 
 
 
91a1a24
fd585a0
 
91a1a24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0f1ce4a
 
 
91a1a24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fd585a0
 
 
 
 
91a1a24
 
 
 
fd585a0
 
91a1a24
 
 
 
fd585a0
 
0f1ce4a
 
91a1a24
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import os
import gradio as gr
import whisper
import time

model = whisper.load_model("base")

def transcribe(audio, state={}, delay=0.2, lang=None, translate=False):
    time.sleep(delay)

    transcription = model.transcribe(
        audio,
        language = lang if lang != "auto" else None
    )
    state['transcription'] += transcription['text'] + " "

    if translate:
        x = whisper.load_audio(audio)
        x = whisper.pad_or_trim(x)
        mel = whisper.log_mel_spectrogram(x).to(model.device)

        options = whisper.DecodingOptions(task = "translation")
        translation = whisper.decode(model, mel, options)

        state['translation'] += translation.text + " "

    return state['transcription'], state['translation'], state, f"detected language: {transcription['language']}"


title = "OpenAI's Whisper Real-time Demo"
description = "A simple demo of OpenAI's [**Whisper**](https://github.com/openai/whisper) speech recognition model."

delay_slider = gr.inputs.Slider(minimum=0, maximum=5, default=0.2, label="Rate of transcription (1 sec + this value)")
lang_dropdown = gr.inputs.Dropdown(choices=["auto", "english", "afrikaans",
                                            "albanian", "amharic", "arabic",
                                            "armenian", "assamese", "azerbaijani",
                                            "bashkir", "basque", "belarusian",
                                            "bengali", "bosnian", "breton",
                                            "bulgarian", "catalan", "chinese",
                                            "croatian", "czech", "danish",
                                            "dutch", "estonian", "faroese",
                                            "finnish", "french", "galician",
                                            "georgian", "german", "greek",
                                            "gujarati", "haitian creole", "hausa",
                                            "hawaiian", "hebrew", "hindi",
                                            "hungarian", "icelandic", "indonesian",
                                            "italian", "japanese", "javanese",
                                            "kannada", "kazakh", "khmer",
                                            "korean", "kyrgyz", "lao",
                                            "latin", "latvian", "lingala",
                                            "lithuanian", "luxembourgish", "macedonian",
                                            "malagasy", "malay", "malayalam",
                                            "maltese", "maori", "marathi",
                                            "mongolian", "myanmar", "nepali",
                                            "norwegian", "nyanja", "nynorsk",
                                            "occitan", "oriya", "pashto",
                                            "persian", "polish", "portuguese",
                                            "punjabi", "romanian", "russian",
                                            "sanskrit", "sardinian", "serbian",
                                            "shona", "sindhi", "sinhala",
                                            "slovak", "slovenian", "somali",
                                            "spanish", "sundanese", "swahili",
                                            "swedish", "tagalog", "tajik",
                                            "tamil", "tatar", "telugu",
                                            "thai", "tigrinya", "tibetan",
                                            "turkish", "turkmen", "ukrainian",
                                            "urdu", "uzbek", "vietnamese",
                                            "welsh", "xhosa", "yiddish",
                                            "yoruba"],
                                   label="Language", default="auto", type="value")

translate_checkbox = gr.inputs.Checkbox(label="Translate to English", default=False)



transcription_tb = gr.Textbox(label="Transcription", lines=10, max_lines=20)
translation_tb = gr.Textbox(label="Translation", lines=10, max_lines=20)
detected_lang = gr.outputs.HTML(label="Detected Language")

state = gr.State({"transcription": "", "translation": ""})

gr.Interface(
    fn=transcribe,
    inputs=[
        gr.Audio(source="microphone", type="filepath", streaming=True),
        state,
        delay_slider,
        lang_dropdown,
        translate_checkbox
        ], 
    outputs=[
        transcription_tb,
        translation_tb,
        state,
        detected_lang
    ],
    live=True,
    allow_flagging='never',
    title=title,
    description=description,
).launch(
    # enable_queue=True,
    # debug=True
  )