Spaces:
Runtime error
Runtime error
ui changes + translation
Browse files
app.py
CHANGED
@@ -5,24 +5,82 @@ import time
|
|
5 |
|
6 |
model = whisper.load_model("base")
|
7 |
|
8 |
-
def transcribe(audio, state=
|
9 |
time.sleep(delay)
|
10 |
-
result = model.transcribe(audio, language="english")
|
11 |
-
state += result['text'] + " "
|
12 |
-
# return f"Language: {result['language']}\
|
13 |
-
# \n\nText: {state}"
|
14 |
-
return state, state
|
15 |
|
16 |
-
|
17 |
-
print(whisper.load_audio(audio).shape)
|
18 |
-
state += str(whisper.load_audio(audio))
|
19 |
-
# print(state)
|
20 |
-
return state, state
|
21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
-
delay_slider = gr.inputs.Slider(minimum=0, maximum=10, default=0.2, label="Delay (seconds). The rate of transcription (1 sec + delay).")
|
24 |
|
25 |
title = "OpenAI's Whisper Real-time Demo"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
gr.Interface(
|
28 |
fn=transcribe,
|
@@ -30,14 +88,22 @@ gr.Interface(
|
|
30 |
inputs=[
|
31 |
# gr.Audio(source="upload", type="filepath"),
|
32 |
gr.Audio(source="microphone", type="filepath", streaming=True),
|
33 |
-
|
34 |
-
delay_slider
|
|
|
|
|
35 |
],
|
36 |
outputs=[
|
37 |
-
|
38 |
-
|
|
|
|
|
39 |
],
|
40 |
live=True,
|
41 |
allow_flagging='never',
|
42 |
title=title,
|
43 |
-
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
model = whisper.load_model("base")
|
7 |
|
8 |
+
def transcribe(audio, state={}, delay=0.2, lang=None, translate=False):
|
9 |
time.sleep(delay)
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
+
# state = {"transcription": "", "translation": ""}
|
|
|
|
|
|
|
|
|
12 |
|
13 |
+
transcription = model.transcribe(
|
14 |
+
audio,
|
15 |
+
language = lang if lang != "auto" else None
|
16 |
+
)
|
17 |
+
state['transcription'] += transcription['text'] + " "
|
18 |
+
|
19 |
+
if translate:
|
20 |
+
x = whisper.load_audio(audio)
|
21 |
+
x = whisper.pad_or_trim(x)
|
22 |
+
mel = whisper.log_mel_spectrogram(x).to(model.device)
|
23 |
+
|
24 |
+
options = whisper.DecodingOptions(task = "translation")
|
25 |
+
translation = whisper.decode(model, mel, options)
|
26 |
+
|
27 |
+
state['translation'] += translation.text + " "
|
28 |
+
|
29 |
+
return state['transcription'], state['translation'], state, f"detected language: {transcription['language']}"
|
30 |
|
|
|
31 |
|
32 |
title = "OpenAI's Whisper Real-time Demo"
|
33 |
+
description = "A simple demo of OpenAI's [**Whisper**](https://github.com/openai/whisper) speech recognition model."
|
34 |
+
|
35 |
+
delay_slider = gr.inputs.Slider(minimum=0, maximum=5, default=0.2, label="Rate of transcription (1 sec + this value)")
|
36 |
+
lang_dropdown = gr.inputs.Dropdown(choices=["auto", "english", "afrikaans",
|
37 |
+
"albanian", "amharic", "arabic",
|
38 |
+
"armenian", "assamese", "azerbaijani",
|
39 |
+
"bashkir", "basque", "belarusian",
|
40 |
+
"bengali", "bosnian", "breton",
|
41 |
+
"bulgarian", "catalan", "chinese",
|
42 |
+
"croatian", "czech", "danish",
|
43 |
+
"dutch", "estonian", "faroese",
|
44 |
+
"finnish", "french", "galician",
|
45 |
+
"georgian", "german", "greek",
|
46 |
+
"gujarati", "haitian creole", "hausa",
|
47 |
+
"hawaiian", "hebrew", "hindi",
|
48 |
+
"hungarian", "icelandic", "indonesian",
|
49 |
+
"italian", "japanese", "javanese",
|
50 |
+
"kannada", "kazakh", "khmer",
|
51 |
+
"korean", "kyrgyz", "lao",
|
52 |
+
"latin", "latvian", "lingala",
|
53 |
+
"lithuanian", "luxembourgish", "macedonian",
|
54 |
+
"malagasy", "malay", "malayalam",
|
55 |
+
"maltese", "maori", "marathi",
|
56 |
+
"mongolian", "myanmar", "nepali",
|
57 |
+
"norwegian", "nyanja", "nynorsk",
|
58 |
+
"occitan", "oriya", "pashto",
|
59 |
+
"persian", "polish", "portuguese",
|
60 |
+
"punjabi", "romanian", "russian",
|
61 |
+
"sanskrit", "sardinian", "serbian",
|
62 |
+
"shona", "sindhi", "sinhala",
|
63 |
+
"slovak", "slovenian", "somali",
|
64 |
+
"spanish", "sundanese", "swahili",
|
65 |
+
"swedish", "tagalog", "tajik",
|
66 |
+
"tamil", "tatar", "telugu",
|
67 |
+
"thai", "tigrinya", "tibetan",
|
68 |
+
"turkish", "turkmen", "ukrainian",
|
69 |
+
"urdu", "uzbek", "vietnamese",
|
70 |
+
"welsh", "xhosa", "yiddish",
|
71 |
+
"yoruba"],
|
72 |
+
label="Language", default="auto", type="value")
|
73 |
+
|
74 |
+
# chechbox whether to translate
|
75 |
+
translate_checkbox = gr.inputs.Checkbox(label="Translate to English", default=False)
|
76 |
+
|
77 |
+
|
78 |
+
|
79 |
+
transcription_tb = gr.Textbox(label="Transcription", lines=10, max_lines=20)
|
80 |
+
translation_tb = gr.Textbox(label="Translation", lines=10, max_lines=20)
|
81 |
+
detected_lang = gr.outputs.HTML(label="Detected Language")
|
82 |
+
|
83 |
+
state = gr.State({"transcription": "", "translation": ""})
|
84 |
|
85 |
gr.Interface(
|
86 |
fn=transcribe,
|
|
|
88 |
inputs=[
|
89 |
# gr.Audio(source="upload", type="filepath"),
|
90 |
gr.Audio(source="microphone", type="filepath", streaming=True),
|
91 |
+
state,
|
92 |
+
delay_slider,
|
93 |
+
lang_dropdown,
|
94 |
+
translate_checkbox
|
95 |
],
|
96 |
outputs=[
|
97 |
+
transcription_tb,
|
98 |
+
translation_tb,
|
99 |
+
state,
|
100 |
+
detected_lang
|
101 |
],
|
102 |
live=True,
|
103 |
allow_flagging='never',
|
104 |
title=title,
|
105 |
+
description=description,
|
106 |
+
).launch(
|
107 |
+
# enable_queue=True,
|
108 |
+
# debug=True
|
109 |
+
)
|