anzorq commited on
Commit
91a1a24
Β·
1 Parent(s): 0f1ce4a

ui changes + translation

Browse files
Files changed (1) hide show
  1. app.py +83 -17
app.py CHANGED
@@ -5,24 +5,82 @@ import time
5
 
6
  model = whisper.load_model("base")
7
 
8
- def transcribe(audio, state="", delay=0.2):
9
  time.sleep(delay)
10
- result = model.transcribe(audio, language="english")
11
- state += result['text'] + " "
12
- # return f"Language: {result['language']}\
13
- # \n\nText: {state}"
14
- return state, state
15
 
16
- def debug(audio, state="", delay=0.2):
17
- print(whisper.load_audio(audio).shape)
18
- state += str(whisper.load_audio(audio))
19
- # print(state)
20
- return state, state
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
- delay_slider = gr.inputs.Slider(minimum=0, maximum=10, default=0.2, label="Delay (seconds). The rate of transcription (1 sec + delay).")
24
 
25
  title = "OpenAI's Whisper Real-time Demo"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
  gr.Interface(
28
  fn=transcribe,
@@ -30,14 +88,22 @@ gr.Interface(
30
  inputs=[
31
  # gr.Audio(source="upload", type="filepath"),
32
  gr.Audio(source="microphone", type="filepath", streaming=True),
33
- "state",
34
- delay_slider
 
 
35
  ],
36
  outputs=[
37
- gr.Textbox(label="Transcription", lines=10, max_lines=20),
38
- "state"
 
 
39
  ],
40
  live=True,
41
  allow_flagging='never',
42
  title=title,
43
- ).launch(enable_queue=True)
 
 
 
 
 
5
 
6
  model = whisper.load_model("base")
7
 
8
+ def transcribe(audio, state={}, delay=0.2, lang=None, translate=False):
9
  time.sleep(delay)
 
 
 
 
 
10
 
11
+ # state = {"transcription": "", "translation": ""}
 
 
 
 
12
 
13
+ transcription = model.transcribe(
14
+ audio,
15
+ language = lang if lang != "auto" else None
16
+ )
17
+ state['transcription'] += transcription['text'] + " "
18
+
19
+ if translate:
20
+ x = whisper.load_audio(audio)
21
+ x = whisper.pad_or_trim(x)
22
+ mel = whisper.log_mel_spectrogram(x).to(model.device)
23
+
24
+ options = whisper.DecodingOptions(task = "translation")
25
+ translation = whisper.decode(model, mel, options)
26
+
27
+ state['translation'] += translation.text + " "
28
+
29
+ return state['transcription'], state['translation'], state, f"detected language: {transcription['language']}"
30
 
 
31
 
32
  title = "OpenAI's Whisper Real-time Demo"
33
+ description = "A simple demo of OpenAI's [**Whisper**](https://github.com/openai/whisper) speech recognition model."
34
+
35
+ delay_slider = gr.inputs.Slider(minimum=0, maximum=5, default=0.2, label="Rate of transcription (1 sec + this value)")
36
+ lang_dropdown = gr.inputs.Dropdown(choices=["auto", "english", "afrikaans",
37
+ "albanian", "amharic", "arabic",
38
+ "armenian", "assamese", "azerbaijani",
39
+ "bashkir", "basque", "belarusian",
40
+ "bengali", "bosnian", "breton",
41
+ "bulgarian", "catalan", "chinese",
42
+ "croatian", "czech", "danish",
43
+ "dutch", "estonian", "faroese",
44
+ "finnish", "french", "galician",
45
+ "georgian", "german", "greek",
46
+ "gujarati", "haitian creole", "hausa",
47
+ "hawaiian", "hebrew", "hindi",
48
+ "hungarian", "icelandic", "indonesian",
49
+ "italian", "japanese", "javanese",
50
+ "kannada", "kazakh", "khmer",
51
+ "korean", "kyrgyz", "lao",
52
+ "latin", "latvian", "lingala",
53
+ "lithuanian", "luxembourgish", "macedonian",
54
+ "malagasy", "malay", "malayalam",
55
+ "maltese", "maori", "marathi",
56
+ "mongolian", "myanmar", "nepali",
57
+ "norwegian", "nyanja", "nynorsk",
58
+ "occitan", "oriya", "pashto",
59
+ "persian", "polish", "portuguese",
60
+ "punjabi", "romanian", "russian",
61
+ "sanskrit", "sardinian", "serbian",
62
+ "shona", "sindhi", "sinhala",
63
+ "slovak", "slovenian", "somali",
64
+ "spanish", "sundanese", "swahili",
65
+ "swedish", "tagalog", "tajik",
66
+ "tamil", "tatar", "telugu",
67
+ "thai", "tigrinya", "tibetan",
68
+ "turkish", "turkmen", "ukrainian",
69
+ "urdu", "uzbek", "vietnamese",
70
+ "welsh", "xhosa", "yiddish",
71
+ "yoruba"],
72
+ label="Language", default="auto", type="value")
73
+
74
+ # chechbox whether to translate
75
+ translate_checkbox = gr.inputs.Checkbox(label="Translate to English", default=False)
76
+
77
+
78
+
79
+ transcription_tb = gr.Textbox(label="Transcription", lines=10, max_lines=20)
80
+ translation_tb = gr.Textbox(label="Translation", lines=10, max_lines=20)
81
+ detected_lang = gr.outputs.HTML(label="Detected Language")
82
+
83
+ state = gr.State({"transcription": "", "translation": ""})
84
 
85
  gr.Interface(
86
  fn=transcribe,
 
88
  inputs=[
89
  # gr.Audio(source="upload", type="filepath"),
90
  gr.Audio(source="microphone", type="filepath", streaming=True),
91
+ state,
92
+ delay_slider,
93
+ lang_dropdown,
94
+ translate_checkbox
95
  ],
96
  outputs=[
97
+ transcription_tb,
98
+ translation_tb,
99
+ state,
100
+ detected_lang
101
  ],
102
  live=True,
103
  allow_flagging='never',
104
  title=title,
105
+ description=description,
106
+ ).launch(
107
+ # enable_queue=True,
108
+ # debug=True
109
+ )