terry-li-hm commited on
Commit
57d9268
ยท
1 Parent(s): af0fecd

Remove `html_content`

Browse files
Files changed (1) hide show
  1. app.py +179 -161
app.py CHANGED
@@ -1,84 +1,83 @@
1
  # coding=utf-8
2
 
3
- import os
4
- import librosa
5
  import base64
6
  import io
7
- import gradio as gr
8
  import re
9
 
 
 
10
  import numpy as np
 
11
  import torch
12
  import torchaudio
13
-
14
- import spaces
15
-
16
  from funasr import AutoModel
17
 
18
  model = "FunAudioLLM/SenseVoiceSmall"
19
- model = AutoModel(model=model,
20
- vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch",
21
- vad_kwargs={"max_single_segment_time": 30000},
22
- hub="hf",
23
- device="cuda"
24
- )
 
25
 
26
  import re
27
 
28
  emo_dict = {
29
- "<|HAPPY|>": "๐Ÿ˜Š",
30
- "<|SAD|>": "๐Ÿ˜”",
31
- "<|ANGRY|>": "๐Ÿ˜ก",
32
- "<|NEUTRAL|>": "",
33
- "<|FEARFUL|>": "๐Ÿ˜ฐ",
34
- "<|DISGUSTED|>": "๐Ÿคข",
35
- "<|SURPRISED|>": "๐Ÿ˜ฎ",
36
  }
37
 
38
  event_dict = {
39
- "<|BGM|>": "๐ŸŽผ",
40
- "<|Speech|>": "",
41
- "<|Applause|>": "๐Ÿ‘",
42
- "<|Laughter|>": "๐Ÿ˜€",
43
- "<|Cry|>": "๐Ÿ˜ญ",
44
- "<|Sneeze|>": "๐Ÿคง",
45
- "<|Breath|>": "",
46
- "<|Cough|>": "๐Ÿคง",
47
  }
48
 
49
  emoji_dict = {
50
- "<|nospeech|><|Event_UNK|>": "โ“",
51
- "<|zh|>": "",
52
- "<|en|>": "",
53
- "<|yue|>": "",
54
- "<|ja|>": "",
55
- "<|ko|>": "",
56
- "<|nospeech|>": "",
57
- "<|HAPPY|>": "๐Ÿ˜Š",
58
- "<|SAD|>": "๐Ÿ˜”",
59
- "<|ANGRY|>": "๐Ÿ˜ก",
60
- "<|NEUTRAL|>": "",
61
- "<|BGM|>": "๐ŸŽผ",
62
- "<|Speech|>": "",
63
- "<|Applause|>": "๐Ÿ‘",
64
- "<|Laughter|>": "๐Ÿ˜€",
65
- "<|FEARFUL|>": "๐Ÿ˜ฐ",
66
- "<|DISGUSTED|>": "๐Ÿคข",
67
- "<|SURPRISED|>": "๐Ÿ˜ฎ",
68
- "<|Cry|>": "๐Ÿ˜ญ",
69
- "<|EMO_UNKNOWN|>": "",
70
- "<|Sneeze|>": "๐Ÿคง",
71
- "<|Breath|>": "",
72
- "<|Cough|>": "๐Ÿ˜ท",
73
- "<|Sing|>": "",
74
- "<|Speech_Noise|>": "",
75
- "<|withitn|>": "",
76
- "<|woitn|>": "",
77
- "<|GBG|>": "",
78
- "<|Event_UNK|>": "",
79
  }
80
 
81
- lang_dict = {
82
  "<|zh|>": "<|lang|>",
83
  "<|en|>": "<|lang|>",
84
  "<|yue|>": "<|lang|>",
@@ -88,98 +87,111 @@ lang_dict = {
88
  }
89
 
90
  emo_set = {"๐Ÿ˜Š", "๐Ÿ˜”", "๐Ÿ˜ก", "๐Ÿ˜ฐ", "๐Ÿคข", "๐Ÿ˜ฎ"}
91
- event_set = {"๐ŸŽผ", "๐Ÿ‘", "๐Ÿ˜€", "๐Ÿ˜ญ", "๐Ÿคง", "๐Ÿ˜ท",}
 
92
 
93
  def format_str(s):
94
- for sptk in emoji_dict:
95
- s = s.replace(sptk, emoji_dict[sptk])
96
- return s
97
 
98
 
99
  def format_str_v2(s):
100
- sptk_dict = {}
101
- for sptk in emoji_dict:
102
- sptk_dict[sptk] = s.count(sptk)
103
- s = s.replace(sptk, "")
104
- emo = "<|NEUTRAL|>"
105
- for e in emo_dict:
106
- if sptk_dict[e] > sptk_dict[emo]:
107
- emo = e
108
- for e in event_dict:
109
- if sptk_dict[e] > 0:
110
- s = event_dict[e] + s
111
- s = s + emo_dict[emo]
112
-
113
- for emoji in emo_set.union(event_set):
114
- s = s.replace(" " + emoji, emoji)
115
- s = s.replace(emoji + " ", emoji)
116
- return s.strip()
 
117
 
118
  def format_str_v3(s):
119
- def get_emo(s):
120
- return s[-1] if s[-1] in emo_set else None
121
- def get_event(s):
122
- return s[0] if s[0] in event_set else None
123
-
124
- s = s.replace("<|nospeech|><|Event_UNK|>", "โ“")
125
- for lang in lang_dict:
126
- s = s.replace(lang, "<|lang|>")
127
- s_list = [format_str_v2(s_i).strip(" ") for s_i in s.split("<|lang|>")]
128
- new_s = " " + s_list[0]
129
- cur_ent_event = get_event(new_s)
130
- for i in range(1, len(s_list)):
131
- if len(s_list[i]) == 0:
132
- continue
133
- if get_event(s_list[i]) == cur_ent_event and get_event(s_list[i]) != None:
134
- s_list[i] = s_list[i][1:]
135
- #else:
136
- cur_ent_event = get_event(s_list[i])
137
- if get_emo(s_list[i]) != None and get_emo(s_list[i]) == get_emo(new_s):
138
- new_s = new_s[:-1]
139
- new_s += s_list[i].strip().lstrip()
140
- new_s = new_s.replace("The.", " ")
141
- return new_s.strip()
 
 
142
 
143
  @spaces.GPU
144
  def model_inference(input_wav, language, fs=16000):
145
- # task_abbr = {"Speech Recognition": "ASR", "Rich Text Transcription": ("ASR", "AED", "SER")}
146
- language_abbr = {"auto": "auto", "zh": "zh", "en": "en", "yue": "yue", "ja": "ja", "ko": "ko",
147
- "nospeech": "nospeech"}
148
-
149
- # task = "Speech Recognition" if task is None else task
150
- language = "auto" if len(language) < 1 else language
151
- selected_language = language_abbr[language]
152
- # selected_task = task_abbr.get(task)
153
-
154
- # print(f"input_wav: {type(input_wav)}, {input_wav[1].shape}, {input_wav}")
155
-
156
- if isinstance(input_wav, tuple):
157
- fs, input_wav = input_wav
158
- input_wav = input_wav.astype(np.float32) / np.iinfo(np.int16).max
159
- if len(input_wav.shape) > 1:
160
- input_wav = input_wav.mean(-1)
161
- if fs != 16000:
162
- print(f"audio_fs: {fs}")
163
- resampler = torchaudio.transforms.Resample(fs, 16000)
164
- input_wav_t = torch.from_numpy(input_wav).to(torch.float32)
165
- input_wav = resampler(input_wav_t[None, :])[0, :].numpy()
166
-
167
-
168
- merge_vad = True #False if selected_task == "ASR" else True
169
- print(f"language: {language}, merge_vad: {merge_vad}")
170
- text = model.generate(input=input_wav,
171
- cache={},
172
- language=language,
173
- use_itn=True,
174
- batch_size_s=500, merge_vad=merge_vad)
175
-
176
- print(text)
177
- text = text[0]["text"]
178
- text = format_str_v3(text)
179
-
180
- print(text)
181
-
182
- return text
 
 
 
 
 
 
 
 
 
183
 
184
 
185
  audio_examples = [
@@ -200,28 +212,34 @@ audio_examples = [
200
 
201
 
202
  def launch():
203
- with gr.Blocks(theme=gr.themes.Soft()) as demo:
204
- # gr.Markdown(description)
205
- gr.HTML(html_content)
206
- with gr.Row():
207
- with gr.Column():
208
- audio_inputs = gr.Audio(label="Upload audio or use the microphone")
209
-
210
- with gr.Accordion("Configuration"):
211
- language_inputs = gr.Dropdown(choices=["auto", "zh", "en", "yue", "ja", "ko", "nospeech"],
212
- value="auto",
213
- label="Language")
214
- fn_button = gr.Button("Start", variant="primary")
215
- text_outputs = gr.Textbox(label="Results")
216
- gr.Examples(examples=audio_examples, inputs=[audio_inputs, language_inputs], examples_per_page=20)
217
-
218
- fn_button.click(model_inference, inputs=[audio_inputs, language_inputs], outputs=text_outputs)
219
-
220
- demo.launch()
 
 
 
 
 
 
 
 
221
 
222
 
223
  if __name__ == "__main__":
224
- # iface.launch()
225
- launch()
226
-
227
-
 
1
  # coding=utf-8
2
 
 
 
3
  import base64
4
  import io
5
+ import os
6
  import re
7
 
8
+ import gradio as gr
9
+ import librosa
10
  import numpy as np
11
+ import spaces
12
  import torch
13
  import torchaudio
 
 
 
14
  from funasr import AutoModel
15
 
16
  model = "FunAudioLLM/SenseVoiceSmall"
17
+ model = AutoModel(
18
+ model=model,
19
+ vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch",
20
+ vad_kwargs={"max_single_segment_time": 30000},
21
+ hub="hf",
22
+ device="cuda",
23
+ )
24
 
25
  import re
26
 
27
  emo_dict = {
28
+ "<|HAPPY|>": "๐Ÿ˜Š",
29
+ "<|SAD|>": "๐Ÿ˜”",
30
+ "<|ANGRY|>": "๐Ÿ˜ก",
31
+ "<|NEUTRAL|>": "",
32
+ "<|FEARFUL|>": "๐Ÿ˜ฐ",
33
+ "<|DISGUSTED|>": "๐Ÿคข",
34
+ "<|SURPRISED|>": "๐Ÿ˜ฎ",
35
  }
36
 
37
  event_dict = {
38
+ "<|BGM|>": "๐ŸŽผ",
39
+ "<|Speech|>": "",
40
+ "<|Applause|>": "๐Ÿ‘",
41
+ "<|Laughter|>": "๐Ÿ˜€",
42
+ "<|Cry|>": "๐Ÿ˜ญ",
43
+ "<|Sneeze|>": "๐Ÿคง",
44
+ "<|Breath|>": "",
45
+ "<|Cough|>": "๐Ÿคง",
46
  }
47
 
48
  emoji_dict = {
49
+ "<|nospeech|><|Event_UNK|>": "โ“",
50
+ "<|zh|>": "",
51
+ "<|en|>": "",
52
+ "<|yue|>": "",
53
+ "<|ja|>": "",
54
+ "<|ko|>": "",
55
+ "<|nospeech|>": "",
56
+ "<|HAPPY|>": "๐Ÿ˜Š",
57
+ "<|SAD|>": "๐Ÿ˜”",
58
+ "<|ANGRY|>": "๐Ÿ˜ก",
59
+ "<|NEUTRAL|>": "",
60
+ "<|BGM|>": "๐ŸŽผ",
61
+ "<|Speech|>": "",
62
+ "<|Applause|>": "๐Ÿ‘",
63
+ "<|Laughter|>": "๐Ÿ˜€",
64
+ "<|FEARFUL|>": "๐Ÿ˜ฐ",
65
+ "<|DISGUSTED|>": "๐Ÿคข",
66
+ "<|SURPRISED|>": "๐Ÿ˜ฎ",
67
+ "<|Cry|>": "๐Ÿ˜ญ",
68
+ "<|EMO_UNKNOWN|>": "",
69
+ "<|Sneeze|>": "๐Ÿคง",
70
+ "<|Breath|>": "",
71
+ "<|Cough|>": "๐Ÿ˜ท",
72
+ "<|Sing|>": "",
73
+ "<|Speech_Noise|>": "",
74
+ "<|withitn|>": "",
75
+ "<|woitn|>": "",
76
+ "<|GBG|>": "",
77
+ "<|Event_UNK|>": "",
78
  }
79
 
80
+ lang_dict = {
81
  "<|zh|>": "<|lang|>",
82
  "<|en|>": "<|lang|>",
83
  "<|yue|>": "<|lang|>",
 
87
  }
88
 
89
  emo_set = {"๐Ÿ˜Š", "๐Ÿ˜”", "๐Ÿ˜ก", "๐Ÿ˜ฐ", "๐Ÿคข", "๐Ÿ˜ฎ"}
90
+ event_set = {"๐ŸŽผ", "๐Ÿ‘", "๐Ÿ˜€", "๐Ÿ˜ญ", "๐Ÿคง", "๐Ÿ˜ท"}
91
+
92
 
93
  def format_str(s):
94
+ for sptk in emoji_dict:
95
+ s = s.replace(sptk, emoji_dict[sptk])
96
+ return s
97
 
98
 
99
  def format_str_v2(s):
100
+ sptk_dict = {}
101
+ for sptk in emoji_dict:
102
+ sptk_dict[sptk] = s.count(sptk)
103
+ s = s.replace(sptk, "")
104
+ emo = "<|NEUTRAL|>"
105
+ for e in emo_dict:
106
+ if sptk_dict[e] > sptk_dict[emo]:
107
+ emo = e
108
+ for e in event_dict:
109
+ if sptk_dict[e] > 0:
110
+ s = event_dict[e] + s
111
+ s = s + emo_dict[emo]
112
+
113
+ for emoji in emo_set.union(event_set):
114
+ s = s.replace(" " + emoji, emoji)
115
+ s = s.replace(emoji + " ", emoji)
116
+ return s.strip()
117
+
118
 
119
  def format_str_v3(s):
120
+ def get_emo(s):
121
+ return s[-1] if s[-1] in emo_set else None
122
+
123
+ def get_event(s):
124
+ return s[0] if s[0] in event_set else None
125
+
126
+ s = s.replace("<|nospeech|><|Event_UNK|>", "โ“")
127
+ for lang in lang_dict:
128
+ s = s.replace(lang, "<|lang|>")
129
+ s_list = [format_str_v2(s_i).strip(" ") for s_i in s.split("<|lang|>")]
130
+ new_s = " " + s_list[0]
131
+ cur_ent_event = get_event(new_s)
132
+ for i in range(1, len(s_list)):
133
+ if len(s_list[i]) == 0:
134
+ continue
135
+ if get_event(s_list[i]) == cur_ent_event and get_event(s_list[i]) != None:
136
+ s_list[i] = s_list[i][1:]
137
+ # else:
138
+ cur_ent_event = get_event(s_list[i])
139
+ if get_emo(s_list[i]) != None and get_emo(s_list[i]) == get_emo(new_s):
140
+ new_s = new_s[:-1]
141
+ new_s += s_list[i].strip().lstrip()
142
+ new_s = new_s.replace("The.", " ")
143
+ return new_s.strip()
144
+
145
 
146
  @spaces.GPU
147
  def model_inference(input_wav, language, fs=16000):
148
+ # task_abbr = {"Speech Recognition": "ASR", "Rich Text Transcription": ("ASR", "AED", "SER")}
149
+ language_abbr = {
150
+ "auto": "auto",
151
+ "zh": "zh",
152
+ "en": "en",
153
+ "yue": "yue",
154
+ "ja": "ja",
155
+ "ko": "ko",
156
+ "nospeech": "nospeech",
157
+ }
158
+
159
+ # task = "Speech Recognition" if task is None else task
160
+ language = "auto" if len(language) < 1 else language
161
+ selected_language = language_abbr[language]
162
+ # selected_task = task_abbr.get(task)
163
+
164
+ # print(f"input_wav: {type(input_wav)}, {input_wav[1].shape}, {input_wav}")
165
+
166
+ if isinstance(input_wav, tuple):
167
+ fs, input_wav = input_wav
168
+ input_wav = input_wav.astype(np.float32) / np.iinfo(np.int16).max
169
+ if len(input_wav.shape) > 1:
170
+ input_wav = input_wav.mean(-1)
171
+ if fs != 16000:
172
+ print(f"audio_fs: {fs}")
173
+ resampler = torchaudio.transforms.Resample(fs, 16000)
174
+ input_wav_t = torch.from_numpy(input_wav).to(torch.float32)
175
+ input_wav = resampler(input_wav_t[None, :])[0, :].numpy()
176
+
177
+ merge_vad = True # False if selected_task == "ASR" else True
178
+ print(f"language: {language}, merge_vad: {merge_vad}")
179
+ text = model.generate(
180
+ input=input_wav,
181
+ cache={},
182
+ language=language,
183
+ use_itn=True,
184
+ batch_size_s=500,
185
+ merge_vad=merge_vad,
186
+ )
187
+
188
+ print(text)
189
+ text = text[0]["text"]
190
+ text = format_str_v3(text)
191
+
192
+ print(text)
193
+
194
+ return text
195
 
196
 
197
  audio_examples = [
 
212
 
213
 
214
  def launch():
215
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
216
+ with gr.Row():
217
+ with gr.Column():
218
+ audio_inputs = gr.Audio(label="Upload audio or use the microphone")
219
+
220
+ with gr.Accordion("Configuration"):
221
+ language_inputs = gr.Dropdown(
222
+ choices=["auto", "zh", "en", "yue", "ja", "ko", "nospeech"],
223
+ value="auto",
224
+ label="Language",
225
+ )
226
+ fn_button = gr.Button("Start", variant="primary")
227
+ text_outputs = gr.Textbox(label="Results")
228
+ gr.Examples(
229
+ examples=audio_examples,
230
+ inputs=[audio_inputs, language_inputs],
231
+ examples_per_page=20,
232
+ )
233
+
234
+ fn_button.click(
235
+ model_inference,
236
+ inputs=[audio_inputs, language_inputs],
237
+ outputs=text_outputs,
238
+ )
239
+
240
+ demo.launch()
241
 
242
 
243
  if __name__ == "__main__":
244
+ # iface.launch()
245
+ launch()