abreza commited on
Commit
a4e2823
·
1 Parent(s): 5f64318

Add asset and model downloading functionality to app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -23
app.py CHANGED
@@ -1,6 +1,8 @@
1
  import os
2
  import gradio as gr
3
  import spaces
 
 
4
  import dolphin
5
  from dolphin.languages import LANGUAGE_CODES, LANGUAGE_REGION_CODES
6
 
@@ -16,6 +18,17 @@ MODELS = {
16
  "small (372M)": "small",
17
  }
18
 
 
 
 
 
 
 
 
 
 
 
 
19
  language_to_regions = {}
20
  for lang_region, names in LANGUAGE_REGION_CODES.items():
21
  if "-" in lang_region:
@@ -25,40 +38,77 @@ for lang_region, names in LANGUAGE_REGION_CODES.items():
25
  language_to_regions[lang].append((f"{region}: {names[0]}", region))
26
 
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  def update_regions(language):
29
  if language and language in language_to_regions:
30
  regions = language_to_regions[language]
31
  regions.sort(key=lambda x: x[0])
32
- return gr.Dropdown.update(choices=regions, value=regions[0][1], visible=True)
33
- return gr.Dropdown.update(choices=[], value=None, visible=False)
34
 
35
 
36
  @spaces.GPU
37
  def transcribe_audio(audio_file, model_name, language, region, predict_timestamps, padding_speech):
38
- model_key = MODELS[model_name]
39
- model = dolphin.load_model(model_key, MODEL_DIR, "cuda")
40
 
41
- waveform = dolphin.load_audio(audio_file)
 
42
 
43
- kwargs = {
44
- "predict_time": predict_timestamps,
45
- "padding_speech": padding_speech
46
- }
47
 
48
- if language:
49
- kwargs["lang_sym"] = language
50
- if region:
51
- kwargs["region_sym"] = region
52
 
53
- result = model(waveform, **kwargs)
 
 
 
54
 
55
- output_text = result.text
56
- language_detected = f"{result.language}"
57
- region_detected = f"{result.region}"
 
58
 
59
- detected_info = f"Detected language: {result.language}" + \
60
- (f", region: {result.region}" if result.region else "")
61
- return output_text, detected_info
 
 
 
 
 
 
 
 
62
 
63
 
64
  with gr.Blocks(title="Dolphin Speech Recognition") as demo:
@@ -115,7 +165,7 @@ with gr.Blocks(title="Dolphin Speech Recognition") as demo:
115
  language_dropdown.change(
116
  fn=update_regions,
117
  inputs=[language_dropdown],
118
- outputs=[region_dropdown]
119
  )
120
 
121
  transcribe_button.click(
@@ -132,13 +182,13 @@ with gr.Blocks(title="Dolphin Speech Recognition") as demo:
132
  )
133
 
134
  gr.Markdown("""
135
-
136
  - The model supports 40 Eastern languages and 22 Chinese dialects
137
  - You can let the model auto-detect language or specify language and region
138
  - Timestamps can be included in the output
139
  - Speech can be padded to 30 seconds for better processing
140
 
141
-
142
  - Model: [DataoceanAI/Dolphin](https://github.com/DataoceanAI/Dolphin)
143
  - Paper: [Dolphin: A Multilingual Model for Eastern Languages](https://arxiv.org/abs/2503.20212)
144
  """)
 
1
  import os
2
  import gradio as gr
3
  import spaces
4
+ import urllib.request
5
+ import shutil
6
  import dolphin
7
  from dolphin.languages import LANGUAGE_CODES, LANGUAGE_REGION_CODES
8
 
 
18
  "small (372M)": "small",
19
  }
20
 
21
+ MODEL_URLS = {
22
+ "base": "https://huggingface.co/DataoceanAI/dolphin-base/resolve/main/base.pt",
23
+ "small": "https://huggingface.co/DataoceanAI/dolphin-small/resolve/main/small.pt",
24
+ }
25
+
26
+ ASSET_URLS = {
27
+ "bpe.model": "https://huggingface.co/DataoceanAI/dolphin-base/resolve/main/bpe.model",
28
+ "config.yaml": "https://huggingface.co/DataoceanAI/dolphin-base/resolve/main/config.yaml",
29
+ "feats_stats.npz": "https://huggingface.co/DataoceanAI/dolphin-base/resolve/main/feats_stats.npz",
30
+ }
31
+
32
  language_to_regions = {}
33
  for lang_region, names in LANGUAGE_REGION_CODES.items():
34
  if "-" in lang_region:
 
38
  language_to_regions[lang].append((f"{region}: {names[0]}", region))
39
 
40
 
41
+ def download_file(url, dest_path):
42
+ if not os.path.exists(dest_path):
43
+ print(f"Downloading {url} to {dest_path}")
44
+ with urllib.request.urlopen(url) as response, open(dest_path, 'wb') as out_file:
45
+ shutil.copyfileobj(response, out_file)
46
+ print(f"Downloaded {dest_path}")
47
+ else:
48
+ print(f"File already exists: {dest_path}")
49
+
50
+
51
+ def ensure_assets_downloaded():
52
+ assets_dir = os.path.join(os.path.dirname(
53
+ os.path.abspath(__file__)), "dolphin", "assets")
54
+ os.makedirs(assets_dir, exist_ok=True)
55
+
56
+ for filename, url in ASSET_URLS.items():
57
+ download_file(url, os.path.join(assets_dir, filename))
58
+
59
+
60
+ def ensure_model_downloaded(model_key):
61
+ if model_key not in MODEL_URLS:
62
+ raise ValueError(f"Unknown model: {model_key}")
63
+
64
+ model_path = os.path.join(MODEL_DIR, f"{model_key}.pt")
65
+ if not os.path.exists(model_path):
66
+ download_file(MODEL_URLS[model_key], model_path)
67
+
68
+ return model_path
69
+
70
+
71
  def update_regions(language):
72
  if language and language in language_to_regions:
73
  regions = language_to_regions[language]
74
  regions.sort(key=lambda x: x[0])
75
+ return regions, regions[0][1], True
76
+ return [], None, False
77
 
78
 
79
  @spaces.GPU
80
  def transcribe_audio(audio_file, model_name, language, region, predict_timestamps, padding_speech):
81
+ try:
82
+ ensure_assets_downloaded()
83
 
84
+ model_key = MODELS[model_name]
85
+ ensure_model_downloaded(model_key)
86
 
87
+ model = dolphin.load_model(model_key, MODEL_DIR, "cuda")
 
 
 
88
 
89
+ waveform = dolphin.load_audio(audio_file)
 
 
 
90
 
91
+ kwargs = {
92
+ "predict_time": predict_timestamps,
93
+ "padding_speech": padding_speech
94
+ }
95
 
96
+ if language:
97
+ kwargs["lang_sym"] = language
98
+ if region:
99
+ kwargs["region_sym"] = region
100
 
101
+ result = model(waveform, **kwargs)
102
+
103
+ output_text = result.text
104
+ language_detected = f"{result.language}"
105
+ region_detected = f"{result.region}"
106
+
107
+ detected_info = f"Detected language: {result.language}" + (
108
+ f", region: {result.region}" if result.region else "")
109
+ return output_text, detected_info
110
+ except Exception as e:
111
+ return f"Error: {str(e)}", "Transcription failed"
112
 
113
 
114
  with gr.Blocks(title="Dolphin Speech Recognition") as demo:
 
165
  language_dropdown.change(
166
  fn=update_regions,
167
  inputs=[language_dropdown],
168
+ outputs=[region_dropdown, region_dropdown, region_dropdown]
169
  )
170
 
171
  transcribe_button.click(
 
182
  )
183
 
184
  gr.Markdown("""
185
+ ## Usage Notes
186
  - The model supports 40 Eastern languages and 22 Chinese dialects
187
  - You can let the model auto-detect language or specify language and region
188
  - Timestamps can be included in the output
189
  - Speech can be padded to 30 seconds for better processing
190
 
191
+ ## Credits
192
  - Model: [DataoceanAI/Dolphin](https://github.com/DataoceanAI/Dolphin)
193
  - Paper: [Dolphin: A Multilingual Model for Eastern Languages](https://arxiv.org/abs/2503.20212)
194
  """)