Spaces:
Running
on
Zero
Running
on
Zero
File size: 7,051 Bytes
8d36f34 a4e2823 8d36f34 2161d50 8d36f34 a4e2823 8d36f34 a4e2823 8d36f34 2161d50 8d36f34 2161d50 a4e2823 8d36f34 a4e2823 2161d50 a4e2823 8d36f34 a4e2823 8d36f34 a4e2823 8d36f34 a4e2823 8d36f34 a4e2823 8d36f34 a4e2823 8d36f34 a4e2823 2161d50 a4e2823 8d36f34 5d88422 8d36f34 2161d50 8d36f34 2161d50 8d36f34 5d88422 8d36f34 2161d50 8d36f34 2161d50 8d36f34 2161d50 8d36f34 a4e2823 8d36f34 a4e2823 8d36f34 938e1d4 8d36f34 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 |
import os
import gradio as gr
import spaces
import urllib.request
import shutil
import dolphin
from dolphin.languages import LANGUAGE_CODES, LANGUAGE_REGION_CODES
MODEL_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "models")
os.makedirs(MODEL_DIR, exist_ok=True)
language_options = [(f"{code}: {name[0]}", code)
for code, name in LANGUAGE_CODES.items()]
language_options.sort(key=lambda x: x[0])
language_options = [("Auto-detect", None)] + language_options
MODELS = {
"base (140M)": "base",
"small (372M)": "small",
}
MODEL_URLS = {
"base": "https://huggingface.co/DataoceanAI/dolphin-base/resolve/main/base.pt",
"small": "https://huggingface.co/DataoceanAI/dolphin-small/resolve/main/small.pt",
}
ASSET_URLS = {
"bpe.model": "https://huggingface.co/DataoceanAI/dolphin-base/resolve/main/bpe.model",
"config.yaml": "https://huggingface.co/DataoceanAI/dolphin-base/resolve/main/config.yaml",
"feats_stats.npz": "https://huggingface.co/DataoceanAI/dolphin-base/resolve/main/feats_stats.npz",
}
language_to_regions = {}
for lang_region, names in LANGUAGE_REGION_CODES.items():
if "-" in lang_region:
lang, region = lang_region.split("-", 1)
if lang not in language_to_regions:
language_to_regions[lang] = []
language_to_regions[lang].append((f"{region}: {names[0]}", region))
def download_file(url, dest_path):
if not os.path.exists(dest_path):
print(f"Downloading {url} to {dest_path}")
with urllib.request.urlopen(url) as response, open(dest_path, 'wb') as out_file:
shutil.copyfileobj(response, out_file)
print(f"Downloaded {dest_path}")
else:
print(f"File already exists: {dest_path}")
def ensure_assets_downloaded():
assets_dir = os.path.join(os.path.dirname(
os.path.abspath(__file__)), "dolphin", "assets")
os.makedirs(assets_dir, exist_ok=True)
for filename, url in ASSET_URLS.items():
download_file(url, os.path.join(assets_dir, filename))
def ensure_model_downloaded(model_key):
if model_key not in MODEL_URLS:
raise ValueError(f"Unknown model: {model_key}")
model_path = os.path.join(MODEL_DIR, f"{model_key}.pt")
if not os.path.exists(model_path):
download_file(MODEL_URLS[model_key], model_path)
return model_path
def update_regions(language):
if not language:
return [], None, False
if language in language_to_regions:
regions = language_to_regions[language]
regions.sort(key=lambda x: x[0])
default_value = regions[0][1] if regions else None
return regions, default_value, True
return [], None, False
@spaces.GPU
def transcribe_audio(audio_file, model_name, language, region, predict_timestamps, padding_speech):
try:
if not audio_file:
return "Please upload or record audio first", ""
ensure_assets_downloaded()
model_key = MODELS[model_name]
ensure_model_downloaded(model_key)
model = dolphin.load_model(model_key, MODEL_DIR, "cuda")
waveform = dolphin.load_audio(audio_file)
kwargs = {
"predict_time": predict_timestamps,
"padding_speech": padding_speech
}
if language:
kwargs["lang_sym"] = language
if region:
kwargs["region_sym"] = region
result = model(waveform, **kwargs)
output_text = result.text
language_detected = f"{result.language}"
region_detected = f"{result.region}"
detected_info = f"Detected language: {result.language}" + (
f", region: {result.region}" if result.region else "")
return output_text, detected_info
except Exception as e:
print(f"Error in transcribe_audio: {str(e)}")
return f"Error: {str(e)}", "Transcription failed"
with gr.Blocks(title="Dolphin Speech Recognition") as demo:
gr.Markdown("# Dolphin ASR")
gr.Markdown("""
A multilingual, multitask ASR model supporting 40 Eastern languages and 22 Chinese dialects.
This model is from [DataoceanAI/Dolphin](https://github.com/DataoceanAI/Dolphin), for speech recognition in
Eastern languages including Chinese, Japanese, Korean, and many more.
""")
with gr.Row():
with gr.Column():
audio_input = gr.Audio(
type="filepath", label="Upload or Record Audio")
with gr.Row():
model_dropdown = gr.Dropdown(
choices=list(MODELS.keys()),
value=list(MODELS.keys())[1],
label="Model Size"
)
with gr.Row():
language_dropdown = gr.Dropdown(
choices=language_options,
value=language_options[0][1],
label="Language",
info="Default is auto-detect"
)
region_dropdown = gr.Dropdown(
choices=[],
value=None,
label="Region",
visible=False
)
with gr.Row():
timestamp_checkbox = gr.Checkbox(
value=True,
label="Include Timestamps"
)
padding_checkbox = gr.Checkbox(
value=False,
label="Pad Speech to 30s"
)
transcribe_button = gr.Button("Transcribe", variant="primary")
with gr.Column():
output_text = gr.Textbox(label="Transcription", lines=10)
language_info = gr.Textbox(label="Detected Language", lines=1)
def on_language_change(language):
regions, default_value, is_visible = update_regions(language)
return {
region_dropdown: gr.update(
choices=regions, value=default_value, visible=is_visible)
}
language_dropdown.change(
fn=on_language_change,
inputs=[language_dropdown],
outputs=[region_dropdown]
)
transcribe_button.click(
fn=transcribe_audio,
inputs=[
audio_input,
model_dropdown,
language_dropdown,
region_dropdown,
timestamp_checkbox,
padding_checkbox
],
outputs=[output_text, language_info]
)
gr.Markdown("""
## Usage Notes
- The model supports 40 Eastern languages and 22 Chinese dialects
- You can let the model auto-detect language or specify language and region
- Timestamps can be included in the output
- Speech can be padded to 30 seconds for better processing
## Credits
- Model: [DataoceanAI/Dolphin](https://github.com/DataoceanAI/Dolphin)
- Paper: [Dolphin: A Multilingual Model for Eastern Languages](https://arxiv.org/abs/2503.20212)
""")
ensure_assets_downloaded()
for model_key in MODELS.values():
ensure_model_downloaded(model_key)
demo.launch()
|