File size: 7,051 Bytes
8d36f34
 
 
a4e2823
 
8d36f34
 
 
 
 
 
 
 
 
2161d50
8d36f34
 
 
 
 
 
a4e2823
 
 
 
 
 
 
 
 
 
 
8d36f34
 
 
 
 
 
 
 
 
a4e2823
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8d36f34
2161d50
 
 
 
8d36f34
 
2161d50
 
a4e2823
8d36f34
 
 
 
a4e2823
2161d50
 
a4e2823
8d36f34
a4e2823
 
8d36f34
a4e2823
8d36f34
a4e2823
8d36f34
a4e2823
 
 
 
8d36f34
a4e2823
 
 
 
8d36f34
a4e2823
 
 
 
 
 
 
 
 
 
2161d50
a4e2823
8d36f34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5d88422
8d36f34
 
 
 
 
 
2161d50
 
 
8d36f34
 
 
 
2161d50
8d36f34
 
 
 
 
 
 
 
 
5d88422
8d36f34
 
 
 
 
 
 
 
 
2161d50
 
 
 
 
 
 
8d36f34
2161d50
8d36f34
2161d50
8d36f34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a4e2823
8d36f34
 
 
 
 
a4e2823
8d36f34
 
 
 
938e1d4
 
 
 
 
8d36f34
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
import os
import gradio as gr
import spaces
import urllib.request
import shutil
import dolphin
from dolphin.languages import LANGUAGE_CODES, LANGUAGE_REGION_CODES

MODEL_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "models")
os.makedirs(MODEL_DIR, exist_ok=True)

language_options = [(f"{code}: {name[0]}", code)
                    for code, name in LANGUAGE_CODES.items()]
language_options.sort(key=lambda x: x[0])
language_options = [("Auto-detect", None)] + language_options

MODELS = {
    "base (140M)": "base",
    "small (372M)": "small",
}

MODEL_URLS = {
    "base": "https://huggingface.co/DataoceanAI/dolphin-base/resolve/main/base.pt",
    "small": "https://huggingface.co/DataoceanAI/dolphin-small/resolve/main/small.pt",
}

ASSET_URLS = {
    "bpe.model": "https://huggingface.co/DataoceanAI/dolphin-base/resolve/main/bpe.model",
    "config.yaml": "https://huggingface.co/DataoceanAI/dolphin-base/resolve/main/config.yaml",
    "feats_stats.npz": "https://huggingface.co/DataoceanAI/dolphin-base/resolve/main/feats_stats.npz",
}

language_to_regions = {}
for lang_region, names in LANGUAGE_REGION_CODES.items():
    if "-" in lang_region:
        lang, region = lang_region.split("-", 1)
        if lang not in language_to_regions:
            language_to_regions[lang] = []
        language_to_regions[lang].append((f"{region}: {names[0]}", region))


def download_file(url, dest_path):
    if not os.path.exists(dest_path):
        print(f"Downloading {url} to {dest_path}")
        with urllib.request.urlopen(url) as response, open(dest_path, 'wb') as out_file:
            shutil.copyfileobj(response, out_file)
        print(f"Downloaded {dest_path}")
    else:
        print(f"File already exists: {dest_path}")


def ensure_assets_downloaded():
    assets_dir = os.path.join(os.path.dirname(
        os.path.abspath(__file__)), "dolphin", "assets")
    os.makedirs(assets_dir, exist_ok=True)

    for filename, url in ASSET_URLS.items():
        download_file(url, os.path.join(assets_dir, filename))


def ensure_model_downloaded(model_key):
    if model_key not in MODEL_URLS:
        raise ValueError(f"Unknown model: {model_key}")

    model_path = os.path.join(MODEL_DIR, f"{model_key}.pt")
    if not os.path.exists(model_path):
        download_file(MODEL_URLS[model_key], model_path)

    return model_path


def update_regions(language):
    if not language:
        return [], None, False

    if language in language_to_regions:
        regions = language_to_regions[language]
        regions.sort(key=lambda x: x[0])
        default_value = regions[0][1] if regions else None
        return regions, default_value, True
    return [], None, False


@spaces.GPU
def transcribe_audio(audio_file, model_name, language, region, predict_timestamps, padding_speech):
    try:
        if not audio_file:
            return "Please upload or record audio first", ""
        ensure_assets_downloaded()

        model_key = MODELS[model_name]
        ensure_model_downloaded(model_key)

        model = dolphin.load_model(model_key, MODEL_DIR, "cuda")

        waveform = dolphin.load_audio(audio_file)

        kwargs = {
            "predict_time": predict_timestamps,
            "padding_speech": padding_speech
        }

        if language:
            kwargs["lang_sym"] = language
            if region:
                kwargs["region_sym"] = region

        result = model(waveform, **kwargs)

        output_text = result.text
        language_detected = f"{result.language}"
        region_detected = f"{result.region}"

        detected_info = f"Detected language: {result.language}" + (
            f", region: {result.region}" if result.region else "")
        return output_text, detected_info
    except Exception as e:
        print(f"Error in transcribe_audio: {str(e)}")
        return f"Error: {str(e)}", "Transcription failed"


with gr.Blocks(title="Dolphin Speech Recognition") as demo:
    gr.Markdown("# Dolphin ASR")
    gr.Markdown("""
    A multilingual, multitask ASR model supporting 40 Eastern languages and 22 Chinese dialects.
    
    This model is from [DataoceanAI/Dolphin](https://github.com/DataoceanAI/Dolphin), for speech recognition in 
    Eastern languages including Chinese, Japanese, Korean, and many more.
    """)

    with gr.Row():
        with gr.Column():
            audio_input = gr.Audio(
                type="filepath", label="Upload or Record Audio")

            with gr.Row():
                model_dropdown = gr.Dropdown(
                    choices=list(MODELS.keys()),
                    value=list(MODELS.keys())[1],
                    label="Model Size"
                )

            with gr.Row():
                language_dropdown = gr.Dropdown(
                    choices=language_options,
                    value=language_options[0][1],
                    label="Language",
                    info="Default is auto-detect"
                )
                region_dropdown = gr.Dropdown(
                    choices=[],
                    value=None,
                    label="Region",
                    visible=False
                )

            with gr.Row():
                timestamp_checkbox = gr.Checkbox(
                    value=True,
                    label="Include Timestamps"
                )
                padding_checkbox = gr.Checkbox(
                    value=False,
                    label="Pad Speech to 30s"
                )

            transcribe_button = gr.Button("Transcribe", variant="primary")

        with gr.Column():
            output_text = gr.Textbox(label="Transcription", lines=10)
            language_info = gr.Textbox(label="Detected Language", lines=1)

    def on_language_change(language):
        regions, default_value, is_visible = update_regions(language)
        return {
            region_dropdown: gr.update(
                choices=regions, value=default_value, visible=is_visible)
        }

    language_dropdown.change(
        fn=on_language_change,
        inputs=[language_dropdown],
        outputs=[region_dropdown]
    )

    transcribe_button.click(
        fn=transcribe_audio,
        inputs=[
            audio_input,
            model_dropdown,
            language_dropdown,
            region_dropdown,
            timestamp_checkbox,
            padding_checkbox
        ],
        outputs=[output_text, language_info]
    )

    gr.Markdown("""
    ## Usage Notes
    - The model supports 40 Eastern languages and 22 Chinese dialects
    - You can let the model auto-detect language or specify language and region
    - Timestamps can be included in the output
    - Speech can be padded to 30 seconds for better processing
    
    ## Credits
    - Model: [DataoceanAI/Dolphin](https://github.com/DataoceanAI/Dolphin)
    - Paper: [Dolphin: A Multilingual Model for Eastern Languages](https://arxiv.org/abs/2503.20212)
    """)


ensure_assets_downloaded()
for model_key in MODELS.values():
    ensure_model_downloaded(model_key)

demo.launch()