Spaces:
Running
Running
Irpan
commited on
Commit
·
6502e85
1
Parent(s):
2b3019f
asr
Browse files
app.py
CHANGED
@@ -6,10 +6,14 @@ import util
|
|
6 |
mms_transcribe = gr.Interface(
|
7 |
fn=asr.transcribe,
|
8 |
inputs=[
|
9 |
-
gr.Audio(
|
|
|
|
|
|
|
|
|
10 |
gr.Dropdown(
|
11 |
choices=[model for model in asr.models_info],
|
12 |
-
label="Select a Model
|
13 |
value="ixxan/wav2vec2-large-mms-1b-uyghur-latin",
|
14 |
interactive=True
|
15 |
),
|
@@ -19,7 +23,7 @@ mms_transcribe = gr.Interface(
|
|
19 |
gr.Textbox(label="Uyghur Latin Transcription"),
|
20 |
],
|
21 |
examples=util.asr_examples,
|
22 |
-
title="Speech-
|
23 |
description=(
|
24 |
"Transcribe Uyghur speech audio from a microphone or input file."
|
25 |
),
|
@@ -32,7 +36,7 @@ mms_synthesize = gr.Interface(
|
|
32 |
gr.Text(label="Input text"),
|
33 |
gr.Dropdown(
|
34 |
choices=[model for model in tts.models_info],
|
35 |
-
label="Select a Model
|
36 |
value="Meta-MMS",
|
37 |
interactive=True
|
38 |
)
|
@@ -41,7 +45,7 @@ mms_synthesize = gr.Interface(
|
|
41 |
gr.Audio(label="Generated Audio"),
|
42 |
],
|
43 |
examples=util.tts_examples,
|
44 |
-
title="Text-
|
45 |
description=(
|
46 |
"Generate audio from input Uyghur text."
|
47 |
),
|
@@ -50,7 +54,7 @@ mms_synthesize = gr.Interface(
|
|
50 |
|
51 |
tabbed_interface = gr.TabbedInterface(
|
52 |
[mms_transcribe, mms_synthesize],
|
53 |
-
["Speech-
|
54 |
)
|
55 |
|
56 |
with gr.Blocks() as demo:
|
|
|
6 |
mms_transcribe = gr.Interface(
|
7 |
fn=asr.transcribe,
|
8 |
inputs=[
|
9 |
+
gr.Audio(
|
10 |
+
label="Record or Upload Uyghur Audio",
|
11 |
+
sources=["microphone", "upload"],
|
12 |
+
type="filepath",
|
13 |
+
),
|
14 |
gr.Dropdown(
|
15 |
choices=[model for model in asr.models_info],
|
16 |
+
label="Select a Model",
|
17 |
value="ixxan/wav2vec2-large-mms-1b-uyghur-latin",
|
18 |
interactive=True
|
19 |
),
|
|
|
23 |
gr.Textbox(label="Uyghur Latin Transcription"),
|
24 |
],
|
25 |
examples=util.asr_examples,
|
26 |
+
title="Speech-To-Text",
|
27 |
description=(
|
28 |
"Transcribe Uyghur speech audio from a microphone or input file."
|
29 |
),
|
|
|
36 |
gr.Text(label="Input text"),
|
37 |
gr.Dropdown(
|
38 |
choices=[model for model in tts.models_info],
|
39 |
+
label="Select a Model",
|
40 |
value="Meta-MMS",
|
41 |
interactive=True
|
42 |
)
|
|
|
45 |
gr.Audio(label="Generated Audio"),
|
46 |
],
|
47 |
examples=util.tts_examples,
|
48 |
+
title="Text-To-Speech",
|
49 |
description=(
|
50 |
"Generate audio from input Uyghur text."
|
51 |
),
|
|
|
54 |
|
55 |
tabbed_interface = gr.TabbedInterface(
|
56 |
[mms_transcribe, mms_synthesize],
|
57 |
+
["Speech-To-Text", "Text-To-Speech"],
|
58 |
)
|
59 |
|
60 |
with gr.Blocks() as demo:
|
asr.py
CHANGED
@@ -58,36 +58,31 @@ models_info = {
|
|
58 |
# return transcriptions
|
59 |
|
60 |
def transcribe(audio_data, model_id) -> str:
|
61 |
-
# Load audio
|
62 |
-
if not audio_data:
|
63 |
-
return "<<ERROR: Empty Audio Input>>"
|
64 |
-
|
65 |
if isinstance(audio_data, tuple):
|
66 |
# microphone
|
67 |
sampling_rate, audio_input = audio_data
|
68 |
audio_input = (audio_input / 32768.0).astype(np.float32)
|
69 |
-
|
70 |
elif isinstance(audio_data, str):
|
71 |
# file upload
|
72 |
audio_input, sampling_rate = torchaudio.load(audio_data)
|
73 |
-
|
74 |
-
else:
|
75 |
return "<<ERROR: Invalid Audio Input Instance: {}>>".format(type(audio_data))
|
76 |
|
77 |
-
|
78 |
model = models_info[model_id]["model"]
|
79 |
processor = models_info[model_id]["processor"]
|
80 |
-
target_sr =
|
81 |
ctc_model = models_info[model_id]["ctc_model"]
|
82 |
-
|
83 |
|
84 |
# Resample if needed
|
85 |
if sampling_rate != target_sr:
|
86 |
resampler = torchaudio.transforms.Resample(sampling_rate, target_sr)
|
87 |
audio_input = resampler(audio_input)
|
|
|
88 |
|
89 |
# Preprocess the audio input
|
90 |
-
inputs = processor(audio_input.squeeze(), sampling_rate=
|
91 |
|
92 |
# Move model to GPU if available
|
93 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
58 |
# return transcriptions
|
59 |
|
60 |
def transcribe(audio_data, model_id) -> str:
|
61 |
+
# Load user audio
|
|
|
|
|
|
|
62 |
if isinstance(audio_data, tuple):
|
63 |
# microphone
|
64 |
sampling_rate, audio_input = audio_data
|
65 |
audio_input = (audio_input / 32768.0).astype(np.float32)
|
|
|
66 |
elif isinstance(audio_data, str):
|
67 |
# file upload
|
68 |
audio_input, sampling_rate = torchaudio.load(audio_data)
|
69 |
+
else:
|
|
|
70 |
return "<<ERROR: Invalid Audio Input Instance: {}>>".format(type(audio_data))
|
71 |
|
|
|
72 |
model = models_info[model_id]["model"]
|
73 |
processor = models_info[model_id]["processor"]
|
74 |
+
target_sr = processor.feature_extractor.sampling_rate
|
75 |
ctc_model = models_info[model_id]["ctc_model"]
|
76 |
+
print(target_sr)
|
77 |
|
78 |
# Resample if needed
|
79 |
if sampling_rate != target_sr:
|
80 |
resampler = torchaudio.transforms.Resample(sampling_rate, target_sr)
|
81 |
audio_input = resampler(audio_input)
|
82 |
+
sampling_rate = target_sr
|
83 |
|
84 |
# Preprocess the audio input
|
85 |
+
inputs = processor(audio_input.squeeze(), sampling_rate=sampling_rate, return_tensors="pt")
|
86 |
|
87 |
# Move model to GPU if available
|
88 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|