Irpan commited on
Commit
6502e85
·
1 Parent(s): 2b3019f
Files changed (2) hide show
  1. app.py +10 -6
  2. asr.py +6 -11
app.py CHANGED
@@ -6,10 +6,14 @@ import util
6
  mms_transcribe = gr.Interface(
7
  fn=asr.transcribe,
8
  inputs=[
9
- gr.Audio(),
 
 
 
 
10
  gr.Dropdown(
11
  choices=[model for model in asr.models_info],
12
- label="Select a Model for ASR",
13
  value="ixxan/wav2vec2-large-mms-1b-uyghur-latin",
14
  interactive=True
15
  ),
@@ -19,7 +23,7 @@ mms_transcribe = gr.Interface(
19
  gr.Textbox(label="Uyghur Latin Transcription"),
20
  ],
21
  examples=util.asr_examples,
22
- title="Speech-to-text",
23
  description=(
24
  "Transcribe Uyghur speech audio from a microphone or input file."
25
  ),
@@ -32,7 +36,7 @@ mms_synthesize = gr.Interface(
32
  gr.Text(label="Input text"),
33
  gr.Dropdown(
34
  choices=[model for model in tts.models_info],
35
- label="Select a Model for TTS",
36
  value="Meta-MMS",
37
  interactive=True
38
  )
@@ -41,7 +45,7 @@ mms_synthesize = gr.Interface(
41
  gr.Audio(label="Generated Audio"),
42
  ],
43
  examples=util.tts_examples,
44
- title="Text-to-speech",
45
  description=(
46
  "Generate audio from input Uyghur text."
47
  ),
@@ -50,7 +54,7 @@ mms_synthesize = gr.Interface(
50
 
51
  tabbed_interface = gr.TabbedInterface(
52
  [mms_transcribe, mms_synthesize],
53
- ["Speech-to-text", "Text-to-speech"],
54
  )
55
 
56
  with gr.Blocks() as demo:
 
6
  mms_transcribe = gr.Interface(
7
  fn=asr.transcribe,
8
  inputs=[
9
+ gr.Audio(
10
+ label="Record or Upload Uyghur Audio",
11
+ sources=["microphone", "upload"],
12
+ type="filepath",
13
+ ),
14
  gr.Dropdown(
15
  choices=[model for model in asr.models_info],
16
+ label="Select a Model",
17
  value="ixxan/wav2vec2-large-mms-1b-uyghur-latin",
18
  interactive=True
19
  ),
 
23
  gr.Textbox(label="Uyghur Latin Transcription"),
24
  ],
25
  examples=util.asr_examples,
26
+ title="Speech-To-Text",
27
  description=(
28
  "Transcribe Uyghur speech audio from a microphone or input file."
29
  ),
 
36
  gr.Text(label="Input text"),
37
  gr.Dropdown(
38
  choices=[model for model in tts.models_info],
39
+ label="Select a Model",
40
  value="Meta-MMS",
41
  interactive=True
42
  )
 
45
  gr.Audio(label="Generated Audio"),
46
  ],
47
  examples=util.tts_examples,
48
+ title="Text-To-Speech",
49
  description=(
50
  "Generate audio from input Uyghur text."
51
  ),
 
54
 
55
  tabbed_interface = gr.TabbedInterface(
56
  [mms_transcribe, mms_synthesize],
57
+ ["Speech-To-Text", "Text-To-Speech"],
58
  )
59
 
60
  with gr.Blocks() as demo:
asr.py CHANGED
@@ -58,36 +58,31 @@ models_info = {
58
  # return transcriptions
59
 
60
  def transcribe(audio_data, model_id) -> str:
61
- # Load audio file
62
- if not audio_data:
63
- return "<<ERROR: Empty Audio Input>>"
64
-
65
  if isinstance(audio_data, tuple):
66
  # microphone
67
  sampling_rate, audio_input = audio_data
68
  audio_input = (audio_input / 32768.0).astype(np.float32)
69
-
70
  elif isinstance(audio_data, str):
71
  # file upload
72
  audio_input, sampling_rate = torchaudio.load(audio_data)
73
-
74
- else:
75
  return "<<ERROR: Invalid Audio Input Instance: {}>>".format(type(audio_data))
76
 
77
-
78
  model = models_info[model_id]["model"]
79
  processor = models_info[model_id]["processor"]
80
- target_sr = 16000 #processor.feature_extractor.sampling_rate
81
  ctc_model = models_info[model_id]["ctc_model"]
82
-
83
 
84
  # Resample if needed
85
  if sampling_rate != target_sr:
86
  resampler = torchaudio.transforms.Resample(sampling_rate, target_sr)
87
  audio_input = resampler(audio_input)
 
88
 
89
  # Preprocess the audio input
90
- inputs = processor(audio_input.squeeze(), sampling_rate=target_sr, return_tensors="pt")
91
 
92
  # Move model to GPU if available
93
  device = "cuda" if torch.cuda.is_available() else "cpu"
 
58
  # return transcriptions
59
 
60
  def transcribe(audio_data, model_id) -> str:
61
+ # Load user audio
 
 
 
62
  if isinstance(audio_data, tuple):
63
  # microphone
64
  sampling_rate, audio_input = audio_data
65
  audio_input = (audio_input / 32768.0).astype(np.float32)
 
66
  elif isinstance(audio_data, str):
67
  # file upload
68
  audio_input, sampling_rate = torchaudio.load(audio_data)
69
+ else:
 
70
  return "<<ERROR: Invalid Audio Input Instance: {}>>".format(type(audio_data))
71
 
 
72
  model = models_info[model_id]["model"]
73
  processor = models_info[model_id]["processor"]
74
+ target_sr = processor.feature_extractor.sampling_rate
75
  ctc_model = models_info[model_id]["ctc_model"]
76
+ print(target_sr)
77
 
78
  # Resample if needed
79
  if sampling_rate != target_sr:
80
  resampler = torchaudio.transforms.Resample(sampling_rate, target_sr)
81
  audio_input = resampler(audio_input)
82
+ sampling_rate = target_sr
83
 
84
  # Preprocess the audio input
85
+ inputs = processor(audio_input.squeeze(), sampling_rate=sampling_rate, return_tensors="pt")
86
 
87
  # Move model to GPU if available
88
  device = "cuda" if torch.cuda.is_available() else "cpu"