Irpan commited on
Commit
81e83c9
·
1 Parent(s): 9db718b
Files changed (3) hide show
  1. app.py +76 -70
  2. asr.py +1 -0
  3. tts.py +1 -2
app.py CHANGED
@@ -4,90 +4,96 @@ import tts
4
  import util
5
 
6
  # Define the Speech-to-Text tab
7
- with gr.Blocks() as mms_transcribe:
8
- gr.Markdown("### Speech-To-Text")
9
- with gr.Row():
10
- audio_input = gr.Audio(
11
- label="Record or Upload Uyghur Audio",
12
- sources=["microphone", "upload"],
13
- type="filepath",
14
- )
15
- model_selection_stt = gr.Dropdown(
16
- choices=[model for model in asr.models_info],
17
- label="Select a Model",
18
- value="ixxan/wav2vec2-large-mms-1b-uyghur-latin",
19
- interactive=True
20
- )
21
- with gr.Row():
22
- arabic_output = gr.Textbox(label="Uyghur Arabic Transcription", interactive=False)
23
- latin_output = gr.Textbox(label="Uyghur Latin Transcription", interactive=False)
24
- with gr.Row():
25
- stt_submit_btn = gr.Button("Submit")
26
- stt_clear_btn = gr.Button("Clear")
 
27
 
28
- # Example button to load examples
29
- with gr.Row():
30
- stt_examples = gr.Examples(
31
- examples=util.asr_examples,
 
 
 
 
 
 
 
 
32
  inputs=[audio_input, model_selection_stt],
33
- outputs=[arabic_output, latin_output],
34
- label="Examples"
 
 
 
 
35
  )
36
 
37
- # Define button functionality
38
- stt_submit_btn.click(
39
- asr.transcribe,
40
- inputs=[audio_input, model_selection_stt],
41
- outputs=[arabic_output, latin_output]
42
- )
43
- stt_clear_btn.click(
44
- lambda: (None, None, None), # Clear inputs and outputs
45
- inputs=[],
46
- outputs=[audio_input, arabic_output, latin_output]
47
- )
48
 
49
  # Define the Text-to-Speech tab
50
- with gr.Blocks() as mms_synthesize:
51
- gr.Markdown("### Text-To-Speech")
52
- with gr.Row():
53
- input_text = gr.Text(label="Input text")
54
- model_selection_tts = gr.Dropdown(
55
- choices=[model for model in tts.models_info],
56
- label="Select a Model",
57
- value="Meta-MMS",
58
- interactive=True
59
- )
60
- with gr.Row():
61
- generated_audio = gr.Audio(label="Generated Audio", interactive=False)
62
- with gr.Row():
63
- tts_submit_btn = gr.Button("Submit")
64
- tts_clear_btn = gr.Button("Clear")
 
 
 
 
 
 
 
 
 
 
65
 
66
- # Example button to load examples
67
- with gr.Row():
68
- tts_examples = gr.Examples(
69
- examples=util.tts_examples,
70
  inputs=[input_text, model_selection_tts],
71
- outputs=[generated_audio],
72
- label="Examples"
 
 
 
 
73
  )
74
 
75
- # Define button functionality
76
- tts_submit_btn.click(
77
- tts.synthesize,
78
- inputs=[input_text, model_selection_tts],
79
- outputs=[generated_audio]
80
- )
81
- tts_clear_btn.click(
82
- lambda: (None, None), # Clear inputs and outputs
83
- inputs=[],
84
- outputs=[input_text, generated_audio]
85
- )
86
 
87
  # Combine tabs into a Tabbed Interface
88
  with gr.Blocks() as demo:
89
  gr.Markdown("### Uyghur Language Tools: STT and TTS")
90
- with gr.TabbedInterface([mms_transcribe, mms_synthesize], ["Speech-To-Text", "Text-To-Speech"]):
91
  pass
92
 
93
  # Run the app
 
4
  import util
5
 
6
  # Define the Speech-to-Text tab
7
+ def create_stt_tab():
8
+ with gr.Blocks() as mms_transcribe:
9
+ gr.Markdown("### Speech-To-Text")
10
+ with gr.Row():
11
+ audio_input = gr.Audio(
12
+ label="Record or Upload Uyghur Audio",
13
+ sources=["microphone", "upload"],
14
+ type="filepath",
15
+ )
16
+ model_selection_stt = gr.Dropdown(
17
+ choices=[model for model in asr.models_info],
18
+ label="Select a Model",
19
+ value="ixxan/wav2vec2-large-mms-1b-uyghur-latin",
20
+ interactive=True
21
+ )
22
+ with gr.Row():
23
+ arabic_output = gr.Textbox(label="Uyghur Arabic Transcription", interactive=False)
24
+ latin_output = gr.Textbox(label="Uyghur Latin Transcription", interactive=False)
25
+ with gr.Row():
26
+ stt_submit_btn = gr.Button("Submit")
27
+ stt_clear_btn = gr.Button("Clear")
28
 
29
+ # Example button to load examples
30
+ with gr.Row():
31
+ stt_examples = gr.Examples(
32
+ examples=util.asr_examples,
33
+ inputs=[audio_input, model_selection_stt],
34
+ outputs=[arabic_output, latin_output],
35
+ label="Examples"
36
+ )
37
+
38
+ # Define button functionality
39
+ stt_submit_btn.click(
40
+ asr.transcribe,
41
  inputs=[audio_input, model_selection_stt],
42
+ outputs=[arabic_output, latin_output]
43
+ )
44
+ stt_clear_btn.click(
45
+ lambda: (None, None, None), # Clear inputs and outputs
46
+ inputs=[],
47
+ outputs=[audio_input, arabic_output, latin_output]
48
  )
49
 
50
+ return mms_transcribe
 
 
 
 
 
 
 
 
 
 
51
 
52
  # Define the Text-to-Speech tab
53
+ def create_tts_tab():
54
+ with gr.Blocks() as mms_synthesize:
55
+ gr.Markdown("### Text-To-Speech")
56
+ with gr.Row():
57
+ input_text = gr.Text(label="Input text")
58
+ model_selection_tts = gr.Dropdown(
59
+ choices=[model for model in tts.models_info],
60
+ label="Select a Model",
61
+ value="Meta-MMS",
62
+ interactive=True
63
+ )
64
+ with gr.Row():
65
+ generated_audio = gr.Audio(label="Generated Audio", interactive=False)
66
+ with gr.Row():
67
+ tts_submit_btn = gr.Button("Submit")
68
+ tts_clear_btn = gr.Button("Clear")
69
+
70
+ # Example button to load examples
71
+ with gr.Row():
72
+ tts_examples = gr.Examples(
73
+ examples=util.tts_examples,
74
+ inputs=[input_text, model_selection_tts],
75
+ outputs=[generated_audio],
76
+ label="Examples"
77
+ )
78
 
79
+ # Define button functionality
80
+ tts_submit_btn.click(
81
+ tts.synthesize,
 
82
  inputs=[input_text, model_selection_tts],
83
+ outputs=[generated_audio]
84
+ )
85
+ tts_clear_btn.click(
86
+ lambda: (None, None), # Clear inputs and outputs
87
+ inputs=[],
88
+ outputs=[input_text, generated_audio]
89
  )
90
 
91
+ return mms_synthesize
 
 
 
 
 
 
 
 
 
 
92
 
93
  # Combine tabs into a Tabbed Interface
94
  with gr.Blocks() as demo:
95
  gr.Markdown("### Uyghur Language Tools: STT and TTS")
96
+ with gr.TabbedInterface([create_stt_tab(), create_tts_tab()], ["Speech-To-Text", "Text-To-Speech"]):
97
  pass
98
 
99
  # Run the app
asr.py CHANGED
@@ -109,4 +109,5 @@ def transcribe(audio_data, model_id) -> str:
109
  else: # Latin script output
110
  transcription_arabic = util.ug_latn_to_arab(transcription)
111
  transcription_latin = transcription
 
112
  return transcription_arabic, transcription_latin
 
109
  else: # Latin script output
110
  transcription_arabic = util.ug_latn_to_arab(transcription)
111
  transcription_latin = transcription
112
+ print(model_id, transcription_arabic, transcription_latin)
113
  return transcription_arabic, transcription_latin
tts.py CHANGED
@@ -43,6 +43,7 @@ text2speech = Text2Speech(
43
  text2speech.spc2wav = None ### disable griffin-lim
44
 
45
  def synthesize(text, model_id):
 
46
  if len(text) > 200:
47
  raise ValueError(f"Input text exceeds 200 characters. Please provide a shorter input text for faster processing.")
48
 
@@ -74,8 +75,6 @@ def synthesize_turkic_tts(text):
74
  wav = vocoder.inference(c_mel)
75
 
76
  output = wav.view(-1).cpu().numpy()
77
- print(output.shape)
78
-
79
  output_path = "tts_output.wav"
80
  scipy.io.wavfile.write(output_path, rate=22050, data=output)
81
 
 
43
  text2speech.spc2wav = None ### disable griffin-lim
44
 
45
  def synthesize(text, model_id):
46
+ print(text)
47
  if len(text) > 200:
48
  raise ValueError(f"Input text exceeds 200 characters. Please provide a shorter input text for faster processing.")
49
 
 
75
  wav = vocoder.inference(c_mel)
76
 
77
  output = wav.view(-1).cpu().numpy()
 
 
78
  output_path = "tts_output.wav"
79
  scipy.io.wavfile.write(output_path, rate=22050, data=output)
80