alibabasglab commited on
Commit
09a7355
·
verified ·
1 Parent(s): 8ca20ba

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +123 -1
app.py CHANGED
@@ -6,6 +6,64 @@ from clearvoice import ClearVoice
6
  import os
7
  import random
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  @spaces.GPU
10
  def fn_clearvoice_sr(input_wav, apply_se):
11
  wavname = input_wav.split('/')[-1]
@@ -28,6 +86,70 @@ def fn_clearvoice_sr(input_wav, apply_se):
28
 
29
  demo = gr.Blocks()
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  sr_demo = gr.Interface(
32
  fn=fn_clearvoice_sr,
33
  inputs = [
@@ -52,6 +174,6 @@ sr_demo = gr.Interface(
52
  )
53
 
54
  with demo:
55
- gr.TabbedInterface([sr_demo], ["Task 4: Speech Super Resolution"])
56
 
57
  demo.launch()
 
6
  import os
7
  import random
8
 
9
+ @spaces.GPU
10
+ def fn_clearvoice_se(input_wav, sr):
11
+ if sr == "16000 Hz":
12
+ myClearVoice = ClearVoice(task='speech_enhancement', model_names=['FRCRN_SE_16K'])
13
+ fs = 16000
14
+ else:
15
+ myClearVoice = ClearVoice(task='speech_enhancement', model_names=['MossFormer2_SE_48K'])
16
+ fs = 48000
17
+ output_wav_dict = myClearVoice(input_path=input_wav, online_write=False)
18
+ if isinstance(output_wav_dict, dict):
19
+ key = next(iter(output_wav_dict))
20
+ output_wav = output_wav_dict[key]
21
+ else:
22
+ output_wav = output_wav_dict
23
+ sf.write('enhanced.wav', output_wav, fs)
24
+ return 'enhanced.wav'
25
+
26
+ @spaces.GPU
27
+ def fn_clearvoice_ss(input_wav):
28
+ myClearVoice = ClearVoice(task='speech_separation', model_names=['MossFormer2_SS_16K'])
29
+ output_wav_dict = myClearVoice(input_path=input_wav, online_write=False)
30
+ if isinstance(output_wav_dict, dict):
31
+ key = next(iter(output_wav_dict))
32
+ output_wav_list = output_wav_dict[key]
33
+ output_wav_s1 = output_wav_list[0]
34
+ output_wav_s2 = output_wav_list[1]
35
+ else:
36
+ output_wav_list = output_wav_dict
37
+ output_wav_s1 = output_wav_list[0]
38
+ output_wav_s2 = output_wav_list[1]
39
+ sf.write('separated_s1.wav', output_wav_s1, 16000)
40
+ sf.write('separated_s2.wav', output_wav_s2, 16000)
41
+ return "separated_s1.wav", "separated_s2.wav"
42
+
43
+ def find_mp4_files(directory):
44
+ mp4_files = []
45
+
46
+ # Walk through the directory and its subdirectories
47
+ for root, dirs, files in os.walk(directory):
48
+ for file in files:
49
+ # Check if the file ends with .mp4
50
+ if file.endswith(".mp4") and file[:3] == 'est':
51
+ mp4_files.append(os.path.join(root, file))
52
+
53
+ return mp4_files
54
+
55
+
56
+ @spaces.GPU()
57
+ def fn_clearvoice_tse(input_video):
58
+ myClearVoice = ClearVoice(task='target_speaker_extraction', model_names=['AV_MossFormer2_TSE_16K'])
59
+ #output_wav_dict =
60
+ print(f'input_video: {input_video}')
61
+ myClearVoice(input_path=input_video, online_write=True, output_path='path_to_output_videos_tse')
62
+
63
+ output_list = find_mp4_files(f'path_to_output_videos_tse/AV_MossFormer2_TSE_16K/{os.path.basename(input_video).split(".")[0]}/')
64
+
65
+ return output_list
66
+
67
  @spaces.GPU
68
  def fn_clearvoice_sr(input_wav, apply_se):
69
  wavname = input_wav.split('/')[-1]
 
86
 
87
  demo = gr.Blocks()
88
 
89
+ se_demo = gr.Interface(
90
+ fn=fn_clearvoice_se,
91
+ inputs = [
92
+ gr.Audio(label="Input Audio", type="filepath"),
93
+ gr.Dropdown(
94
+ ["16000 Hz", "48000 Hz"], value="16000 Hz", multiselect=False, info="Choose a sampling rate for your output."
95
+ ),
96
+ ],
97
+ outputs = [
98
+ gr.Audio(label="Output Audio", type="filepath"),
99
+ ],
100
+ title = "<a href='https://github.com/modelscope/ClearerVoice-Studio/tree/main/clearvoice' target='_blank'>ClearVoice<a/>: Speech Enhancement",
101
+ description = ("ClearVoice ([Github Repo](https://github.com/modelscope/ClearerVoice-Studio/tree/main/clearvoice)) is AI-powered and extracts clear speech from background noise for enhanced speech quality. It supports both 16 kHz and 48 kHz audio outputs. "
102
+ "To try it, simply upload your audio, or click one of the examples. "),
103
+ article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2206.07293' target='_blank'>FRCRN: Boosting Feature Representation Using Frequency Recurrence for Monaural Speech Enhancement</a> </p>"
104
+ "<p style='text-align: center'><a href='https://arxiv.org/abs/2312.11825' target='_blank'>MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation</a> </p>"),
105
+ examples = [
106
+ ["examples/mandarin_speech_16kHz.wav", "16000 Hz"],
107
+ ["examples/english_speech_48kHz.wav", "48000 Hz"],
108
+ ],
109
+ cache_examples = True,
110
+ )
111
+
112
+ ss_demo = gr.Interface(
113
+ fn=fn_clearvoice_ss,
114
+ inputs = [
115
+ gr.Audio(label="Input Audio", type="filepath"),
116
+ ],
117
+ outputs = [
118
+ gr.Audio(label="Output Audio", type="filepath"),
119
+ gr.Audio(label="Output Audio", type="filepath"),
120
+ ],
121
+ title = "<a href='https://github.com/modelscope/ClearerVoice-Studio/tree/main/clearvoice' target='_blank'>ClearVoice<a/>: Speech Separation",
122
+ description = ("ClearVoice ([Github Repo](https://github.com/modelscope/ClearerVoice-Studio/tree/main/clearvoice)) is powered by AI and separates individual speech from mixed audio. It supports 16 kHz and two output streams. "
123
+ "To try it, simply upload your audio, or click one of the examples. "),
124
+ article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2302.11824' target='_blank'>MossFormer: Pushing the Performance Limit of Monaural Speech Separation using Gated Single-Head Transformer with Convolution-Augmented Joint Self-Attentions</a> </p>"
125
+ "<p style='text-align: center'><a href='https://arxiv.org/abs/2312.11825' target='_blank'>MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation</a> </p>"),
126
+ examples = [
127
+ ['examples/female_female_speech.wav'],
128
+ ['examples/female_male_speech.wav'],
129
+ ],
130
+ cache_examples = True,
131
+ )
132
+
133
+ tse_demo = gr.Interface(
134
+ fn=fn_clearvoice_tse,
135
+ inputs = [
136
+ gr.Video(label="Input Video"),
137
+ ],
138
+ outputs = [
139
+ gr.Gallery(label="Output Video List")
140
+ ],
141
+ title = "<a href='https://github.com/modelscope/ClearerVoice-Studio/tree/main/clearvoice' target='_blank'>ClearVoice<a/>: Audio-Visual Speaker Extraction",
142
+ description = ("ClearVoice ([Github Repo](https://github.com/modelscope/ClearerVoice-Studio/tree/main/clearvoice)) is AI-powered and extracts each speaker's voice from a multi-speaker video using facial recognition. "
143
+ "To try it, simply upload your video, or click one of the examples. "),
144
+ # article = ("<p style='text-align: center'><a href='https://arxiv.org/abs/2302.11824' target='_blank'>MossFormer: Pushing the Performance Limit of Monaural Speech Separation using Gated Single-Head Transformer with Convolution-Augmented Joint Self-Attentions</a> | <a href='https://github.com/alibabasglab/MossFormer' target='_blank'>Github Repo</a></p>"
145
+ # "<p style='text-align: center'><a href='https://arxiv.org/abs/2312.11825' target='_blank'>MossFormer2: Combining Transformer and RNN-Free Recurrent Network for Enhanced Time-Domain Monaural Speech Separation</a> | <a href='https://github.com/alibabasglab/MossFormer2' target='_blank'>Github Repo</a></p>"),
146
+ examples = [
147
+ ['examples/001.mp4'],
148
+ ['examples/002.mp4'],
149
+ ],
150
+ cache_examples = True,
151
+ )
152
+
153
  sr_demo = gr.Interface(
154
  fn=fn_clearvoice_sr,
155
  inputs = [
 
174
  )
175
 
176
  with demo:
177
+ gr.TabbedInterface([se_demo, ss_demo, sr_demo, tse_demo], ["Task 1: Speech Enhancement", "Task 2: Speech Separation", "Task 3: Speech Super Resolution", "Task 4: Audio-Visual Speaker Extraction"])
178
 
179
  demo.launch()