kemuriririn commited on
Commit
4e4b6f0
·
1 Parent(s): b1f350e

(wip)remove useless code

Browse files
Files changed (1) hide show
  1. app.py +36 -94
app.py CHANGED
@@ -63,9 +63,8 @@ from cosyvoice.cli.cosyvoice import CosyVoice2
63
  from cosyvoice.utils.file_utils import load_wav, logging
64
  from cosyvoice.utils.common import set_all_random_seed
65
 
66
- inference_mode_list = ['3s Voice Clone', 'Instructed Voice Generation']
67
- instruct_dict = {'3s Voice Clone': '1. Upload prompt wav file (or record from mic), no longer than 30s, wav file will be used if provided at the same time\n2. Input prompt transcription\n3. click \'Speech Synthesis\' button',
68
- 'Instructed Voice Generation': '1. Upload prompt wav file (or record from mic), no longer than 30s, wav file will be used if provided at the same time\n2. Input instruct\n3. click \'Speech Synthesis\' button'}
69
  stream_mode_list = [('No', False), ('Yes', True)]
70
  max_val = 0.8
71
  cosyvoice_instance = None
@@ -129,10 +128,6 @@ def postprocess(speech, top_db=60, hop_length=220, win_length=440):
129
  speech = torch.concat([speech, torch.zeros(1, int(target_sr * 0.2))], dim=1)
130
  return speech
131
 
132
-
133
- def change_instruction(mode_checkbox_group):
134
- return instruct_dict[mode_checkbox_group]
135
-
136
  @spaces.GPU
137
  def prompt_wav_recognition(prompt_wav):
138
  res = get_asr().generate(input=prompt_wav,
@@ -143,122 +138,69 @@ def prompt_wav_recognition(prompt_wav):
143
  return text
144
 
145
  @spaces.GPU
146
- def generate_audio(tts_text, mode_checkbox_group, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text,
147
- seed, stream):
148
- sft_dropdown, speed = '', 1.0
149
  if prompt_wav_upload is not None:
150
  prompt_wav = prompt_wav_upload
151
  elif prompt_wav_record is not None:
152
  prompt_wav = prompt_wav_record
153
  else:
154
  prompt_wav = None
155
- # if instruct mode, please make sure that model is iic/CosyVoice-300M-Instruct and not cross_lingual mode
156
- if mode_checkbox_group in ['Instructed Voice Generation']:
157
- if instruct_text == '':
158
- gr.Warning('You are using Instructed Voice Generation mode, please input the instruct.')
159
- yield (target_sr, default_data)
160
- if prompt_wav is None:
161
- gr.Info('You are using Instructed Voice Generation mode, please upload the prompt audio.')
162
- # if cross_lingual mode, please make sure that model is iic/CosyVoice-300M and tts_text prompt_text are different language
163
- if mode_checkbox_group in ['Cross-lingual Clone']:
164
- if get_cosyvoice().frontend.instruct is True:
165
- gr.Warning('You are using the cross-lingual Clone mode. The {} model does not support this mode. Please use the iic/CosyVoice-300M model.'.format(args.model_dir))
166
- yield (target_sr, default_data)
167
- if instruct_text != '':
168
- gr.Info('You are using the cross-lingual Clone mode. The instruct text will be ignored.')
169
- if prompt_wav is None:
170
- gr.Warning('You are using the cross-lingual Clone mode. Please provide the prompt audio.')
171
- yield (target_sr, default_data)
172
- gr.Info('You are using the cross-lingual Clone mode. Please ensure that the synthesis text and prompt text are in different languages.')
173
- # if in zero_shot cross_lingual, please make sure that prompt_text and prompt_wav meets requirements
174
- if mode_checkbox_group in ['3s Voice Clone', 'Cross-lingual Clone']:
175
- if prompt_wav is None:
176
- gr.Warning('Empty prompt found, please check the prompt text.')
177
- yield (target_sr, default_data)
178
- if torchaudio.info(prompt_wav).sample_rate < prompt_sr:
179
- gr.Warning('prompt wav sample rate {}, lower than {}.'.format(torchaudio.info(prompt_wav).sample_rate, prompt_sr))
180
- yield (target_sr, default_data)
181
- # sft mode only use sft_dropdown
182
- if mode_checkbox_group in ['Pretrained Voice']:
183
- if instruct_text != '' or prompt_wav is not None or prompt_text != '':
184
- gr.Info('You are using Pretrained Voice mode. Pretrained Voice/Instruct will be ingnored.')
185
- # zero_shot mode only use prompt_wav prompt text
186
- if mode_checkbox_group in ['3s Voice Clone']:
187
- if prompt_text == '':
188
- gr.Warning('Empty prompt found, please check the prompt text.')
189
- yield (target_sr, default_data)
190
- if instruct_text != '':
191
- gr.Info('You are using 3s Voice Clone mode. Pretrained Voice/Instruct will be ingnored.')
192
- info = torchaudio.info(prompt_wav)
193
- if info.num_frames / info.sample_rate > 10:
194
- gr.Warning('Please use prompt audio shorter than 10s.')
195
- yield (target_sr, default_data)
196
 
197
- if mode_checkbox_group == 'Pretrained Voice':
198
- logging.info('get sft inference request')
199
- set_all_random_seed(seed)
200
- for i in get_cosyvoice().inference_sft(tts_text, sft_dropdown, stream=stream, speed=speed):
201
- yield (target_sr, i['tts_speech'].numpy().flatten())
202
- elif mode_checkbox_group == '3s Voice Clone':
203
- logging.info('get zero_shot inference request')
204
- prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
205
- set_all_random_seed(seed)
206
- for i in infer_zeroshot(tts_text, prompt_text, prompt_speech_16k, stream=stream, speed=speed):
207
- yield (target_sr, i['tts_speech'].numpy().flatten())
208
- elif mode_checkbox_group == 'Cross-lingual Clone':
209
- logging.info('get cross_lingual inference request')
210
- prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
211
- set_all_random_seed(seed)
212
- for i in get_cosyvoice().inference_cross_lingual(tts_text, prompt_speech_16k, stream=stream, speed=speed):
213
- yield (target_sr, i['tts_speech'].numpy().flatten())
214
- else:
215
- logging.info('get instruct inference request')
216
- prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
217
- set_all_random_seed(seed)
218
- for i in get_cosyvoice().inference_instruct2(tts_text, instruct_text, prompt_speech_16k, stream=stream, speed=speed):
219
- yield (target_sr, i['tts_speech'].numpy().flatten())
220
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
 
222
  def main():
223
  with gr.Blocks() as demo:
224
- gr.Markdown("### Repo [CosyVoice](https://github.com/FunAudioLLM/CosyVoice) \
225
- Pretrained Model [CosyVoice2-0.5B](https://www.modelscope.cn/models/iic/CosyVoice2-0.5B) \
226
- [CosyVoice-300M](https://www.modelscope.cn/models/iic/CosyVoice-300M) \
227
- [CosyVoice-300M-Instruct](https://www.modelscope.cn/models/iic/CosyVoice-300M-Instruct) \
228
- [CosyVoice-300M-SFT](https://www.modelscope.cn/models/iic/CosyVoice-300M-SFT)")
229
- gr.Markdown("#### Please input the text to synthesize, choose inference mode and follow the controlling steps below.")
230
 
231
- tts_text = gr.Textbox(label="Text to synthesize", lines=1, value="CosyVoice is undergoing a comprehensive upgrade, providing more accurate, stable, faster, and better voice generation capabilities. CosyVoice迎来全面升级,提供更准、更稳、更快、 更好的语音生成能力。")
 
 
 
 
232
  with gr.Row():
233
- mode_checkbox_group = gr.Radio(choices=inference_mode_list, label='Inference Mode', value=inference_mode_list[0])
234
- instruction_text = gr.Text(label="Instructions", value=instruct_dict[inference_mode_list[0]], scale=0.5)
235
  stream = gr.Radio(choices=stream_mode_list, label='Streaming or not', value=stream_mode_list[0][1])
236
  with gr.Column(scale=0.25):
237
  seed_button = gr.Button(value="\U0001F3B2")
238
  seed = gr.Number(value=0, label="Random Seed")
239
 
240
- with gr.Row():
241
- prompt_wav_upload = gr.Audio(sources='upload', type='filepath', label='Prompt wav file (sample rate >= 16kHz)')
242
- prompt_wav_record = gr.Audio(sources='microphone', type='filepath', label='Record prompt from your microphone')
243
- prompt_text = gr.Textbox(label="Prompt Transcription", lines=1, placeholder="Prompt transcription (auto ASR, you can correct the recognition results)", value='')
244
- instruct_text = gr.Textbox(label="Instruct", lines=1, placeholder="Instruct transcription. e.g. A old sea captain, navigates life's storms with timeless wisdom and a heart of gold.", value='')
245
-
246
  generate_button = gr.Button("Speech Synthesis")
247
-
248
  audio_output = gr.Audio(label="Audio Output", autoplay=True, streaming=True)
249
 
250
  seed_button.click(generate_seed, inputs=[], outputs=seed)
251
  generate_button.click(generate_audio,
252
- inputs=[tts_text, mode_checkbox_group, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text,
253
- seed, stream],
254
  outputs=[audio_output])
255
- mode_checkbox_group.change(fn=change_instruction, inputs=[mode_checkbox_group], outputs=[instruction_text])
256
  prompt_wav_upload.change(fn=prompt_wav_recognition, inputs=[prompt_wav_upload], outputs=[prompt_text])
257
  prompt_wav_record.change(fn=prompt_wav_recognition, inputs=[prompt_wav_record], outputs=[prompt_text])
258
 
259
  demo.launch(max_threads=4)
260
 
261
-
262
  if __name__ == '__main__':
263
  # sft_spk = cosyvoice.list_avaliable_spks()
264
  prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
 
63
  from cosyvoice.utils.file_utils import load_wav, logging
64
  from cosyvoice.utils.common import set_all_random_seed
65
 
66
+ inference_mode_list = ['3s Voice Clone']
67
+ instruct_dict = {'3s Voice Clone': '1. Upload prompt wav file (or record from mic), no longer than 30s, wav file will be used if provided at the same time\n2. Input prompt transcription\n3. click \'Speech Synthesis\' button'}
 
68
  stream_mode_list = [('No', False), ('Yes', True)]
69
  max_val = 0.8
70
  cosyvoice_instance = None
 
128
  speech = torch.concat([speech, torch.zeros(1, int(target_sr * 0.2))], dim=1)
129
  return speech
130
 
 
 
 
 
131
  @spaces.GPU
132
  def prompt_wav_recognition(prompt_wav):
133
  res = get_asr().generate(input=prompt_wav,
 
138
  return text
139
 
140
  @spaces.GPU
141
+ def generate_audio(tts_text, prompt_text, prompt_wav_upload, prompt_wav_record, seed, stream):
142
+ speed = 1.0
 
143
  if prompt_wav_upload is not None:
144
  prompt_wav = prompt_wav_upload
145
  elif prompt_wav_record is not None:
146
  prompt_wav = prompt_wav_record
147
  else:
148
  prompt_wav = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
 
150
+ if prompt_text == '':
151
+ gr.Warning('Empty prompt found, please check the prompt text.')
152
+ yield (target_sr, default_data)
153
+ return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
 
155
+ if prompt_wav is None:
156
+ gr.Warning('Empty prompt found, please upload or record audio.')
157
+ yield (target_sr, default_data)
158
+ return
159
+
160
+ info = torchaudio.info(prompt_wav)
161
+ if info.num_frames / info.sample_rate > 10:
162
+ gr.Warning('Please use prompt audio shorter than 10s.')
163
+ yield (target_sr, default_data)
164
+ return
165
+
166
+ if torchaudio.info(prompt_wav).sample_rate < prompt_sr:
167
+ gr.Warning('Prompt wav sample rate {}, lower than {}.'.format(torchaudio.info(prompt_wav).sample_rate, prompt_sr))
168
+ yield (target_sr, default_data)
169
+ return
170
+
171
+ prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
172
+ set_all_random_seed(seed)
173
+ for i in infer_zeroshot(tts_text, prompt_text, prompt_speech_16k, stream=stream, speed=speed):
174
+ yield (target_sr, i['tts_speech'].numpy().flatten())
175
 
176
  def main():
177
  with gr.Blocks() as demo:
178
+ gr.Markdown("### 3s Voice Clone")
179
+ gr.Markdown("#### Clone any voice with just 3 seconds of audio. Upload or record audio, input transcription, and click 'Speech Synthesis'.")
 
 
 
 
180
 
181
+ tts_text = gr.Textbox(label="Text to synthesize", lines=1, value="CosyVoice is undergoing a comprehensive upgrade, providing more accurate, stable, faster, and better voice generation capabilities.")
182
+ with gr.Row():
183
+ prompt_wav_upload = gr.Audio(sources='upload', type='filepath', label='Prompt wav file (sample rate >= 16kHz)')
184
+ prompt_wav_record = gr.Audio(sources='microphone', type='filepath', label='Record prompt from your microphone')
185
+ prompt_text = gr.Textbox(label="Prompt Transcription", lines=1, placeholder="Prompt transcription (auto ASR, you can correct the recognition results)", value='')
186
  with gr.Row():
 
 
187
  stream = gr.Radio(choices=stream_mode_list, label='Streaming or not', value=stream_mode_list[0][1])
188
  with gr.Column(scale=0.25):
189
  seed_button = gr.Button(value="\U0001F3B2")
190
  seed = gr.Number(value=0, label="Random Seed")
191
 
 
 
 
 
 
 
192
  generate_button = gr.Button("Speech Synthesis")
 
193
  audio_output = gr.Audio(label="Audio Output", autoplay=True, streaming=True)
194
 
195
  seed_button.click(generate_seed, inputs=[], outputs=seed)
196
  generate_button.click(generate_audio,
197
+ inputs=[tts_text, prompt_text, prompt_wav_upload, prompt_wav_record, seed, stream],
 
198
  outputs=[audio_output])
 
199
  prompt_wav_upload.change(fn=prompt_wav_recognition, inputs=[prompt_wav_upload], outputs=[prompt_text])
200
  prompt_wav_record.change(fn=prompt_wav_recognition, inputs=[prompt_wav_record], outputs=[prompt_text])
201
 
202
  demo.launch(max_threads=4)
203
 
 
204
  if __name__ == '__main__':
205
  # sft_spk = cosyvoice.list_avaliable_spks()
206
  prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)