Spaces:
Runtime error
Runtime error
Commit
·
4e4b6f0
1
Parent(s):
b1f350e
(wip)remove useless code
Browse files
app.py
CHANGED
@@ -63,9 +63,8 @@ from cosyvoice.cli.cosyvoice import CosyVoice2
|
|
63 |
from cosyvoice.utils.file_utils import load_wav, logging
|
64 |
from cosyvoice.utils.common import set_all_random_seed
|
65 |
|
66 |
-
inference_mode_list = ['3s Voice Clone'
|
67 |
-
instruct_dict = {'3s Voice Clone': '1. Upload prompt wav file (or record from mic), no longer than 30s, wav file will be used if provided at the same time\n2. Input prompt transcription\n3. click \'Speech Synthesis\' button'
|
68 |
-
'Instructed Voice Generation': '1. Upload prompt wav file (or record from mic), no longer than 30s, wav file will be used if provided at the same time\n2. Input instruct\n3. click \'Speech Synthesis\' button'}
|
69 |
stream_mode_list = [('No', False), ('Yes', True)]
|
70 |
max_val = 0.8
|
71 |
cosyvoice_instance = None
|
@@ -129,10 +128,6 @@ def postprocess(speech, top_db=60, hop_length=220, win_length=440):
|
|
129 |
speech = torch.concat([speech, torch.zeros(1, int(target_sr * 0.2))], dim=1)
|
130 |
return speech
|
131 |
|
132 |
-
|
133 |
-
def change_instruction(mode_checkbox_group):
|
134 |
-
return instruct_dict[mode_checkbox_group]
|
135 |
-
|
136 |
@spaces.GPU
|
137 |
def prompt_wav_recognition(prompt_wav):
|
138 |
res = get_asr().generate(input=prompt_wav,
|
@@ -143,122 +138,69 @@ def prompt_wav_recognition(prompt_wav):
|
|
143 |
return text
|
144 |
|
145 |
@spaces.GPU
|
146 |
-
def generate_audio(tts_text,
|
147 |
-
|
148 |
-
sft_dropdown, speed = '', 1.0
|
149 |
if prompt_wav_upload is not None:
|
150 |
prompt_wav = prompt_wav_upload
|
151 |
elif prompt_wav_record is not None:
|
152 |
prompt_wav = prompt_wav_record
|
153 |
else:
|
154 |
prompt_wav = None
|
155 |
-
# if instruct mode, please make sure that model is iic/CosyVoice-300M-Instruct and not cross_lingual mode
|
156 |
-
if mode_checkbox_group in ['Instructed Voice Generation']:
|
157 |
-
if instruct_text == '':
|
158 |
-
gr.Warning('You are using Instructed Voice Generation mode, please input the instruct.')
|
159 |
-
yield (target_sr, default_data)
|
160 |
-
if prompt_wav is None:
|
161 |
-
gr.Info('You are using Instructed Voice Generation mode, please upload the prompt audio.')
|
162 |
-
# if cross_lingual mode, please make sure that model is iic/CosyVoice-300M and tts_text prompt_text are different language
|
163 |
-
if mode_checkbox_group in ['Cross-lingual Clone']:
|
164 |
-
if get_cosyvoice().frontend.instruct is True:
|
165 |
-
gr.Warning('You are using the cross-lingual Clone mode. The {} model does not support this mode. Please use the iic/CosyVoice-300M model.'.format(args.model_dir))
|
166 |
-
yield (target_sr, default_data)
|
167 |
-
if instruct_text != '':
|
168 |
-
gr.Info('You are using the cross-lingual Clone mode. The instruct text will be ignored.')
|
169 |
-
if prompt_wav is None:
|
170 |
-
gr.Warning('You are using the cross-lingual Clone mode. Please provide the prompt audio.')
|
171 |
-
yield (target_sr, default_data)
|
172 |
-
gr.Info('You are using the cross-lingual Clone mode. Please ensure that the synthesis text and prompt text are in different languages.')
|
173 |
-
# if in zero_shot cross_lingual, please make sure that prompt_text and prompt_wav meets requirements
|
174 |
-
if mode_checkbox_group in ['3s Voice Clone', 'Cross-lingual Clone']:
|
175 |
-
if prompt_wav is None:
|
176 |
-
gr.Warning('Empty prompt found, please check the prompt text.')
|
177 |
-
yield (target_sr, default_data)
|
178 |
-
if torchaudio.info(prompt_wav).sample_rate < prompt_sr:
|
179 |
-
gr.Warning('prompt wav sample rate {}, lower than {}.'.format(torchaudio.info(prompt_wav).sample_rate, prompt_sr))
|
180 |
-
yield (target_sr, default_data)
|
181 |
-
# sft mode only use sft_dropdown
|
182 |
-
if mode_checkbox_group in ['Pretrained Voice']:
|
183 |
-
if instruct_text != '' or prompt_wav is not None or prompt_text != '':
|
184 |
-
gr.Info('You are using Pretrained Voice mode. Pretrained Voice/Instruct will be ingnored.')
|
185 |
-
# zero_shot mode only use prompt_wav prompt text
|
186 |
-
if mode_checkbox_group in ['3s Voice Clone']:
|
187 |
-
if prompt_text == '':
|
188 |
-
gr.Warning('Empty prompt found, please check the prompt text.')
|
189 |
-
yield (target_sr, default_data)
|
190 |
-
if instruct_text != '':
|
191 |
-
gr.Info('You are using 3s Voice Clone mode. Pretrained Voice/Instruct will be ingnored.')
|
192 |
-
info = torchaudio.info(prompt_wav)
|
193 |
-
if info.num_frames / info.sample_rate > 10:
|
194 |
-
gr.Warning('Please use prompt audio shorter than 10s.')
|
195 |
-
yield (target_sr, default_data)
|
196 |
|
197 |
-
if
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
yield (target_sr, i['tts_speech'].numpy().flatten())
|
202 |
-
elif mode_checkbox_group == '3s Voice Clone':
|
203 |
-
logging.info('get zero_shot inference request')
|
204 |
-
prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
|
205 |
-
set_all_random_seed(seed)
|
206 |
-
for i in infer_zeroshot(tts_text, prompt_text, prompt_speech_16k, stream=stream, speed=speed):
|
207 |
-
yield (target_sr, i['tts_speech'].numpy().flatten())
|
208 |
-
elif mode_checkbox_group == 'Cross-lingual Clone':
|
209 |
-
logging.info('get cross_lingual inference request')
|
210 |
-
prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
|
211 |
-
set_all_random_seed(seed)
|
212 |
-
for i in get_cosyvoice().inference_cross_lingual(tts_text, prompt_speech_16k, stream=stream, speed=speed):
|
213 |
-
yield (target_sr, i['tts_speech'].numpy().flatten())
|
214 |
-
else:
|
215 |
-
logging.info('get instruct inference request')
|
216 |
-
prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
|
217 |
-
set_all_random_seed(seed)
|
218 |
-
for i in get_cosyvoice().inference_instruct2(tts_text, instruct_text, prompt_speech_16k, stream=stream, speed=speed):
|
219 |
-
yield (target_sr, i['tts_speech'].numpy().flatten())
|
220 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
221 |
|
222 |
def main():
|
223 |
with gr.Blocks() as demo:
|
224 |
-
gr.Markdown("###
|
225 |
-
|
226 |
-
[CosyVoice-300M](https://www.modelscope.cn/models/iic/CosyVoice-300M) \
|
227 |
-
[CosyVoice-300M-Instruct](https://www.modelscope.cn/models/iic/CosyVoice-300M-Instruct) \
|
228 |
-
[CosyVoice-300M-SFT](https://www.modelscope.cn/models/iic/CosyVoice-300M-SFT)")
|
229 |
-
gr.Markdown("#### Please input the text to synthesize, choose inference mode and follow the controlling steps below.")
|
230 |
|
231 |
-
tts_text = gr.Textbox(label="Text to synthesize", lines=1, value="CosyVoice is undergoing a comprehensive upgrade, providing more accurate, stable, faster, and better voice generation capabilities.
|
|
|
|
|
|
|
|
|
232 |
with gr.Row():
|
233 |
-
mode_checkbox_group = gr.Radio(choices=inference_mode_list, label='Inference Mode', value=inference_mode_list[0])
|
234 |
-
instruction_text = gr.Text(label="Instructions", value=instruct_dict[inference_mode_list[0]], scale=0.5)
|
235 |
stream = gr.Radio(choices=stream_mode_list, label='Streaming or not', value=stream_mode_list[0][1])
|
236 |
with gr.Column(scale=0.25):
|
237 |
seed_button = gr.Button(value="\U0001F3B2")
|
238 |
seed = gr.Number(value=0, label="Random Seed")
|
239 |
|
240 |
-
with gr.Row():
|
241 |
-
prompt_wav_upload = gr.Audio(sources='upload', type='filepath', label='Prompt wav file (sample rate >= 16kHz)')
|
242 |
-
prompt_wav_record = gr.Audio(sources='microphone', type='filepath', label='Record prompt from your microphone')
|
243 |
-
prompt_text = gr.Textbox(label="Prompt Transcription", lines=1, placeholder="Prompt transcription (auto ASR, you can correct the recognition results)", value='')
|
244 |
-
instruct_text = gr.Textbox(label="Instruct", lines=1, placeholder="Instruct transcription. e.g. A old sea captain, navigates life's storms with timeless wisdom and a heart of gold.", value='')
|
245 |
-
|
246 |
generate_button = gr.Button("Speech Synthesis")
|
247 |
-
|
248 |
audio_output = gr.Audio(label="Audio Output", autoplay=True, streaming=True)
|
249 |
|
250 |
seed_button.click(generate_seed, inputs=[], outputs=seed)
|
251 |
generate_button.click(generate_audio,
|
252 |
-
inputs=[tts_text,
|
253 |
-
seed, stream],
|
254 |
outputs=[audio_output])
|
255 |
-
mode_checkbox_group.change(fn=change_instruction, inputs=[mode_checkbox_group], outputs=[instruction_text])
|
256 |
prompt_wav_upload.change(fn=prompt_wav_recognition, inputs=[prompt_wav_upload], outputs=[prompt_text])
|
257 |
prompt_wav_record.change(fn=prompt_wav_recognition, inputs=[prompt_wav_record], outputs=[prompt_text])
|
258 |
|
259 |
demo.launch(max_threads=4)
|
260 |
|
261 |
-
|
262 |
if __name__ == '__main__':
|
263 |
# sft_spk = cosyvoice.list_avaliable_spks()
|
264 |
prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
|
|
|
63 |
from cosyvoice.utils.file_utils import load_wav, logging
|
64 |
from cosyvoice.utils.common import set_all_random_seed
|
65 |
|
66 |
+
inference_mode_list = ['3s Voice Clone']
|
67 |
+
instruct_dict = {'3s Voice Clone': '1. Upload prompt wav file (or record from mic), no longer than 30s, wav file will be used if provided at the same time\n2. Input prompt transcription\n3. click \'Speech Synthesis\' button'}
|
|
|
68 |
stream_mode_list = [('No', False), ('Yes', True)]
|
69 |
max_val = 0.8
|
70 |
cosyvoice_instance = None
|
|
|
128 |
speech = torch.concat([speech, torch.zeros(1, int(target_sr * 0.2))], dim=1)
|
129 |
return speech
|
130 |
|
|
|
|
|
|
|
|
|
131 |
@spaces.GPU
|
132 |
def prompt_wav_recognition(prompt_wav):
|
133 |
res = get_asr().generate(input=prompt_wav,
|
|
|
138 |
return text
|
139 |
|
140 |
@spaces.GPU
|
141 |
+
def generate_audio(tts_text, prompt_text, prompt_wav_upload, prompt_wav_record, seed, stream):
|
142 |
+
speed = 1.0
|
|
|
143 |
if prompt_wav_upload is not None:
|
144 |
prompt_wav = prompt_wav_upload
|
145 |
elif prompt_wav_record is not None:
|
146 |
prompt_wav = prompt_wav_record
|
147 |
else:
|
148 |
prompt_wav = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
149 |
|
150 |
+
if prompt_text == '':
|
151 |
+
gr.Warning('Empty prompt found, please check the prompt text.')
|
152 |
+
yield (target_sr, default_data)
|
153 |
+
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
154 |
|
155 |
+
if prompt_wav is None:
|
156 |
+
gr.Warning('Empty prompt found, please upload or record audio.')
|
157 |
+
yield (target_sr, default_data)
|
158 |
+
return
|
159 |
+
|
160 |
+
info = torchaudio.info(prompt_wav)
|
161 |
+
if info.num_frames / info.sample_rate > 10:
|
162 |
+
gr.Warning('Please use prompt audio shorter than 10s.')
|
163 |
+
yield (target_sr, default_data)
|
164 |
+
return
|
165 |
+
|
166 |
+
if torchaudio.info(prompt_wav).sample_rate < prompt_sr:
|
167 |
+
gr.Warning('Prompt wav sample rate {}, lower than {}.'.format(torchaudio.info(prompt_wav).sample_rate, prompt_sr))
|
168 |
+
yield (target_sr, default_data)
|
169 |
+
return
|
170 |
+
|
171 |
+
prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
|
172 |
+
set_all_random_seed(seed)
|
173 |
+
for i in infer_zeroshot(tts_text, prompt_text, prompt_speech_16k, stream=stream, speed=speed):
|
174 |
+
yield (target_sr, i['tts_speech'].numpy().flatten())
|
175 |
|
176 |
def main():
|
177 |
with gr.Blocks() as demo:
|
178 |
+
gr.Markdown("### 3s Voice Clone")
|
179 |
+
gr.Markdown("#### Clone any voice with just 3 seconds of audio. Upload or record audio, input transcription, and click 'Speech Synthesis'.")
|
|
|
|
|
|
|
|
|
180 |
|
181 |
+
tts_text = gr.Textbox(label="Text to synthesize", lines=1, value="CosyVoice is undergoing a comprehensive upgrade, providing more accurate, stable, faster, and better voice generation capabilities.")
|
182 |
+
with gr.Row():
|
183 |
+
prompt_wav_upload = gr.Audio(sources='upload', type='filepath', label='Prompt wav file (sample rate >= 16kHz)')
|
184 |
+
prompt_wav_record = gr.Audio(sources='microphone', type='filepath', label='Record prompt from your microphone')
|
185 |
+
prompt_text = gr.Textbox(label="Prompt Transcription", lines=1, placeholder="Prompt transcription (auto ASR, you can correct the recognition results)", value='')
|
186 |
with gr.Row():
|
|
|
|
|
187 |
stream = gr.Radio(choices=stream_mode_list, label='Streaming or not', value=stream_mode_list[0][1])
|
188 |
with gr.Column(scale=0.25):
|
189 |
seed_button = gr.Button(value="\U0001F3B2")
|
190 |
seed = gr.Number(value=0, label="Random Seed")
|
191 |
|
|
|
|
|
|
|
|
|
|
|
|
|
192 |
generate_button = gr.Button("Speech Synthesis")
|
|
|
193 |
audio_output = gr.Audio(label="Audio Output", autoplay=True, streaming=True)
|
194 |
|
195 |
seed_button.click(generate_seed, inputs=[], outputs=seed)
|
196 |
generate_button.click(generate_audio,
|
197 |
+
inputs=[tts_text, prompt_text, prompt_wav_upload, prompt_wav_record, seed, stream],
|
|
|
198 |
outputs=[audio_output])
|
|
|
199 |
prompt_wav_upload.change(fn=prompt_wav_recognition, inputs=[prompt_wav_upload], outputs=[prompt_text])
|
200 |
prompt_wav_record.change(fn=prompt_wav_recognition, inputs=[prompt_wav_record], outputs=[prompt_text])
|
201 |
|
202 |
demo.launch(max_threads=4)
|
203 |
|
|
|
204 |
if __name__ == '__main__':
|
205 |
# sft_spk = cosyvoice.list_avaliable_spks()
|
206 |
prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
|