CosyVoice2-0.5B

Runtime error

App Files Files Community

kemuriririn commited on 21 days ago

Commit

4e4b6f0

1 Parent(s): b1f350e

(wip)remove useless code

Browse files

Files changed (1) hide show

app.py +36 -94

app.py CHANGED Viewed

@@ -63,9 +63,8 @@ from cosyvoice.cli.cosyvoice import CosyVoice2
 from cosyvoice.utils.file_utils import load_wav, logging
 from cosyvoice.utils.common import set_all_random_seed
-inference_mode_list = ['3s Voice Clone', 'Instructed Voice Generation']
-instruct_dict = {'3s Voice Clone': '1. Upload prompt wav file (or record from mic), no longer than 30s, wav file will be used if provided at the same time\n2. Input prompt transcription\n3. click \'Speech Synthesis\' button',
-                 'Instructed Voice Generation': '1. Upload prompt wav file (or record from mic), no longer than 30s, wav file will be used if provided at the same time\n2. Input instruct\n3. click \'Speech Synthesis\' button'}
 stream_mode_list = [('No', False), ('Yes', True)]
 max_val = 0.8
 cosyvoice_instance = None
@@ -129,10 +128,6 @@ def postprocess(speech, top_db=60, hop_length=220, win_length=440):
     speech = torch.concat([speech, torch.zeros(1, int(target_sr * 0.2))], dim=1)
     return speech
-def change_instruction(mode_checkbox_group):
-    return instruct_dict[mode_checkbox_group]
 @spaces.GPU
 def prompt_wav_recognition(prompt_wav):
     res = get_asr().generate(input=prompt_wav,
@@ -143,122 +138,69 @@ def prompt_wav_recognition(prompt_wav):
     return text
 @spaces.GPU
-def generate_audio(tts_text, mode_checkbox_group, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text,
-                   seed, stream):
-    sft_dropdown, speed = '', 1.0
     if prompt_wav_upload is not None:
         prompt_wav = prompt_wav_upload
     elif prompt_wav_record is not None:
         prompt_wav = prompt_wav_record
     else:
         prompt_wav = None
-    # if instruct mode, please make sure that model is iic/CosyVoice-300M-Instruct and not cross_lingual mode
-    if mode_checkbox_group in ['Instructed Voice Generation']:
-        if instruct_text == '':
-            gr.Warning('You are using Instructed Voice Generation mode, please input the instruct.')
-            yield (target_sr, default_data)
-        if prompt_wav is None:
-            gr.Info('You are using Instructed Voice Generation mode, please upload the prompt audio.')
-    # if cross_lingual mode, please make sure that model is iic/CosyVoice-300M and tts_text prompt_text are different language
-    if mode_checkbox_group in ['Cross-lingual Clone']:
-        if get_cosyvoice().frontend.instruct is True:
-            gr.Warning('You are using the cross-lingual Clone mode. The {} model does not support this mode. Please use the iic/CosyVoice-300M model.'.format(args.model_dir))
-            yield (target_sr, default_data)
-        if instruct_text != '':
-            gr.Info('You are using the cross-lingual Clone mode. The instruct text will be ignored.')
-        if prompt_wav is None:
-            gr.Warning('You are using the cross-lingual Clone mode. Please provide the prompt audio.')
-            yield (target_sr, default_data)
-        gr.Info('You are using the cross-lingual Clone mode. Please ensure that the synthesis text and prompt text are in different languages.')
-    # if in zero_shot cross_lingual, please make sure that prompt_text and prompt_wav meets requirements
-    if mode_checkbox_group in ['3s Voice Clone', 'Cross-lingual Clone']:
-        if prompt_wav is None:
-            gr.Warning('Empty prompt found, please check the prompt text.')
-            yield (target_sr, default_data)
-        if torchaudio.info(prompt_wav).sample_rate < prompt_sr:
-            gr.Warning('prompt wav sample rate {}, lower than {}.'.format(torchaudio.info(prompt_wav).sample_rate, prompt_sr))
-            yield (target_sr, default_data)
-    # sft mode only use sft_dropdown
-    if mode_checkbox_group in ['Pretrained Voice']:
-        if instruct_text != '' or prompt_wav is not None or prompt_text != '':
-            gr.Info('You are using Pretrained Voice mode. Pretrained Voice/Instruct will be ingnored.')
-    # zero_shot mode only use prompt_wav prompt text
-    if mode_checkbox_group in ['3s Voice Clone']:
-        if prompt_text == '':
-            gr.Warning('Empty prompt found, please check the prompt text.')
-            yield (target_sr, default_data)
-        if instruct_text != '':
-            gr.Info('You are using 3s Voice Clone mode. Pretrained Voice/Instruct will be ingnored.')
-        info = torchaudio.info(prompt_wav)
-        if info.num_frames / info.sample_rate > 10:
-            gr.Warning('Please use prompt audio shorter than 10s.')
-            yield (target_sr, default_data)
-    if mode_checkbox_group == 'Pretrained Voice':
-        logging.info('get sft inference request')
-        set_all_random_seed(seed)
-        for i in get_cosyvoice().inference_sft(tts_text, sft_dropdown, stream=stream, speed=speed):
-            yield (target_sr, i['tts_speech'].numpy().flatten())
-    elif mode_checkbox_group == '3s Voice Clone':
-        logging.info('get zero_shot inference request')
-        prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
-        set_all_random_seed(seed)
-        for i in infer_zeroshot(tts_text, prompt_text, prompt_speech_16k, stream=stream, speed=speed):
-            yield (target_sr, i['tts_speech'].numpy().flatten())
-    elif mode_checkbox_group == 'Cross-lingual Clone':
-        logging.info('get cross_lingual inference request')
-        prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
-        set_all_random_seed(seed)
-        for i in get_cosyvoice().inference_cross_lingual(tts_text, prompt_speech_16k, stream=stream, speed=speed):
-            yield (target_sr, i['tts_speech'].numpy().flatten())
-    else:
-        logging.info('get instruct inference request')
-        prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
-        set_all_random_seed(seed)
-        for i in get_cosyvoice().inference_instruct2(tts_text, instruct_text, prompt_speech_16k, stream=stream, speed=speed):
-            yield (target_sr, i['tts_speech'].numpy().flatten())
 def main():
     with gr.Blocks() as demo:
-        gr.Markdown("### Repo [CosyVoice](https://github.com/FunAudioLLM/CosyVoice) \
-                    Pretrained Model [CosyVoice2-0.5B](https://www.modelscope.cn/models/iic/CosyVoice2-0.5B) \
-                    [CosyVoice-300M](https://www.modelscope.cn/models/iic/CosyVoice-300M) \
-                    [CosyVoice-300M-Instruct](https://www.modelscope.cn/models/iic/CosyVoice-300M-Instruct) \
-                    [CosyVoice-300M-SFT](https://www.modelscope.cn/models/iic/CosyVoice-300M-SFT)")
-        gr.Markdown("#### Please input the text to synthesize, choose inference mode and follow the controlling steps below.")
-        tts_text = gr.Textbox(label="Text to synthesize", lines=1, value="CosyVoice is undergoing a comprehensive upgrade, providing more accurate, stable, faster, and better voice generation capabilities. CosyVoice迎来全面升级，提供更准、更稳、更快、 更好的语音生成能力。")
         with gr.Row():
-            mode_checkbox_group = gr.Radio(choices=inference_mode_list, label='Inference Mode', value=inference_mode_list[0])
-            instruction_text = gr.Text(label="Instructions", value=instruct_dict[inference_mode_list[0]], scale=0.5)
             stream = gr.Radio(choices=stream_mode_list, label='Streaming or not', value=stream_mode_list[0][1])
             with gr.Column(scale=0.25):
                 seed_button = gr.Button(value="\U0001F3B2")
                 seed = gr.Number(value=0, label="Random Seed")
-        with gr.Row():
-            prompt_wav_upload = gr.Audio(sources='upload', type='filepath', label='Prompt wav file (sample rate >= 16kHz)')
-            prompt_wav_record = gr.Audio(sources='microphone', type='filepath', label='Record prompt from your microphone')
-        prompt_text = gr.Textbox(label="Prompt Transcription", lines=1, placeholder="Prompt transcription (auto ASR, you can correct the recognition results)", value='')
-        instruct_text = gr.Textbox(label="Instruct", lines=1, placeholder="Instruct transcription. e.g. A old sea captain, navigates life's storms with timeless wisdom and a heart of gold.", value='')
         generate_button = gr.Button("Speech Synthesis")
         audio_output = gr.Audio(label="Audio Output", autoplay=True, streaming=True)
         seed_button.click(generate_seed, inputs=[], outputs=seed)
         generate_button.click(generate_audio,
-                              inputs=[tts_text, mode_checkbox_group, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text,
-                                      seed, stream],
                               outputs=[audio_output])
-        mode_checkbox_group.change(fn=change_instruction, inputs=[mode_checkbox_group], outputs=[instruction_text])
         prompt_wav_upload.change(fn=prompt_wav_recognition, inputs=[prompt_wav_upload], outputs=[prompt_text])
         prompt_wav_record.change(fn=prompt_wav_recognition, inputs=[prompt_wav_record], outputs=[prompt_text])
     demo.launch(max_threads=4)
 if __name__ == '__main__':
     # sft_spk = cosyvoice.list_avaliable_spks()
     prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)

 from cosyvoice.utils.file_utils import load_wav, logging
 from cosyvoice.utils.common import set_all_random_seed
+inference_mode_list = ['3s Voice Clone']
+instruct_dict = {'3s Voice Clone': '1. Upload prompt wav file (or record from mic), no longer than 30s, wav file will be used if provided at the same time\n2. Input prompt transcription\n3. click \'Speech Synthesis\' button'}
 stream_mode_list = [('No', False), ('Yes', True)]
 max_val = 0.8
 cosyvoice_instance = None
     speech = torch.concat([speech, torch.zeros(1, int(target_sr * 0.2))], dim=1)
     return speech
 @spaces.GPU
 def prompt_wav_recognition(prompt_wav):
     res = get_asr().generate(input=prompt_wav,
     return text
 @spaces.GPU
+def generate_audio(tts_text, prompt_text, prompt_wav_upload, prompt_wav_record, seed, stream):
+    speed = 1.0
     if prompt_wav_upload is not None:
         prompt_wav = prompt_wav_upload
     elif prompt_wav_record is not None:
         prompt_wav = prompt_wav_record
     else:
         prompt_wav = None
+    if prompt_text == '':
+        gr.Warning('Empty prompt found, please check the prompt text.')
+        yield (target_sr, default_data)
+        return
+    if prompt_wav is None:
+        gr.Warning('Empty prompt found, please upload or record audio.')
+        yield (target_sr, default_data)
+        return
+    info = torchaudio.info(prompt_wav)
+    if info.num_frames / info.sample_rate > 10:
+        gr.Warning('Please use prompt audio shorter than 10s.')
+        yield (target_sr, default_data)
+        return
+    if torchaudio.info(prompt_wav).sample_rate < prompt_sr:
+        gr.Warning('Prompt wav sample rate {}, lower than {}.'.format(torchaudio.info(prompt_wav).sample_rate, prompt_sr))
+        yield (target_sr, default_data)
+        return
+    prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
+    set_all_random_seed(seed)
+    for i in infer_zeroshot(tts_text, prompt_text, prompt_speech_16k, stream=stream, speed=speed):
+        yield (target_sr, i['tts_speech'].numpy().flatten())
 def main():
     with gr.Blocks() as demo:
+        gr.Markdown("### 3s Voice Clone")
+        gr.Markdown("#### Clone any voice with just 3 seconds of audio. Upload or record audio, input transcription, and click 'Speech Synthesis'.")
+        tts_text = gr.Textbox(label="Text to synthesize", lines=1, value="CosyVoice is undergoing a comprehensive upgrade, providing more accurate, stable, faster, and better voice generation capabilities.")
+        with gr.Row():
+            prompt_wav_upload = gr.Audio(sources='upload', type='filepath', label='Prompt wav file (sample rate >= 16kHz)')
+            prompt_wav_record = gr.Audio(sources='microphone', type='filepath', label='Record prompt from your microphone')
+        prompt_text = gr.Textbox(label="Prompt Transcription", lines=1, placeholder="Prompt transcription (auto ASR, you can correct the recognition results)", value='')
         with gr.Row():
             stream = gr.Radio(choices=stream_mode_list, label='Streaming or not', value=stream_mode_list[0][1])
             with gr.Column(scale=0.25):
                 seed_button = gr.Button(value="\U0001F3B2")
                 seed = gr.Number(value=0, label="Random Seed")
         generate_button = gr.Button("Speech Synthesis")
         audio_output = gr.Audio(label="Audio Output", autoplay=True, streaming=True)
         seed_button.click(generate_seed, inputs=[], outputs=seed)
         generate_button.click(generate_audio,
+                              inputs=[tts_text, prompt_text, prompt_wav_upload, prompt_wav_record, seed, stream],
                               outputs=[audio_output])
         prompt_wav_upload.change(fn=prompt_wav_recognition, inputs=[prompt_wav_upload], outputs=[prompt_text])
         prompt_wav_record.change(fn=prompt_wav_recognition, inputs=[prompt_wav_record], outputs=[prompt_text])
     demo.launch(max_threads=4)
 if __name__ == '__main__':
     # sft_spk = cosyvoice.list_avaliable_spks()
     prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)