CosyVoice2-0.5B

Runtime error

App Files Files Community

kemuriririn commited on 22 days ago

Commit

9060d5c

1 Parent(s): ba791a8

update

Browse files

Files changed (2) hide show

app.py +41 -19
cosyvoice/cli/cosyvoice.py +1 -0

app.py CHANGED Viewed

@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
 import torch
 os.system('nvidia-smi')
@@ -65,7 +67,39 @@ instruct_dict = {'3s Voice Clone': '1. Upload prompt wav file (or record from mi
                  'Instructed Voice Generation': '1. Upload prompt wav file (or record from mic), no longer than 30s, wav file will be used if provided at the same time\n2. Input instruct\n3. click \'Speech Synthesis\' button'}
 stream_mode_list = [('No', False), ('Yes', True)]
 max_val = 0.8
 def generate_seed():
     seed = random.randint(1, 100000000)
@@ -91,7 +125,7 @@ def change_instruction(mode_checkbox_group):
     return instruct_dict[mode_checkbox_group]
 def prompt_wav_recognition(prompt_wav):
-    res = asr_model.generate(input=prompt_wav,
                              language="auto",  # "zn", "en", "yue", "ja", "ko", "nospeech"
                              use_itn=True,
     )
@@ -117,7 +151,7 @@ def generate_audio(tts_text, mode_checkbox_group, prompt_text, prompt_wav_upload
             gr.Info('You are using Instructed Voice Generation mode, please upload the prompt audio.')
     # if cross_lingual mode, please make sure that model is iic/CosyVoice-300M and tts_text prompt_text are different language
     if mode_checkbox_group in ['Cross-lingual Clone']:
-        if cosyvoice.frontend.instruct is True:
             gr.Warning('You are using the cross-lingual Clone mode. The {} model does not support this mode. Please use the iic/CosyVoice-300M model.'.format(args.model_dir))
             yield (target_sr, default_data)
         if instruct_text != '':
@@ -153,25 +187,25 @@ def generate_audio(tts_text, mode_checkbox_group, prompt_text, prompt_wav_upload
     if mode_checkbox_group == 'Pretrained Voice':
         logging.info('get sft inference request')
         set_all_random_seed(seed)
-        for i in cosyvoice.inference_sft(tts_text, sft_dropdown, stream=stream, speed=speed):
             yield (target_sr, i['tts_speech'].numpy().flatten())
     elif mode_checkbox_group == '3s Voice Clone':
         logging.info('get zero_shot inference request')
         prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
         set_all_random_seed(seed)
-        for i in cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k, stream=stream, speed=speed):
             yield (target_sr, i['tts_speech'].numpy().flatten())
     elif mode_checkbox_group == 'Cross-lingual Clone':
         logging.info('get cross_lingual inference request')
         prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
         set_all_random_seed(seed)
-        for i in cosyvoice.inference_cross_lingual(tts_text, prompt_speech_16k, stream=stream, speed=speed):
             yield (target_sr, i['tts_speech'].numpy().flatten())
     else:
         logging.info('get instruct inference request')
         prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
         set_all_random_seed(seed)
-        for i in cosyvoice.inference_instruct2(tts_text, instruct_text, prompt_speech_16k, stream=stream, speed=speed):
             yield (target_sr, i['tts_speech'].numpy().flatten())
@@ -216,23 +250,11 @@ def main():
 if __name__ == '__main__':
-    load_jit = True if os.environ.get('jit') == '1' else False
-    load_onnx = True if os.environ.get('onnx') == '1' else False
-    load_trt = True if os.environ.get('trt') == '1' else False
-    logging.info('cosyvoice args load_jit {} load_onnx {} load_trt {}'.format(load_jit, load_onnx, load_trt))
-    cosyvoice = CosyVoice2('pretrained_models/CosyVoice2-0.5B', load_jit=load_jit, load_onnx=load_onnx, load_trt=load_trt)
     # sft_spk = cosyvoice.list_avaliable_spks()
     prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
     for stream in [True, False]:
-        for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物，那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐，笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=stream)):
             continue
     prompt_sr, target_sr = 16000, 24000
     default_data = np.zeros(target_sr)
-    model_dir = "FunAudioLLM/SenseVoiceSmall"
-    asr_model = AutoModel(
-        model=model_dir,
-        disable_update=True,
-        log_level='DEBUG',
-        device="cuda:0")
     main()

 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
+import threading
 import torch
 os.system('nvidia-smi')
                  'Instructed Voice Generation': '1. Upload prompt wav file (or record from mic), no longer than 30s, wav file will be used if provided at the same time\n2. Input instruct\n3. click \'Speech Synthesis\' button'}
 stream_mode_list = [('No', False), ('Yes', True)]
 max_val = 0.8
+cosyvoice_instance = None
+asr_model = None
+cosyvoice_lock = threading.Lock()
+@spaces.GPU
+def get_cosyvoice():
+    global cosyvoice_instance, model_dir
+    load_jit = True if os.environ.get('jit') == '1' else False
+    load_onnx = True if os.environ.get('onnx') == '1' else False
+    load_trt = True if os.environ.get('trt') == '1' else False
+    with cosyvoice_lock:
+        if cosyvoice_instance is not None:
+            return cosyvoice_instance
+        else:
+            logging.info('cosyvoice args load_jit {} load_onnx {} load_trt {}'.format(load_jit, load_onnx, load_trt))
+            cosyvoice_instance= CosyVoice2('pretrained_models/CosyVoice2-0.5B', load_jit=load_jit, load_onnx=load_onnx,
+                           load_trt=load_trt)
+            return cosyvoice_instance
+@spaces.GPU
+def get_asr():
+    global asr_model
+    if asr_model is not None:
+        return asr_model
+    else:
+        logging.info('asr model load')
+        model_dir = "FunAudioLLM/SenseVoiceSmall"
+        asr_model = AutoModel(
+            model=model_dir,
+            disable_update=True,
+            log_level='DEBUG',
+            device="cuda:0")
+        return asr_model
 def generate_seed():
     seed = random.randint(1, 100000000)
     return instruct_dict[mode_checkbox_group]
 def prompt_wav_recognition(prompt_wav):
+    res = get_asr().generate(input=prompt_wav,
                              language="auto",  # "zn", "en", "yue", "ja", "ko", "nospeech"
                              use_itn=True,
     )
             gr.Info('You are using Instructed Voice Generation mode, please upload the prompt audio.')
     # if cross_lingual mode, please make sure that model is iic/CosyVoice-300M and tts_text prompt_text are different language
     if mode_checkbox_group in ['Cross-lingual Clone']:
+        if get_cosyvoice().frontend.instruct is True:
             gr.Warning('You are using the cross-lingual Clone mode. The {} model does not support this mode. Please use the iic/CosyVoice-300M model.'.format(args.model_dir))
             yield (target_sr, default_data)
         if instruct_text != '':
     if mode_checkbox_group == 'Pretrained Voice':
         logging.info('get sft inference request')
         set_all_random_seed(seed)
+        for i in get_cosyvoice().inference_sft(tts_text, sft_dropdown, stream=stream, speed=speed):
             yield (target_sr, i['tts_speech'].numpy().flatten())
     elif mode_checkbox_group == '3s Voice Clone':
         logging.info('get zero_shot inference request')
         prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
         set_all_random_seed(seed)
+        for i in get_cosyvoice().inference_zero_shot(tts_text, prompt_text, prompt_speech_16k, stream=stream, speed=speed):
             yield (target_sr, i['tts_speech'].numpy().flatten())
     elif mode_checkbox_group == 'Cross-lingual Clone':
         logging.info('get cross_lingual inference request')
         prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
         set_all_random_seed(seed)
+        for i in get_cosyvoice().inference_cross_lingual(tts_text, prompt_speech_16k, stream=stream, speed=speed):
             yield (target_sr, i['tts_speech'].numpy().flatten())
     else:
         logging.info('get instruct inference request')
         prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
         set_all_random_seed(seed)
+        for i in get_cosyvoice().inference_instruct2(tts_text, instruct_text, prompt_speech_16k, stream=stream, speed=speed):
             yield (target_sr, i['tts_speech'].numpy().flatten())
 if __name__ == '__main__':
     # sft_spk = cosyvoice.list_avaliable_spks()
     prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
     for stream in [True, False]:
+        for i, j in enumerate(get_cosyvoice().inference_zero_shot('收到好友从远方寄来的生日礼物，那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐，笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=stream)):
             continue
     prompt_sr, target_sr = 16000, 24000
     default_data = np.zeros(target_sr)
     main()

cosyvoice/cli/cosyvoice.py CHANGED Viewed

@@ -103,6 +103,7 @@ class CosyVoice:
     @spaces.GPU
     def inference_instruct(self, tts_text, spk_id, instruct_text, stream=False, speed=1.0):
         if self.frontend.instruct is False:
             raise ValueError('{} do not support instruct inference'.format(self.model_dir))
         instruct_text = self.frontend.text_normalize(instruct_text, split=False)

     @spaces.GPU
     def inference_instruct(self, tts_text, spk_id, instruct_text, stream=False, speed=1.0):
+        assert isinstance(self.model, CosyVoiceModel), 'inference_instruct is only implemented for CosyVoice!'
         if self.frontend.instruct is False:
             raise ValueError('{} do not support instruct inference'.format(self.model_dir))
         instruct_text = self.frontend.text_normalize(instruct_text, split=False)