kemuriririn commited on
Commit
9060d5c
·
1 Parent(s): ba791a8
Files changed (2) hide show
  1. app.py +41 -19
  2. cosyvoice/cli/cosyvoice.py +1 -0
app.py CHANGED
@@ -12,6 +12,8 @@
12
  # See the License for the specific language governing permissions and
13
  # limitations under the License.
14
  import os
 
 
15
  import torch
16
 
17
  os.system('nvidia-smi')
@@ -65,7 +67,39 @@ instruct_dict = {'3s Voice Clone': '1. Upload prompt wav file (or record from mi
65
  'Instructed Voice Generation': '1. Upload prompt wav file (or record from mic), no longer than 30s, wav file will be used if provided at the same time\n2. Input instruct\n3. click \'Speech Synthesis\' button'}
66
  stream_mode_list = [('No', False), ('Yes', True)]
67
  max_val = 0.8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
  def generate_seed():
71
  seed = random.randint(1, 100000000)
@@ -91,7 +125,7 @@ def change_instruction(mode_checkbox_group):
91
  return instruct_dict[mode_checkbox_group]
92
 
93
  def prompt_wav_recognition(prompt_wav):
94
- res = asr_model.generate(input=prompt_wav,
95
  language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech"
96
  use_itn=True,
97
  )
@@ -117,7 +151,7 @@ def generate_audio(tts_text, mode_checkbox_group, prompt_text, prompt_wav_upload
117
  gr.Info('You are using Instructed Voice Generation mode, please upload the prompt audio.')
118
  # if cross_lingual mode, please make sure that model is iic/CosyVoice-300M and tts_text prompt_text are different language
119
  if mode_checkbox_group in ['Cross-lingual Clone']:
120
- if cosyvoice.frontend.instruct is True:
121
  gr.Warning('You are using the cross-lingual Clone mode. The {} model does not support this mode. Please use the iic/CosyVoice-300M model.'.format(args.model_dir))
122
  yield (target_sr, default_data)
123
  if instruct_text != '':
@@ -153,25 +187,25 @@ def generate_audio(tts_text, mode_checkbox_group, prompt_text, prompt_wav_upload
153
  if mode_checkbox_group == 'Pretrained Voice':
154
  logging.info('get sft inference request')
155
  set_all_random_seed(seed)
156
- for i in cosyvoice.inference_sft(tts_text, sft_dropdown, stream=stream, speed=speed):
157
  yield (target_sr, i['tts_speech'].numpy().flatten())
158
  elif mode_checkbox_group == '3s Voice Clone':
159
  logging.info('get zero_shot inference request')
160
  prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
161
  set_all_random_seed(seed)
162
- for i in cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k, stream=stream, speed=speed):
163
  yield (target_sr, i['tts_speech'].numpy().flatten())
164
  elif mode_checkbox_group == 'Cross-lingual Clone':
165
  logging.info('get cross_lingual inference request')
166
  prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
167
  set_all_random_seed(seed)
168
- for i in cosyvoice.inference_cross_lingual(tts_text, prompt_speech_16k, stream=stream, speed=speed):
169
  yield (target_sr, i['tts_speech'].numpy().flatten())
170
  else:
171
  logging.info('get instruct inference request')
172
  prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
173
  set_all_random_seed(seed)
174
- for i in cosyvoice.inference_instruct2(tts_text, instruct_text, prompt_speech_16k, stream=stream, speed=speed):
175
  yield (target_sr, i['tts_speech'].numpy().flatten())
176
 
177
 
@@ -216,23 +250,11 @@ def main():
216
 
217
 
218
  if __name__ == '__main__':
219
- load_jit = True if os.environ.get('jit') == '1' else False
220
- load_onnx = True if os.environ.get('onnx') == '1' else False
221
- load_trt = True if os.environ.get('trt') == '1' else False
222
- logging.info('cosyvoice args load_jit {} load_onnx {} load_trt {}'.format(load_jit, load_onnx, load_trt))
223
- cosyvoice = CosyVoice2('pretrained_models/CosyVoice2-0.5B', load_jit=load_jit, load_onnx=load_onnx, load_trt=load_trt)
224
  # sft_spk = cosyvoice.list_avaliable_spks()
225
  prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
226
  for stream in [True, False]:
227
- for i, j in enumerate(cosyvoice.inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=stream)):
228
  continue
229
  prompt_sr, target_sr = 16000, 24000
230
  default_data = np.zeros(target_sr)
231
-
232
- model_dir = "FunAudioLLM/SenseVoiceSmall"
233
- asr_model = AutoModel(
234
- model=model_dir,
235
- disable_update=True,
236
- log_level='DEBUG',
237
- device="cuda:0")
238
  main()
 
12
  # See the License for the specific language governing permissions and
13
  # limitations under the License.
14
  import os
15
+ import threading
16
+
17
  import torch
18
 
19
  os.system('nvidia-smi')
 
67
  'Instructed Voice Generation': '1. Upload prompt wav file (or record from mic), no longer than 30s, wav file will be used if provided at the same time\n2. Input instruct\n3. click \'Speech Synthesis\' button'}
68
  stream_mode_list = [('No', False), ('Yes', True)]
69
  max_val = 0.8
70
+ cosyvoice_instance = None
71
+ asr_model = None
72
+ cosyvoice_lock = threading.Lock()
73
+
74
+ @spaces.GPU
75
+ def get_cosyvoice():
76
+ global cosyvoice_instance, model_dir
77
+ load_jit = True if os.environ.get('jit') == '1' else False
78
+ load_onnx = True if os.environ.get('onnx') == '1' else False
79
+ load_trt = True if os.environ.get('trt') == '1' else False
80
+ with cosyvoice_lock:
81
+ if cosyvoice_instance is not None:
82
+ return cosyvoice_instance
83
+ else:
84
+ logging.info('cosyvoice args load_jit {} load_onnx {} load_trt {}'.format(load_jit, load_onnx, load_trt))
85
+ cosyvoice_instance= CosyVoice2('pretrained_models/CosyVoice2-0.5B', load_jit=load_jit, load_onnx=load_onnx,
86
+ load_trt=load_trt)
87
+ return cosyvoice_instance
88
 
89
+ @spaces.GPU
90
+ def get_asr():
91
+ global asr_model
92
+ if asr_model is not None:
93
+ return asr_model
94
+ else:
95
+ logging.info('asr model load')
96
+ model_dir = "FunAudioLLM/SenseVoiceSmall"
97
+ asr_model = AutoModel(
98
+ model=model_dir,
99
+ disable_update=True,
100
+ log_level='DEBUG',
101
+ device="cuda:0")
102
+ return asr_model
103
 
104
  def generate_seed():
105
  seed = random.randint(1, 100000000)
 
125
  return instruct_dict[mode_checkbox_group]
126
 
127
  def prompt_wav_recognition(prompt_wav):
128
+ res = get_asr().generate(input=prompt_wav,
129
  language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech"
130
  use_itn=True,
131
  )
 
151
  gr.Info('You are using Instructed Voice Generation mode, please upload the prompt audio.')
152
  # if cross_lingual mode, please make sure that model is iic/CosyVoice-300M and tts_text prompt_text are different language
153
  if mode_checkbox_group in ['Cross-lingual Clone']:
154
+ if get_cosyvoice().frontend.instruct is True:
155
  gr.Warning('You are using the cross-lingual Clone mode. The {} model does not support this mode. Please use the iic/CosyVoice-300M model.'.format(args.model_dir))
156
  yield (target_sr, default_data)
157
  if instruct_text != '':
 
187
  if mode_checkbox_group == 'Pretrained Voice':
188
  logging.info('get sft inference request')
189
  set_all_random_seed(seed)
190
+ for i in get_cosyvoice().inference_sft(tts_text, sft_dropdown, stream=stream, speed=speed):
191
  yield (target_sr, i['tts_speech'].numpy().flatten())
192
  elif mode_checkbox_group == '3s Voice Clone':
193
  logging.info('get zero_shot inference request')
194
  prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
195
  set_all_random_seed(seed)
196
+ for i in get_cosyvoice().inference_zero_shot(tts_text, prompt_text, prompt_speech_16k, stream=stream, speed=speed):
197
  yield (target_sr, i['tts_speech'].numpy().flatten())
198
  elif mode_checkbox_group == 'Cross-lingual Clone':
199
  logging.info('get cross_lingual inference request')
200
  prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
201
  set_all_random_seed(seed)
202
+ for i in get_cosyvoice().inference_cross_lingual(tts_text, prompt_speech_16k, stream=stream, speed=speed):
203
  yield (target_sr, i['tts_speech'].numpy().flatten())
204
  else:
205
  logging.info('get instruct inference request')
206
  prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
207
  set_all_random_seed(seed)
208
+ for i in get_cosyvoice().inference_instruct2(tts_text, instruct_text, prompt_speech_16k, stream=stream, speed=speed):
209
  yield (target_sr, i['tts_speech'].numpy().flatten())
210
 
211
 
 
250
 
251
 
252
  if __name__ == '__main__':
 
 
 
 
 
253
  # sft_spk = cosyvoice.list_avaliable_spks()
254
  prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
255
  for stream in [True, False]:
256
+ for i, j in enumerate(get_cosyvoice().inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=stream)):
257
  continue
258
  prompt_sr, target_sr = 16000, 24000
259
  default_data = np.zeros(target_sr)
 
 
 
 
 
 
 
260
  main()
cosyvoice/cli/cosyvoice.py CHANGED
@@ -103,6 +103,7 @@ class CosyVoice:
103
 
104
  @spaces.GPU
105
  def inference_instruct(self, tts_text, spk_id, instruct_text, stream=False, speed=1.0):
 
106
  if self.frontend.instruct is False:
107
  raise ValueError('{} do not support instruct inference'.format(self.model_dir))
108
  instruct_text = self.frontend.text_normalize(instruct_text, split=False)
 
103
 
104
  @spaces.GPU
105
  def inference_instruct(self, tts_text, spk_id, instruct_text, stream=False, speed=1.0):
106
+ assert isinstance(self.model, CosyVoiceModel), 'inference_instruct is only implemented for CosyVoice!'
107
  if self.frontend.instruct is False:
108
  raise ValueError('{} do not support instruct inference'.format(self.model_dir))
109
  instruct_text = self.frontend.text_normalize(instruct_text, split=False)