Spaces:
Runtime error
Runtime error
Commit
·
9060d5c
1
Parent(s):
ba791a8
update
Browse files- app.py +41 -19
- cosyvoice/cli/cosyvoice.py +1 -0
app.py
CHANGED
@@ -12,6 +12,8 @@
|
|
12 |
# See the License for the specific language governing permissions and
|
13 |
# limitations under the License.
|
14 |
import os
|
|
|
|
|
15 |
import torch
|
16 |
|
17 |
os.system('nvidia-smi')
|
@@ -65,7 +67,39 @@ instruct_dict = {'3s Voice Clone': '1. Upload prompt wav file (or record from mi
|
|
65 |
'Instructed Voice Generation': '1. Upload prompt wav file (or record from mic), no longer than 30s, wav file will be used if provided at the same time\n2. Input instruct\n3. click \'Speech Synthesis\' button'}
|
66 |
stream_mode_list = [('No', False), ('Yes', True)]
|
67 |
max_val = 0.8
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
|
70 |
def generate_seed():
|
71 |
seed = random.randint(1, 100000000)
|
@@ -91,7 +125,7 @@ def change_instruction(mode_checkbox_group):
|
|
91 |
return instruct_dict[mode_checkbox_group]
|
92 |
|
93 |
def prompt_wav_recognition(prompt_wav):
|
94 |
-
res =
|
95 |
language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech"
|
96 |
use_itn=True,
|
97 |
)
|
@@ -117,7 +151,7 @@ def generate_audio(tts_text, mode_checkbox_group, prompt_text, prompt_wav_upload
|
|
117 |
gr.Info('You are using Instructed Voice Generation mode, please upload the prompt audio.')
|
118 |
# if cross_lingual mode, please make sure that model is iic/CosyVoice-300M and tts_text prompt_text are different language
|
119 |
if mode_checkbox_group in ['Cross-lingual Clone']:
|
120 |
-
if
|
121 |
gr.Warning('You are using the cross-lingual Clone mode. The {} model does not support this mode. Please use the iic/CosyVoice-300M model.'.format(args.model_dir))
|
122 |
yield (target_sr, default_data)
|
123 |
if instruct_text != '':
|
@@ -153,25 +187,25 @@ def generate_audio(tts_text, mode_checkbox_group, prompt_text, prompt_wav_upload
|
|
153 |
if mode_checkbox_group == 'Pretrained Voice':
|
154 |
logging.info('get sft inference request')
|
155 |
set_all_random_seed(seed)
|
156 |
-
for i in
|
157 |
yield (target_sr, i['tts_speech'].numpy().flatten())
|
158 |
elif mode_checkbox_group == '3s Voice Clone':
|
159 |
logging.info('get zero_shot inference request')
|
160 |
prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
|
161 |
set_all_random_seed(seed)
|
162 |
-
for i in
|
163 |
yield (target_sr, i['tts_speech'].numpy().flatten())
|
164 |
elif mode_checkbox_group == 'Cross-lingual Clone':
|
165 |
logging.info('get cross_lingual inference request')
|
166 |
prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
|
167 |
set_all_random_seed(seed)
|
168 |
-
for i in
|
169 |
yield (target_sr, i['tts_speech'].numpy().flatten())
|
170 |
else:
|
171 |
logging.info('get instruct inference request')
|
172 |
prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
|
173 |
set_all_random_seed(seed)
|
174 |
-
for i in
|
175 |
yield (target_sr, i['tts_speech'].numpy().flatten())
|
176 |
|
177 |
|
@@ -216,23 +250,11 @@ def main():
|
|
216 |
|
217 |
|
218 |
if __name__ == '__main__':
|
219 |
-
load_jit = True if os.environ.get('jit') == '1' else False
|
220 |
-
load_onnx = True if os.environ.get('onnx') == '1' else False
|
221 |
-
load_trt = True if os.environ.get('trt') == '1' else False
|
222 |
-
logging.info('cosyvoice args load_jit {} load_onnx {} load_trt {}'.format(load_jit, load_onnx, load_trt))
|
223 |
-
cosyvoice = CosyVoice2('pretrained_models/CosyVoice2-0.5B', load_jit=load_jit, load_onnx=load_onnx, load_trt=load_trt)
|
224 |
# sft_spk = cosyvoice.list_avaliable_spks()
|
225 |
prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
|
226 |
for stream in [True, False]:
|
227 |
-
for i, j in enumerate(
|
228 |
continue
|
229 |
prompt_sr, target_sr = 16000, 24000
|
230 |
default_data = np.zeros(target_sr)
|
231 |
-
|
232 |
-
model_dir = "FunAudioLLM/SenseVoiceSmall"
|
233 |
-
asr_model = AutoModel(
|
234 |
-
model=model_dir,
|
235 |
-
disable_update=True,
|
236 |
-
log_level='DEBUG',
|
237 |
-
device="cuda:0")
|
238 |
main()
|
|
|
12 |
# See the License for the specific language governing permissions and
|
13 |
# limitations under the License.
|
14 |
import os
|
15 |
+
import threading
|
16 |
+
|
17 |
import torch
|
18 |
|
19 |
os.system('nvidia-smi')
|
|
|
67 |
'Instructed Voice Generation': '1. Upload prompt wav file (or record from mic), no longer than 30s, wav file will be used if provided at the same time\n2. Input instruct\n3. click \'Speech Synthesis\' button'}
|
68 |
stream_mode_list = [('No', False), ('Yes', True)]
|
69 |
max_val = 0.8
|
70 |
+
cosyvoice_instance = None
|
71 |
+
asr_model = None
|
72 |
+
cosyvoice_lock = threading.Lock()
|
73 |
+
|
74 |
+
@spaces.GPU
|
75 |
+
def get_cosyvoice():
|
76 |
+
global cosyvoice_instance, model_dir
|
77 |
+
load_jit = True if os.environ.get('jit') == '1' else False
|
78 |
+
load_onnx = True if os.environ.get('onnx') == '1' else False
|
79 |
+
load_trt = True if os.environ.get('trt') == '1' else False
|
80 |
+
with cosyvoice_lock:
|
81 |
+
if cosyvoice_instance is not None:
|
82 |
+
return cosyvoice_instance
|
83 |
+
else:
|
84 |
+
logging.info('cosyvoice args load_jit {} load_onnx {} load_trt {}'.format(load_jit, load_onnx, load_trt))
|
85 |
+
cosyvoice_instance= CosyVoice2('pretrained_models/CosyVoice2-0.5B', load_jit=load_jit, load_onnx=load_onnx,
|
86 |
+
load_trt=load_trt)
|
87 |
+
return cosyvoice_instance
|
88 |
|
89 |
+
@spaces.GPU
|
90 |
+
def get_asr():
|
91 |
+
global asr_model
|
92 |
+
if asr_model is not None:
|
93 |
+
return asr_model
|
94 |
+
else:
|
95 |
+
logging.info('asr model load')
|
96 |
+
model_dir = "FunAudioLLM/SenseVoiceSmall"
|
97 |
+
asr_model = AutoModel(
|
98 |
+
model=model_dir,
|
99 |
+
disable_update=True,
|
100 |
+
log_level='DEBUG',
|
101 |
+
device="cuda:0")
|
102 |
+
return asr_model
|
103 |
|
104 |
def generate_seed():
|
105 |
seed = random.randint(1, 100000000)
|
|
|
125 |
return instruct_dict[mode_checkbox_group]
|
126 |
|
127 |
def prompt_wav_recognition(prompt_wav):
|
128 |
+
res = get_asr().generate(input=prompt_wav,
|
129 |
language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech"
|
130 |
use_itn=True,
|
131 |
)
|
|
|
151 |
gr.Info('You are using Instructed Voice Generation mode, please upload the prompt audio.')
|
152 |
# if cross_lingual mode, please make sure that model is iic/CosyVoice-300M and tts_text prompt_text are different language
|
153 |
if mode_checkbox_group in ['Cross-lingual Clone']:
|
154 |
+
if get_cosyvoice().frontend.instruct is True:
|
155 |
gr.Warning('You are using the cross-lingual Clone mode. The {} model does not support this mode. Please use the iic/CosyVoice-300M model.'.format(args.model_dir))
|
156 |
yield (target_sr, default_data)
|
157 |
if instruct_text != '':
|
|
|
187 |
if mode_checkbox_group == 'Pretrained Voice':
|
188 |
logging.info('get sft inference request')
|
189 |
set_all_random_seed(seed)
|
190 |
+
for i in get_cosyvoice().inference_sft(tts_text, sft_dropdown, stream=stream, speed=speed):
|
191 |
yield (target_sr, i['tts_speech'].numpy().flatten())
|
192 |
elif mode_checkbox_group == '3s Voice Clone':
|
193 |
logging.info('get zero_shot inference request')
|
194 |
prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
|
195 |
set_all_random_seed(seed)
|
196 |
+
for i in get_cosyvoice().inference_zero_shot(tts_text, prompt_text, prompt_speech_16k, stream=stream, speed=speed):
|
197 |
yield (target_sr, i['tts_speech'].numpy().flatten())
|
198 |
elif mode_checkbox_group == 'Cross-lingual Clone':
|
199 |
logging.info('get cross_lingual inference request')
|
200 |
prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
|
201 |
set_all_random_seed(seed)
|
202 |
+
for i in get_cosyvoice().inference_cross_lingual(tts_text, prompt_speech_16k, stream=stream, speed=speed):
|
203 |
yield (target_sr, i['tts_speech'].numpy().flatten())
|
204 |
else:
|
205 |
logging.info('get instruct inference request')
|
206 |
prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
|
207 |
set_all_random_seed(seed)
|
208 |
+
for i in get_cosyvoice().inference_instruct2(tts_text, instruct_text, prompt_speech_16k, stream=stream, speed=speed):
|
209 |
yield (target_sr, i['tts_speech'].numpy().flatten())
|
210 |
|
211 |
|
|
|
250 |
|
251 |
|
252 |
if __name__ == '__main__':
|
|
|
|
|
|
|
|
|
|
|
253 |
# sft_spk = cosyvoice.list_avaliable_spks()
|
254 |
prompt_speech_16k = load_wav('zero_shot_prompt.wav', 16000)
|
255 |
for stream in [True, False]:
|
256 |
+
for i, j in enumerate(get_cosyvoice().inference_zero_shot('收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的祝福让我心中充满了甜蜜的快乐,笑容如花儿般绽放。', '希望你以后能够做的比我还好呦。', prompt_speech_16k, stream=stream)):
|
257 |
continue
|
258 |
prompt_sr, target_sr = 16000, 24000
|
259 |
default_data = np.zeros(target_sr)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
260 |
main()
|
cosyvoice/cli/cosyvoice.py
CHANGED
@@ -103,6 +103,7 @@ class CosyVoice:
|
|
103 |
|
104 |
@spaces.GPU
|
105 |
def inference_instruct(self, tts_text, spk_id, instruct_text, stream=False, speed=1.0):
|
|
|
106 |
if self.frontend.instruct is False:
|
107 |
raise ValueError('{} do not support instruct inference'.format(self.model_dir))
|
108 |
instruct_text = self.frontend.text_normalize(instruct_text, split=False)
|
|
|
103 |
|
104 |
@spaces.GPU
|
105 |
def inference_instruct(self, tts_text, spk_id, instruct_text, stream=False, speed=1.0):
|
106 |
+
assert isinstance(self.model, CosyVoiceModel), 'inference_instruct is only implemented for CosyVoice!'
|
107 |
if self.frontend.instruct is False:
|
108 |
raise ValueError('{} do not support instruct inference'.format(self.model_dir))
|
109 |
instruct_text = self.frontend.text_normalize(instruct_text, split=False)
|