Mahiruoshi commited on
Commit
c7aa991
·
1 Parent(s): 2618e11

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +263 -32
main.py CHANGED
@@ -19,6 +19,8 @@ import os
19
  import pickle
20
  import openai
21
  from scipy.io.wavfile import write
 
 
22
  def is_japanese(string):
23
  for ch in string:
24
  if ord(ch) > 0x3040 and ord(ch) < 0x30FF:
@@ -42,6 +44,7 @@ def extrac(text):
42
  i = romajitable.to_kana(i).katakana
43
  i = i.replace('\n','').replace(' ','')
44
  #Current length of single sentence: 20
 
45
  if len(i)>1:
46
  if len(i) > 20:
47
  try:
@@ -53,6 +56,8 @@ def extrac(text):
53
  pass
54
  else:
55
  final_list.append(i)
 
 
56
  final_list = [x for x in final_list if x != '']
57
  print(final_list)
58
  return final_list
@@ -98,7 +103,7 @@ def get_symbols_from_json(path):
98
  return data['symbols']
99
 
100
  def sle(language,text):
101
- text = text.replace('\n', ' ').replace('\r', '').replace(" ", "")
102
  if language == "中文":
103
  tts_input1 = "[ZH]" + text + "[ZH]"
104
  return tts_input1
@@ -124,6 +129,7 @@ def get_text(text,hps_ms):
124
  def create_tts_fn(net_g,hps,speaker_id):
125
  speaker_id = int(speaker_id)
126
  def tts_fn(history,is_gpt,api_key,is_audio,audiopath,repeat_time,text, language, extract, n_scale= 0.667,n_scale_w = 0.8, l_scale = 1 ):
 
127
  repeat_time = int(repeat_time)
128
  if is_gpt:
129
  openai.api_key = api_key
@@ -182,44 +188,241 @@ def create_tts_fn(net_g,hps,speaker_id):
182
  print(time_end)
183
  f1.write(str(c-1)+'\n'+time_start+' --> '+time_end+'\n'+sentence+'\n\n')
184
  audio_fin.append(audio)
185
- try:
186
- write(audiopath + '.wav',22050,np.concatenate(audio_fin))
187
- if is_audio:
188
- for i in range(repeat_time):
189
- cmd = 'ffmpeg -y -i ' + audiopath + '.wav' + ' -ar 44100 '+ audiopath.replace('temp','temp'+str(i))
190
- os.system(cmd)
191
 
192
- except:
193
- pass
194
 
195
  file_path = "subtitles.srt"
196
  return history,file_path,(hps.data.sampling_rate, np.concatenate(audio_fin))
197
  return tts_fn
198
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  def bot(history,user_message):
200
- return history + [[user_message, None]]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
 
202
  if __name__ == '__main__':
203
  hps = utils.get_hparams_from_file('checkpoints/tmp/config.json')
204
  dev = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
205
  models = []
206
- schools = ["Nijigasaki High School","Seisho Music Academy","Rinmeikan Girls School","Frontier School of Arts","Siegfeld Institute of Music"]
 
207
  lan = ["中文","日文","自动","手动"]
208
  with open("checkpoints/info.json", "r", encoding="utf-8") as f:
209
  models_info = json.load(f)
210
- checkpoint = models_info['Seisho Music Academy']["checkpoint"]
211
- phone_dict = {
212
- symbol: i for i, symbol in enumerate(symbols)
213
- }
214
- net_g = SynthesizerTrn(
215
- len(symbols),
216
- hps.data.filter_length // 2 + 1,
217
- hps.train.segment_size // hps.data.hop_length,
218
- n_speakers=hps.data.n_speakers,
219
- **hps.model).to(dev)
220
- _ = net_g.eval()
221
- _ = utils.load_checkpoint(checkpoint, net_g)
222
  for i in models_info:
 
 
 
 
 
 
 
 
 
 
 
 
223
  school = models_info[i]
224
  speakers = school["speakers"]
225
  content = []
@@ -230,12 +433,14 @@ if __name__ == '__main__':
230
  name = speakers[j]["name"]
231
  content.append((sid, name, title, example, create_tts_fn(net_g,hps,sid)))
232
  models.append(content)
233
-
234
  with gr.Blocks() as app:
235
  with gr.Tabs():
236
- for i in schools:
237
  with gr.TabItem(i):
238
- for (sid, name, title, example, tts_fn) in models[schools.index(i)]:
 
 
239
  with gr.TabItem(name):
240
  with gr.Column():
241
  with gr.Row():
@@ -255,19 +460,45 @@ if __name__ == '__main__':
255
  with gr.Accordion(label="Setting", open=False):
256
  input2 = gr.Dropdown(label="Language", choices=lan, value="自动", interactive=True)
257
  input3 = gr.Checkbox(value=False, label="长句切割(小说合成)")
258
- input4 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声比例(noise scale),以控制情感", value=0.567)
259
- input5 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声偏差(noise scale w),以控制音素长短", value=0.7)
260
  input6 = gr.Slider(minimum=0.1, maximum=10, label="duration", value=1)
261
  with gr.Accordion(label="Advanced Setting", open=False):
262
  audio_input3 = gr.Dropdown(label="重复次数", choices=list(range(101)), value='0', interactive=True)
263
  api_input1 = gr.Checkbox(value=False, label="接入chatgpt")
264
- api_input2 = gr.TextArea(label="api-key",lines=1,value = '见 https://openai.com/blog/openai-api')
265
  output2 = gr.outputs.File(label="字幕文件:subtitles.srt")
266
  audio_input1 = gr.Checkbox(value=False, label="修改音频路径(live2d)")
267
- audio_input2 = gr.TextArea(label="音频路径",lines=1,value = '#参考 D:/app_develop/live2d_whole/2010002/sounds/temp.wav')
268
-
 
 
269
  btnVC.click(bot, inputs = [chatbot,input1], outputs = [chatbot]).then(
270
  tts_fn, inputs=[chatbot,api_input1,api_input2,audio_input1,audio_input2,audio_input3,input1,input2,input3,input4,input5,input6], outputs=[chatbot,output2,output1]
271
  )
272
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
  app.launch()
 
19
  import pickle
20
  import openai
21
  from scipy.io.wavfile import write
22
+ import librosa
23
+ from mel_processing import spectrogram_torch
24
  def is_japanese(string):
25
  for ch in string:
26
  if ord(ch) > 0x3040 and ord(ch) < 0x30FF:
 
44
  i = romajitable.to_kana(i).katakana
45
  i = i.replace('\n','').replace(' ','')
46
  #Current length of single sentence: 20
47
+ '''
48
  if len(i)>1:
49
  if len(i) > 20:
50
  try:
 
56
  pass
57
  else:
58
  final_list.append(i)
59
+ '''
60
+ final_list.append(i)
61
  final_list = [x for x in final_list if x != '']
62
  print(final_list)
63
  return final_list
 
103
  return data['symbols']
104
 
105
  def sle(language,text):
106
+ text = text.replace('\n', '').replace('\r', '').replace(" ", "")
107
  if language == "中文":
108
  tts_input1 = "[ZH]" + text + "[ZH]"
109
  return tts_input1
 
129
  def create_tts_fn(net_g,hps,speaker_id):
130
  speaker_id = int(speaker_id)
131
  def tts_fn(history,is_gpt,api_key,is_audio,audiopath,repeat_time,text, language, extract, n_scale= 0.667,n_scale_w = 0.8, l_scale = 1 ):
132
+ text = check_text(text)
133
  repeat_time = int(repeat_time)
134
  if is_gpt:
135
  openai.api_key = api_key
 
188
  print(time_end)
189
  f1.write(str(c-1)+'\n'+time_start+' --> '+time_end+'\n'+sentence+'\n\n')
190
  audio_fin.append(audio)
191
+ try:
192
+ write(audiopath + '.wav',22050,np.concatenate(audio_fin))
193
+ if is_audio:
194
+ for i in range(repeat_time):
195
+ cmd = 'ffmpeg -y -i ' + audiopath + '.wav' + ' -ar 44100 '+ audiopath.replace('temp','temp'+str(i))
196
+ os.system(cmd)
197
 
198
+ except:
199
+ pass
200
 
201
  file_path = "subtitles.srt"
202
  return history,file_path,(hps.data.sampling_rate, np.concatenate(audio_fin))
203
  return tts_fn
204
 
205
+ def create_vc_fn(net_g,hps):
206
+ def vc_fn(text,language,n_scale,n_scale_w,l_scale,original_speaker, target_speaker, record_audio, upload_audio):
207
+ input_audio = record_audio if record_audio is not None else upload_audio
208
+ original_speaker_id = selection(original_speaker)
209
+ target_speaker_id = selection(target_speaker)
210
+ if input_audio is None:
211
+ stn_tst = get_text(sle(language,text),hps)
212
+ with torch.no_grad():
213
+ x_tst = stn_tst.unsqueeze(0).to(dev)
214
+ x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(dev)
215
+ sid = torch.LongTensor([original_speaker_id]).to(dev)
216
+ audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=n_scale, noise_scale_w=n_scale_w, length_scale=l_scale)[0][0,0].data.cpu().float().numpy()
217
+ sampling_rate = hps.data.sampling_rate
218
+ else:
219
+ sampling_rate, audio = input_audio
220
+ audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
221
+ if len(audio.shape) > 1:
222
+ audio = librosa.to_mono(audio.transpose(1, 0))
223
+ if sampling_rate != hps.data.sampling_rate:
224
+ audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=hps.data.sampling_rate)
225
+ with torch.no_grad():
226
+ y = torch.FloatTensor(audio)
227
+ y = y / max(-y.min(), y.max()) / 0.99
228
+ y = y.to(dev)
229
+ y = y.unsqueeze(0)
230
+ spec = spectrogram_torch(y, hps.data.filter_length,
231
+ hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
232
+ center=False).to(dev)
233
+ spec_lengths = torch.LongTensor([spec.size(-1)]).to(dev)
234
+ sid_src = torch.LongTensor([original_speaker_id]).to(dev)
235
+ sid_tgt = torch.LongTensor([target_speaker_id]).to(dev)
236
+ audio = net_g.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[0][
237
+ 0, 0].data.cpu().float().numpy()
238
+ del y, spec, spec_lengths, sid_src, sid_tgt
239
+ return "Success", (hps.data.sampling_rate, audio)
240
+ return vc_fn
241
+
242
  def bot(history,user_message):
243
+ return history + [[check_text(user_message), None]]
244
+
245
+ def selection(speaker):
246
+ if speaker == "高咲侑":
247
+ spk = 0
248
+ return spk
249
+
250
+ elif speaker == "歩夢":
251
+ spk = 1
252
+ return spk
253
+
254
+ elif speaker == "かすみ":
255
+ spk = 2
256
+ return spk
257
+
258
+ elif speaker == "しずく":
259
+ spk = 3
260
+ return spk
261
+
262
+ elif speaker == "果林":
263
+ spk = 4
264
+ return spk
265
+
266
+ elif speaker == "愛":
267
+ spk = 5
268
+ return spk
269
+
270
+ elif speaker == "彼方":
271
+ spk = 6
272
+ return spk
273
+
274
+ elif speaker == "せつ菜":
275
+ spk = 7
276
+ return spk
277
+
278
+ elif speaker == "エマ":
279
+ spk = 8
280
+ return spk
281
+
282
+ elif speaker == "璃奈":
283
+ spk = 9
284
+ return spk
285
+
286
+ elif speaker == "栞子":
287
+ spk = 10
288
+ return spk
289
+
290
+ elif speaker == "ランジュ":
291
+ spk = 11
292
+ return spk
293
+
294
+ elif speaker == "ミア":
295
+ spk = 12
296
+ return spk
297
+
298
+ elif speaker == "派蒙":
299
+ spk = 16
300
+ return spk
301
+
302
+ elif speaker == "c1":
303
+ spk = 18
304
+ return spk
305
+
306
+ elif speaker == "c2":
307
+ spk = 19
308
+ return spk
309
+
310
+ elif speaker == "華恋":
311
+ spk = 21
312
+ return spk
313
+
314
+ elif speaker == "まひる":
315
+ spk = 22
316
+ return spk
317
+
318
+ elif speaker == "なな":
319
+ spk = 23
320
+ return spk
321
+
322
+ elif speaker == "クロディーヌ":
323
+ spk = 24
324
+ return spk
325
+
326
+ elif speaker == "ひかり":
327
+ spk = 25
328
+ return spk
329
+
330
+ elif speaker == "純那":
331
+ spk = 26
332
+ return spk
333
+
334
+ elif speaker == "香子":
335
+ spk = 27
336
+ return spk
337
+
338
+ elif speaker == "真矢":
339
+ spk = 28
340
+ return spk
341
+
342
+ elif speaker == "双葉":
343
+ spk = 29
344
+ return spk
345
+
346
+ elif speaker == "ミチル":
347
+ spk = 30
348
+ return spk
349
+
350
+ elif speaker == "メイファン":
351
+ spk = 31
352
+ return spk
353
+
354
+ elif speaker == "やちよ":
355
+ spk = 32
356
+ return spk
357
+
358
+ elif speaker == "晶":
359
+ spk = 33
360
+ return spk
361
+
362
+ elif speaker == "いちえ":
363
+ spk = 34
364
+ return spk
365
+
366
+ elif speaker == "ゆゆ子":
367
+ spk = 35
368
+ return spk
369
+
370
+ elif speaker == "塁":
371
+ spk = 36
372
+ return spk
373
+
374
+ elif speaker == "珠緒":
375
+ spk = 37
376
+ return spk
377
+
378
+ elif speaker == "あるる":
379
+ spk = 38
380
+ return spk
381
+
382
+ elif speaker == "ララフィン":
383
+ spk = 39
384
+ return spk
385
+
386
+ elif speaker == "美空":
387
+ spk = 40
388
+ return spk
389
+
390
+ elif speaker == "静羽":
391
+ spk = 41
392
+ return spk
393
+
394
+ else:
395
+ return 0
396
+
397
+ def check_text(input):
398
+ if isinstance(input, str):
399
+ return input
400
+ else:
401
+ with open(input.name, "r", encoding="utf-8") as f:
402
+ return f.read()
403
 
404
  if __name__ == '__main__':
405
  hps = utils.get_hparams_from_file('checkpoints/tmp/config.json')
406
  dev = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
407
  models = []
408
+ schools_list = ["ShojoKageki-Nijigasaki","ShojoKageki","Nijigasaki"]
409
+ schools = []
410
  lan = ["中文","日文","自动","手动"]
411
  with open("checkpoints/info.json", "r", encoding="utf-8") as f:
412
  models_info = json.load(f)
 
 
 
 
 
 
 
 
 
 
 
 
413
  for i in models_info:
414
+ checkpoint = models_info[i]["checkpoint"]
415
+ phone_dict = {
416
+ symbol: i for i, symbol in enumerate(symbols)
417
+ }
418
+ net_g = SynthesizerTrn(
419
+ len(symbols),
420
+ hps.data.filter_length // 2 + 1,
421
+ hps.train.segment_size // hps.data.hop_length,
422
+ n_speakers=hps.data.n_speakers,
423
+ **hps.model).to(dev)
424
+ _ = net_g.eval()
425
+ _ = utils.load_checkpoint(checkpoint, net_g)
426
  school = models_info[i]
427
  speakers = school["speakers"]
428
  content = []
 
433
  name = speakers[j]["name"]
434
  content.append((sid, name, title, example, create_tts_fn(net_g,hps,sid)))
435
  models.append(content)
436
+ schools.append((i,create_vc_fn(net_g,hps)))
437
  with gr.Blocks() as app:
438
  with gr.Tabs():
439
+ for (i,vc_fn) in schools:
440
  with gr.TabItem(i):
441
+ idols = ["派蒙"]
442
+ for (sid, name, title, example, tts_fn) in models[schools_list.index(i)]:
443
+ idols.append(name)
444
  with gr.TabItem(name):
445
  with gr.Column():
446
  with gr.Row():
 
460
  with gr.Accordion(label="Setting", open=False):
461
  input2 = gr.Dropdown(label="Language", choices=lan, value="自动", interactive=True)
462
  input3 = gr.Checkbox(value=False, label="长句切割(小说合成)")
463
+ input4 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声比例(noise scale),以控制情感", value=0.6)
464
+ input5 = gr.Slider(minimum=0, maximum=1.0, label="更改噪声偏差(noise scale w),以控制音素长短", value=0.668)
465
  input6 = gr.Slider(minimum=0.1, maximum=10, label="duration", value=1)
466
  with gr.Accordion(label="Advanced Setting", open=False):
467
  audio_input3 = gr.Dropdown(label="重复次数", choices=list(range(101)), value='0', interactive=True)
468
  api_input1 = gr.Checkbox(value=False, label="接入chatgpt")
469
+ api_input2 = gr.TextArea(label="api-key",lines=1,value = '懂得都懂')
470
  output2 = gr.outputs.File(label="字幕文件:subtitles.srt")
471
  audio_input1 = gr.Checkbox(value=False, label="修改音频路径(live2d)")
472
+ audio_input2 = gr.TextArea(label="音频路径",lines=1,value = 'D:/path/to/live2d/sounds/temp.wav')
473
+ input3 = gr.Checkbox(value=False, label="长句切割(小说合成)")
474
+ inputxt = gr.File(label="Text")
475
+ btnbook = gr.Button("小说合成")
476
  btnVC.click(bot, inputs = [chatbot,input1], outputs = [chatbot]).then(
477
  tts_fn, inputs=[chatbot,api_input1,api_input2,audio_input1,audio_input2,audio_input3,input1,input2,input3,input4,input5,input6], outputs=[chatbot,output2,output1]
478
  )
479
+ btnbook.click(bot, inputs = [chatbot,inputxt], outputs = [chatbot]).then(
480
+ tts_fn, inputs=[chatbot,api_input1,api_input2,audio_input1,audio_input2,audio_input3,inputxt,input2,input3,input4,input5,input6], outputs=[chatbot,output2,output1]
481
+ )
482
+ with gr.Tab("Voice Conversion(类似sovits)"):
483
+ gr.Markdown("""
484
+ 声线转化,使用模型中的说话人作为音源时效果更佳
485
+ """)
486
+ with gr.Column():
487
+ with gr.Accordion(label="方法1:录制或上传声音,可进行歌声合成", open=False):
488
+ record_audio = gr.Audio(label="record your voice", source="microphone")
489
+ upload_audio = gr.Audio(label="or upload audio here", source="upload")
490
+ with gr.Accordion(label="方法2:由原说话人先进行tts后套娃,适用于合成中文等特殊场景", open=True):
491
+ text = gr.TextArea(label="Text", value='由源说话人进行语音转化',lines = 1)
492
+ language = gr.Dropdown(label="Language", choices=lan, value="自动", interactive=True)
493
+ n_scale = gr.Slider(minimum=0, maximum=1.0, label="更改噪声比例(noise scale),以控制情感", value=0.6)
494
+ n_scale_w = gr.Slider(minimum=0, maximum=1.0, label="更改噪声偏差(noise scale w),以控制音素长短", value=0.668)
495
+ l_scale = gr.Slider(minimum=0.1, maximum=10, label="duration", value=1.1)
496
+ source_speaker = gr.Dropdown(choices=idols, value=idols[-2], label="source speaker")
497
+ target_speaker = gr.Dropdown(choices=idols, value=idols[-3], label="target speaker")
498
+ with gr.Column():
499
+ message_box = gr.Textbox(label="Message")
500
+ converted_audio = gr.Audio(label='converted audio')
501
+ btn = gr.Button("Convert!")
502
+ btn.click(vc_fn, inputs=[text,language,n_scale,n_scale_w,l_scale,source_speaker, target_speaker, record_audio, upload_audio],
503
+ outputs=[message_box, converted_audio])
504
  app.launch()