Mahiruoshi commited on
Commit
1e6af1e
·
verified ·
1 Parent(s): b98c304

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +224 -8
app.py CHANGED
@@ -25,6 +25,8 @@ from torch.utils.data import DataLoader, Dataset
25
  from tqdm import tqdm
26
  from clap_wrapper import get_clap_audio_feature, get_clap_text_feature
27
 
 
 
28
 
29
  import gradio as gr
30
 
@@ -36,13 +38,14 @@ import commons
36
  from text import cleaned_text_to_sequence, get_bert
37
  from text.cleaner import clean_text
38
  import utils
39
-
40
  from models import SynthesizerTrn
41
  from text.symbols import symbols
42
  import sys
 
43
 
44
  net_g = None
45
- '''
46
  device = (
47
  "cuda:0"
48
  if torch.cuda.is_available()
@@ -52,8 +55,7 @@ device = (
52
  else "cpu"
53
  )
54
  )
55
- '''
56
- device = "cpu"
57
  BandList = {
58
  "PoppinParty":["香澄","有咲","たえ","りみ","沙綾"],
59
  "Afterglow":["蘭","モカ","ひまり","巴","つぐみ"],
@@ -156,7 +158,6 @@ def infer(
156
  emo = emo.to(device).unsqueeze(0)
157
  del phones
158
  speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
159
- print(text)
160
  audio = (
161
  net_g.infer(
162
  x_tst,
@@ -188,6 +189,161 @@ def infer(
188
  torch.cuda.empty_cache()
189
  return (hps.data.sampling_rate,gr.processing_utils.convert_to_16_bit_wav(audio))
190
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
  def loadmodel(model):
192
  _ = net_g.eval()
193
  _ = utils.load_checkpoint(model, net_g, None, skip_optimizer=True)
@@ -221,7 +377,7 @@ if __name__ == "__main__":
221
  minimum=0.1, maximum=2, value=1, step=0.01, label="语速调节"
222
  )
223
  emotion = gr.Textbox(
224
- label="情感标注文本t",
225
  value = 'なんではるひかげやったの?!!'
226
  )
227
  style_weight = gr.Slider(
@@ -274,6 +430,66 @@ if __name__ == "__main__":
274
  ],
275
  outputs=[audio_output],
276
  )
277
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
278
  print("推理页面已开启!")
279
- app.launch(share=True)
 
25
  from tqdm import tqdm
26
  from clap_wrapper import get_clap_audio_feature, get_clap_text_feature
27
 
28
+ from tools.sentence import extrac, is_japanese, is_chinese, seconds_to_ass_time, extract_text_from_file, remove_annotations,extract_and_convert
29
+ import re
30
 
31
  import gradio as gr
32
 
 
38
  from text import cleaned_text_to_sequence, get_bert
39
  from text.cleaner import clean_text
40
  import utils
41
+ from scipy.io.wavfile import write
42
  from models import SynthesizerTrn
43
  from text.symbols import symbols
44
  import sys
45
+ import shutil
46
 
47
  net_g = None
48
+
49
  device = (
50
  "cuda:0"
51
  if torch.cuda.is_available()
 
55
  else "cpu"
56
  )
57
  )
58
+
 
59
  BandList = {
60
  "PoppinParty":["香澄","有咲","たえ","りみ","沙綾"],
61
  "Afterglow":["蘭","モカ","ひまり","巴","つぐみ"],
 
158
  emo = emo.to(device).unsqueeze(0)
159
  del phones
160
  speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
 
161
  audio = (
162
  net_g.infer(
163
  x_tst,
 
189
  torch.cuda.empty_cache()
190
  return (hps.data.sampling_rate,gr.processing_utils.convert_to_16_bit_wav(audio))
191
 
192
+
193
+ def generate_audio_and_srt_for_group(group, outputPath, group_index, sampling_rate, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,spealerList,silenceTime):
194
+ audio_fin = []
195
+ ass_entries = []
196
+ start_time = 0
197
+ #speaker = random.choice(cara_list)
198
+ ass_header = """[Script Info]
199
+ ; 我没意见
200
+ Title: Audiobook
201
+ ScriptType: v4.00+
202
+ WrapStyle: 0
203
+ PlayResX: 640
204
+ PlayResY: 360
205
+ ScaledBorderAndShadow: yes
206
+ [V4+ Styles]
207
+ Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
208
+ Style: Default,Arial,20,&H00FFFFFF,&H000000FF,&H00000000,&H00000000,0,0,0,0,100,100,0,0,1,1,1,2,10,10,10,1
209
+ [Events]
210
+ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
211
+ """
212
+
213
+ for sentence in group:
214
+ print(sentence)
215
+ FakeSpeaker = sentence.split("|")[0]
216
+ print(FakeSpeaker)
217
+ SpeakersList = re.split('\n', spealerList)
218
+ if FakeSpeaker in list(hps.data.spk2id.keys()):
219
+ speaker = FakeSpeaker
220
+ for i in SpeakersList:
221
+ if FakeSpeaker == i.split("|")[1]:
222
+ speaker = i.split("|")[0]
223
+ if sentence != '\n':
224
+ audio = infer_simple((remove_annotations(sentence.split("|")[-1]).replace(" ","")+"。").replace(",。","。").replace("。。","。"), sdp_ratio, noise_scale, noise_scale_w, length_scale,speaker)
225
+ silence_frames = int(silenceTime * 44010) if is_chinese(sentence) else int(silenceTime * 44010)
226
+ silence_data = np.zeros((silence_frames,), dtype=audio.dtype)
227
+ audio_fin.append(audio)
228
+ audio_fin.append(silence_data)
229
+
230
+ duration = len(audio) / sampling_rate
231
+ print(duration)
232
+ end_time = start_time + duration + silenceTime
233
+ ass_entries.append("Dialogue: 0,{},{},".format(seconds_to_ass_time(start_time), seconds_to_ass_time(end_time)) + "Default,,0,0,0,,{}".format(sentence.replace("|",":")))
234
+ start_time = end_time
235
+
236
+ wav_filename = os.path.join(outputPath, f'audiobook_part_{group_index}.wav')
237
+ ass_filename = os.path.join(outputPath, f'audiobook_part_{group_index}.ass')
238
+
239
+ write(wav_filename, sampling_rate, np.concatenate(audio_fin))
240
+
241
+ with open(ass_filename, 'w', encoding='utf-8') as f:
242
+ f.write(ass_header + '\n'.join(ass_entries))
243
+ return (hps.data.sampling_rate, np.concatenate(audio_fin))
244
+
245
+
246
+ def infer_simple(
247
+ text,
248
+ sdp_ratio,
249
+ noise_scale,
250
+ noise_scale_w,
251
+ length_scale,
252
+ sid,
253
+ emotion = '',
254
+ reference_audio=None,
255
+ skip_start=False,
256
+ skip_end=False,
257
+ style_text=None,
258
+ style_weight=0.7,
259
+ ):
260
+ language = "JP"
261
+ if isinstance(reference_audio, np.ndarray):
262
+ emo = get_clap_audio_feature(reference_audio, device)
263
+ else:
264
+ emo = get_clap_text_feature(emotion, device)
265
+ emo = torch.squeeze(emo, dim=1)
266
+
267
+ bert, phones, tones, lang_ids = get_text(
268
+ text,
269
+ language,
270
+ hps,
271
+ device,
272
+ style_text=style_text,
273
+ style_weight=style_weight,
274
+ )
275
+ if skip_start:
276
+ phones = phones[3:]
277
+ tones = tones[3:]
278
+ lang_ids = lang_ids[3:]
279
+ bert = bert[:, 3:]
280
+ if skip_end:
281
+ phones = phones[:-2]
282
+ tones = tones[:-2]
283
+ lang_ids = lang_ids[:-2]
284
+ bert = bert[:, :-2]
285
+ with torch.no_grad():
286
+ x_tst = phones.to(device).unsqueeze(0)
287
+ tones = tones.to(device).unsqueeze(0)
288
+ lang_ids = lang_ids.to(device).unsqueeze(0)
289
+ bert = bert.to(device).unsqueeze(0)
290
+ x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
291
+ emo = emo.to(device).unsqueeze(0)
292
+ del phones
293
+ speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
294
+ audio = (
295
+ net_g.infer(
296
+ x_tst,
297
+ x_tst_lengths,
298
+ speakers,
299
+ tones,
300
+ lang_ids,
301
+ bert,
302
+ emo,
303
+ sdp_ratio=sdp_ratio,
304
+ noise_scale=noise_scale,
305
+ noise_scale_w=noise_scale_w,
306
+ length_scale=length_scale,
307
+ )[0][0, 0]
308
+ .data.cpu()
309
+ .float()
310
+ .numpy()
311
+ )
312
+ del (
313
+ x_tst,
314
+ tones,
315
+ lang_ids,
316
+ bert,
317
+ x_tst_lengths,
318
+ speakers,
319
+ emo,
320
+ ) # , emo
321
+ if torch.cuda.is_available():
322
+ torch.cuda.empty_cache()
323
+ return audio
324
+
325
+ def audiobook(inputFile, groupsize, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,spealerList,silenceTime,filepath,raw_text):
326
+ directory_path = filepath if torch.cuda.is_available() else "books"
327
+
328
+ if os.path.exists(directory_path):
329
+ shutil.rmtree(directory_path)
330
+
331
+ os.makedirs(directory_path)
332
+ if inputFile:
333
+ text = extract_text_from_file(inputFile.name)
334
+ else:
335
+ text = raw_text
336
+ sentences = extrac(extract_and_convert(text))
337
+ GROUP_SIZE = groupsize
338
+ for i in range(0, len(sentences), GROUP_SIZE):
339
+ group = sentences[i:i+GROUP_SIZE]
340
+ if spealerList == "":
341
+ spealerList = "无"
342
+ result = generate_audio_and_srt_for_group(group,directory_path, i//GROUP_SIZE + 1, 44100, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,spealerList,silenceTime)
343
+ if not torch.cuda.is_available():
344
+ return result
345
+ return result
346
+
347
  def loadmodel(model):
348
  _ = net_g.eval()
349
  _ = utils.load_checkpoint(model, net_g, None, skip_optimizer=True)
 
377
  minimum=0.1, maximum=2, value=1, step=0.01, label="语速调节"
378
  )
379
  emotion = gr.Textbox(
380
+ label="情感标注文本",
381
  value = 'なんではるひかげやったの?!!'
382
  )
383
  style_weight = gr.Slider(
 
430
  ],
431
  outputs=[audio_output],
432
  )
433
+ with gr.Tab('拓展功能'):
434
+ with gr.Row():
435
+ with gr.Column():
436
+ gr.Markdown(
437
+ f"从 <a href='https://nijigaku.top/2023/10/03/BangDreamTTS/'>我的博客站点</a> 查看自制galgame使用说明\n</a>"
438
+ )
439
+ inputFile = gr.UploadButton(label="txt文件输入")
440
+ raw_text = gr.TextArea(
441
+ label="文本输入",
442
+ info="输入纯日语或者中文",
443
+ value="つくし|我是来结束这个乐队的。",
444
+ )
445
+ groupSize = gr.Slider(
446
+ minimum=10, maximum=1000 if torch.cuda.is_available() else 50,value = 50, step=1, label="单个音频文件包含的最大字数"
447
+ )
448
+ silenceTime = gr.Slider(
449
+ minimum=0, maximum=1, value=0.5, step=0.01, label="句子的间隔"
450
+ )
451
+ filepath = gr.TextArea(
452
+ label="本地合成时的音频存储文件夹(会清空文件夹)",
453
+ value = "D:/audiobook/book1",
454
+ )
455
+ spealerList = gr.TextArea(
456
+ label="角色对应表,左边是你想要在每一句话合成中用到的speaker(见角色清单)右边是你上传文本时分隔符左边设置的说话人:{ChoseSpeakerFromConfigList}|{SeakerInUploadText}",
457
+ value = "ましろ|真白\n七深|七深\n透子|透子\nつくし|筑紫\n瑠唯|瑠唯\nそよ|素世\n祥子|祥子",
458
+ )
459
+ speaker = gr.Dropdown(
460
+ choices=speakers, value = "ましろ", label="选择默认说话人"
461
+ )
462
+ with gr.Column():
463
+ sdp_ratio = gr.Slider(
464
+ minimum=0, maximum=1, value=0.2, step=0.01, label="SDP/DP混合比"
465
+ )
466
+ noise_scale = gr.Slider(
467
+ minimum=0.1, maximum=2, value=0.6, step=0.01, label="感情调节"
468
+ )
469
+ noise_scale_w = gr.Slider(
470
+ minimum=0.1, maximum=2, value=0.667, step=0.01, label="音素长度"
471
+ )
472
+ length_scale = gr.Slider(
473
+ minimum=0.1, maximum=2, value=1, step=0.01, label="生成长度"
474
+ )
475
+ LastAudioOutput = gr.Audio(label="当使用cuda时才能在本地文件夹浏览全部文件")
476
+ btn2 = gr.Button("点击生成", variant="primary")
477
+ btn2.click(
478
+ audiobook,
479
+ inputs=[
480
+ inputFile,
481
+ groupSize,
482
+ speaker,
483
+ sdp_ratio,
484
+ noise_scale,
485
+ noise_scale_w,
486
+ length_scale,
487
+ spealerList,
488
+ silenceTime,
489
+ filepath,
490
+ raw_text
491
+ ],
492
+ outputs=[LastAudioOutput],
493
+ )
494
  print("推理页面已开启!")
495
+ app.launch()