Dionyssos commited on
Commit
4e4c64c
·
1 Parent(s): c9121c5

light files

Browse files
Files changed (4) hide show
  1. mimic3_make_harvard_sentences.py +205 -137
  2. models.py +611 -0
  3. text_utils.py +116 -0
  4. utils.py +74 -0
mimic3_make_harvard_sentences.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import shutil
2
  import csv
3
  import io
@@ -62,12 +63,12 @@ import audiofile
62
  # [print(i) for i in foreign_voices]
63
  # print('\n_______________________________\n')
64
  # [print(i) for i in english_voices]
65
- # ====================================================== END PRINT LIST OF VOICES
66
  list_voices = [
67
  'en_US/m-ailabs_low#mary_ann',
68
  'en_UK/apope_low',
69
  'de_DE/thorsten-emotion_low#neutral', # is the 4x really interesting we can just write it in Section
70
- 'human'
71
  ] # special - for human we load specific style file - no Mimic3 is run
72
 
73
 
@@ -290,7 +291,7 @@ for _id, _voice in enumerate(list_voices):
290
  with open('harvard.json', 'r') as f:
291
  harvard_individual_sentences = json.load(f)['sentences']
292
  total_audio_mimic3 = []
293
- total_audio_stts2 = []
294
  ix = 0
295
  for list_of_10 in harvard_individual_sentences[:1]: # 77
296
 
@@ -341,16 +342,22 @@ for _id, _voice in enumerate(list_voices):
341
  # # state.ssml = 1234546575
342
  # state.stdout = True
343
  # state.tts = True
344
- process_lines(state, wav_path='tmp1.wav')
 
345
  shutdown_tts(state)
346
- x, fs = audiofile.read('tmp1.wav')
347
- print(x.shape)
348
  else:
 
349
  # MSP['valence.train.votes'].get().sort_values('7').index[-1]
350
- human_style = '/cache/audb/msppodcast/2.4.0/fe182b91/Audios/MSP-PODCAST_0235_0053.wav'
351
- x, fs = audiofile.read(human_style)
 
 
 
352
  print(x.shape,' human') # crop human to almost mimic-3 duration
353
  total_audio_mimic3.append(x)
 
354
  print(fs, text, 'mimic3')
355
 
356
  # MIMIC3 = = = = = = = = = = = = = = END
@@ -358,7 +365,7 @@ for _id, _voice in enumerate(list_voices):
358
 
359
 
360
 
361
- style_vec = msinference.compute_style('tmp1.wav') # use mimic-3 as prompt
362
 
363
 
364
 
@@ -369,39 +376,47 @@ for _id, _voice in enumerate(list_voices):
369
  diffusion_steps=7,
370
  embedding_scale=1)
371
 
372
- total_audio_stts2.append(x)
 
 
373
 
 
 
 
 
 
 
 
374
 
375
-
376
-
377
-
378
-
379
- total_audio_stts2 = np.concatenate(total_audio_stts2) # -- concat 77x lists
380
- total_audio_stts2 = audresample.resample(total_audio_stts2, original_rate=24000, target_rate=16000)[0] # for audinterface
381
- audiofile.write(out_dir + 'styletts2__' + _str + '.wav', total_audio_stts2, 16000)
382
 
383
  total_audio_mimic3 = np.concatenate(total_audio_mimic3) # -- concat 77x lists
384
- total_audio_mimic3 = audresample.resample(total_audio_mimic3, original_rate=24000, target_rate=16000)[0]
 
 
 
 
 
 
385
  audiofile.write(out_dir + 'mimic3__' + _str + '.wav', total_audio_mimic3, 16000)
386
 
387
- print('Saving:', out_dir + 'mimic3__' + _str + '.wav')
388
- else:
389
- print('Skip:', out_dir + 'styletts2__' + _str + '.wav')
390
 
391
 
392
  # AUD I N T E R F A C E
393
- # file_interface = f'timeseries_{long_audio.replace("/", "")}.pkl'
394
 
395
 
396
 
397
- for engine in ['mimic3', 'styletts2']:
 
398
  harvard_of_voice = f'{out_dir}{engine}__{_str}'
399
  if not os.path.exists(harvard_of_voice + '.pkl'):
400
  df = interface.process_file(harvard_of_voice + '.wav')
401
  df.to_pickle(harvard_of_voice + '.pkl')
402
- else:
403
- # df = pd.read_pickle(harvard_of_voice + '.pkl')
404
- print(harvard_of_voice + '.pkl', 'FOUND')
405
 
406
 
407
 
@@ -411,150 +426,203 @@ for _id, _voice in enumerate(list_voices):
411
 
412
 
413
 
414
-
415
  print('\nVisuals\n')
416
 
417
  # ===============================================================================
418
  # V I S U A L S
419
  #
420
  # ===============================================================================
 
 
 
 
421
 
422
- for folder, list_voices in [
423
- ['foreign', foreign_voices],
424
- ['english', english_voices],
425
- ]:
426
- print(folder, list_voices[:4], '\n\nVISUALIZING VOICES')
427
- for _id, _voice in enumerate(list_voices[:4]):
428
- _str = _voice.replace('/', '_').replace('#', '_').replace('_low', '')
429
- _dir = folder + '_pkl/'
430
- if 'cmu-arctic' in _str:
431
- _str = _str.replace('cmu-arctic', 'cmu_arctic') #+ '.wav'
432
-
433
-
434
- vis_df = {}
435
- # LOAD PKL
436
- for engine in ['mimic3', 'styletts2']:
437
- harvard_of_voice = f'{_dir}{engine}__{_str}'
438
- if not os.path.exists(harvard_of_voice + '.pkl'):
439
- df = interface.process_file(harvard_of_voice + '.wav')
440
- df.to_pickle(harvard_of_voice + '.pkl')
441
- else:
442
- df = pd.read_pickle(harvard_of_voice + '.pkl')
443
- print(harvard_of_voice + '.pkl', 'FOUND')
444
 
445
- vis_df[engine] = df
446
- SHORT = min(len(vis_df['mimic3']), len(vis_df['styletts2']))
447
- for k,v in vis_df.items():
448
- p = v[:SHORT] # TRuncate extra segments - human is slower than mimic3
449
 
450
- p.reset_index(inplace= True)
451
- p.drop(columns=['file','start'], inplace=True)
452
- p.set_index('end', inplace=True)
453
- # p = p.filter(scene_classes) #['transport', 'indoor', 'outdoor'])
454
- p.index = p.index.map(mapper = (lambda x: x.total_seconds()))
455
- vis_df[k] = p
 
456
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
457
 
458
- print(vis_df, '\n\n\n\n \n')
459
- # ============ VISUAL ADV cats of styletts2 vs mimic3 same-voice
 
 
 
 
460
 
461
 
 
462
 
463
- fig, ax = plt.subplots(nrows=10, ncols=2, figsize=(24, 24),
464
- gridspec_kw={'hspace': 0, 'wspace': .04})
465
 
466
-
467
- # ADV
468
-
 
 
469
 
470
- time_stamp = vis_df['mimic3'].index.to_numpy()
471
- for j, dim in enumerate(['arousal',
472
- 'dominance',
473
- 'valence']):
474
-
475
- # MIMIC3
476
-
477
- ax[j, 0].plot(time_stamp, vis_df['mimic3'][dim],
478
- color=(0,104/255,139/255),
479
- label='mean_1',
480
- linewidth=2)
481
- ax[j, 0].fill_between(time_stamp,
482
-
483
- vis_df['mimic3'][dim],
484
- vis_df['styletts2'][dim],
485
-
486
- color=(.2,.2,.2),
487
- alpha=0.244)
488
- if j == 0:
489
- ax[j, 0].legend(['StyleTTS2 style mimic3',
490
- 'StyleTTS2 style crema-d'],
491
- prop={'size': 10},
492
- # loc='lower right'
493
- )
494
- ax[j, 0].set_ylabel(dim.lower(), color=(.4, .4, .4), fontsize=14)
495
-
496
- # TICK
497
- ax[j, 0].set_ylim([1e-7, .9999])
498
- # ax[j, 0].set_yticks([.25, .5,.75])
499
- # ax[j, 0].set_yticklabels(['0.25', '.5', '0.75'])
500
- ax[j, 0].set_xticklabels(['' for _ in ax[j, 0].get_xticklabels()])
501
- ax[j, 0].set_xlim([time_stamp[0], time_stamp[-1]])
502
 
 
 
 
 
 
 
 
 
503
 
504
- ax[j, 0].grid()
505
-
506
- # CATEGORIE
 
 
 
 
 
 
 
 
 
 
 
 
 
 
507
 
508
 
509
 
510
 
511
 
512
- time_stamp = vis_df['styletts2'].index.to_numpy()
513
- for j, dim in enumerate(['Angry',
514
- 'Sad',
515
- 'Happy',
516
- 'Surprise',
517
- 'Fear',
518
- 'Disgust',
519
- 'Contempt',
520
- # 'Neutral'
521
- ]): # ASaHSuFDCN
522
- j = j + 3 # skip A/D/V suplt
523
 
524
- # MIMIC3
525
 
526
- ax[j, 0].plot(time_stamp, vis_df['mimic3'][dim],
527
- color=(0,104/255,139/255),
528
- label='mean_1',
529
- linewidth=2)
530
- ax[j, 0].fill_between(time_stamp,
531
 
532
- vis_df['mimic3'][dim],
533
- vis_df['styletts2'][dim],
534
 
535
- color=(.2,.2,.2),
536
- alpha=0.244)
537
- # ax[j, 0].legend(['StyleTTS2 style mimic3',
538
- # 'StyleTTS2 style crema-d'],
539
- # prop={'size': 10},
540
- # # loc='upper left'
541
- # )
542
 
543
 
544
- ax[j, 0].set_ylabel(dim.lower(), color=(.4, .4, .4), fontsize=14)
545
 
546
- # TICKS
547
- ax[j, 0].set_ylim([1e-7, .9999])
548
- ax[j, 0].set_xlim([time_stamp[0], time_stamp[-1]])
549
- ax[j, 0].set_xticklabels(['' for _ in ax[j, 0].get_xticklabels()])
550
- ax[j, 0].set_xlabel('767 Harvard Sentences (seconds)', fontsize=16, color=(.4,.4,.4))
551
 
552
 
553
- ax[j, 0].grid()
554
 
555
 
 
 
 
 
 
556
 
557
- plt.savefig(f'bh_{_str}.png', bbox_inches='tight')
558
- plt.close()
559
 
560
- print('UNCOMMENT msinfereence')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://github.com/audeering/shift/tree/main -- RUN FROM THIS REPO
2
  import shutil
3
  import csv
4
  import io
 
63
  # [print(i) for i in foreign_voices]
64
  # print('\n_______________________________\n')
65
  # [print(i) for i in english_voices]
66
+ # ====================================================== LIST Mimic-3 ALL VOICES
67
  list_voices = [
68
  'en_US/m-ailabs_low#mary_ann',
69
  'en_UK/apope_low',
70
  'de_DE/thorsten-emotion_low#neutral', # is the 4x really interesting we can just write it in Section
71
+ 'human',
72
  ] # special - for human we load specific style file - no Mimic3 is run
73
 
74
 
 
291
  with open('harvard.json', 'r') as f:
292
  harvard_individual_sentences = json.load(f)['sentences']
293
  total_audio_mimic3 = []
294
+ total_audio_styletts2 = []
295
  ix = 0
296
  for list_of_10 in harvard_individual_sentences[:1]: # 77
297
 
 
342
  # # state.ssml = 1234546575
343
  # state.stdout = True
344
  # state.tts = True
345
+ style_path = 'tmp1.wav'
346
+ process_lines(state, wav_path=style_path)
347
  shutdown_tts(state)
348
+ x, fs = audiofile.read(style_path)
349
+ # print(x.shape)
350
  else:
351
+ # --
352
  # MSP['valence.train.votes'].get().sort_values('7').index[-1]
353
+ # style_path = '/cache/audb/msppodcast/2.4.0/fe182b91/Audios/MSP-PODCAST_0235_0053.wav'
354
+ # --
355
+ # MSP['emotion.test-1'].get().sort_values('valence').index[-1]
356
+ style_path = '/cache/audb/msppodcast/2.4.0/fe182b91/Audios/MSP-PODCAST_0220_0870.wav'
357
+ x, fs = audiofile.read(style_path) # assure is not very short - equl harvard sent len
358
  print(x.shape,' human') # crop human to almost mimic-3 duration
359
  total_audio_mimic3.append(x)
360
+ print(f'{len(total_audio_mimic3)=}')
361
  print(fs, text, 'mimic3')
362
 
363
  # MIMIC3 = = = = = = = = = = = = = = END
 
365
 
366
 
367
 
368
+ style_vec = msinference.compute_style(style_path) # use mimic-3 as prompt
369
 
370
 
371
 
 
376
  diffusion_steps=7,
377
  embedding_scale=1)
378
 
379
+ total_audio_styletts2.append(x)
380
+
381
+ # save styletts2 .wav
382
 
383
+ total_audio_styletts2 = np.concatenate(total_audio_styletts2) # -- concat 77x lists
384
+ total_audio_styletts2 = audresample.resample(total_audio_styletts2,
385
+ original_rate=24000,
386
+ target_rate=16000)[0]
387
+ print('RESAMPLEstyletts2', total_audio_styletts2.shape)
388
+ audiofile.write(out_dir + 'styletts2__' + _str + '.wav', total_audio_styletts2, 16000)
389
+ # print('Saving:', out_dir + 'styletts2__' + _str + '.wav')
390
 
391
+ # save mimic3 or human .wav
 
 
 
 
 
 
392
 
393
  total_audio_mimic3 = np.concatenate(total_audio_mimic3) # -- concat 77x lists
394
+ if 'human' not in _str:
395
+ total_audio_mimic3 = audresample.resample(total_audio_mimic3,
396
+ original_rate=24000,
397
+ target_rate=16000)[0]
398
+ else:
399
+ print('human is already on 16kHz - MSPpodcst file')
400
+ print('RESAMPLEmimic3', total_audio_mimic3.shape)
401
  audiofile.write(out_dir + 'mimic3__' + _str + '.wav', total_audio_mimic3, 16000)
402
 
403
+ print(total_audio_mimic3.shape, total_audio_styletts2.shape, 'LEN OF TOTAL\n')
404
+ # print('Saving:', out_dir + 'mimic3__' + _str + '.wav')
 
405
 
406
 
407
  # AUD I N T E R F A C E
408
+
409
 
410
 
411
 
412
+ for engine in ['mimic3',
413
+ 'styletts2']:
414
  harvard_of_voice = f'{out_dir}{engine}__{_str}'
415
  if not os.path.exists(harvard_of_voice + '.pkl'):
416
  df = interface.process_file(harvard_of_voice + '.wav')
417
  df.to_pickle(harvard_of_voice + '.pkl')
418
+ print('\n\n', harvard_of_voice, df,'\n___________________________\n')
419
+
 
420
 
421
 
422
 
 
426
 
427
 
428
 
429
+ raise SystemExit
430
  print('\nVisuals\n')
431
 
432
  # ===============================================================================
433
  # V I S U A L S
434
  #
435
  # ===============================================================================
436
+ voice_pairs = [
437
+ [list_voices[0], list_voices[1]],
438
+ [list_voices[2], list_voices[3]]
439
+ ] # special - for human we load specific style file - no Mimic3 is run
440
 
441
+ # PLot 1 list_voices[0] list_voices[1]
442
+ # Plot 2 list_voices[2] list_voices[2]
443
+
444
+ for vox1, vox2 in voice_pairs: # 1 figure pro pair
445
+
446
+ _str1 = vox1.replace('/', '_').replace('#', '_').replace('_low', '')
447
+
448
+ if 'cmu-arctic' in _str1:
449
+ _str1 = _str1.replace('cmu-arctic', 'cmu_arctic') #+ '.wav'
450
+
451
+ _str2 = vox2.replace('/', '_').replace('#', '_').replace('_low', '')
452
+
453
+ if 'cmu-arctic' in _str2:
454
+ _str2 = _str2.replace('cmu-arctic', 'cmu_arctic') #+ '.wav'
 
 
 
 
 
 
 
 
455
 
 
 
 
 
456
 
457
+ vis_df = {
458
+ f'mimic3_{_str1}' : pd.read_pickle(out_dir + 'mimic3__' + _str1 + '.pkl'),
459
+ f'mimic3_{_str2}' : pd.read_pickle(out_dir + 'mimic3__' + _str2 + '.pkl'),
460
+ f'styletts2_{_str1}' : pd.read_pickle(out_dir + 'styletts2__' + _str1 + '.pkl'),
461
+ f'styletts2_{_str2}' : pd.read_pickle(out_dir + 'styletts2__' + _str2 + '.pkl'),
462
+ }
463
+
464
 
465
+
466
+
467
+ SHORT_LEN = min([len(v) for k, v in vis_df.items()]) # different TTS durations per voic
468
+ for k,v in vis_df.items():
469
+ p = v[:SHORT_LEN] # TRuncate extra segments - human is slower than mimic3
470
+ print('\n\n\n\n',k, p)
471
+ p.reset_index(inplace= True)
472
+ p.drop(columns=['file','start'], inplace=True)
473
+ p.set_index('end', inplace=True)
474
+ # p = p.filter(scene_classes) #['transport', 'indoor', 'outdoor'])
475
+ p.index = p.index.map(mapper = (lambda x: x.total_seconds()))
476
+ vis_df[k] = p
477
+ preds = vis_df
478
+ fig, ax = plt.subplots(nrows=10, ncols=2, figsize=(24, 24), gridspec_kw={'hspace': 0, 'wspace': .04})
479
+
480
+
481
+ # ADV - subplots
482
+
483
+ time_stamp = preds[f'mimic3_{_str2}'].index.to_numpy()
484
+ for j, dim in enumerate(['arousal',
485
+ 'dominance',
486
+ 'valence']):
487
+
488
+ # MIMIC3
489
+
490
+ ax[j, 0].plot(time_stamp, preds[f'styletts2_{_str1}'][dim],
491
+ color=(0,104/255,139/255),
492
+ label='mean_1',
493
+ linewidth=2)
494
+ ax[j, 0].fill_between(time_stamp,
495
+
496
+ preds[f'styletts2_{_str1}'][dim],
497
+ preds[f'mimic3_{_str1}'][dim],
498
+
499
+ color=(.2,.2,.2),
500
+ alpha=0.244)
501
+ if j == 0:
502
+ ax[j, 0].legend([f'mimic3_{_str1}',
503
+ f'StyleTTS2 using {_str1}'],
504
+ prop={'size': 10},
505
+ # loc='lower right'
506
+ )
507
+ ax[j, 0].set_ylabel(dim.lower(), color=(.4, .4, .4), fontsize=14)
508
 
509
+ # TICK
510
+ ax[j, 0].set_ylim([1e-7, .9999])
511
+ # ax[j, 0].set_yticks([.25, .5,.75])
512
+ # ax[j, 0].set_yticklabels(['0.25', '.5', '0.75'])
513
+ ax[j, 0].set_xticklabels(['' for _ in ax[j, 0].get_xticklabels()])
514
+ ax[j, 0].set_xlim([time_stamp[0], time_stamp[-1]])
515
 
516
 
517
+ # MIMIC3 4x speed
518
 
 
 
519
 
520
+ ax[j, 1].plot(time_stamp, preds[f'mimic3_{_str2}'][dim],
521
+ color=(0,104/255,139/255),
522
+ label='mean_1',
523
+ linewidth=2)
524
+ ax[j, 1].fill_between(time_stamp,
525
 
526
+ preds[f'styletts2_{_str2}'][dim],
527
+ preds[f'mimic3_{_str2}'][dim],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
528
 
529
+ color=(.2,.2,.2),
530
+ alpha=0.244)
531
+ if j == 0:
532
+ ax[j, 1].legend([f'mimic3_{_str2}',
533
+ f'StyleTTS2 using {_str2}'],
534
+ prop={'size': 10},
535
+ # loc='lower right'
536
+ )
537
 
538
+
539
+ ax[j, 1].set_xlabel('767 Harvard Sentences (seconds)')
540
+
541
+
542
+
543
+ # TICK
544
+ ax[j, 1].set_ylim([1e-7, .9999])
545
+ # ax[j, 1].set_yticklabels(['' for _ in ax[j, 1].get_yticklabels()])
546
+ ax[j, 1].set_xticklabels(['' for _ in ax[j, 0].get_xticklabels()])
547
+ ax[j, 1].set_xlim([time_stamp[0], time_stamp[-1]])
548
+
549
+
550
+
551
+
552
+ ax[j, 0].grid()
553
+ ax[j, 1].grid()
554
+ # CATEGORIE
555
 
556
 
557
 
558
 
559
 
560
+ time_stamp = preds[f'mimic3_{_str1}'].index.to_numpy()
561
+ for j, dim in enumerate(['Angry',
562
+ 'Sad',
563
+ 'Happy',
564
+ 'Surprise',
565
+ 'Fear',
566
+ 'Disgust',
567
+ 'Contempt',
568
+ # 'Neutral'
569
+ ]): # ASaHSuFDCN
570
+ j = j + 3 # skip A/D/V suplt
571
 
572
+ # MIMIC3
573
 
574
+ ax[j, 0].plot(time_stamp, preds[f'mimic3_{_str1}'][dim],
575
+ color=(0,104/255,139/255),
576
+ label='mean_1',
577
+ linewidth=2)
578
+ ax[j, 0].fill_between(time_stamp,
579
 
580
+ preds[f'mimic3_{_str2}'][dim],
581
+ preds[f'styletts2_{_str2}'][dim],
582
 
583
+ color=(.2,.2,.2),
584
+ alpha=0.244)
585
+ # ax[j, 0].legend(['StyleTTS2 style mimic3',
586
+ # 'StyleTTS2 style crema-d'],
587
+ # prop={'size': 10},
588
+ # # loc='upper left'
589
+ # )
590
 
591
 
592
+ ax[j, 0].set_ylabel(dim.lower(), color=(.4, .4, .4), fontsize=14)
593
 
594
+ # TICKS
595
+ ax[j, 0].set_ylim([1e-7, .9999])
596
+ ax[j, 0].set_xlim([time_stamp[0], time_stamp[-1]])
597
+ ax[j, 0].set_xticklabels(['' for _ in ax[j, 0].get_xticklabels()])
598
+ ax[j, 0].set_xlabel('767 Harvard Sentences (seconds)', fontsize=16, color=(.4,.4,.4))
599
 
600
 
601
+ # MIMIC3 4x speed
602
 
603
 
604
+ ax[j, 1].plot(time_stamp, preds[f'mimic3_{_str2}'][dim],
605
+ color=(0,104/255,139/255),
606
+ label='mean_1',
607
+ linewidth=2)
608
+ ax[j, 1].fill_between(time_stamp,
609
 
610
+ preds[f'mimic3_{_str2}'][dim],
611
+ preds[f'styletts2_{_str2}'][dim],
612
 
613
+ color=(.2,.2,.2),
614
+ alpha=0.244)
615
+ # ax[j, 1].legend(['StyleTTS2 style mimic3 4x speed',
616
+ # 'StyleTTS2 style crema-d'],
617
+ # prop={'size': 10},
618
+ # # loc='upper left'
619
+ # )
620
+ ax[j, 1].set_xlabel('767 Harvard Sentences (seconds)', fontsize=16, color=(.4,.4,.4))
621
+ ax[j, 1].set_ylim([1e-7, .999])
622
+ # ax[j, 1].set_yticklabels(['' for _ in ax[j, 1].get_yticklabels()])
623
+ ax[j, 1].set_xticklabels(['' for _ in ax[j, 1].get_xticklabels()])
624
+ ax[j, 1].set_xlim([time_stamp[0], time_stamp[-1]])
625
+ ax[j, 0].grid()
626
+ ax[j, 1].grid()
627
+ plt.savefig(f'pair_{_str1}_{_str2}.png', bbox_inches='tight')
628
+ plt.close()
models.py ADDED
@@ -0,0 +1,611 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #coding:utf-8
2
+
3
+ import os
4
+ import os.path as osp
5
+
6
+ import copy
7
+ import math
8
+
9
+ import numpy as np
10
+ import torch
11
+ import torch.nn as nn
12
+ import torch.nn.functional as F
13
+ from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
14
+
15
+ from Utils.ASR.models import ASRCNN
16
+ from Utils.JDC.model import JDCNet
17
+
18
+ from Modules.diffusion.sampler import KDiffusion, LogNormalDistribution
19
+ from Modules.diffusion.modules import Transformer1d, StyleTransformer1d
20
+ from Modules.diffusion.diffusion import AudioDiffusionConditional
21
+
22
+
23
+
24
+ from munch import Munch
25
+ import yaml
26
+
27
+ class LearnedDownSample(nn.Module):
28
+ def __init__(self, layer_type, dim_in):
29
+ super().__init__()
30
+ self.layer_type = layer_type
31
+
32
+ if self.layer_type == 'none':
33
+ self.conv = nn.Identity()
34
+ elif self.layer_type == 'timepreserve':
35
+ self.conv = spectral_norm(nn.Conv2d(dim_in, dim_in, kernel_size=(3, 1), stride=(2, 1), groups=dim_in, padding=(1, 0)))
36
+ elif self.layer_type == 'half':
37
+ self.conv = spectral_norm(nn.Conv2d(dim_in, dim_in, kernel_size=(3, 3), stride=(2, 2), groups=dim_in, padding=1))
38
+ else:
39
+ raise RuntimeError('Got unexpected donwsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
40
+
41
+ def forward(self, x):
42
+ return self.conv(x)
43
+
44
+ class LearnedUpSample(nn.Module):
45
+ def __init__(self, layer_type, dim_in):
46
+ super().__init__()
47
+ self.layer_type = layer_type
48
+
49
+ if self.layer_type == 'none':
50
+ self.conv = nn.Identity()
51
+ elif self.layer_type == 'timepreserve':
52
+ self.conv = nn.ConvTranspose2d(dim_in, dim_in, kernel_size=(3, 1), stride=(2, 1), groups=dim_in, output_padding=(1, 0), padding=(1, 0))
53
+ elif self.layer_type == 'half':
54
+ self.conv = nn.ConvTranspose2d(dim_in, dim_in, kernel_size=(3, 3), stride=(2, 2), groups=dim_in, output_padding=1, padding=1)
55
+ else:
56
+ raise RuntimeError('Got unexpected upsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
57
+
58
+
59
+ def forward(self, x):
60
+ return self.conv(x)
61
+
62
+ class DownSample(nn.Module):
63
+ def __init__(self, layer_type):
64
+ super().__init__()
65
+ self.layer_type = layer_type
66
+
67
+ def forward(self, x):
68
+ if self.layer_type == 'none':
69
+ return x
70
+ elif self.layer_type == 'timepreserve':
71
+ return F.avg_pool2d(x, (2, 1))
72
+ elif self.layer_type == 'half':
73
+ if x.shape[-1] % 2 != 0:
74
+ x = torch.cat([x, x[..., -1].unsqueeze(-1)], dim=-1)
75
+ return F.avg_pool2d(x, 2)
76
+ else:
77
+ raise RuntimeError('Got unexpected donwsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
78
+
79
+
80
+ class UpSample(nn.Module):
81
+ def __init__(self, layer_type):
82
+ super().__init__()
83
+ self.layer_type = layer_type
84
+
85
+ def forward(self, x):
86
+ if self.layer_type == 'none':
87
+ return x
88
+ elif self.layer_type == 'timepreserve':
89
+ return F.interpolate(x, scale_factor=(2, 1), mode='nearest')
90
+ elif self.layer_type == 'half':
91
+ return F.interpolate(x, scale_factor=2, mode='nearest')
92
+ else:
93
+ raise RuntimeError('Got unexpected upsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
94
+
95
+
96
+ class ResBlk(nn.Module):
97
+ def __init__(self, dim_in, dim_out, actv=nn.LeakyReLU(0.2),
98
+ normalize=False, downsample='none'):
99
+ super().__init__()
100
+ self.actv = actv
101
+ self.normalize = normalize
102
+ self.downsample = DownSample(downsample)
103
+ self.downsample_res = LearnedDownSample(downsample, dim_in)
104
+ self.learned_sc = dim_in != dim_out
105
+ self._build_weights(dim_in, dim_out)
106
+
107
+ def _build_weights(self, dim_in, dim_out):
108
+ self.conv1 = spectral_norm(nn.Conv2d(dim_in, dim_in, 3, 1, 1))
109
+ self.conv2 = spectral_norm(nn.Conv2d(dim_in, dim_out, 3, 1, 1))
110
+ if self.normalize:
111
+ self.norm1 = nn.InstanceNorm2d(dim_in, affine=True)
112
+ self.norm2 = nn.InstanceNorm2d(dim_in, affine=True)
113
+ if self.learned_sc:
114
+ self.conv1x1 = spectral_norm(nn.Conv2d(dim_in, dim_out, 1, 1, 0, bias=False))
115
+
116
+ def _shortcut(self, x):
117
+ if self.learned_sc:
118
+ x = self.conv1x1(x)
119
+ if self.downsample:
120
+ x = self.downsample(x)
121
+ return x
122
+
123
+ def _residual(self, x):
124
+ if self.normalize:
125
+ x = self.norm1(x)
126
+ x = self.actv(x)
127
+ x = self.conv1(x)
128
+ x = self.downsample_res(x)
129
+ if self.normalize:
130
+ x = self.norm2(x)
131
+ x = self.actv(x)
132
+ x = self.conv2(x)
133
+ return x
134
+
135
+ def forward(self, x):
136
+ x = self._shortcut(x) + self._residual(x)
137
+ return x / math.sqrt(2) # unit variance
138
+
139
+ class StyleEncoder(nn.Module):
140
+ def __init__(self, dim_in=48, style_dim=48, max_conv_dim=384):
141
+ super().__init__()
142
+ blocks = []
143
+ blocks += [spectral_norm(nn.Conv2d(1, dim_in, 3, 1, 1))]
144
+
145
+ repeat_num = 4
146
+ for _ in range(repeat_num):
147
+ dim_out = min(dim_in*2, max_conv_dim)
148
+ blocks += [ResBlk(dim_in, dim_out, downsample='half')]
149
+ dim_in = dim_out
150
+
151
+ blocks += [nn.LeakyReLU(0.2)]
152
+ blocks += [spectral_norm(nn.Conv2d(dim_out, dim_out, 5, 1, 0))]
153
+ blocks += [nn.AdaptiveAvgPool2d(1)]
154
+ blocks += [nn.LeakyReLU(0.2)]
155
+ self.shared = nn.Sequential(*blocks)
156
+
157
+ self.unshared = nn.Linear(dim_out, style_dim)
158
+
159
+ def forward(self, x):
160
+ h = self.shared(x)
161
+ h = h.view(h.size(0), -1)
162
+ s = self.unshared(h)
163
+
164
+ return s
165
+
166
+ class LinearNorm(torch.nn.Module):
167
+ def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
168
+ super(LinearNorm, self).__init__()
169
+ self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)
170
+
171
+ torch.nn.init.xavier_uniform_(
172
+ self.linear_layer.weight,
173
+ gain=torch.nn.init.calculate_gain(w_init_gain))
174
+
175
+ def forward(self, x):
176
+ return self.linear_layer(x)
177
+
178
+ class ResBlk1d(nn.Module):
179
+ def __init__(self, dim_in, dim_out, actv=nn.LeakyReLU(0.2),
180
+ normalize=False, downsample='none', dropout_p=0.2):
181
+ super().__init__()
182
+ self.actv = actv
183
+ self.normalize = normalize
184
+ self.downsample_type = downsample
185
+ self.learned_sc = dim_in != dim_out
186
+ self._build_weights(dim_in, dim_out)
187
+ self.dropout_p = dropout_p
188
+
189
+ if self.downsample_type == 'none':
190
+ self.pool = nn.Identity()
191
+ else:
192
+ self.pool = weight_norm(nn.Conv1d(dim_in, dim_in, kernel_size=3, stride=2, groups=dim_in, padding=1))
193
+
194
+ def _build_weights(self, dim_in, dim_out):
195
+ self.conv1 = weight_norm(nn.Conv1d(dim_in, dim_in, 3, 1, 1))
196
+ self.conv2 = weight_norm(nn.Conv1d(dim_in, dim_out, 3, 1, 1))
197
+ if self.normalize:
198
+ self.norm1 = nn.InstanceNorm1d(dim_in, affine=True)
199
+ self.norm2 = nn.InstanceNorm1d(dim_in, affine=True)
200
+ if self.learned_sc:
201
+ self.conv1x1 = weight_norm(nn.Conv1d(dim_in, dim_out, 1, 1, 0, bias=False))
202
+
203
+ def downsample(self, x):
204
+ if self.downsample_type == 'none':
205
+ return x
206
+ else:
207
+ if x.shape[-1] % 2 != 0:
208
+ x = torch.cat([x, x[..., -1].unsqueeze(-1)], dim=-1)
209
+ return F.avg_pool1d(x, 2)
210
+
211
+ def _shortcut(self, x):
212
+ if self.learned_sc:
213
+ x = self.conv1x1(x)
214
+ x = self.downsample(x)
215
+ return x
216
+
217
+ def _residual(self, x):
218
+ if self.normalize:
219
+ x = self.norm1(x)
220
+ x = self.actv(x)
221
+ x = F.dropout(x, p=self.dropout_p, training=self.training)
222
+
223
+ x = self.conv1(x)
224
+ x = self.pool(x)
225
+ if self.normalize:
226
+ x = self.norm2(x)
227
+
228
+ x = self.actv(x)
229
+ x = F.dropout(x, p=self.dropout_p, training=self.training)
230
+
231
+ x = self.conv2(x)
232
+ return x
233
+
234
+ def forward(self, x):
235
+ x = self._shortcut(x) + self._residual(x)
236
+ return x / math.sqrt(2) # unit variance
237
+
238
+ class LayerNorm(nn.Module):
239
+ def __init__(self, channels, eps=1e-5):
240
+ super().__init__()
241
+ self.channels = channels
242
+ self.eps = eps
243
+
244
+ self.gamma = nn.Parameter(torch.ones(channels))
245
+ self.beta = nn.Parameter(torch.zeros(channels))
246
+
247
+ def forward(self, x):
248
+ x = x.transpose(1, -1)
249
+ x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
250
+ return x.transpose(1, -1)
251
+
252
+ class TextEncoder(nn.Module):
253
+ def __init__(self, channels, kernel_size, depth, n_symbols, actv=nn.LeakyReLU(0.2)):
254
+ super().__init__()
255
+ self.embedding = nn.Embedding(n_symbols, channels)
256
+
257
+ padding = (kernel_size - 1) // 2
258
+ self.cnn = nn.ModuleList()
259
+ for _ in range(depth):
260
+ self.cnn.append(nn.Sequential(
261
+ weight_norm(nn.Conv1d(channels, channels, kernel_size=kernel_size, padding=padding)),
262
+ LayerNorm(channels),
263
+ actv,
264
+ nn.Dropout(0.2),
265
+ ))
266
+ # self.cnn = nn.Sequential(*self.cnn)
267
+
268
+ self.lstm = nn.LSTM(channels, channels//2, 1, batch_first=True, bidirectional=True)
269
+
270
+ def forward(self, x, input_lengths, m):
271
+ x = self.embedding(x) # [B, T, emb]
272
+ x = x.transpose(1, 2) # [B, emb, T]
273
+ m = m.to(input_lengths.device).unsqueeze(1)
274
+ x.masked_fill_(m, 0.0)
275
+
276
+ for c in self.cnn:
277
+ x = c(x)
278
+ x.masked_fill_(m, 0.0)
279
+
280
+ x = x.transpose(1, 2) # [B, T, chn]
281
+
282
+ input_lengths = input_lengths.cpu().numpy()
283
+ x = nn.utils.rnn.pack_padded_sequence(
284
+ x, input_lengths, batch_first=True, enforce_sorted=False)
285
+
286
+ self.lstm.flatten_parameters()
287
+ x, _ = self.lstm(x)
288
+ x, _ = nn.utils.rnn.pad_packed_sequence(
289
+ x, batch_first=True)
290
+
291
+ x = x.transpose(-1, -2)
292
+ x_pad = torch.zeros([x.shape[0], x.shape[1], m.shape[-1]])
293
+
294
+ x_pad[:, :, :x.shape[-1]] = x
295
+ x = x_pad.to(x.device)
296
+
297
+ x.masked_fill_(m, 0.0)
298
+
299
+ return x
300
+
301
+ def inference(self, x):
302
+ x = self.embedding(x)
303
+ x = x.transpose(1, 2)
304
+ x = self.cnn(x)
305
+ x = x.transpose(1, 2)
306
+ self.lstm.flatten_parameters()
307
+ x, _ = self.lstm(x)
308
+ return x
309
+
310
+ def length_to_mask(self, lengths):
311
+ mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
312
+ mask = torch.gt(mask+1, lengths.unsqueeze(1))
313
+ return mask
314
+
315
+
316
+
317
+ class AdaIN1d(nn.Module):
318
+ def __init__(self, style_dim, num_features):
319
+ super().__init__()
320
+ self.norm = nn.InstanceNorm1d(num_features, affine=False)
321
+ self.fc = nn.Linear(style_dim, num_features*2)
322
+
323
+ def forward(self, x, s):
324
+ h = self.fc(s)
325
+ h = h.view(h.size(0), h.size(1), 1)
326
+ gamma, beta = torch.chunk(h, chunks=2, dim=1)
327
+ # affine (1 + lin(x)) * inst(x) + lin(x) is this a skip connection where the weight is a lin of itself
328
+ return (1 + gamma) * self.norm(x) + beta
329
+
330
+ class UpSample1d(nn.Module):
331
+ def __init__(self, layer_type):
332
+ super().__init__()
333
+ self.layer_type = layer_type
334
+
335
+ def forward(self, x):
336
+ if self.layer_type == 'none':
337
+ return x
338
+ else:
339
+ return F.interpolate(x, scale_factor=2, mode='nearest')
340
+
341
+ class AdainResBlk1d(nn.Module):
342
+ def __init__(self, dim_in, dim_out, style_dim=64, actv=nn.LeakyReLU(0.2),
343
+ upsample='none', dropout_p=0.0):
344
+ super().__init__()
345
+ self.actv = actv
346
+ self.upsample_type = upsample
347
+ self.upsample = UpSample1d(upsample)
348
+ self.learned_sc = dim_in != dim_out
349
+ self._build_weights(dim_in, dim_out, style_dim)
350
+ self.dropout = nn.Dropout(dropout_p)
351
+
352
+ if upsample == 'none':
353
+ self.pool = nn.Identity()
354
+ else:
355
+ self.pool = weight_norm(nn.ConvTranspose1d(dim_in, dim_in, kernel_size=3, stride=2, groups=dim_in, padding=1, output_padding=1))
356
+
357
+
358
+ def _build_weights(self, dim_in, dim_out, style_dim):
359
+ self.conv1 = weight_norm(nn.Conv1d(dim_in, dim_out, 3, 1, 1))
360
+ self.conv2 = weight_norm(nn.Conv1d(dim_out, dim_out, 3, 1, 1))
361
+ self.norm1 = AdaIN1d(style_dim, dim_in)
362
+ self.norm2 = AdaIN1d(style_dim, dim_out)
363
+ if self.learned_sc:
364
+ self.conv1x1 = weight_norm(nn.Conv1d(dim_in, dim_out, 1, 1, 0, bias=False))
365
+
366
+ def _shortcut(self, x):
367
+ x = self.upsample(x)
368
+ if self.learned_sc:
369
+ x = self.conv1x1(x)
370
+ return x
371
+
372
+ def _residual(self, x, s):
373
+ x = self.norm1(x, s)
374
+ x = self.actv(x)
375
+ x = self.pool(x)
376
+ x = self.conv1(self.dropout(x))
377
+ x = self.norm2(x, s)
378
+ x = self.actv(x)
379
+ x = self.conv2(self.dropout(x))
380
+ return x
381
+
382
+ def forward(self, x, s):
383
+ out = self._residual(x, s)
384
+ out = (out + self._shortcut(x)) / math.sqrt(2)
385
+ return out
386
+
387
+ class AdaLayerNorm(nn.Module):
388
+ def __init__(self, style_dim, channels, eps=1e-5):
389
+ super().__init__()
390
+ self.channels = channels
391
+ self.eps = eps
392
+
393
+ self.fc = nn.Linear(style_dim, channels*2)
394
+
395
+ def forward(self, x, s):
396
+ x = x.transpose(-1, -2)
397
+ x = x.transpose(1, -1)
398
+
399
+ h = self.fc(s)
400
+ h = h.view(h.size(0), h.size(1), 1)
401
+ gamma, beta = torch.chunk(h, chunks=2, dim=1)
402
+ gamma, beta = gamma.transpose(1, -1), beta.transpose(1, -1)
403
+
404
+
405
+ x = F.layer_norm(x, (self.channels,), eps=self.eps)
406
+ x = (1 + gamma) * x + beta
407
+ return x.transpose(1, -1).transpose(-1, -2)
408
+
409
+ class ProsodyPredictor(nn.Module):
410
+
411
+ def __init__(self, style_dim, d_hid, nlayers, max_dur=50, dropout=0.1):
412
+ super().__init__()
413
+
414
+ self.text_encoder = DurationEncoder(sty_dim=style_dim,
415
+ d_model=d_hid,
416
+ nlayers=nlayers,
417
+ dropout=dropout)
418
+
419
+ self.lstm = nn.LSTM(d_hid + style_dim, d_hid // 2, 1, batch_first=True, bidirectional=True)
420
+ self.duration_proj = LinearNorm(d_hid, max_dur)
421
+
422
+ self.shared = nn.LSTM(d_hid + style_dim, d_hid // 2, 1, batch_first=True, bidirectional=True)
423
+ self.F0 = nn.ModuleList()
424
+ self.F0.append(AdainResBlk1d(d_hid, d_hid, style_dim, dropout_p=dropout))
425
+ self.F0.append(AdainResBlk1d(d_hid, d_hid // 2, style_dim, upsample=True, dropout_p=dropout))
426
+ self.F0.append(AdainResBlk1d(d_hid // 2, d_hid // 2, style_dim, dropout_p=dropout))
427
+
428
+ self.N = nn.ModuleList()
429
+ self.N.append(AdainResBlk1d(d_hid, d_hid, style_dim, dropout_p=dropout))
430
+ self.N.append(AdainResBlk1d(d_hid, d_hid // 2, style_dim, upsample=True, dropout_p=dropout))
431
+ self.N.append(AdainResBlk1d(d_hid // 2, d_hid // 2, style_dim, dropout_p=dropout))
432
+
433
+ self.F0_proj = nn.Conv1d(d_hid // 2, 1, 1, 1, 0)
434
+ self.N_proj = nn.Conv1d(d_hid // 2, 1, 1, 1, 0)
435
+
436
+ def F0Ntrain(self, x, s):
437
+ x, _ = self.shared(x.transpose(-1, -2))
438
+
439
+ F0 = x.transpose(-1, -2)
440
+ for block in self.F0:
441
+ F0 = block(F0, s)
442
+ F0 = self.F0_proj(F0)
443
+
444
+ N = x.transpose(-1, -2)
445
+ for block in self.N:
446
+ N = block(N, s)
447
+ N = self.N_proj(N)
448
+
449
+ return F0.squeeze(1), N.squeeze(1)
450
+
451
+ def length_to_mask(self, lengths):
452
+ mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
453
+ mask = torch.gt(mask+1, lengths.unsqueeze(1))
454
+ return mask
455
+
456
+ class DurationEncoder(nn.Module):
457
+
458
+ def __init__(self, sty_dim, d_model, nlayers, dropout=0.1):
459
+ super().__init__()
460
+ self.lstms = nn.ModuleList()
461
+ for _ in range(nlayers):
462
+ self.lstms.append(nn.LSTM(d_model + sty_dim,
463
+ d_model // 2,
464
+ num_layers=1,
465
+ batch_first=True,
466
+ bidirectional=True,
467
+ dropout=dropout))
468
+ self.lstms.append(AdaLayerNorm(sty_dim, d_model))
469
+
470
+
471
+ self.dropout = dropout
472
+ self.d_model = d_model
473
+ self.sty_dim = sty_dim
474
+
475
+ def forward(self, x, style, text_lengths, m):
476
+ masks = m.to(text_lengths.device)
477
+
478
+ x = x.permute(2, 0, 1)
479
+ s = style.expand(x.shape[0], x.shape[1], -1)
480
+ x = torch.cat([x, s], axis=-1)
481
+ x.masked_fill_(masks.unsqueeze(-1).transpose(0, 1), 0.0)
482
+
483
+ x = x.transpose(0, 1)
484
+ input_lengths = text_lengths.cpu().numpy()
485
+ x = x.transpose(-1, -2)
486
+
487
+ for block in self.lstms:
488
+ if isinstance(block, AdaLayerNorm):
489
+ x = block(x.transpose(-1, -2), style).transpose(-1, -2)
490
+ x = torch.cat([x, s.permute(1, -1, 0)], axis=1)
491
+ x.masked_fill_(masks.unsqueeze(-1).transpose(-1, -2), 0.0)
492
+ else:
493
+ x = x.transpose(-1, -2)
494
+ x = nn.utils.rnn.pack_padded_sequence(
495
+ x, input_lengths, batch_first=True, enforce_sorted=False)
496
+ block.flatten_parameters()
497
+ x, _ = block(x)
498
+ x, _ = nn.utils.rnn.pad_packed_sequence(
499
+ x, batch_first=True)
500
+ x = F.dropout(x, p=self.dropout, training=self.training)
501
+ x = x.transpose(-1, -2)
502
+
503
+ x_pad = torch.zeros([x.shape[0], x.shape[1], m.shape[-1]])
504
+
505
+ x_pad[:, :, :x.shape[-1]] = x
506
+ x = x_pad.to(x.device)
507
+ # print('Calling Duration Encoder\n\n\n\n',x.shape, x.min(), x.max())
508
+ # Calling Duration Encoder
509
+ # torch.Size([1, 640, 107]) tensor(-3.0903, device='cuda:0') tensor(2.3089, device='cuda:0')
510
+ return x.transpose(-1, -2)
511
+
512
+
513
+
514
+
515
+ def load_F0_models(path):
516
+ # load F0 model
517
+
518
+ F0_model = JDCNet(num_class=1, seq_len=192)
519
+ print(path, 'WHAT ARE YOU TRYING TO LOAD F0 L520')
520
+ path.replace('.t7', '.pth')
521
+ params = torch.load(path, map_location='cpu')['net']
522
+ F0_model.load_state_dict(params)
523
+ _ = F0_model.train()
524
+
525
+ return F0_model
526
+
527
+ def load_ASR_models(ASR_MODEL_PATH, ASR_MODEL_CONFIG):
528
+ # load ASR model
529
+ def _load_config(path):
530
+ with open(path) as f:
531
+ config = yaml.safe_load(f)
532
+ model_config = config['model_params']
533
+ return model_config
534
+
535
+ def _load_model(model_config, model_path):
536
+ model = ASRCNN(**model_config)
537
+ params = torch.load(model_path, map_location='cpu')['model']
538
+ model.load_state_dict(params)
539
+ return model
540
+
541
+ asr_model_config = _load_config(ASR_MODEL_CONFIG)
542
+ asr_model = _load_model(asr_model_config, ASR_MODEL_PATH)
543
+ _ = asr_model.train()
544
+
545
+ return asr_model
546
+
547
+ def build_model(args, text_aligner, pitch_extractor, bert):
548
+ print(f'\n==============\n {args.decoder.type=}\n==============L584 models.py @ build_model()\n')
549
+
550
+ from Modules.hifigan import Decoder
551
+ decoder = Decoder(dim_in=args.hidden_dim, style_dim=args.style_dim, dim_out=args.n_mels,
552
+ resblock_kernel_sizes = args.decoder.resblock_kernel_sizes,
553
+ upsample_rates = args.decoder.upsample_rates,
554
+ upsample_initial_channel=args.decoder.upsample_initial_channel,
555
+ resblock_dilation_sizes=args.decoder.resblock_dilation_sizes,
556
+ upsample_kernel_sizes=args.decoder.upsample_kernel_sizes)
557
+
558
+ text_encoder = TextEncoder(channels=args.hidden_dim, kernel_size=5, depth=args.n_layer, n_symbols=args.n_token)
559
+
560
+ predictor = ProsodyPredictor(style_dim=args.style_dim, d_hid=args.hidden_dim, nlayers=args.n_layer, max_dur=args.max_dur, dropout=args.dropout)
561
+
562
+ style_encoder = StyleEncoder(dim_in=args.dim_in, style_dim=args.style_dim, max_conv_dim=args.hidden_dim) # acoustic style encoder
563
+ predictor_encoder = StyleEncoder(dim_in=args.dim_in, style_dim=args.style_dim, max_conv_dim=args.hidden_dim) # prosodic style encoder
564
+
565
+ # define diffusion model
566
+ if args.multispeaker:
567
+ transformer = StyleTransformer1d(channels=args.style_dim*2,
568
+ context_embedding_features=bert.config.hidden_size,
569
+ context_features=args.style_dim*2,
570
+ **args.diffusion.transformer)
571
+ else:
572
+ transformer = Transformer1d(channels=args.style_dim*2,
573
+ context_embedding_features=bert.config.hidden_size,
574
+ **args.diffusion.transformer)
575
+
576
+ diffusion = AudioDiffusionConditional(
577
+ in_channels=1,
578
+ embedding_max_length=bert.config.max_position_embeddings,
579
+ embedding_features=bert.config.hidden_size,
580
+ embedding_mask_proba=args.diffusion.embedding_mask_proba, # Conditional dropout of batch elements,
581
+ channels=args.style_dim*2,
582
+ context_features=args.style_dim*2,
583
+ )
584
+
585
+ diffusion.diffusion = KDiffusion(
586
+ net=diffusion.unet,
587
+ sigma_distribution=LogNormalDistribution(mean = args.diffusion.dist.mean, std = args.diffusion.dist.std),
588
+ sigma_data=args.diffusion.dist.sigma_data, # a placeholder, will be changed dynamically when start training diffusion model
589
+ dynamic_threshold=0.0
590
+ )
591
+ diffusion.diffusion.net = transformer
592
+ diffusion.unet = transformer
593
+
594
+
595
+ nets = Munch(
596
+ bert=bert,
597
+ bert_encoder=nn.Linear(bert.config.hidden_size, args.hidden_dim),
598
+
599
+ predictor=predictor,
600
+ decoder=decoder,
601
+ text_encoder=text_encoder,
602
+
603
+ predictor_encoder=predictor_encoder,
604
+ style_encoder=style_encoder,
605
+ diffusion=diffusion,
606
+
607
+ text_aligner = text_aligner,
608
+ pitch_extractor=pitch_extractor
609
+ )
610
+
611
+ return nets
text_utils.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ import numpy as np
3
+ import re
4
+ import codecs
5
+ # IPA Phonemizer: https://github.com/bootphon/phonemizer
6
+
7
+ _pad = "$"
8
+ _punctuation = ';:,.!?¡¿—…"«»“” '
9
+ _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
10
+ _letters_ipa = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
11
+
12
+ # Export all symbols:
13
+ symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
14
+
15
+ dicts = {}
16
+ for i in range(len((symbols))):
17
+ dicts[symbols[i]] = i
18
+
19
+ class TextCleaner:
20
+ def __init__(self, dummy=None):
21
+ self.word_index_dictionary = dicts
22
+ print(len(dicts))
23
+ def __call__(self, text):
24
+ indexes = []
25
+ for char in text:
26
+ try:
27
+ indexes.append(self.word_index_dictionary[char])
28
+ except KeyError:
29
+ print(text)
30
+ return indexes
31
+
32
+
33
+ # == Sentence Splitter
34
+
35
+ alphabets= "([A-Za-z])"
36
+ prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
37
+ suffixes = "(Inc|Ltd|Jr|Sr|Co)"
38
+ starters = "(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
39
+ acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
40
+ websites = "[.](com|net|org|io|gov|edu|me)"
41
+ digits = "([0-9])"
42
+ multiple_dots = r'\.{2,}'
43
+
44
+
45
+
46
+ def split_into_sentences(text):
47
+ """
48
+ Split the text into sentences.
49
+
50
+ If the text contains substrings "<prd>" or "<stop>", they would lead
51
+ to incorrect splitting because they are used as markers for splitting.
52
+
53
+ :param text: text to be split into sentences
54
+ :type text: str
55
+
56
+ :return: list of sentences
57
+ :rtype: list[str]
58
+
59
+ https://stackoverflow.com/questions/4576077/how-can-i-split-a-text-into-sentences
60
+ """
61
+ text = " " + text + " "
62
+ text = text.replace("\n"," ")
63
+ text = re.sub(prefixes,"\\1<prd>",text)
64
+ text = re.sub(websites,"<prd>\\1",text)
65
+ text = re.sub(digits + "[.]" + digits,"\\1<prd>\\2",text)
66
+ text = re.sub(multiple_dots, lambda match: "<prd>" * len(match.group(0)) + "<stop>", text)
67
+ if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
68
+ text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
69
+ text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
70
+ text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
71
+ text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
72
+ text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
73
+ text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
74
+ text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
75
+ if "”" in text: text = text.replace(".”","”.")
76
+ if "\"" in text: text = text.replace(".\"","\".")
77
+ if "!" in text: text = text.replace("!\"","\"!")
78
+ if "?" in text: text = text.replace("?\"","\"?")
79
+ text = text.replace(".",".<stop>")
80
+ text = text.replace("?","?<stop>")
81
+ text = text.replace("!","!<stop>")
82
+ text = text.replace("<prd>",".")
83
+ sentences = text.split("<stop>")
84
+ sentences = [s.strip() for s in sentences]
85
+ if sentences and not sentences[-1]: sentences = sentences[:-1]
86
+ return sentences
87
+
88
+ def store_ssml(text=None,
89
+ voice=None):
90
+ '''create ssml:
91
+ text : list of sentences
92
+ voice: https://github.com/MycroftAI/mimic3-voices
93
+ '''
94
+ print('\n___________________________\n', len(text), text[0], '\n___________________________________\n')
95
+ _s = '<speak>'
96
+ for short_text in text:
97
+
98
+ rate = min(max(.87, len(short_text) / 76), 1.14) #1.44) # 1.24 for bieber
99
+
100
+
101
+ volume = int(74 * np.random.rand() + 24)
102
+ # text = ('<speak>'
103
+ _s += f'<prosody volume=\'{volume}\'>' # THe other voice does not have volume
104
+ _s += f'<prosody rate=\'{rate}\'>'
105
+ _s += f'<voice name=\'{voice}\'>'
106
+ _s += '<s>'
107
+ _s += short_text
108
+ _s += '</s>'
109
+ _s += '</voice>'
110
+ _s += '</prosody>'
111
+ _s += '</prosody>'
112
+ _s += '</speak>'
113
+ print(len(text),'\n\n\n\n\n\n\n', _s)
114
+
115
+ with codecs.open('_tmp_ssml.txt', 'w', "utf-8-sig") as f:
116
+ f.write(_s)
utils.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from monotonic_align import maximum_path
2
+ from monotonic_align import mask_from_lens
3
+ from monotonic_align.core import maximum_path_c
4
+ import numpy as np
5
+ import torch
6
+ import copy
7
+ from torch import nn
8
+ import torch.nn.functional as F
9
+ import torchaudio
10
+ import librosa
11
+ import matplotlib.pyplot as plt
12
+ from munch import Munch
13
+
14
+ def maximum_path(neg_cent, mask):
15
+ """ Cython optimized version.
16
+ neg_cent: [b, t_t, t_s]
17
+ mask: [b, t_t, t_s]
18
+ """
19
+ device = neg_cent.device
20
+ dtype = neg_cent.dtype
21
+ neg_cent = np.ascontiguousarray(neg_cent.data.cpu().numpy().astype(np.float32))
22
+ path = np.ascontiguousarray(np.zeros(neg_cent.shape, dtype=np.int32))
23
+
24
+ t_t_max = np.ascontiguousarray(mask.sum(1)[:, 0].data.cpu().numpy().astype(np.int32))
25
+ t_s_max = np.ascontiguousarray(mask.sum(2)[:, 0].data.cpu().numpy().astype(np.int32))
26
+ maximum_path_c(path, neg_cent, t_t_max, t_s_max)
27
+ return torch.from_numpy(path).to(device=device, dtype=dtype)
28
+
29
+ def get_data_path_list(train_path=None, val_path=None):
30
+ if train_path is None:
31
+ train_path = "Data/train_list.txt"
32
+ if val_path is None:
33
+ val_path = "Data/val_list.txt"
34
+
35
+ with open(train_path, 'r', encoding='utf-8', errors='ignore') as f:
36
+ train_list = f.readlines()
37
+ with open(val_path, 'r', encoding='utf-8', errors='ignore') as f:
38
+ val_list = f.readlines()
39
+
40
+ return train_list, val_list
41
+
42
+ def length_to_mask(lengths):
43
+ mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
44
+ mask = torch.gt(mask+1, lengths.unsqueeze(1))
45
+ return mask
46
+
47
+ # for norm consistency loss
48
+ def log_norm(x, mean=-4, std=4, dim=2):
49
+ """
50
+ normalized log mel -> mel -> norm -> log(norm)
51
+ """
52
+ x = torch.log(torch.exp(x * std + mean).norm(dim=dim))
53
+ return x
54
+
55
+ def get_image(arrs):
56
+ plt.switch_backend('agg')
57
+ fig = plt.figure()
58
+ ax = plt.gca()
59
+ ax.imshow(arrs)
60
+
61
+ return fig
62
+
63
+ def recursive_munch(d):
64
+ if isinstance(d, dict):
65
+ return Munch((k, recursive_munch(v)) for k, v in d.items())
66
+ elif isinstance(d, list):
67
+ return [recursive_munch(v) for v in d]
68
+ else:
69
+ return d
70
+
71
+ def log_print(message, logger):
72
+ logger.info(message)
73
+ print(message)
74
+