Dionyssos commited on
Commit
6430cbc
·
1 Parent(s): f7fd0c3

style = mimic3 generation

Browse files
Files changed (1) hide show
  1. mimic3_make_harvard_sentences.py +68 -69
mimic3_make_harvard_sentences.py CHANGED
@@ -34,41 +34,44 @@ import audiofile
34
 
35
 
36
  # ================================================ LIST OF VOICES
37
- # ROOT_DIR = '/data/dkounadis/mimic3-voices/'
38
- # foreign_voices = []
39
- # english_voices = []
40
- # for lang in os.listdir(ROOT_DIR + 'voices'):
41
 
42
- # for voice in os.listdir(ROOT_DIR + 'voices/' + lang):
43
- # if 'en_' in lang:
44
-
45
- # try:
46
- # with open(ROOT_DIR + 'voices/' + lang + '/' + voice + '/speakers.txt', 'r') as f:
47
- # for spk in f:
48
- # english_voices.append(lang + '/' + voice + '#' + spk.rstrip())
49
- # # voice_spk_string = lang + '/' + voice + '#' + spk.rstrip() for spk in f
50
- # except FileNotFoundError:
51
- # english_voices.append(lang + '/' + voice)
52
-
53
- # else:
54
 
55
- # try:
56
- # with open(ROOT_DIR + 'voices/' + lang + '/' + voice + '/speakers.txt', 'r') as f:
57
- # for spk in f:
58
- # foreign_voices.append(lang + '/' + voice + '#' + spk.rstrip())
59
 
60
- # except FileNotFoundError:
61
- # foreign_voices.append(lang + '/' + voice)
62
- # #
63
- # [print(i) for i in foreign_voices]
64
- # print('\n_______________________________\n')
65
- # [print(i) for i in english_voices]
66
  # ====================================================== LIST Mimic-3 ALL VOICES
67
  list_voices = [
68
  'en_US/m-ailabs_low#mary_ann',
69
  'en_UK/apope_low',
70
  'de_DE/thorsten-emotion_low#neutral', # is the 4x really interesting we can just write it in Section
71
- 'human',
 
 
 
72
  ] # special - for human we load specific style file - no Mimic3 is run
73
 
74
 
@@ -293,7 +296,7 @@ for _id, _voice in enumerate(list_voices):
293
  total_audio_mimic3 = []
294
  total_audio_styletts2 = []
295
  ix = 0
296
- for list_of_10 in harvard_individual_sentences[:1]: # 77
297
 
298
  text = ' '.join(list_of_10['sentences'])
299
 
@@ -312,7 +315,7 @@ for _id, _voice in enumerate(list_voices):
312
  f'<prosody rate=\'{rate}\'>'
313
  f'<voice name=\'{_voice}\'>'
314
  '<s>'
315
- f'{text}'
316
  '</s>'
317
  '</voice>'
318
  '</prosody>'
@@ -353,7 +356,9 @@ for _id, _voice in enumerate(list_voices):
353
  # style_path = '/cache/audb/msppodcast/2.4.0/fe182b91/Audios/MSP-PODCAST_0235_0053.wav'
354
  # --
355
  # MSP['emotion.test-1'].get().sort_values('valence').index[-1]
356
- style_path = '/cache/audb/msppodcast/2.4.0/fe182b91/Audios/MSP-PODCAST_0220_0870.wav'
 
 
357
  x, fs = audiofile.read(style_path) # assure is not very short - equl harvard sent len
358
  print(x.shape,' human') # crop human to almost mimic-3 duration
359
  total_audio_mimic3.append(x)
@@ -426,7 +431,7 @@ for _id, _voice in enumerate(list_voices):
426
 
427
 
428
 
429
- raise SystemExit
430
  print('\nVisuals\n')
431
 
432
  # ===============================================================================
@@ -475,32 +480,39 @@ for vox1, vox2 in voice_pairs: # 1 figure pro pair
475
  p.index = p.index.map(mapper = (lambda x: x.total_seconds()))
476
  vis_df[k] = p
477
  preds = vis_df
478
- fig, ax = plt.subplots(nrows=10, ncols=2, figsize=(24, 24), gridspec_kw={'hspace': 0, 'wspace': .04})
479
 
480
 
481
  # ADV - subplots
482
 
483
- time_stamp = preds[f'mimic3_{_str2}'].index.to_numpy()
484
  for j, dim in enumerate(['arousal',
485
  'dominance',
486
  'valence']):
487
 
488
  # MIMIC3
489
 
490
- ax[j, 0].plot(time_stamp, preds[f'styletts2_{_str1}'][dim],
 
 
491
  color=(0,104/255,139/255),
492
  label='mean_1',
493
  linewidth=2)
 
 
 
 
 
494
  ax[j, 0].fill_between(time_stamp,
495
 
496
  preds[f'styletts2_{_str1}'][dim],
497
  preds[f'mimic3_{_str1}'][dim],
498
-
499
- color=(.2,.2,.2),
500
- alpha=0.244)
501
  if j == 0:
502
- ax[j, 0].legend([f'mimic3_{_str1}',
503
- f'StyleTTS2 using {_str1}'],
504
  prop={'size': 10},
505
  # loc='lower right'
506
  )
@@ -508,8 +520,6 @@ for vox1, vox2 in voice_pairs: # 1 figure pro pair
508
 
509
  # TICK
510
  ax[j, 0].set_ylim([1e-7, .9999])
511
- # ax[j, 0].set_yticks([.25, .5,.75])
512
- # ax[j, 0].set_yticklabels(['0.25', '.5', '0.75'])
513
  ax[j, 0].set_xticklabels(['' for _ in ax[j, 0].get_xticklabels()])
514
  ax[j, 0].set_xlim([time_stamp[0], time_stamp[-1]])
515
 
@@ -517,21 +527,20 @@ for vox1, vox2 in voice_pairs: # 1 figure pro pair
517
  # MIMIC3 4x speed
518
 
519
 
520
- ax[j, 1].plot(time_stamp, preds[f'mimic3_{_str2}'][dim],
521
  color=(0,104/255,139/255),
522
  label='mean_1',
523
  linewidth=2)
524
  ax[j, 1].fill_between(time_stamp,
525
-
526
- preds[f'styletts2_{_str2}'][dim],
527
  preds[f'mimic3_{_str2}'][dim],
528
-
529
- color=(.2,.2,.2),
530
- alpha=0.244)
531
  if j == 0:
532
- ax[j, 1].legend([f'mimic3_{_str2}',
533
- f'StyleTTS2 using {_str2}'],
534
- prop={'size': 10},
 
535
  # loc='lower right'
536
  )
537
 
@@ -561,34 +570,25 @@ for vox1, vox2 in voice_pairs: # 1 figure pro pair
561
  for j, dim in enumerate(['Angry',
562
  'Sad',
563
  'Happy',
564
- 'Surprise',
565
  'Fear',
566
  'Disgust',
567
- 'Contempt',
568
  # 'Neutral'
569
  ]): # ASaHSuFDCN
570
  j = j + 3 # skip A/D/V suplt
571
 
572
  # MIMIC3
573
 
574
- ax[j, 0].plot(time_stamp, preds[f'mimic3_{_str1}'][dim],
575
  color=(0,104/255,139/255),
576
  label='mean_1',
577
  linewidth=2)
578
  ax[j, 0].fill_between(time_stamp,
579
-
580
- preds[f'mimic3_{_str2}'][dim],
581
  preds[f'styletts2_{_str2}'][dim],
582
-
583
- color=(.2,.2,.2),
584
- alpha=0.244)
585
- # ax[j, 0].legend(['StyleTTS2 style mimic3',
586
- # 'StyleTTS2 style crema-d'],
587
- # prop={'size': 10},
588
- # # loc='upper left'
589
- # )
590
-
591
-
592
  ax[j, 0].set_ylabel(dim.lower(), color=(.4, .4, .4), fontsize=14)
593
 
594
  # TICKS
@@ -601,7 +601,7 @@ for vox1, vox2 in voice_pairs: # 1 figure pro pair
601
  # MIMIC3 4x speed
602
 
603
 
604
- ax[j, 1].plot(time_stamp, preds[f'mimic3_{_str2}'][dim],
605
  color=(0,104/255,139/255),
606
  label='mean_1',
607
  linewidth=2)
@@ -609,9 +609,8 @@ for vox1, vox2 in voice_pairs: # 1 figure pro pair
609
 
610
  preds[f'mimic3_{_str2}'][dim],
611
  preds[f'styletts2_{_str2}'][dim],
612
-
613
- color=(.2,.2,.2),
614
- alpha=0.244)
615
  # ax[j, 1].legend(['StyleTTS2 style mimic3 4x speed',
616
  # 'StyleTTS2 style crema-d'],
617
  # prop={'size': 10},
 
34
 
35
 
36
  # ================================================ LIST OF VOICES
37
+ ROOT_DIR = '/data/dkounadis/mimic3-voices/'
38
+ foreign_voices = []
39
+ english_voices = []
40
+ for lang in os.listdir(ROOT_DIR + 'voices'):
41
 
42
+ for voice in os.listdir(ROOT_DIR + 'voices/' + lang):
43
+ if 'en_' in lang:
44
+
45
+ try:
46
+ with open(ROOT_DIR + 'voices/' + lang + '/' + voice + '/speakers.txt', 'r') as f:
47
+ for spk in f:
48
+ english_voices.append(lang + '/' + voice + '#' + spk.rstrip())
49
+ # voice_spk_string = lang + '/' + voice + '#' + spk.rstrip() for spk in f
50
+ except FileNotFoundError:
51
+ english_voices.append(lang + '/' + voice)
52
+
53
+ else:
54
 
55
+ try:
56
+ with open(ROOT_DIR + 'voices/' + lang + '/' + voice + '/speakers.txt', 'r') as f:
57
+ for spk in f:
58
+ foreign_voices.append(lang + '/' + voice + '#' + spk.rstrip())
59
 
60
+ except FileNotFoundError:
61
+ foreign_voices.append(lang + '/' + voice)
62
+ #
63
+ [print(i) for i in foreign_voices]
64
+ print('\n_______________________________\n')
65
+ [print(i) for i in english_voices]
66
  # ====================================================== LIST Mimic-3 ALL VOICES
67
  list_voices = [
68
  'en_US/m-ailabs_low#mary_ann',
69
  'en_UK/apope_low',
70
  'de_DE/thorsten-emotion_low#neutral', # is the 4x really interesting we can just write it in Section
71
+ # 'ko_KO/kss_low',
72
+ 'fr_FR/m-ailabs_low#gilles_g_le_blanc',
73
+
74
+ #'human',
75
  ] # special - for human we load specific style file - no Mimic3 is run
76
 
77
 
 
296
  total_audio_mimic3 = []
297
  total_audio_styletts2 = []
298
  ix = 0
299
+ for list_of_10 in harvard_individual_sentences[:4]: # 77
300
 
301
  text = ' '.join(list_of_10['sentences'])
302
 
 
315
  f'<prosody rate=\'{rate}\'>'
316
  f'<voice name=\'{_voice}\'>'
317
  '<s>'
318
+ f'{text[:-1] + ", .. !!!"}'
319
  '</s>'
320
  '</voice>'
321
  '</prosody>'
 
356
  # style_path = '/cache/audb/msppodcast/2.4.0/fe182b91/Audios/MSP-PODCAST_0235_0053.wav'
357
  # --
358
  # MSP['emotion.test-1'].get().sort_values('valence').index[-1]
359
+ # style_path = '/cache/audb/msppodcast/2.4.0/fe182b91/Audios/MSP-PODCAST_0220_0870.wav'
360
+ # --
361
+ style_path = '/cache/audb/librispeech/3.1.0/fe182b91/test-clean/3575/170457/3575-170457-0024.wav'
362
  x, fs = audiofile.read(style_path) # assure is not very short - equl harvard sent len
363
  print(x.shape,' human') # crop human to almost mimic-3 duration
364
  total_audio_mimic3.append(x)
 
431
 
432
 
433
 
434
+
435
  print('\nVisuals\n')
436
 
437
  # ===============================================================================
 
480
  p.index = p.index.map(mapper = (lambda x: x.total_seconds()))
481
  vis_df[k] = p
482
  preds = vis_df
483
+ fig, ax = plt.subplots(nrows=8, ncols=2, figsize=(24, 19.2), gridspec_kw={'hspace': 0, 'wspace': .04})
484
 
485
 
486
  # ADV - subplots
487
 
488
+ time_stamp = preds[f'mimic3_{_str1}'].index.to_numpy()
489
  for j, dim in enumerate(['arousal',
490
  'dominance',
491
  'valence']):
492
 
493
  # MIMIC3
494
 
495
+ ax[j, 0].plot(time_stamp,
496
+ # np.ones_like(time_stamp) * .4, --> to find the line on the legend
497
+ preds[f'styletts2_{_str1}'][dim], # THIS IS THE BLUE LINE VERIFIED
498
  color=(0,104/255,139/255),
499
  label='mean_1',
500
  linewidth=2)
501
+ # ax[j, 0].plot(time_stamp, preds[f'styletts2_{_str1}'][dim],
502
+ # color=(.2, .2, .2),
503
+ # label='mean_1',
504
+ # linewidth=2,
505
+ # marker='o')
506
  ax[j, 0].fill_between(time_stamp,
507
 
508
  preds[f'styletts2_{_str1}'][dim],
509
  preds[f'mimic3_{_str1}'][dim],
510
+ color=(.5,.5,.5),
511
+ alpha=.4
512
+ )
513
  if j == 0:
514
+ ax[j, 0].legend([f'StyleTTS2 using {_str1}',
515
+ f'mimic3_{_str1}'],
516
  prop={'size': 10},
517
  # loc='lower right'
518
  )
 
520
 
521
  # TICK
522
  ax[j, 0].set_ylim([1e-7, .9999])
 
 
523
  ax[j, 0].set_xticklabels(['' for _ in ax[j, 0].get_xticklabels()])
524
  ax[j, 0].set_xlim([time_stamp[0], time_stamp[-1]])
525
 
 
527
  # MIMIC3 4x speed
528
 
529
 
530
+ ax[j, 1].plot(time_stamp, preds[f'styletts2_{_str2}'][dim],
531
  color=(0,104/255,139/255),
532
  label='mean_1',
533
  linewidth=2)
534
  ax[j, 1].fill_between(time_stamp,
 
 
535
  preds[f'mimic3_{_str2}'][dim],
536
+ preds[f'styletts2_{_str2}'][dim],
537
+ color=(.5,.5,.5),
538
+ alpha=.4)
539
  if j == 0:
540
+ ax[j, 1].legend([
541
+ f'StyleTTS2 using {_str2}',
542
+ f'mimic3_{_str2}'],
543
+ prop={'size': 10},
544
  # loc='lower right'
545
  )
546
 
 
570
  for j, dim in enumerate(['Angry',
571
  'Sad',
572
  'Happy',
573
+ # 'Surprise',
574
  'Fear',
575
  'Disgust',
576
+ # 'Contempt',
577
  # 'Neutral'
578
  ]): # ASaHSuFDCN
579
  j = j + 3 # skip A/D/V suplt
580
 
581
  # MIMIC3
582
 
583
+ ax[j, 0].plot(time_stamp, preds[f'styletts2_{_str2}'][dim],
584
  color=(0,104/255,139/255),
585
  label='mean_1',
586
  linewidth=2)
587
  ax[j, 0].fill_between(time_stamp,
 
 
588
  preds[f'styletts2_{_str2}'][dim],
589
+ preds[f'mimic3_{_str2}'][dim],
590
+ color=(.5,.5,.5),
591
+ alpha=.4)
 
 
 
 
 
 
 
592
  ax[j, 0].set_ylabel(dim.lower(), color=(.4, .4, .4), fontsize=14)
593
 
594
  # TICKS
 
601
  # MIMIC3 4x speed
602
 
603
 
604
+ ax[j, 1].plot(time_stamp, preds[f'styletts2_{_str2}'][dim],
605
  color=(0,104/255,139/255),
606
  label='mean_1',
607
  linewidth=2)
 
609
 
610
  preds[f'mimic3_{_str2}'][dim],
611
  preds[f'styletts2_{_str2}'][dim],
612
+ color=(.5,.5,.5),
613
+ alpha=.4)
 
614
  # ax[j, 1].legend(['StyleTTS2 style mimic3 4x speed',
615
  # 'StyleTTS2 style crema-d'],
616
  # prop={'size': 10},