Dionyssos commited on
Commit
e154110
·
1 Parent(s): 08238e0

2 figs 1x/4x for libri

Browse files
Files changed (2) hide show
  1. tts_harvard.py +10 -10
  2. visualize_tts_plesantness.py +135 -129
tts_harvard.py CHANGED
@@ -1,7 +1,7 @@
1
  # Synthesize all Harvard Lists 77x lists of 10x sentences to single .wav
2
 
3
  # 1. using mimic3 english 1x/4x non-english 1x/4x
4
- # Use visualize_tts_plesantness.py for figs --> 4figs eng 1x 4x vs human non-eng 1x 4x vs human libri
5
 
6
  import soundfile
7
  import json
@@ -89,22 +89,22 @@ synthetic_wav_paths_foreign_4x = ['./mimic3_foreign_4x/' + i for i in os.listdir
89
 
90
 
91
 
92
- for audio_prompt in ['mimic3',
93
- 'mimic3_speed',
94
  'human',
95
  'foreign',
96
- 'foreign_speed']:
97
- if not os.path.isfile(f'{audio_prompt}_k.wav'):
98
  total_audio = []
99
  ix = 0
100
- for list_of_10 in harvard_individual_sentences[:2]:
101
  # long_sentence = ' '.join(list_of_10['sentences'])
102
  # harvard.append(long_sentence.replace('.', ' '))
103
  for text in list_of_10['sentences']:
104
- if audio_prompt == 'mimic3':
105
  style_vec = msinference.compute_style(
106
  synthetic_wav_paths[ix % 134])
107
- elif audio_prompt == 'mimic3_speed':
108
  style_vec = msinference.compute_style(
109
  synthetic_wav_paths_4x[ix % 134])
110
  elif audio_prompt == 'human':
@@ -113,7 +113,7 @@ for audio_prompt in ['mimic3',
113
  elif audio_prompt == 'foreign':
114
  style_vec = msinference.compute_style(
115
  synthetic_wav_paths_foreign[ix % 204])
116
- elif audio_prompt == 'foreign_speed':
117
  style_vec = msinference.compute_style(
118
  synthetic_wav_paths_foreign_4x[ix % 204])
119
  else:
@@ -133,7 +133,7 @@ for audio_prompt in ['mimic3',
133
  print('_____________________')
134
  # -- for 77x lists
135
  total_audio = np.concatenate(total_audio)
136
- soundfile.write(f'{audio_prompt}_k.wav', total_audio, 24000)
137
 
138
  else:
139
  print('\nALREADY EXISTS\n')
 
1
  # Synthesize all Harvard Lists 77x lists of 10x sentences to single .wav
2
 
3
  # 1. using mimic3 english 1x/4x non-english 1x/4x
4
+ # Call visualize_tts_plesantness.py for 4figs [eng 1x/4x vs human, non-eng 1x/4x vs human-libri]
5
 
6
  import soundfile
7
  import json
 
89
 
90
 
91
 
92
+ for audio_prompt in ['english',
93
+ 'english_4x',
94
  'human',
95
  'foreign',
96
+ 'foreign_4x']:
97
+ if not os.path.isfile(f'{audio_prompt}_z.wav'):
98
  total_audio = []
99
  ix = 0
100
+ for list_of_10 in harvard_individual_sentences[:10000]:
101
  # long_sentence = ' '.join(list_of_10['sentences'])
102
  # harvard.append(long_sentence.replace('.', ' '))
103
  for text in list_of_10['sentences']:
104
+ if audio_prompt == 'english':
105
  style_vec = msinference.compute_style(
106
  synthetic_wav_paths[ix % 134])
107
+ elif audio_prompt == 'english_4x':
108
  style_vec = msinference.compute_style(
109
  synthetic_wav_paths_4x[ix % 134])
110
  elif audio_prompt == 'human':
 
113
  elif audio_prompt == 'foreign':
114
  style_vec = msinference.compute_style(
115
  synthetic_wav_paths_foreign[ix % 204])
116
+ elif audio_prompt == 'foreign_4x':
117
  style_vec = msinference.compute_style(
118
  synthetic_wav_paths_foreign_4x[ix % 204])
119
  else:
 
133
  print('_____________________')
134
  # -- for 77x lists
135
  total_audio = np.concatenate(total_audio)
136
+ soundfile.write(f'{audio_prompt}_z.wav', total_audio, 24000)
137
 
138
  else:
139
  print('\nALREADY EXISTS\n')
visualize_tts_plesantness.py CHANGED
@@ -9,6 +9,13 @@
9
  # human_770.wav
10
  # mimic3_770.wav
11
  # mimic3_speedup_770.wav
 
 
 
 
 
 
 
12
  import pandas as pd
13
  import os
14
 
@@ -80,13 +87,7 @@ def _sigmoid(x):
80
  # wavs are generated concat and plot time-series?
81
 
82
  # for mimic3/mimic3speed/human - concat all 77 and run timeseries with 7s hop 3s
83
- for long_audio in [
84
- 'mimic3_k.wav',
85
- 'mimic_speed_k.wav',
86
- 'human_k.wav'
87
- 'foreign_k.wav',
88
- 'foreign_speed_k.wav',
89
- ]:
90
  file_interface = f'timeseries_{long_audio.replace("/", "")}.pkl'
91
  if not os.path.exists(file_interface):
92
 
@@ -241,6 +242,9 @@ for long_audio in [
241
  else:
242
  print(file_interface, 'FOUND')
243
  # df_pred = pd.read_pickle(file_interface)
 
 
 
244
  # ===============================================================================
245
  # V I S U A L S by loading all 3 pkl - mimic3 - speedup - human pd
246
  #
@@ -249,13 +253,7 @@ for long_audio in [
249
 
250
  preds = {}
251
  SHORTEST_PD = 100000 # segments
252
- for long_audio in [
253
- # 'mimic3.wav',
254
- # 'mimic3_speedup.wav',
255
- 'human_770.wav', # 'mimic3_all_77.wav', #
256
- 'mimic3_770.wav',
257
- 'mimic3_speed_770.wav'
258
- ]:
259
  file_interface = f'timeseries_{long_audio.replace("/", "")}.pkl'
260
  y = pd.read_pickle(file_interface)
261
  preds[long_audio] = y
@@ -273,169 +271,177 @@ for k,v in preds.items():
273
  p.index = p.index.map(mapper = (lambda x: x.total_seconds()))
274
  preds[k] = p
275
 
276
- print(p, '\n\n\n\n \n')
277
-
 
278
 
279
- # Show plots by 2
280
 
281
- fig, ax = plt.subplots(nrows=8, ncols=2, figsize=(4.6, 24), gridspec_kw={'hspace': 0, 'wspace': .04})
282
 
283
 
284
- # ADV
285
 
286
- time_stamp = preds['human_770.wav'].index.to_numpy()
287
- for j, dim in enumerate(['arousal',
288
- 'dominance',
289
- 'valence']):
290
 
291
- # MIMIC3
 
292
 
293
- ax[j, 0].plot(time_stamp, preds['mimic3_770.wav'][dim],
294
- color=(0,104/255,139/255),
295
- label='mean_1',
296
- linewidth=2)
297
- ax[j, 0].fill_between(time_stamp,
298
 
299
- preds['mimic3_770.wav'][dim],
300
- preds['human_770.wav'][dim],
301
 
302
- color=(.2,.2,.2),
303
- alpha=0.244)
304
- if j == 0:
305
- ax[j, 0].legend(['StyleTTS2 style mimic3',
306
- 'StyleTTS2 style crema-d'],
307
- prop={'size': 10},
308
- # loc='lower right'
309
- )
310
- ax[j, 0].set_ylabel(dim.lower(), color=(.4, .4, .4), fontsize=14)
311
-
312
- # TICK
313
- ax[j, 0].set_ylim([1e-7, .9999])
314
- # ax[j, 0].set_yticks([.25, .5,.75])
315
- # ax[j, 0].set_yticklabels(['0.25', '.5', '0.75'])
316
- ax[j, 0].set_xticklabels(['' for _ in ax[j, 0].get_xticklabels()])
317
- ax[j, 0].set_xlim([time_stamp[0], time_stamp[-1]])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
318
 
319
 
320
- # MIMIC3 4x speed
321
 
322
 
323
- ax[j, 1].plot(time_stamp, preds['mimic3_speed_770.wav'][dim],
324
- color=(0,104/255,139/255),
325
- label='mean_1',
326
- linewidth=2)
327
- ax[j, 1].fill_between(time_stamp,
328
 
329
- preds['mimic3_speed_770.wav'][dim],
330
- preds['human_770.wav'][dim],
331
 
332
- color=(.2,.2,.2),
333
- alpha=0.244)
334
- if j == 0:
335
- ax[j, 1].legend(['StyleTTS2 style mimic3 4x speed',
336
- 'StyleTTS2 style crema-d'],
337
- prop={'size': 10},
338
- # loc='lower right'
339
- )
340
 
341
 
342
- ax[j, 1].set_xlabel('767 Harvard Sentences (seconds)')
343
 
344
 
345
 
346
- # TICK
347
- ax[j, 1].set_ylim([1e-7, .9999])
348
- # ax[j, 1].set_yticklabels(['' for _ in ax[j, 1].get_yticklabels()])
349
- ax[j, 1].set_xticklabels(['' for _ in ax[j, 0].get_xticklabels()])
350
- ax[j, 1].set_xlim([time_stamp[0], time_stamp[-1]])
351
 
352
 
353
 
354
 
355
- ax[j, 0].grid()
356
- ax[j, 1].grid()
357
- # CATEGORIE
358
 
359
 
360
 
361
 
362
 
363
- time_stamp = preds['human_770.wav'].index.to_numpy()
364
- for j, dim in enumerate(['Angry',
365
- 'Sad',
366
- 'Happy',
367
- # 'Surprise',
368
- 'Fear',
369
- 'Disgust',
370
- # 'Contempt',
371
- # 'Neutral'
372
- ]): # ASaHSuFDCN
373
- j = j + 3 # skip A/D/V suplt
374
 
375
- # MIMIC3
376
 
377
- ax[j, 0].plot(time_stamp, preds['mimic3_770.wav'][dim],
378
- color=(0,104/255,139/255),
379
- label='mean_1',
380
- linewidth=2)
381
- ax[j, 0].fill_between(time_stamp,
382
 
383
- preds['mimic3_770.wav'][dim],
384
- preds['human_770.wav'][dim],
385
 
386
- color=(.2,.2,.2),
387
- alpha=0.244)
388
- # ax[j, 0].legend(['StyleTTS2 style mimic3',
389
- # 'StyleTTS2 style crema-d'],
390
- # prop={'size': 10},
391
- # # loc='upper left'
392
- # )
393
 
394
 
395
- ax[j, 0].set_ylabel(dim.lower(), color=(.4, .4, .4), fontsize=14)
396
 
397
- # TICKS
398
- ax[j, 0].set_ylim([1e-7, .9999])
399
- ax[j, 0].set_xlim([time_stamp[0], time_stamp[-1]])
400
- ax[j, 0].set_xticklabels(['' for _ in ax[j, 0].get_xticklabels()])
401
- ax[j, 0].set_xlabel('767 Harvard Sentences (seconds)', fontsize=16, color=(.4,.4,.4))
402
 
403
 
404
- # MIMIC3 4x speed
405
 
406
 
407
- ax[j, 1].plot(time_stamp, preds['mimic3_speed_770.wav'][dim],
408
- color=(0,104/255,139/255),
409
- label='mean_1',
410
- linewidth=2)
411
- ax[j, 1].fill_between(time_stamp,
412
 
413
- preds['mimic3_speed_770.wav'][dim],
414
- preds['human_770.wav'][dim],
415
 
416
- color=(.2,.2,.2),
417
- alpha=0.244)
418
- # ax[j, 1].legend(['StyleTTS2 style mimic3 4x speed',
419
- # 'StyleTTS2 style crema-d'],
420
- # prop={'size': 10},
421
- # # loc='upper left'
422
- # )
423
- ax[j, 1].set_xlabel('767 Harvard Sentences (seconds)', fontsize=16, color=(.4,.4,.4))
424
- ax[j, 1].set_ylim([1e-7, .999])
425
- # ax[j, 1].set_yticklabels(['' for _ in ax[j, 1].get_yticklabels()])
426
- ax[j, 1].set_xticklabels(['' for _ in ax[j, 1].get_xticklabels()])
427
- ax[j, 1].set_xlim([time_stamp[0], time_stamp[-1]])
428
-
429
 
430
 
431
 
432
 
433
 
434
- ax[j, 0].grid()
435
- ax[j, 1].grid()
436
 
437
 
438
 
439
- plt.savefig(f'fig8.pdf', bbox_inches='tight')
440
- plt.close()
441
 
 
9
  # human_770.wav
10
  # mimic3_770.wav
11
  # mimic3_speedup_770.wav
12
+ FULL_WAV = [
13
+ 'english_z.wav',
14
+ 'english_4x_z.wav',
15
+ 'human_z.wav',
16
+ 'foreign_z.wav',
17
+ 'foreign_4x_z.wav',
18
+ ]
19
  import pandas as pd
20
  import os
21
 
 
87
  # wavs are generated concat and plot time-series?
88
 
89
  # for mimic3/mimic3speed/human - concat all 77 and run timeseries with 7s hop 3s
90
+ for long_audio in FULL_WAV:
 
 
 
 
 
 
91
  file_interface = f'timeseries_{long_audio.replace("/", "")}.pkl'
92
  if not os.path.exists(file_interface):
93
 
 
242
  else:
243
  print(file_interface, 'FOUND')
244
  # df_pred = pd.read_pickle(file_interface)
245
+
246
+
247
+
248
  # ===============================================================================
249
  # V I S U A L S by loading all 3 pkl - mimic3 - speedup - human pd
250
  #
 
253
 
254
  preds = {}
255
  SHORTEST_PD = 100000 # segments
256
+ for long_audio in FULL_WAV:
 
 
 
 
 
 
257
  file_interface = f'timeseries_{long_audio.replace("/", "")}.pkl'
258
  y = pd.read_pickle(file_interface)
259
  preds[long_audio] = y
 
271
  p.index = p.index.map(mapper = (lambda x: x.total_seconds()))
272
  preds[k] = p
273
 
274
+ # print(p, '\n\n\n\n \n')
275
+
276
+ print(preds.keys(),'p')
277
 
 
278
 
 
279
 
280
 
281
+ # 2 PLOTS
282
 
283
+ for lang in ['english',
284
+ 'foreign']:
285
+
 
286
 
287
+ fig, ax = plt.subplots(nrows=8, ncols=2, figsize=(21, 24),
288
+ gridspec_kw={'hspace': 0, 'wspace': .04})
289
 
 
 
 
 
 
290
 
291
+
 
292
 
293
+ time_stamp = preds['human_z.wav'].index.to_numpy()
294
+ for j, dim in enumerate(['arousal',
295
+ 'dominance',
296
+ 'valence']):
297
+
298
+ # MIMIC3
299
+
300
+ ax[j, 0].plot(time_stamp, preds[f'{lang}_z.wav'][dim],
301
+ color=(0,104/255,139/255),
302
+ label='mean_1',
303
+ linewidth=2)
304
+ ax[j, 0].fill_between(time_stamp,
305
+
306
+ preds[f'{lang}_z.wav'][dim],
307
+ preds['human_z.wav'][dim],
308
+
309
+ color=(.2,.2,.2),
310
+ alpha=0.244)
311
+ if j == 0:
312
+ ax[j, 0].legend([f'StyleTTS2 using {lang}',
313
+ f'StyleTTS2 uising LibriSpeech'],
314
+ prop={'size': 10},
315
+ )
316
+ ax[j, 0].set_ylabel(dim.lower(), color=(.4, .4, .4), fontsize=14)
317
+
318
+ # TICK
319
+ ax[j, 0].set_ylim([1e-7, .9999])
320
+ # ax[j, 0].set_yticks([.25, .5,.75])
321
+ # ax[j, 0].set_yticklabels(['0.25', '.5', '0.75'])
322
+ ax[j, 0].set_xticklabels(['' for _ in ax[j, 0].get_xticklabels()])
323
+ ax[j, 0].set_xlim([time_stamp[0], time_stamp[-1]])
324
 
325
 
326
+ # MIMIC3 4x speed
327
 
328
 
329
+ ax[j, 1].plot(time_stamp, preds[f'{lang}_4x_z.wav'][dim],
330
+ color=(0,104/255,139/255),
331
+ label='mean_1',
332
+ linewidth=2)
333
+ ax[j, 1].fill_between(time_stamp,
334
 
335
+ preds[f'{lang}_4x_z.wav'][dim],
336
+ preds['human_z.wav'][dim],
337
 
338
+ color=(.2,.2,.2),
339
+ alpha=0.244)
340
+ if j == 0:
341
+ ax[j, 1].legend([f'StyleTTS2 using {lang} 4x speed',
342
+ f'StyleTTS2 using LibriSpeech'],
343
+ prop={'size': 10},
344
+ # loc='lower right'
345
+ )
346
 
347
 
348
+ ax[j, 1].set_xlabel('767 Harvard Sentences (seconds)')
349
 
350
 
351
 
352
+ # TICK
353
+ ax[j, 1].set_ylim([1e-7, .9999])
354
+ # ax[j, 1].set_yticklabels(['' for _ in ax[j, 1].get_yticklabels()])
355
+ ax[j, 1].set_xticklabels(['' for _ in ax[j, 0].get_xticklabels()])
356
+ ax[j, 1].set_xlim([time_stamp[0], time_stamp[-1]])
357
 
358
 
359
 
360
 
361
+ ax[j, 0].grid()
362
+ ax[j, 1].grid()
363
+ # CATEGORIE
364
 
365
 
366
 
367
 
368
 
369
+ time_stamp = preds['human_z.wav'].index.to_numpy()
370
+ for j, dim in enumerate(['Angry',
371
+ 'Sad',
372
+ 'Happy',
373
+ # 'Surprise',
374
+ 'Fear',
375
+ 'Disgust',
376
+ # 'Contempt',
377
+ # 'Neutral'
378
+ ]): # ASaHSuFDCN
379
+ j = j + 3 # skip A/D/V suplt
380
 
381
+ # MIMIC3
382
 
383
+ ax[j, 0].plot(time_stamp, preds[f'{lang}_z.wav'][dim],
384
+ color=(0,104/255,139/255),
385
+ label='mean_1',
386
+ linewidth=2)
387
+ ax[j, 0].fill_between(time_stamp,
388
 
389
+ preds[f'{lang}_z.wav'][dim],
390
+ preds['human_z.wav'][dim],
391
 
392
+ color=(.2,.2,.2),
393
+ alpha=0.244)
394
+ # ax[j, 0].legend(['StyleTTS2 style mimic3',
395
+ # 'StyleTTS2 style crema-d'],
396
+ # prop={'size': 10},
397
+ # # loc='upper left'
398
+ # )
399
 
400
 
401
+ ax[j, 0].set_ylabel(dim.lower(), color=(.4, .4, .4), fontsize=14)
402
 
403
+ # TICKS
404
+ ax[j, 0].set_ylim([1e-7, .9999])
405
+ ax[j, 0].set_xlim([time_stamp[0], time_stamp[-1]])
406
+ ax[j, 0].set_xticklabels(['' for _ in ax[j, 0].get_xticklabels()])
407
+ ax[j, 0].set_xlabel('767 Harvard Sentences (seconds)', fontsize=16, color=(.4,.4,.4))
408
 
409
 
410
+ # MIMIC3 4x speed
411
 
412
 
413
+ ax[j, 1].plot(time_stamp, preds[f'{lang}_4x_z.wav'][dim],
414
+ color=(0,104/255,139/255),
415
+ label='mean_1',
416
+ linewidth=2)
417
+ ax[j, 1].fill_between(time_stamp,
418
 
419
+ preds[f'{lang}_4x_z.wav'][dim],
420
+ preds['human_z.wav'][dim],
421
 
422
+ color=(.2,.2,.2),
423
+ alpha=0.244)
424
+ # ax[j, 1].legend(['StyleTTS2 style mimic3 4x speed',
425
+ # 'StyleTTS2 style crema-d'],
426
+ # prop={'size': 10},
427
+ # # loc='upper left'
428
+ # )
429
+ ax[j, 1].set_xlabel('767 Harvard Sentences (seconds)', fontsize=16, color=(.4,.4,.4))
430
+ ax[j, 1].set_ylim([1e-7, .999])
431
+ # ax[j, 1].set_yticklabels(['' for _ in ax[j, 1].get_yticklabels()])
432
+ ax[j, 1].set_xticklabels(['' for _ in ax[j, 1].get_xticklabels()])
433
+ ax[j, 1].set_xlim([time_stamp[0], time_stamp[-1]])
434
+
435
 
436
 
437
 
438
 
439
 
440
+ ax[j, 0].grid()
441
+ ax[j, 1].grid()
442
 
443
 
444
 
445
+ plt.savefig(f'fig_{lang}_z.pdf', bbox_inches='tight')
446
+ plt.close()
447