dkounadis
/

artificial-styletts2

@@ -1,7 +1,7 @@
 # Synthesize all Harvard Lists 77x lists of 10x sentences to single .wav
 # 1. using mimic3 english 1x/4x non-english 1x/4x
-# Use visualize_tts_plesantness.py for figs --> 4figs eng 1x 4x vs human  non-eng 1x 4x vs human libri
 import soundfile
 import json
@@ -89,22 +89,22 @@ synthetic_wav_paths_foreign_4x = ['./mimic3_foreign_4x/' + i for i in os.listdir
-for audio_prompt in ['mimic3',
-                     'mimic3_speed',
                      'human',
                      'foreign',
-                     'foreign_speed']:
-    if not os.path.isfile(f'{audio_prompt}_k.wav'):
                     total_audio = []
                     ix = 0
-                    for list_of_10 in harvard_individual_sentences[:2]:
                         # long_sentence = ' '.join(list_of_10['sentences'])
                         # harvard.append(long_sentence.replace('.', ' '))
                         for text in list_of_10['sentences']:
-                            if audio_prompt == 'mimic3':
                                 style_vec = msinference.compute_style(
                                     synthetic_wav_paths[ix % 134])
-                            elif audio_prompt == 'mimic3_speed':
                                 style_vec = msinference.compute_style(
                                     synthetic_wav_paths_4x[ix % 134])
                             elif audio_prompt == 'human':
@@ -113,7 +113,7 @@ for audio_prompt in ['mimic3',
                             elif audio_prompt == 'foreign':
                                 style_vec = msinference.compute_style(
                                     synthetic_wav_paths_foreign[ix % 204])
-                            elif audio_prompt == 'foreign_speed':
                                 style_vec = msinference.compute_style(
                                     synthetic_wav_paths_foreign_4x[ix % 204])
                             else:
@@ -133,7 +133,7 @@ for audio_prompt in ['mimic3',
                         print('_____________________')
                     # -- for 77x lists
                     total_audio = np.concatenate(total_audio)
-                    soundfile.write(f'{audio_prompt}_k.wav', total_audio, 24000)
     else:
         print('\nALREADY EXISTS\n')

 # Synthesize all Harvard Lists 77x lists of 10x sentences to single .wav
 # 1. using mimic3 english 1x/4x non-english 1x/4x
+# Call visualize_tts_plesantness.py for 4figs [eng 1x/4x vs human,  non-eng 1x/4x vs human-libri]
 import soundfile
 import json
+for audio_prompt in ['english',
+                     'english_4x',
                      'human',
                      'foreign',
+                     'foreign_4x']:
+    if not os.path.isfile(f'{audio_prompt}_z.wav'):
                     total_audio = []
                     ix = 0
+                    for list_of_10 in harvard_individual_sentences[:10000]:
                         # long_sentence = ' '.join(list_of_10['sentences'])
                         # harvard.append(long_sentence.replace('.', ' '))
                         for text in list_of_10['sentences']:
+                            if audio_prompt == 'english':
                                 style_vec = msinference.compute_style(
                                     synthetic_wav_paths[ix % 134])
+                            elif audio_prompt == 'english_4x':
                                 style_vec = msinference.compute_style(
                                     synthetic_wav_paths_4x[ix % 134])
                             elif audio_prompt == 'human':
                             elif audio_prompt == 'foreign':
                                 style_vec = msinference.compute_style(
                                     synthetic_wav_paths_foreign[ix % 204])
+                            elif audio_prompt == 'foreign_4x':
                                 style_vec = msinference.compute_style(
                                     synthetic_wav_paths_foreign_4x[ix % 204])
                             else:
                         print('_____________________')
                     # -- for 77x lists
                     total_audio = np.concatenate(total_audio)
+                    soundfile.write(f'{audio_prompt}_z.wav', total_audio, 24000)
     else:
         print('\nALREADY EXISTS\n')

visualize_tts_plesantness.py CHANGED Viewed

@@ -9,6 +9,13 @@
 # human_770.wav
 # mimic3_770.wav
 # mimic3_speedup_770.wav
 import pandas as pd
 import os
@@ -80,13 +87,7 @@ def _sigmoid(x):
 # wavs are generated concat and plot time-series?
 # for mimic3/mimic3speed/human - concat all 77 and run timeseries with 7s hop 3s
-for long_audio in [
-    'mimic3_k.wav',
-    'mimic_speed_k.wav',
-    'human_k.wav'
-    'foreign_k.wav',
-    'foreign_speed_k.wav',
-                    ]:
     file_interface = f'timeseries_{long_audio.replace("/", "")}.pkl'
     if not os.path.exists(file_interface):
@@ -241,6 +242,9 @@ for long_audio in [
     else:
         print(file_interface, 'FOUND')
         # df_pred = pd.read_pickle(file_interface)
 # ===============================================================================
 # V I S U A L S by loading all 3 pkl - mimic3 - speedup - human pd
 #
@@ -249,13 +253,7 @@ for long_audio in [
 preds  = {}
 SHORTEST_PD = 100000  # segments
-for long_audio in [
-    # 'mimic3.wav',
-    #                 'mimic3_speedup.wav',
-                    'human_770.wav',  # 'mimic3_all_77.wav', #
-                    'mimic3_770.wav',
-                    'mimic3_speed_770.wav'
-                    ]:
     file_interface = f'timeseries_{long_audio.replace("/", "")}.pkl'
     y = pd.read_pickle(file_interface)
     preds[long_audio] = y
@@ -273,169 +271,177 @@ for k,v in preds.items():
     p.index = p.index.map(mapper = (lambda x: x.total_seconds()))
     preds[k] = p
-    print(p, '\n\n\n\n \n')
-# Show plots by 2
-fig, ax = plt.subplots(nrows=8, ncols=2, figsize=(4.6, 24), gridspec_kw={'hspace': 0, 'wspace': .04})
-# ADV
-time_stamp = preds['human_770.wav'].index.to_numpy()
-for j, dim in enumerate(['arousal',
-                         'dominance',
-                         'valence']):
-    # MIMIC3
-    ax[j, 0].plot(time_stamp, preds['mimic3_770.wav'][dim],
-                color=(0,104/255,139/255),
-                label='mean_1',
-                linewidth=2)
-    ax[j, 0].fill_between(time_stamp,
-                    preds['mimic3_770.wav'][dim],
-                    preds['human_770.wav'][dim],
-                    color=(.2,.2,.2),
-                    alpha=0.244)
-    if j == 0:
-        ax[j, 0].legend(['StyleTTS2 style mimic3',
-                        'StyleTTS2 style crema-d'],
-                        prop={'size': 10},
-                        #  loc='lower right'
-                        )
-    ax[j, 0].set_ylabel(dim.lower(), color=(.4, .4, .4), fontsize=14)
-    # TICK
-    ax[j, 0].set_ylim([1e-7, .9999])
-    # ax[j, 0].set_yticks([.25, .5,.75])
-    # ax[j, 0].set_yticklabels(['0.25', '.5', '0.75'])
-    ax[j, 0].set_xticklabels(['' for _ in ax[j, 0].get_xticklabels()])
-    ax[j, 0].set_xlim([time_stamp[0], time_stamp[-1]])
-   # MIMIC3   4x speed
-    ax[j, 1].plot(time_stamp, preds['mimic3_speed_770.wav'][dim],
-                color=(0,104/255,139/255),
-                label='mean_1',
-                linewidth=2)
-    ax[j, 1].fill_between(time_stamp,
-                    preds['mimic3_speed_770.wav'][dim],
-                    preds['human_770.wav'][dim],
-                    color=(.2,.2,.2),
-                    alpha=0.244)
-    if j == 0:
-        ax[j, 1].legend(['StyleTTS2 style mimic3   4x speed',
-                        'StyleTTS2 style crema-d'],
-                        prop={'size': 10},
-                        #  loc='lower right'
-                        )
-    ax[j, 1].set_xlabel('767 Harvard Sentences (seconds)')
-    # TICK
-    ax[j, 1].set_ylim([1e-7, .9999])
-    # ax[j, 1].set_yticklabels(['' for _ in ax[j, 1].get_yticklabels()])
-    ax[j, 1].set_xticklabels(['' for _ in ax[j, 0].get_xticklabels()])
-    ax[j, 1].set_xlim([time_stamp[0], time_stamp[-1]])
-    ax[j, 0].grid()
-    ax[j, 1].grid()
-# CATEGORIE
-time_stamp = preds['human_770.wav'].index.to_numpy()
-for j, dim in enumerate(['Angry',
-                         'Sad',
-                         'Happy',
-                        #  'Surprise',
-                         'Fear',
-                         'Disgust',
-                        #  'Contempt',
-                        #  'Neutral'
-                         ]):   # ASaHSuFDCN
-    j = j + 3  # skip A/D/V suplt
-    # MIMIC3
-    ax[j, 0].plot(time_stamp, preds['mimic3_770.wav'][dim],
-                color=(0,104/255,139/255),
-                label='mean_1',
-                linewidth=2)
-    ax[j, 0].fill_between(time_stamp,
-                    preds['mimic3_770.wav'][dim],
-                    preds['human_770.wav'][dim],
-                    color=(.2,.2,.2),
-                    alpha=0.244)
-    # ax[j, 0].legend(['StyleTTS2 style mimic3',
-    #                  'StyleTTS2 style crema-d'],
-    #                  prop={'size': 10},
-    #                 #  loc='upper left'
-    # )
-    ax[j, 0].set_ylabel(dim.lower(), color=(.4, .4, .4), fontsize=14)
-    # TICKS
-    ax[j, 0].set_ylim([1e-7, .9999])
-    ax[j, 0].set_xlim([time_stamp[0], time_stamp[-1]])
-    ax[j, 0].set_xticklabels(['' for _ in ax[j, 0].get_xticklabels()])
-    ax[j, 0].set_xlabel('767 Harvard Sentences (seconds)', fontsize=16, color=(.4,.4,.4))
-   # MIMIC3   4x speed
-    ax[j, 1].plot(time_stamp, preds['mimic3_speed_770.wav'][dim],
-                color=(0,104/255,139/255),
-                label='mean_1',
-                linewidth=2)
-    ax[j, 1].fill_between(time_stamp,
-                    preds['mimic3_speed_770.wav'][dim],
-                    preds['human_770.wav'][dim],
-                    color=(.2,.2,.2),
-                    alpha=0.244)
-    # ax[j, 1].legend(['StyleTTS2 style mimic3   4x speed',
-    #                  'StyleTTS2 style crema-d'],
-    #                  prop={'size': 10},
-    #                 #  loc='upper left'
-    # )
-    ax[j, 1].set_xlabel('767 Harvard Sentences (seconds)', fontsize=16, color=(.4,.4,.4))
-    ax[j, 1].set_ylim([1e-7, .999])
-    # ax[j, 1].set_yticklabels(['' for _ in ax[j, 1].get_yticklabels()])
-    ax[j, 1].set_xticklabels(['' for _ in ax[j, 1].get_xticklabels()])
-    ax[j, 1].set_xlim([time_stamp[0], time_stamp[-1]])
-    ax[j, 0].grid()
-    ax[j, 1].grid()
-plt.savefig(f'fig8.pdf', bbox_inches='tight')
-plt.close()

 # human_770.wav
 # mimic3_770.wav
 # mimic3_speedup_770.wav
+FULL_WAV  = [
+    'english_z.wav',
+    'english_4x_z.wav',
+    'human_z.wav',
+    'foreign_z.wav',
+    'foreign_4x_z.wav',
+                    ]
 import pandas as pd
 import os
 # wavs are generated concat and plot time-series?
 # for mimic3/mimic3speed/human - concat all 77 and run timeseries with 7s hop 3s
+for long_audio in FULL_WAV:
     file_interface = f'timeseries_{long_audio.replace("/", "")}.pkl'
     if not os.path.exists(file_interface):
     else:
         print(file_interface, 'FOUND')
         # df_pred = pd.read_pickle(file_interface)
 # ===============================================================================
 # V I S U A L S by loading all 3 pkl - mimic3 - speedup - human pd
 #
 preds  = {}
 SHORTEST_PD = 100000  # segments
+for long_audio in FULL_WAV:
     file_interface = f'timeseries_{long_audio.replace("/", "")}.pkl'
     y = pd.read_pickle(file_interface)
     preds[long_audio] = y
     p.index = p.index.map(mapper = (lambda x: x.total_seconds()))
     preds[k] = p
+    # print(p, '\n\n\n\n \n')
+print(preds.keys(),'p')
+# 2 PLOTS
+for lang in ['english',
+             'foreign']:
+            fig, ax = plt.subplots(nrows=8, ncols=2, figsize=(21, 24),
+                                   gridspec_kw={'hspace': 0, 'wspace': .04})
+            time_stamp = preds['human_z.wav'].index.to_numpy()
+            for j, dim in enumerate(['arousal',
+                                    'dominance',
+                                    'valence']):
+                # MIMIC3
+                ax[j, 0].plot(time_stamp, preds[f'{lang}_z.wav'][dim],
+                            color=(0,104/255,139/255),
+                            label='mean_1',
+                            linewidth=2)
+                ax[j, 0].fill_between(time_stamp,
+                                preds[f'{lang}_z.wav'][dim],
+                                preds['human_z.wav'][dim],
+                                color=(.2,.2,.2),
+                                alpha=0.244)
+                if j == 0:
+                    ax[j, 0].legend([f'StyleTTS2 using {lang}',
+                                     f'StyleTTS2 uising LibriSpeech'],
+                                    prop={'size': 10},
+                                    )
+                ax[j, 0].set_ylabel(dim.lower(), color=(.4, .4, .4), fontsize=14)
+                # TICK
+                ax[j, 0].set_ylim([1e-7, .9999])
+                # ax[j, 0].set_yticks([.25, .5,.75])
+                # ax[j, 0].set_yticklabels(['0.25', '.5', '0.75'])
+                ax[j, 0].set_xticklabels(['' for _ in ax[j, 0].get_xticklabels()])
+                ax[j, 0].set_xlim([time_stamp[0], time_stamp[-1]])
+            # MIMIC3   4x speed
+                ax[j, 1].plot(time_stamp, preds[f'{lang}_4x_z.wav'][dim],
+                            color=(0,104/255,139/255),
+                            label='mean_1',
+                            linewidth=2)
+                ax[j, 1].fill_between(time_stamp,
+                                preds[f'{lang}_4x_z.wav'][dim],
+                                preds['human_z.wav'][dim],
+                                color=(.2,.2,.2),
+                                alpha=0.244)
+                if j == 0:
+                    ax[j, 1].legend([f'StyleTTS2 using {lang} 4x speed',
+                                    f'StyleTTS2 using LibriSpeech'],
+                                    prop={'size': 10},
+                                    #  loc='lower right'
+                                    )
+                ax[j, 1].set_xlabel('767 Harvard Sentences (seconds)')
+                # TICK
+                ax[j, 1].set_ylim([1e-7, .9999])
+                # ax[j, 1].set_yticklabels(['' for _ in ax[j, 1].get_yticklabels()])
+                ax[j, 1].set_xticklabels(['' for _ in ax[j, 0].get_xticklabels()])
+                ax[j, 1].set_xlim([time_stamp[0], time_stamp[-1]])
+                ax[j, 0].grid()
+                ax[j, 1].grid()
+            # CATEGORIE
+            time_stamp = preds['human_z.wav'].index.to_numpy()
+            for j, dim in enumerate(['Angry',
+                                    'Sad',
+                                    'Happy',
+                                    #  'Surprise',
+                                    'Fear',
+                                    'Disgust',
+                                    #  'Contempt',
+                                    #  'Neutral'
+                                    ]):   # ASaHSuFDCN
+                j = j + 3  # skip A/D/V suplt
+                # MIMIC3
+                ax[j, 0].plot(time_stamp, preds[f'{lang}_z.wav'][dim],
+                            color=(0,104/255,139/255),
+                            label='mean_1',
+                            linewidth=2)
+                ax[j, 0].fill_between(time_stamp,
+                                preds[f'{lang}_z.wav'][dim],
+                                preds['human_z.wav'][dim],
+                                color=(.2,.2,.2),
+                                alpha=0.244)
+                # ax[j, 0].legend(['StyleTTS2 style mimic3',
+                #                  'StyleTTS2 style crema-d'],
+                #                  prop={'size': 10},
+                #                 #  loc='upper left'
+                # )
+                ax[j, 0].set_ylabel(dim.lower(), color=(.4, .4, .4), fontsize=14)
+                # TICKS
+                ax[j, 0].set_ylim([1e-7, .9999])
+                ax[j, 0].set_xlim([time_stamp[0], time_stamp[-1]])
+                ax[j, 0].set_xticklabels(['' for _ in ax[j, 0].get_xticklabels()])
+                ax[j, 0].set_xlabel('767 Harvard Sentences (seconds)', fontsize=16, color=(.4,.4,.4))
+            # MIMIC3   4x speed
+                ax[j, 1].plot(time_stamp, preds[f'{lang}_4x_z.wav'][dim],
+                            color=(0,104/255,139/255),
+                            label='mean_1',
+                            linewidth=2)
+                ax[j, 1].fill_between(time_stamp,
+                                preds[f'{lang}_4x_z.wav'][dim],
+                                preds['human_z.wav'][dim],
+                                color=(.2,.2,.2),
+                                alpha=0.244)
+                # ax[j, 1].legend(['StyleTTS2 style mimic3   4x speed',
+                #                  'StyleTTS2 style crema-d'],
+                #                  prop={'size': 10},
+                #                 #  loc='upper left'
+                # )
+                ax[j, 1].set_xlabel('767 Harvard Sentences (seconds)', fontsize=16, color=(.4,.4,.4))
+                ax[j, 1].set_ylim([1e-7, .999])
+                # ax[j, 1].set_yticklabels(['' for _ in ax[j, 1].get_yticklabels()])
+                ax[j, 1].set_xticklabels(['' for _ in ax[j, 1].get_xticklabels()])
+                ax[j, 1].set_xlim([time_stamp[0], time_stamp[-1]])
+                ax[j, 0].grid()
+                ax[j, 1].grid()
+            plt.savefig(f'fig_{lang}_z.pdf', bbox_inches='tight')
+            plt.close()