fx live dem

Browse files

Files changed (9) hide show

README.md +10 -18
api.py +11 -19
audiocraft/transformer.py +1 -1
engineer_style_vectors_v2.py +0 -331
landscape2soundscape.py +7 -15
live_api.py +0 -135
live_demo.py +34 -60
models.py +5 -2
msinference.py +5 -2

README.md CHANGED Viewed

@@ -74,7 +74,7 @@ Following examples need `api.py` to be running. [Set this IP](https://huggingfac
 The following needs `api.py` to be already running on a tmux session.
 ```python
-# TTS & soundscape - overlay to .mp4
 python landscape2soundscape.py
 ```
@@ -94,44 +94,36 @@ For SHIFT demo / Collaboration with [SMB](https://www.smb.museum/home/)
 [![06](uc_spk_Landscape2Soundscape_Masterpieces_pics/thumb____06_Menzel_AI900_001.jpg)](https://youtu.be/3M0y9OYzDfU)
-[![07](uc_spk_Landscape2Soundscape_Masterpieces_pics/thumb____07_Courbet_AI967_001.jpg)](https://youtu.be/OBY666_By1k)
 [![08](uc_spk_Landscape2Soundscape_Masterpieces_pics/thumb____08_Monet_AI1013_001.jpg)](https://youtu.be/gnGCYLcdLsA)
 [![10](uc_spk_Landscape2Soundscape_Masterpieces_pics/thumb____10_Boecklin_967648_NG2-80_001_rsz.jpg)](https://www.youtube.com/watch?v=Y8QyYUgLaCg)
-[![11](uc_spk_Landscape2Soundscape_Masterpieces_pics/thumb____11_Liebermann_NG4-94_001.jpg)](https://youtu.be/XDDzxDSrhb0)
-[![12](uc_spk_Landscape2Soundscape_Masterpieces_pics/thumb____12_Slevogt_AII1022_001.jpg)](https://youtu.be/I3YYKiUzHpA)
-# SoundScape Live (iterative) Demo - Paplay
-Special Flask API for playing sounds live
 ```python
-CUDA_DEVICE_ORDER=PCI_BUS_ID HF_HOME=/data/dkounadis/.hf7/ CUDA_VISIBLE_DEVICES=4 python live_api.py
 ```
-Client - Describe any sound with words and it will be played back to you.
 ```python
-python live_demo.py  # will ask text input & play soundscape
 ```
-# SoundScape (basic) Demo
-```python
-CUDA_DEVICE_ORDER=PCI_BUS_ID HF_HOME=/data/dkounadis/.hf7/ CUDA_VISIBLE_DEVICES=4 python demo.py
-```
-##
 # Audiobook
-Create audiobook from `.docx`. Listen to it - YouTube [male voice](https://www.youtube.com/watch?v=5-cpf7u18JE) / [v2](https://www.youtube.com/watch?v=Pzo-kKaNg6s) / [v2.1](https://www.youtube.com/watch?v=X4qlKBBaegM)/ [female voice](https://www.youtube.com/watch?v=pzrLYCaWD2A)
 ```python
 #  audiobook will be saved in ./tts_audiobooks

 The following needs `api.py` to be already running on a tmux session.
 ```python
+# TTS & soundscape - output .mp4 saved in ./out/
 python landscape2soundscape.py
 ```
 [![06](uc_spk_Landscape2Soundscape_Masterpieces_pics/thumb____06_Menzel_AI900_001.jpg)](https://youtu.be/3M0y9OYzDfU)
+[![07](uc_spk_Landscape2Soundscape_Masterpieces_pics/thumb____07_Courbet_AI967_001.jpg)](https://youtu.be/56MH7zOHrNQ)
 [![08](uc_spk_Landscape2Soundscape_Masterpieces_pics/thumb____08_Monet_AI1013_001.jpg)](https://youtu.be/gnGCYLcdLsA)
 [![10](uc_spk_Landscape2Soundscape_Masterpieces_pics/thumb____10_Boecklin_967648_NG2-80_001_rsz.jpg)](https://www.youtube.com/watch?v=Y8QyYUgLaCg)
+[![11](uc_spk_Landscape2Soundscape_Masterpieces_pics/thumb____11_Liebermann_NG4-94_001.jpg)](https://youtu.be/RhUuS9HMLhg)
+[![12](uc_spk_Landscape2Soundscape_Masterpieces_pics/thumb____12_Slevogt_AII1022_001.jpg)](https://youtu.be/NzzhhrUeKVY)
+# SoundScape Live Demo - Paplay
+Flask API for playing sounds live
 ```python
+CUDA_DEVICE_ORDER=PCI_BUS_ID HF_HOME=/data/dkounadis/.hf7/ CUDA_VISIBLE_DEVICES=4 python api.py
 ```
+Describe any sound via text, the tts & soundscape is played back
 ```python
+python live_demo.py  # type text & plays AudioGen sound & TTS
 ```
 # Audiobook
+Create audiobook from `.docx`. Listen to it - YouTube [male voice](https://www.youtube.com/watch?v=5-cpf7u18JE) / [v2](https://www.youtube.com/watch?v=Pzo-kKaNg6s) / [v2.1](https://www.youtube.com/watch?v=X4qlKBBaegM)/ [no diffusio](https://www.youtube.com/watch?v=vahKXpd6oLg)
 ```python
 #  audiobook will be saved in ./tts_audiobooks

api.py CHANGED Viewed

@@ -21,18 +21,8 @@ NUM_SOUND_GENERATIONS = 1  # batch size to generate same text (same soundscape f
 sound_generator = AudioGen(duration=4.74, device='cuda:0').to('cuda:0').eval()
 Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
-# SSH AGENT
-#   eval $(ssh-agent -s)
-#   ssh-add ~/.ssh/id_ed25519_github2024
-#
-#   git remote set-url origin [email protected]:audeering/shift
-# ==
 def _shorten(filename):
     return filename.replace("/","")[-6:]
@@ -83,9 +73,9 @@ def _shift(x):
 def overlay(x, soundscape=None):
     if soundscape is not None:
         # SOUNDS
         background = sound_generator.generate(
                                         [soundscape] * NUM_SOUND_GENERATIONS
                                         ).reshape(-1).detach().cpu().numpy() # bs, 11400 @.74s
@@ -98,8 +88,10 @@ def overlay(x, soundscape=None):
         background = audresample.resample(
             background,
             original_rate=16000, # sound_generator.sample_rate,
-            target_rate=24000)[0, :-25000]  # discard last samples as they have the splash sound / polarity change;
         # background /= np.abs(background).max() + 1e-7  Apply in sound_generator()
@@ -132,14 +124,14 @@ def overlay(x, soundscape=None):
         # print(total[40000:70000].tolist())
         print(np.logical_and(total > .1, total < .9).sum(), total.shape, 'ev')
-        # background = np.concatenate(n_repeat * [background])
-        # background = _shift(background)
         # print(f'\n====SOUND BACKGROUND SHAPE\n{background.shape=}',
         #       f'{np.abs(background.max())=}\n{x.shape=}')
         total /= np.abs(total).max() + 1e-7  # amplify speech to full [-1,1]
-        x = .4 * x + .6 * total[:len(x)]
     else:
         print('sound_background = None')
     return x

 sound_generator = AudioGen(duration=4.74, device='cuda:0').to('cuda:0').eval()
 Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
 def _shorten(filename):
     return filename.replace("/","")[-6:]
 def overlay(x, soundscape=None):
     if soundscape is not None:
         # SOUNDS
         background = sound_generator.generate(
                                         [soundscape] * NUM_SOUND_GENERATIONS
                                         ).reshape(-1).detach().cpu().numpy() # bs, 11400 @.74s
         background = audresample.resample(
             background,
             original_rate=16000, # sound_generator.sample_rate,
+            target_rate=24000)[0, :-25000]
+        # TODO discards last sampls due to splash sound / polarity change / on long sounds ~ videos / NOT DROP FOR live_demo.py
         # background /= np.abs(background).max() + 1e-7  Apply in sound_generator()
         # print(total[40000:70000].tolist())
         print(np.logical_and(total > .1, total < .9).sum(), total.shape, 'ev')
+        # less periodic - cloned sounds
+        for _ in range(4):
+            background = _shift(background)
         # print(f'\n====SOUND BACKGROUND SHAPE\n{background.shape=}',
         #       f'{np.abs(background.max())=}\n{x.shape=}')
         total /= np.abs(total).max() + 1e-7  # amplify speech to full [-1,1]
+        x = .26 * x + .74 * total[:len(x)]
     else:
         print('sound_background = None')
     return x

audiocraft/transformer.py CHANGED Viewed

@@ -203,7 +203,7 @@ class StreamingMultiheadAttention(nn.Module):
                 else:
                     # init on 1st token (for all 47 transf layers)
-                    print(f'else skip')
                     self.k_history = k
                     self.v_history = v

                 else:
                     # init on 1st token (for all 47 transf layers)
+                    print(f'AudioGen kv cache Flush')
                     self.k_history = k
                     self.v_history = v

engineer_style_vectors_v2.py DELETED Viewed

@@ -1,331 +0,0 @@
-from pathlib import Path
-import shutil
-import csv
-import io
-import os
-import typing
-import wave
-import sys
-from mimic3_tts.__main__ import (CommandLineInterfaceState,
-                                 get_args,
-                                 initialize_args,
-                                 initialize_tts,
-                                 # print_voices,
-                                 # process_lines,
-                                 shutdown_tts,
-                                 OutputNaming,
-                                 process_line)
-def process_lines(state: CommandLineInterfaceState, wav_path=None):
-    '''MIMIC3 INTERNAL CALL that yields the sigh sound'''
-    args = state.args
-    result_idx = 0
-    print(f'why waitings in the for loop LIN {state.texts=}\n')
-    for line in state.texts:
-        print(f'LIN {line=}\n')  # prints \n so is empty not getting the predifne text of state.texts
-        line_voice: typing.Optional[str] = None
-        line_id = ""
-        line = line.strip()
-        # if not line:
-        #     continue
-        if args.output_naming == OutputNaming.ID:
-            # Line has the format id|text instead of just text
-            with io.StringIO(line) as line_io:
-                reader = csv.reader(line_io, delimiter=args.csv_delimiter)
-                row = next(reader)
-                line_id, line = row[0], row[-1]
-                if args.csv_voice:
-                    line_voice = row[1]
-        process_line(line, state, line_id=line_id, line_voice=line_voice)
-        result_idx += 1
-    print('\nARRive at All Audio writing\n\n\n\n')
-    # -------------------------------------------------------------------------
-    # Write combined audio to stdout
-    if state.all_audio:
-        # _LOGGER.debug("Writing WAV audio to stdout")
-        if sys.stdout.isatty() and (not state.args.stdout):
-            with io.BytesIO() as wav_io:
-                wav_file_play: wave.Wave_write = wave.open(wav_io, "wb")
-                with wav_file_play:
-                    wav_file_play.setframerate(state.sample_rate_hz)
-                    wav_file_play.setsampwidth(state.sample_width_bytes)
-                    wav_file_play.setnchannels(state.num_channels)
-                    wav_file_play.writeframes(state.all_audio)
-                    # play_wav_bytes(state.args, wav_io.getvalue())
-                # wav_path = '_direct_call_2.wav'
-                with open(wav_path, 'wb') as wav_file:
-                    wav_file.write(wav_io.getvalue())
-                    wav_file.seek(0)
-# -----------------------------------------------------------------------------
-# cat _tmp_ssml.txt | mimic3 --cuda --ssml --noise-w 0.90001 --length-scale 0.91 --noise-scale 0.04 > noise_w=0.90_en_happy_2.wav
-# ======================================================================
-out_dir = 'assets/'
-reference_wav_directory = 'assets/wavs/style_vector_v2/'
-Path(reference_wav_directory).mkdir(parents=True, exist_ok=True)
-Path(out_dir).mkdir(parents=True, exist_ok=True)
-wav_dir = 'assets/wavs/'
-Path(wav_dir).mkdir(parents=True, exist_ok=True)
-N_PIX = 11
-# =======================================================================
-# S T A R T                 G E N E R A T E   png/wav
-# =======================================================================
-NOISE_SCALE = .667
-NOISE_W = .9001 #.8 #.90001  # default .8 in __main__.py @ L697    IGNORED DUE TO ARTEfACTS - FOR NOW USE default
-a = [
-    'p239',
-    'p236',
-    'p264',
-    'p250',
-    'p259',
-    'p247',
-    'p261',
-    'p263',
-    'p283',
-    'p274',
-    'p286',
-    'p276',
-    'p270',
-    'p281',
-    'p277',
-    'p231',
-    'p238',
-    'p271',
-    'p257',
-    'p273',
-    'p284',
-    'p329',
-    'p361',
-    'p287',
-    'p360',
-    'p374',
-    'p376',
-    'p310',
-    'p304',
-    'p340',
-    'p347',
-    'p330',
-    'p308',
-    'p314',
-    'p317',
-    'p339',
-    'p311',
-    'p294',
-    'p305',
-    'p266',
-    'p335',
-    'p334',
-    'p318',
-    'p323',
-    'p351',
-    'p333',
-    'p313',
-    'p316',
-    'p244',
-    'p307',
-    'p363',
-    'p336',
-    'p312',
-    'p267',
-    'p297',
-    'p275',
-    'p295',
-    'p288',
-    'p258',
-    'p301',
-    'p232',
-    'p292',
-    'p272',
-    'p278',
-    'p280',
-    'p341',
-    'p268',
-    'p298',
-    'p299',
-    'p279',
-    'p285',
-    'p326',
-    'p300',
-    's5',
-    'p230',
-    'p254',
-    'p269',
-    'p293',
-    'p252',
-    'p345',
-    'p262',
-    'p243',
-    'p227',
-    'p343',
-    'p255',
-    'p229',
-    'p240',
-    'p248',
-    'p253',
-    'p233',
-    'p228',
-    'p251',
-    'p282',
-    'p246',
-    'p234',
-    'p226',
-    'p260',
-    'p245',
-    'p241',
-    'p303',
-    'p265',
-    'p306',
-    'p237',
-    'p249',
-    'p256',
-    'p302',
-    'p364',
-    'p225',
-    'p362']
-print(len(a))
-b = []
-for row in a:
-    b.append(f'en_US/vctk_low#{row}')
-# print(b)
-# 00000000 arctic
-a = [
-    'awb'  # comma
-    'rms',
-    'slt',
-    'ksp',
-    'clb',
-    'aew',
-    'bdl',
-    'lnh',
-    'jmk',
-    'rxr',
-    'fem',
-    'ljm',
-    'slp',
-    'ahw',
-    'axb',
-    'aup',
-    'eey',
-    'gka',
-    ]
-for row in a:
-    b.append(f'en_US/cmu-arctic_low#{row}')
-# HIFItts
-a = ['9017',
-    '6097',
-    '92']
-for row in a:
-    b.append(f'en_US/hifi-tts_low#{row}')
-a = [
-    'elliot_miller',
-    'judy_bieber',
-    'mary_ann']
-for row in a:
-    b.append(f'en_US/m-ailabs_low#{row}')
-# LJspeech - single speaker
-b.append(f'en_US/ljspeech_low')
-# en_UK apope - only speaker
-b.append(f'en_UK/apope_low')
-all_names = b
-VOICES = {}
-for _id, _voice in enumerate(all_names):
-    # If GitHub Quota exceded copy mimic-voices from local copies
-    #
-    # https://github.com/MycroftAI/mimic3-voices
-    #
-    home_voice_dir = f'/home/audeering.local/dkounadis/.local/share/mycroft/mimic3/voices/{_voice.split("#")[0]}/'
-    Path(home_voice_dir).mkdir(parents=True, exist_ok=True)
-    speaker_free_voice_name = _voice.split("#")[0] if '#' in _voice else _voice
-    if not os.path.isfile(home_voice_dir + 'generator.onnx'):
-        shutil.copyfile(
-            f'/data/dkounadis/cache/mimic3-voices/voices/{speaker_free_voice_name}/generator.onnx',
-            home_voice_dir + 'generator.onnx')  # 'en_US incl. voice
-    prepare_file = _voice.replace('/', '_').replace('#', '_').replace('_low', '')
-    if 'cmu-arctic' in prepare_file:
-        prepare_file = prepare_file.replace('cmu-arctic', 'cmu_arctic') + '.wav'
-    else:
-        prepare_file = prepare_file + '.wav' # [...cmu-arctic...](....cmu_arctic....wav)
-    file_true = prepare_file.split('.wav')[0] + '_true_.wav'
-    file_false = prepare_file.split('.wav')[0] + '_false_.wav'
-    print(prepare_file, file_false, file_true)
-    reference_wav = reference_wav_directory + prepare_file
-    rate = 4  # high speed sounds nice if used as speaker-reference audio for StyleTTS2
-    _ssml = (
-        '<speak>'
-        '<prosody volume=\'64\'>'
-        f'<prosody rate=\'{rate}\'>'
-        f'<voice name=\'{_voice}\'>'
-        '<s>'
-        'Sweet dreams are made of this, .. !!! # I travel the world and the seven seas.'
-        '</s>'
-        '</voice>'
-        '</prosody>'
-        '</prosody>'
-        '</speak>'
-    )
-    with open('_tmp_ssml.txt', 'w') as f:
-        f.write(_ssml)
-    # ps = subprocess.Popen(f'cat _tmp_ssml.txt | mimic3 --ssml > {reference_wav}', shell=True)
-    # ps.wait()  # using ps to call mimic3 because samples dont have time to be written in stdout buffer
-    args = get_args()
-    args.ssml = True
-    args.text = [_ssml]  #['aa', 'bb'] #txt
-    args.interactive = False
-    # args.output_naming = OutputNaming.TIME
-    state = CommandLineInterfaceState(args=args)
-    initialize_args(state)
-    initialize_tts(state)
-    # args.texts = [txt] #['aa', 'bb'] #txt
-    # state.stdout = '.' #None #'makeme.wav'
-    # state.output_dir = '.noopy'
-    # state.interactive = False
-    # state.output_naming = OutputNaming.TIME
-    # # state.ssml = 1234546575
-    # state.stdout = True
-    # state.tts = True
-    process_lines(state, wav_path=reference_wav)
-    shutdown_tts(state)

landscape2soundscape.py CHANGED Viewed

@@ -8,10 +8,6 @@ import cv2
 # yt-dlp https://www.youtube.com/watch?v=UZ9uyQI3pF0 --downloader ffmpeg --downloader-args "ffmpeg_i:-ss 997 -to 2512"
 # ffmpeg -i Sandra\ Kotevska\,\ Painting\ Rose\ bush\,\ mixed\ media\,\ 2017.\ \[NMzC_036MtE\].mkv -f mp3 -ar 22050 -vn out44.wav -ac 1
 # https://superuser.com/questions/583393/how-to-extract-subtitle-from-video-using-ffmpeg
-#___________________________________________________________________________________________________
-#   VIDEO FROM IMAGE with CAPTIONS
-#
-# UPLOAD to: Simaviro: Documents General WORK PACKAGES WP1 ContentRepository ANBPR_ROMANIA TTSvideos
 # __________________________________________________________________________________________________
 # TO DONLOAD SRT for youtub
 # yt-dlp --write-sub --sub-lang en --convert-subs "srt" https://www.youtube.com/watch?v=F1Ib7TAu7eg&list=PL4x2B6LSwFewdDvRnUTpBM7jkmpwouhPv&index=2
@@ -19,12 +15,8 @@ import cv2
 # _voice = 'en_US/cmu-arctic_low#lnh' #en_US/vctk_low#p249'  # 'en_US/vctk_low#p282'
 # _voice = ''en_US/vctk_low#p351''
 # _voice = 'en_US/vctk_low#p351'  # avoid 318 it does the ghhhhhh
-# _voice = 'en_US/m-ailabs_low#judy_bieber'  # Nice voice for ('Arta culinara romaneasca - Groza Irina [phIF0NxgwlQ].mkv' 'Arta culinara romaneasca - Groza Irina [phIF0NxgwlQ].en-GB.srt'),
-# _voice = 'en_UK/apope_low'
-# _voice = 'en_US/m-ailabs_low#mary_ann'
 # _voice = 'en_US/vctk_low#p351'
 # _voice = 'en_US/hifi-tts_low#92'
-# voice_str = f'_{_voice.replace("/", "")}'
@@ -47,7 +39,7 @@ DESCRIPTIONS = [
         '01_Schick_AII840_001.txt',                               # text
         'statue in shire, hill river, vogels.',                  # audiocraft
         'G. Schick, Bildnis der Heinrike Dannecker, 1802', # cv2 puttext title
-        'en_US/vctk_low#p326', #'en_US/m-ailabs_low#judy_bieber', #'en_US/m-ailabs_low#mary_ann',
      ],
     # 2
     [
@@ -65,7 +57,7 @@ DESCRIPTIONS = [
         'K. Schinkel Gotische Kirche Auf Einem Felsen 1815',
         'en_US/hifi-tts_low#6097',
     ],
-    #
     [
         '04_Friedrich_FV317_001.jpg',
         '04_Friedrich_FV317_001.txt',
@@ -73,7 +65,7 @@ DESCRIPTIONS = [
         'C. D. Friedrich, Der Watzmann, 1824',
         'en_US/m-ailabs_low#mary_ann',
     ],
-    #
     [
         '05_Blechen_FV40_001.jpg',
         '05_Blechen_FV40_001.txt',
@@ -95,7 +87,7 @@ DESCRIPTIONS = [
         '07_Courbet_AI967_001.txt',
         'Storm at the strand of waves Tsunami',
         'G. Courbet, Die Welle, 1870',
-        'en_US/m-ailabs_low#mary_ann',
     ],
     # 8
     [
@@ -125,7 +117,7 @@ DESCRIPTIONS = [
     [
         '11_Liebermann_NG4-94_001.jpg',
         '11_Liebermann_NG4-94_001.txt',
-        'Tavern at the waterfront',
         'M. Tiebermann, Gartenlokal An Der Havel Nikolskoe, 1916',
         'en_US/cmu-arctic_low#ljm',
     ],
@@ -135,7 +127,7 @@ DESCRIPTIONS = [
         '12_Slevogt_AII1022_001.txt',
         'sailing yachts pool fluss',
         'M. Slevogt, Segelboote Auf Der Alster Am Abend, 1905',
-        'en_US/m-ailabs_low#mary_ann',
     ],
 ]
@@ -146,7 +138,7 @@ SILENT_VIDEO = '_silent_video.mp4'
 # SILENT CLIP
-for img, text, soundscape, title, voice in DESCRIPTIONS[2:4]:

 # yt-dlp https://www.youtube.com/watch?v=UZ9uyQI3pF0 --downloader ffmpeg --downloader-args "ffmpeg_i:-ss 997 -to 2512"
 # ffmpeg -i Sandra\ Kotevska\,\ Painting\ Rose\ bush\,\ mixed\ media\,\ 2017.\ \[NMzC_036MtE\].mkv -f mp3 -ar 22050 -vn out44.wav -ac 1
 # https://superuser.com/questions/583393/how-to-extract-subtitle-from-video-using-ffmpeg
 # __________________________________________________________________________________________________
 # TO DONLOAD SRT for youtub
 # yt-dlp --write-sub --sub-lang en --convert-subs "srt" https://www.youtube.com/watch?v=F1Ib7TAu7eg&list=PL4x2B6LSwFewdDvRnUTpBM7jkmpwouhPv&index=2
 # _voice = 'en_US/cmu-arctic_low#lnh' #en_US/vctk_low#p249'  # 'en_US/vctk_low#p282'
 # _voice = ''en_US/vctk_low#p351''
 # _voice = 'en_US/vctk_low#p351'  # avoid 318 it does the ghhhhhh
 # _voice = 'en_US/vctk_low#p351'
 # _voice = 'en_US/hifi-tts_low#92'
         '01_Schick_AII840_001.txt',                               # text
         'statue in shire, hill river, vogels.',                  # audiocraft
         'G. Schick, Bildnis der Heinrike Dannecker, 1802', # cv2 puttext title
+        'fr_FR_m-ailabs_bernard', #'en_US/m-ailabs_low#judy_bieber', #'en_US/m-ailabs_low#mary_ann',
      ],
     # 2
     [
         'K. Schinkel Gotische Kirche Auf Einem Felsen 1815',
         'en_US/hifi-tts_low#6097',
     ],
+    # 4
     [
         '04_Friedrich_FV317_001.jpg',
         '04_Friedrich_FV317_001.txt',
         'C. D. Friedrich, Der Watzmann, 1824',
         'en_US/m-ailabs_low#mary_ann',
     ],
+    # 5
     [
         '05_Blechen_FV40_001.jpg',
         '05_Blechen_FV40_001.txt',
         '07_Courbet_AI967_001.txt',
         'Storm at the strand of waves Tsunami',
         'G. Courbet, Die Welle, 1870',
+        'af_ZA_google-nwu_0184',
     ],
     # 8
     [
     [
         '11_Liebermann_NG4-94_001.jpg',
         '11_Liebermann_NG4-94_001.txt',
+        'Tavern and shrine and people talking glass plates drink',
         'M. Tiebermann, Gartenlokal An Der Havel Nikolskoe, 1916',
         'en_US/cmu-arctic_low#ljm',
     ],
         '12_Slevogt_AII1022_001.txt',
         'sailing yachts pool fluss',
         'M. Slevogt, Segelboote Auf Der Alster Am Abend, 1905',
+        'jv_ID_google-gmu_06207',
     ],
 ]
 # SILENT CLIP
+for img, text, soundscape, title, voice in DESCRIPTIONS: #[2:4]:

live_api.py DELETED Viewed

@@ -1,135 +0,0 @@
-# -*- coding: utf-8 -*-
-import numpy as np
-import soundfile
-import audresample
-import text_utils
-import re
-import subprocess
-import markdown
-import json
-from pathlib import Path
-from types import SimpleNamespace
-from flask import Flask, request, send_from_directory
-from flask_cors import CORS
-from audiocraft.builders import AudioGen #, audio_write
-NUM_SOUND_GENERATIONS = 1  # they differ a lot and are unnatural to concatenate, prefer lm.n_draw
-sound_generator = AudioGen(duration=.74, device='cuda:0').to('cuda:0').eval()
-# ====STYLE VECTOR====
-# AFFECTIVE = True
-# VOICE = 'en_UK/apope_low'  #  	en_US/m-ailabs_low#mary_ann
-# _dir = '/' if AFFECTIVE else '_v2/'
-# precomputed_style_vector = msinference.compute_style(
-#     'assets/wavs/style_vector' + _dir + VOICE.replace(
-#         '/', '_').replace(
-#         '#', '_').replace(
-#         'cmu-arctic', 'cmu_arctic').replace(
-#         '_low', '') + '.wav')
-# print('\n  STYLE VECTOR \n', precomputed_style_vector.shape)
-# ==== STYLE VECTOR
-CACHE_DIR = 'flask_cache/'
-Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
-def tts_multi_sentence(scene=None):
-    if scene is not None and len(scene) >= 4:
-        print(f'Processing: {scene} ..')
-        # x = sound_generator.generate([scene])[0, :, :].detach().cpu().numpy()
-        x = sound_generator.generate(
-                                        [scene] * NUM_SOUND_GENERATIONS
-                                        ).reshape(1, -1).detach().cpu().numpy() # bs, 11400
-        x /= np.abs(x).max() + 1e-7
-        # is 16kHz - AUdiogen Fs
-        x = audresample.resample(x,
-            original_rate=16000,
-            target_rate=24000)[0, :]
-        #
-        print(f'Craft Finished for: {scene}\n\n\n\n____{x.shape}')
-    else:
-        print(scene, '\nDrop\n')
-        x = np.zeros(400)
-    # # StyleTTS2
-    # if ('en_US/' in voice) or ('en_UK/' in voice) or (voice is None):
-    #     assert precomputed_style_vector is not None, 'For affective TTS, style vector is needed.'
-    #     x = []
-    #     for _sentence in text:
-    #         x.append(msinference.inference(_sentence,
-    #                     precomputed_style_vector,
-    #                                 alpha=0.3,
-    #                                 beta=0.7,
-    #                                 diffusion_steps=7,
-    #                                 embedding_scale=1))
-    #     x = np.concatenate(x)
-    #     return overlay(x, sound_background)
-    return x
-app = Flask(__name__)
-cors = CORS(app)
-@app.route("/")
-def index():
-    with open('README.md', 'r') as f:
-        return markdown.markdown(f.read())
-@app.route("/", methods=['GET', 'POST', 'PUT'])
-def serve_wav():
-    # https://stackoverflow.com/questions/13522137/in-flask-convert-form-post-
-    #                      object-into-a-representation-suitable-for-mongodb
-    r = request.form.to_dict(flat=False)
-    args = SimpleNamespace(
-        text=None if r.get('text') is None else r.get('text'),  # string not file?
-        scene=r.get('scene')[0]
-        )
-    # print('\n==RECOMPOSED as \n',request.data,request.form,'\n==')
-    x = tts_multi_sentence(args.scene)
-    OUT_FILE = 'tmp.wav'
-    soundfile.write(CACHE_DIR + OUT_FILE, x, 16000)
-    # send server's output as default file -> srv_result.xx
-    print(f'\n=SERVER saved as {OUT_FILE=}\n')
-    response = send_from_directory(CACHE_DIR, path=OUT_FILE)
-    response.headers['suffix-file-type'] = OUT_FILE
-    return response
-if __name__ == "__main__":
-    app.run(host="0.0.0.0")

live_demo.py CHANGED Viewed

@@ -1,74 +1,48 @@
-import argparse
 import os
 import requests
 import subprocess
-def command_line_args():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-    parser.add_argument(
-        '--affective',
-        help="Select Emotional or non-emotional variant of Available voices: https://audeering.github.io/shift/",
-        action='store_false',
-    )
-    parser.add_argument(
-        '--device',
-        help="Device ID",
-        type=str,
-        default='cpu',
-    )
-    parser.add_argument(
-        '--text',
-        help="Text to be synthesized.",
-        default='How is hoowl',
-        type=str,
-    )
-    return parser
 def send_to_server(args):
     url = "http://192.168.88.209:5000"
     payload = {
         'text': args.text,
-        'scene': args.scene
     }
-    response = requests.post(url, data=payload)  # NONEs do not arrive to servers dict
-    # # Check the response from the server
-    # if response.status_code == 200:
-    #     print("\nRequest was successful!")
-    #     # print("Response:", respdonse.__dict__.keys(), '\n=====\n')
-    # else:
-    #     print("Failed to send the request")
-    #     print("Status Code:", response.status_code)
-    #     print("Response:", response.text)
-    return response
-def cli(): # args.out_file is not send to server - server writes tmp - copied by client
-    parser = command_line_args()
-    args = parser.parse_args()
-    os.system('cls' if os.name == 'nt' else 'clear')
-    while True:
-        args.text = input("\n\n\n\nDescribe Any Sound: \n\n\n\n")
-        # _text, _scene = args.text.split('|')
-        # args.text = _text
-        args.scene = args.text #_scene
-        if len(args.text) >= 4:
-            response = send_to_server(args)
-            out_file = '_gen_.wav'  #+ response.headers['suffix-file-type'].split('.')[-1]
-            with open(out_file, 'wb') as f:
-                f.write(response.content)
-            subprocess.run(["paplay", out_file])
-        else:
-            print(f'__\n{args.text}\n')
-if __name__ == '__main__':
-    cli()

+# Asks for txt input, creates TTS and sound via AudioGen, plays it back
+# Need to have paplay installed on client - live_demo.py
 import os
 import requests
 import subprocess
+from types import SimpleNamespace
 def send_to_server(args):
     url = "http://192.168.88.209:5000"
     payload = {
         'text': args.text,
+        'voice': args.voice,
+        'soundscape': args.soundscape,
+        'affective': True,
+        'image': None,
+        'video': None,
+        'speed': 1.14,
+        'native': None,
     }
+    return requests.post(url, data=payload, files=[(args.text, open('_tmp.txt', 'rb'))])  # NONEs do not arrive to servers dict
+args = SimpleNamespace()
+args.voice = 'fr_FR_m-ailabs_bernard'  # 'en_US/m-ailabs_low#judy_bieber'
+args.speed = 1.14
+os.system('cls' if os.name == 'nt' else 'clear')
+while True:
+    _str = input("\n\n\n\nDescribe Any Sound: \n\n\n\n")
+    args.soundscape = _str
+    # xtra duration for audiogen to sound cool!!!!
+    if len(_str) < 20:
+        _str += 'Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata  few silence for audiogen to impress you.'
+    args.text = '_tmp.txt'  # input -> .txt (implementation thought for audiobooks in API)
+    with open(args.text, 'w') as f:
+        f.write(_str)
+    if len(_str) >= 4:
+        response = send_to_server(args)
+        out_file = '_gen_.wav'#+ response.headers['suffix-file-type'].split('.')[-1]
+        with open(out_file, 'wb') as f:
+            f.write(response.content)
+        subprocess.run(["paplay", out_file])
+    else:
+        print(f'__\n{_str}\n')

models.py CHANGED Viewed

@@ -109,6 +109,9 @@ class ResBlk(nn.Module):
 class StyleEncoder(nn.Module):
     def __init__(self, dim_in=48, style_dim=48, max_conv_dim=384):
         super().__init__()
         blocks = []
@@ -549,5 +552,5 @@ def build_model(args, text_aligner, pitch_extractor, bert):
             text_aligner = text_aligner,
             pitch_extractor=pitch_extractor
        )
-    return nets

 class StyleEncoder(nn.Module):
+    # used for both acoustic & prosodic ref_s/p
     def __init__(self, dim_in=48, style_dim=48, max_conv_dim=384):
         super().__init__()
         blocks = []
             text_aligner = text_aligner,
             pitch_extractor=pitch_extractor
        )
+    return nets

msinference.py CHANGED Viewed

@@ -134,6 +134,7 @@ _ = [model[key].eval() for key in model]
 _ = [model[key].to(device) for key in model]
 # params_whole = torch.load("Models/LibriTTS/epochs_2nd_00020.pth", map_location='cpu')
 params_whole = torch.load(str(cached_path("hf://yl4579/StyleTTS2-LibriTTS/Models/LibriTTS/epochs_2nd_00020.pth")), map_location='cpu')
 params = params_whole['net']
@@ -167,7 +168,7 @@ def inference(text,
     ps = global_phonemizer.phonemize([text])
     # print(f'PHONEMIZER: {ps=}\n\n') #PHONEMIZER: ps=['ɐbˈɛbæbləm ']
     ps = word_tokenize(ps[0])
-    # print(f'TOKENIZER: {ps=}\n\n') #OKENIZER: ps=['ɐbˈɛbæbləm']
     ps = ' '.join(ps)
     tokens = textclenaer(ps)
     # print(f'TEXTCLEAN: {ps=}\n\n') #TEXTCLEAN: ps='ɐbˈɛbæbləm'
@@ -198,11 +199,13 @@ def inference(text,
         # print('BERTdu', bert_dur.shape, tokens.shape, '\n') # bert what is the 768 per token -> IS USED in sampler
         # BERTdu torch.Size([1, 11, 768]) torch.Size([1, 11])
         ref = ref_s[:, :128]
         s = ref_s[:, 128:]
         d = model.predictor.text_encoder(d_en,
                                          s, input_lengths, text_mask)

 _ = [model[key].to(device) for key in model]
 # params_whole = torch.load("Models/LibriTTS/epochs_2nd_00020.pth", map_location='cpu')
+# params_whole = torch.load('freevc2/yl4579_styletts2.pth' map_location='cpu')
 params_whole = torch.load(str(cached_path("hf://yl4579/StyleTTS2-LibriTTS/Models/LibriTTS/epochs_2nd_00020.pth")), map_location='cpu')
 params = params_whole['net']
     ps = global_phonemizer.phonemize([text])
     # print(f'PHONEMIZER: {ps=}\n\n') #PHONEMIZER: ps=['ɐbˈɛbæbləm ']
     ps = word_tokenize(ps[0])
+    # # print(f'TOKENIZER: {ps=}\n\n') #OKENIZER: ps=['ɐbˈɛbæbləm']
     ps = ' '.join(ps)
     tokens = textclenaer(ps)
     # print(f'TEXTCLEAN: {ps=}\n\n') #TEXTCLEAN: ps='ɐbˈɛbæbləm'
         # print('BERTdu', bert_dur.shape, tokens.shape, '\n') # bert what is the 768 per token -> IS USED in sampler
         # BERTdu torch.Size([1, 11, 768]) torch.Size([1, 11])
         ref = ref_s[:, :128]
         s = ref_s[:, 128:]
+        # s = .74 * s  # prosody / arousal & fading unvoiced syllabes [x0.7 - x1.2]
         d = model.predictor.text_encoder(d_en,
                                          s, input_lengths, text_mask)