scripts for landscape2soundscape

Browse files

Files changed (4) hide show

api.py +25 -20
landscape2soundscape.py +272 -0
msinference.py +1 -1
tts.py +3 -2

api.py CHANGED Viewed

@@ -38,18 +38,18 @@ def _shift(x):
     # x = x * fade_in
     return x
-def _background(x, sound_background=None):
     if sound_background is not None:
-        sound_background = sound_background[0, :]
         len_speech = len(x)
-        if len_speech < len(sound_background):
             n_repeat = len_speech // len(sound_background) + 1
             replica = [sound_background] * n_repeat
             replica = [_shift(_) for _ in replica]
             sound_background = np.concatenate(replica)
-        print(f'\nSOUND\nBACKGROUND\nSHAPE\n{sound_background=}\n{x.shape=}\n- - - -')
         x = .74 * x + .26 * sound_background[:len_speech]
     return x
@@ -90,7 +90,7 @@ def tts_multi_sentence(precomputed_style_vector=None,
                                     embedding_scale=1))
         x = np.concatenate(x)
-        return _background(x, sound_background)
     # Fallback - Mimic-3
     text_utils.store_ssml(text=text, voice=voice)  # Text has to be list of single sentences
@@ -99,7 +99,7 @@ def tts_multi_sentence(precomputed_style_vector=None,
     x, fs = soundfile.read('_tmp.wav')
     x = audresample.resample(x.astype(np.float32), 24000, fs)[0, :]  # reshapes (64,) -> (1,64)
-    return _background(x, sound_background)
@@ -131,14 +131,14 @@ def serve_wav():
     print('Saved all files on Server Side\n\n')
-    args = SimpleNamespace(text=None if r.get('text') is None else 'flask_cache/' + r.get('text')[0],
-                           video=None if r.get('video') is None else 'flask_cache/' + r.get('video')[0],
-                           image=None if r.get('image') is None else 'flask_cache/' + r.get('image')[0],
-                           voice=r.get('voice')[0],
-                           native=None if r.get('native') is None else 'flask_cache/' + r.get('native')[0],
-                           affective = r.get('affective')[0],
-                           scene=r.get('scene')[0]
-                                  )
     # print('\n==RECOMPOSED as \n',request.data,request.form,'\n==')
@@ -202,7 +202,7 @@ def serve_wav():
                     '#', '_').replace(
                     'cmu-arctic', 'cmu_arctic').replace(
                     '_low', '') + '.wav')
-    print('\n  STYLE VECTOR \n', precomputed_style_vector)
     # ====SILENT VIDEO====
     if args.video is not None:
@@ -369,9 +369,9 @@ def serve_wav():
         # Fallback: No image nor video provided - do only tts
         x = tts_multi_sentence(text=text,
-                            precomputed_style_vector=precomputed_style_vector,
-                            voice=args.voice,
-                            scene=args.scene)
         OUT_FILE = './flask_cache/tmp.wav' #args.out_file + '.wav'
         soundfile.write(OUT_FILE, x, 24000)
@@ -388,8 +388,13 @@ def serve_wav():
     # response.headers["Content-Type"] = "audio/wav"
     # https://stackoverflow.com/questions/67591467/
     #            flask-shows-typeerror-send-from-directory-missing-1-required-positional-argum
-    response = send_from_directory('flask_cache/', path=OUT_FILE.split('/')[-1])
-    response.headers['suffix-file-type'] = OUT_FILE.split('/')[-1]
     return response

     # x = x * fade_in
     return x
+def overlay(x, sound_background=None):
     if sound_background is not None:
+        sound_background = sound_background.detach().cpu().numpy()[0, :]
         len_speech = len(x)
+        if len_speech > len(sound_background):
             n_repeat = len_speech // len(sound_background) + 1
             replica = [sound_background] * n_repeat
             replica = [_shift(_) for _ in replica]
             sound_background = np.concatenate(replica)
+        print(f'\nSOUND BACKGROUND SHAPE\n{sound_background.shape=}\n{x.shape=}\n- - - -')
         x = .74 * x + .26 * sound_background[:len_speech]
     return x
                                     embedding_scale=1))
         x = np.concatenate(x)
+        return overlay(x, sound_background)
     # Fallback - Mimic-3
     text_utils.store_ssml(text=text, voice=voice)  # Text has to be list of single sentences
     x, fs = soundfile.read('_tmp.wav')
     x = audresample.resample(x.astype(np.float32), 24000, fs)[0, :]  # reshapes (64,) -> (1,64)
+    return overlay(x, sound_background)
     print('Saved all files on Server Side\n\n')
+    args = SimpleNamespace(text=None if r.get('text') is None else 'flask_cache/' + r.get('text')[0].replace("/",""),
+                video=None if r.get('video') is None else 'flask_cache/' + r.get('video')[0].replace("/",""),
+                image=None if r.get('image') is None else 'flask_cache/' + r.get('image')[0].replace("/",""),
+                voice=r.get('voice')[0],
+                native=None if r.get('native') is None else 'flask_cache/' + r.get('native')[0].replace("/",""),
+                affective = r.get('affective')[0],
+                scene=r.get('scene')[0]
+                )
     # print('\n==RECOMPOSED as \n',request.data,request.form,'\n==')
                     '#', '_').replace(
                     'cmu-arctic', 'cmu_arctic').replace(
                     '_low', '') + '.wav')
+    print('\n  STYLE VECTOR \n', precomputed_style_vector.shape)
     # ====SILENT VIDEO====
     if args.video is not None:
         # Fallback: No image nor video provided - do only tts
         x = tts_multi_sentence(text=text,
+                               precomputed_style_vector=precomputed_style_vector,
+                               voice=args.voice,
+                               scene=args.scene)
         OUT_FILE = './flask_cache/tmp.wav' #args.out_file + '.wav'
         soundfile.write(OUT_FILE, x, 24000)
     # response.headers["Content-Type"] = "audio/wav"
     # https://stackoverflow.com/questions/67591467/
     #            flask-shows-typeerror-send-from-directory-missing-1-required-positional-argum
+    # send server's output as default file -> srv_result.xx
+    print(f'\n=SERVER saved as {OUT_FILE=}\n')
+    response = send_from_directory('flask_cache/', path=OUT_FILE)
+    response.headers['suffix-file-type'] = OUT_FILE
     return response

landscape2soundscape.py ADDED Viewed

	@@ -0,0 +1,272 @@

+import numpy as np
+import subprocess
+import cv2
+# with subprocess and an extra argument 'scene' and a 'resized image saved as png' we can call the server
+# yt-dlp is instaled in .d4
+# Download Part of Video
+# yt-dlp https://www.youtube.com/watch?v=UZ9uyQI3pF0 --downloader ffmpeg --downloader-args "ffmpeg_i:-ss 997 -to 2512"
+# ffmpeg -i Sandra\ Kotevska\,\ Painting\ Rose\ bush\,\ mixed\ media\,\ 2017.\ \[NMzC_036MtE\].mkv -f mp3 -ar 22050 -vn out44.wav -ac 1
+# https://superuser.com/questions/583393/how-to-extract-subtitle-from-video-using-ffmpeg
+def _shift(x):
+    n = x.shape[0]
+    i = np.random.randint(.24 * n, .74 * n)
+    return np.roll(x, i)
+#___________________________________________________________________________________________________
+#   VIDEO FROM IMAGE with CAPTIONS
+#
+# UPLOAD to: Simaviro: Documents General WORK PACKAGES WP1 ContentRepository ANBPR_ROMANIA TTSvideos
+# __________________________________________________________________________________________________
+# TO DONLOAD SRT for youtub
+# yt-dlp --write-sub --sub-lang en --convert-subs "srt" https://www.youtube.com/watch?v=F1Ib7TAu7eg&list=PL4x2B6LSwFewdDvRnUTpBM7jkmpwouhPv&index=2
+# _voice = 'en_US/vctk_low#p330'
+# _voice = 'en_US/cmu-arctic_low#lnh' #en_US/vctk_low#p249'  # 'en_US/vctk_low#p282'
+# _voice = ''en_US/vctk_low#p351''
+# _voice = 'en_US/vctk_low#p351'  # avoid 318 it does the ghhhhhh
+# _voice = 'en_US/m-ailabs_low#judy_bieber'  # Nice voice for ('Arta culinara romaneasca - Groza Irina [phIF0NxgwlQ].mkv' 'Arta culinara romaneasca - Groza Irina [phIF0NxgwlQ].en-GB.srt'),
+# _voice = 'en_UK/apope_low'
+# _voice = 'en_US/m-ailabs_low#mary_ann'
+# _voice = 'en_US/vctk_low#p351'
+# _voice = 'en_US/hifi-tts_low#92'
+# voice_str = f'_{_voice.replace("/", "")}'
+# image/descriptions provided by other SHIFT tool or Human curator
+# https://simaviro.sharepoint.com/sites/SHIFT/Shared%20Documents/Forms/AllItems.aspx?csf=1&web=1&e=JNK8dQ&cid=363c253d%2D4d61%2D4db1%2D8ffd%2Ddedda749da2d&RootFolder=%2Fsites%2FSHIFT%2FShared%20Documents%2FGENERAL%2FWORK%20PACKAGES%2FWP1%2FContent%20Repository%2Fshift%5FSPK%5Fuse%5Fcases%5Fshare%2F02%5Fuc%5Fspk%5FLandscape2Soundscape%2FLandscape2Soundscape%5F12%5FMasterpieces&FolderCTID=0x01200058F5037C0101524B82F6F0788C02A563
+# STATIC_FRAME = 'uc_spk_Landscape2Soundscape_Masterpieces_pics/01_Schick_AII840_001.jpg' #'assets/image_from_T31.jpg'
+PIC_DIR = 'uc_spk_Landscape2Soundscape_Masterpieces_pics/'
+DESCRIPTIONS = [
+    # 1
+    [
+        '01_Schick_AII840_001.jpg',                               # image
+        '01_Schick_AII840_001.txt',                               # text
+        'Statue in shire hill on autumn beach.',                  # audiocraft
+        'Gottlieb Chick - Bildnis der Heinrike Dannecker - 1802', # cv2 puttext title
+        'en_US/m-ailabs_low#mary_ann',
+     ],
+    # 2
+    [
+        '02_Constable_AI555_001.jpg',
+        '02_Constable_AI555_001.txt',
+        'Meadows country farm village in sight',
+        'John Constable - Dorf an dem Flusse Stour - 1804',
+        'en_US/m-ailabs_low#mary_ann',
+    ],
+    # 3
+    [
+        '03_Schinkel_WS200-002.jpg',
+        '03_Schinkel_WS200-002.txt',
+        'Arriving at the shore on horses',
+        'Karl Friedrich Schinkel - Gotische Kirche auf einem Felsen am Meer - 1815',
+        'en_US/m-ailabs_low#mary_ann',
+    ],
+    #
+    [
+        '04_Friedrich_FV317_001.jpg',
+        '04_Friedrich_FV317_001.txt',
+        'Land steppes',
+        'Friedrich Caspar David - Der Watzmann - 1824/1825',
+        'en_US/m-ailabs_low#mary_ann',
+    ],
+    #
+    [
+        '05_Blechen_FV40_001.jpg',
+        '05_Blechen_FV40_001.txt',
+        'fjords',
+        'Blechen - Carl Unwetter in der römischen Campagna - 1829',
+        'en_US/m-ailabs_low#mary_ann',
+    ],
+    # 6
+    [
+        '06_Menzel_AI900_001.jpg'
+        '06_Menzel_AI900_001.txt',
+        'Olive trees in Seville',
+        'Adolph Menzel - Bauplatz mit Weiden - 1846',
+        'en_US/m-ailabs_low#mary_ann',
+    ],
+    # 7
+    [
+        '07_Courbet_AI967_001.jpg',
+        '07_Courbet_AI967_001.txt',
+        'Storm at the strand of waves Tsunami',
+        'Gustave Courbet - Die Welle - 1869/1870',
+        'en_US/m-ailabs_low#mary_ann',
+    ],
+    # 8
+    [
+        '08_Monet_AI1013_001.jpg',
+        '08_Monet_AI1013_001.txt',
+        'Mai flowers blossom picnic',
+        'Claude Monet - Sommertag - 1874',
+        'en_US/m-ailabs_low#mary_ann',
+    ],
+    # 9
+    [
+        '09_Blechen_AII823_001.jpg',
+        '09_Blechen_AII823_001.txt',
+        'Cascade in Africa',
+        'Carl Blechen - Wasserfälle bei Tivoli - 1832',
+        'en_US/m-ailabs_low#mary_ann',
+    ],
+    # 10
+    [
+        '10_Boecklin_967648_NG2-80_001_rsz.jpg',
+        '10_Boecklin_967648_NG2-80_001.txt',
+        'Hades ades at it sisland',
+        'Arnold Böcklin - Toteninsel - 1883',
+        'en_US/m-ailabs_low#mary_ann',
+    ],
+    # 11
+    [
+        '11_Liebermann_NG4-94_001.jpg',
+        '11_Liebermann_NG4-94_001.txt',
+        'Tavern at the waterfront',
+        'Max Tiebermann - Gartenlokal an der Havel. Nikolskoe - 1916',
+        'en_US/m-ailabs_low#mary_ann',
+    ],
+    # 12
+    [
+        '12_Slevogt_AII1022_001.jpg',
+        '12_Slevogt_AII1022_001.txt',
+        'toy sailing yachts pool',
+        'Max Slevogt - Segelboote auf der Alster am Abend -1905',
+        'en_US/m-ailabs_low#mary_ann',
+    ],
+]
+SILENT_VIDEO = '_silent_video.mp4'
+# SILENT CLIP
+for _img_, _text_, soundscape_text, _title_, _voice_ in DESCRIPTIONS[:1]:
+    # cv2put txt
+    im = cv2.imread(PIC_DIR + _img_)  # IMG must have EVEN shape
+    h, w, _ = im.shape
+    im = im[(h%2):, (w%2):, :]  # assure even image
+    print(im.shape, "GLOBAL IM\n\n\n\n")
+    fram = np.zeros((94, im.shape[1], 3), dtype=np.uint8)
+    h, w, _ = fram.shape
+    font                   = cv2.FONT_HERSHEY_SIMPLEX
+    bottomLeftCornerOfText = (240, 74)  # w,h
+    fontScale              = 2
+    fontColor              = (255, 255, 255)
+    thickness              = 4
+    lineType               = 2
+    cv2.putText(fram, _title_, #'LandScape 2 SoundScape',
+        bottomLeftCornerOfText,
+        font,
+        fontScale,
+        fontColor,
+        thickness,
+        lineType)
+    offset_h = 24
+    im[offset_h:h+offset_h, :w, :] = (.4 * im[offset_h:h+offset_h, :w, :] + .6 * fram).astype(np.uint8)
+    # cv2.imshow('i', im); cv2.waitKey(); cv2.destroyAllWindows()
+    # logo aud
+    logo = cv2.imread('assets/audeering_logo.jpg')[:740, :, :]
+    logo = cv2.resize(logo, (logo.shape[1]//2, logo.shape[0]//2))
+    h, w, _ = logo.shape
+    offset_h = im.shape[0] - h
+    im[offset_h:h+offset_h, :w, :] = (.23 * im[offset_h:h+offset_h, :w, :] + .77 * logo).astype(np.uint8)
+    # logo SMB
+    logo = cv2.imread('assets/SMB_logo.png')#[:740, :, :]
+    logo = cv2.resize(logo, (logo.shape[1]//2, logo.shape[0]//2))
+    h, w, _ = logo.shape
+    offset_h = im.shape[0] - h
+    # fill logo SMB with the pixels of im - where SMB is empty
+    ptc = im[offset_h:h+offset_h, :w, :]
+    logo[logo == 0] = ptc[logo == 0]  # fill empty
+    im[offset_h:h+offset_h, :w, :] = (.13 * im[offset_h:h+offset_h, :w, :] + .86 * logo).astype(np.uint8)
+    # # logo shift
+    # logo = cv2.imread('assets/shift_logo.png')#[:740, :, :]
+    # logo = cv2.resize(logo, (logo.shape[1]//2, logo.shape[0]//2))
+    # h, w, _ = logo.shape
+    # offset_h = im.shape[0] - h #-274
+    # offset_w = im.shape[1] - w #400
+    # # # fill logo SMB with the pixels of im - where SMB is empty
+    # ptc = im[offset_h:h+offset_h, :w, :]
+    # # msk = np.tile(logo[:, :,0:1] > 252, [1,1,3])
+    # # logo[msk] = ptc[msk]  # fill empty
+    # im[offset_h:h+offset_h, offset_w:w+offset_w, :] = (.0 * im[offset_h:h+offset_h, offset_w:w+offset_w, :] + 1 * logo).astype(np.uint8)
+    # silent video - img
+    # im = cv2.resize(im, (700, 700))
+    cv2.imwrite('pic_logo_emb.png', im)
+    # raw, _ = soundfile.read(soundscape_file)  # 12345, 2
+    # # fill
+    # soundscape = []
+    # for _replica in range(math.ceil(len(total) / raw.shape[0])+1):
+    #     soundscape.append(raw)  # _shift non defined for stereo
+    # soundscape = np.concatenate(soundscape, 0)
+    # total = .36 * np.concatenate([total[:, None],
+    #                              total[:, None]], 1) + .64 * soundscape[:len(total), :]
+    # outfile
+    OUT_FILE = _img_.split('/')[-1].replace('.','__') + '.mp4'  # assets / -1
+    print(f'{OUT_FILE=}\n')
+    # call API passing img
+    subprocess.run(
+            [
+             "python",
+             "tts.py",
+             "--text", PIC_DIR + _text_,
+             '--image', 'pic_logo_emb.png',
+              # "--title", _title_,
+              # '--soundscape_text', soundscape_text,
+             '--voice', _voice_,
+             '--out_file', OUT_FILE,
+                ])
+    # soundfile.write(AUDIO_TRACK, total, 22050)
+    # subprocess.call(
+    #     ["ffmpeg",
+    #         "-y",
+    #         "-i",
+    #         SILENT_VIDEO,
+    #         "-i",
+    #         AUDIO_TRACK,
+    #         #"-c:v",
+    #         #"copy",
+    #         "-map",
+    #         "0:v:0",
+    #         "-map",
+    #         " 1:a:0",
+    #         "-vf",
+    #         "pad",
+    #         OUT_FILE])

msinference.py CHANGED Viewed

@@ -183,7 +183,7 @@ def inference(text, ref_s, alpha = 0.3, beta = 0.7, diffusion_steps=5, embedding
     # print(f'TEXTCLEAN: {ps=}\n\n') #TEXTCLEAN: ps='ɐbˈɛbæbləm'
     tokens.insert(0, 0)
     tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)
-    print(f'TOKENSFINAL: {ps=}\n\n')
     with torch.no_grad():
         input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)

     # print(f'TEXTCLEAN: {ps=}\n\n') #TEXTCLEAN: ps='ɐbˈɛbæbləm'
     tokens.insert(0, 0)
     tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)
+    # print(f'TOKENSFINAL: {ps=}\n\n')
     with torch.no_grad():
         input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)

tts.py CHANGED Viewed

@@ -65,7 +65,7 @@ def command_line_args():
         '--out_file',
         help="Output file name.",
         type=str,
-        default='out'
     )
     parser.add_argument(
         '--scene',
@@ -86,7 +86,7 @@ def send_to_server(args):
         'image': args.image,
         'video': args.video,
         'scene': args.scene,
-        'out_file': args.out_file
     }
     # In data= we can write args
@@ -147,6 +147,7 @@ def cli():
     response = send_to_server(args)
     with open(
         args.out_file + '.' + response.headers['suffix-file-type'].split('.')[-1],
         'wb'
         ) as f:

         '--out_file',
         help="Output file name.",
         type=str,
+        default='b6'
     )
     parser.add_argument(
         '--scene',
         'image': args.image,
         'video': args.video,
         'scene': args.scene,
+        # 'out_file': args.out_file   # let serve save as temp
     }
     # In data= we can write args
     response = send_to_server(args)
     with open(
+        # args.out_file is not send to server - server writes tmp - copied by client
         args.out_file + '.' + response.headers['suffix-file-type'].split('.')[-1],
         'wb'
         ) as f: