File size: 16,560 Bytes


# -*- coding: utf-8 -*-
import numpy as np
import soundfile
import audresample
import text_utils
import msinference
import re
import srt
import subprocess
import cv2
import markdown
import json
from pathlib import Path
from types import SimpleNamespace
from flask import Flask, request, send_from_directory
from flask_cors import CORS
from moviepy.editor import *
from audiocraft.audiogen import AudioGen, audio_write
CACHE_DIR = 'flask_cache/'
SOUNDSCAPE_DURATION = 6
sound_generator = AudioGen.get_pretrained('facebook/audiogen-medium')
sound_generator.set_generation_params(duration=SOUNDSCAPE_DURATION)
print(f'{sound_generator.sample_rate=}')
Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)


# SSH AGENT
#   eval $(ssh-agent -s)
#   ssh-add ~/.ssh/id_ed25519_github2024
#
#   git remote set-url origin [email protected]:audeering/shift
# == 

def _shift(x):
    n = x.shape[0]
    i = np.random.randint(.24 * n, max(1, .74 * n))  # high should be above >= 0
    x = np.roll(x, i)
    # we can add the one or fade it and then amplify
    # the audio is so short 6s that is difficult to not hear the shift somewhere
    # Just concatenate - raw - and then shift - the longconcat audio - many times may fix it
    # fade_in = 1 - .5 * np.tanh(-4*(np.linspace(-10, 10, n) - 9.4))  +  .5 * np.tanh(4*(np.linspace(-10, 10, n) + 9.4))
    return x  #* fade_in   # silence this

def overlay(x, scene=None):
    if scene is not None:
        
        # generate 4
        print('Generating AudioCraft')
        back = [sound_generator.generate(
                                        [scene]
                                            )[0].detach().cpu().numpy()[0, :] for _ in range(1)]
        
        print([j.shape for j in back], len(back), 'BACK')
        
        # upsample to 24kHZ of StyleTTS
        print('Resampling')
        back = [audresample.resample(i,
            original_rate=sound_generator.sample_rate,  # 16000
            target_rate=24000
            )[0, :] for i in back]
        print('Cloning backgrounds')
        # clone/elongate by 4x
        back = [(_shift(np.concatenate([_shift(single_gen)] * 4))) for single_gen in back]
        
        
        # long ~30s
        back = np.concatenate(back)
        for _ in range(4):
            back = _shift(back)
        
        # clone to exact len of TTS
        n_repeat = len(x) // back.shape[0] + 2
        
        # Additional Repeat - Reach full length of TTS
        print(f'Additional Repeat {n_repeat=}')
        back = np.concatenate(n_repeat * [back])
        back = _shift(back)
        print(f'\n====SOUND BACKGROUND SHAPE\n{back.shape=}',
              f'{np.abs(back.max())=}\n{x.shape=}')
        x = .9 * x + .1 * back[:len(x)]
    else:
        print('sound_background = None')
    return x

def tts_multi_sentence(precomputed_style_vector=None,
                       text=None,
                       voice=None,
                       scene=None):
    '''create 24kHZ np.array with tts

       precomputed_style_vector :   required if en_US or en_UK in voice, so
                                    to perform affective TTS.
       text  : string
       voice : string or None (falls to styleTTS)
       scene : 'A castle in far away lands' -> if passed will generate background sound scene
       '''
    
        
    # StyleTTS2
    if ('en_US/' in voice) or ('en_UK/' in voice) or (voice is None):
        assert precomputed_style_vector is not None, 'For affective TTS, style vector is needed.'
        x = []
        for _sentence in text:
            x.append(msinference.inference(_sentence,
                        precomputed_style_vector,
                                    alpha=0.3,
                                    beta=0.7,
                                    diffusion_steps=7,
                                    embedding_scale=1))
        x = np.concatenate(x)
        
        return overlay(x, scene=scene)
    
    # Fallback - Mimic-3
    text_utils.store_ssml(text=text, voice=voice)  # Text has to be list of single sentences
    ps = subprocess.Popen(f'cat _tmp_ssml.txt | mimic3 --ssml > _tmp.wav', shell=True)
    ps.wait()
    x, fs = soundfile.read('_tmp.wav')
    x = audresample.resample(x.astype(np.float32), 24000, fs)[0, :]  # reshapes (64,) -> (1,64)
    
    return overlay(x, sound_background)
    



# voices = {}
# import phonemizer
# global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True,  with_stress=True)

app = Flask(__name__)
cors = CORS(app)


@app.route("/")
def index():
    with open('README.md', 'r') as f:
        return markdown.markdown(f.read())


@app.route("/", methods=['GET', 'POST', 'PUT'])
def serve_wav():
    # https://stackoverflow.com/questions/13522137/in-flask-convert-form-post-
    #                      object-into-a-representation-suitable-for-mongodb
    r = request.form.to_dict(flat=False)
    
    
    # Physically Save Client Files
    for filename, obj in request.files.items():
        obj.save(f'{CACHE_DIR}{filename.replace("/","")}')
        
    print('Saved all files on Server Side\n\n') 

    args = SimpleNamespace(text=None if r.get('text') is None else CACHE_DIR + r.get('text')[0].replace("/",""),
                video=None if r.get('video') is None else CACHE_DIR + r.get('video')[0].replace("/",""),
                image=None if r.get('image') is None else CACHE_DIR + r.get('image')[0].replace("/",""),
                voice=r.get('voice')[0],
                native=None if r.get('native') is None else CACHE_DIR + r.get('native')[0].replace("/",""),
                affective = r.get('affective')[0],
                scene=r.get('scene')[0]
                )
    # print('\n==RECOMPOSED as \n',request.data,request.form,'\n==')
    

    print(args, 'ENTER Script')
    do_video_dub = True if args.text.endswith('.srt') else False

    SILENT_VIDEO = '_silent_video.mp4'
    AUDIO_TRACK = '_audio_track.wav'

    if do_video_dub:
        print('==\nFound .srt : {args.txt}, thus Video should be given as well\n\n')
        with open(args.text, "r") as f:
            s = f.read()
        text = [[j.content, j.start.total_seconds(), j.end.total_seconds()] for j in srt.parse(s)]
        assert args.video is not None
        native_audio_file = '_tmp.wav'
        subprocess.call(
            ["ffmpeg",
                "-y",  # https://stackoverflow.com/questions/39788972/ffmpeg-overwrite-output-file-if-exists
                "-i",
                args.video,
                "-f",
                "mp3",
                "-ar",
                "24000",  # "22050 for mimic3",
                "-vn",
                native_audio_file])
        x_native, _ = soundfile.read(native_audio_file)  # reads mp3
        x_native = x_native[:, 0]  # stereo
        # ffmpeg -i Sandra\ Kotevska\,\ Painting\ Rose\ bush\,\ mixed\ media\,\ 2017.\ \[NMzC_036MtE\].mkv -f mp3 -ar 22050 -vn out44.wa
    else:
        with open(args.text, 'r') as f:
            t = ''.join(f)
        t = re.sub(' +', ' ', t)  # delete spaces
        text = text_utils.split_into_sentences(t)  # split to short sentences (~200 phonemes max)
        
    # ====STYLE VECTOR====

    precomputed_style_vector = None
    if args.native:  # Voice Cloning
        try:
            precomputed_style_vector = msinference.compute_style(args.native)
        except soundfile.LibsndfileError:  # Fallback - internal voice
            print('\n  Could not voice clone audio:', args.native, 'fallback to video or Internal TTS voice.\n')
        if do_video_dub:  # Clone voice via Video
            native_audio_file = args.video.replace('.', '').replace('/', '')
            native_audio_file += '__native_audio_track.wav'
            soundfile.write('tgt_spk.wav',
                np.concatenate([
                    x_native[:int(4 * 24000)]], 0).astype(np.float32), 24000)  # 27400?
            precomputed_style_vector = msinference.compute_style('tgt_spk.wav')

    # NOTE: style vector may be None

    if precomputed_style_vector is None:
        if 'en_US' in args.voice or 'en_UK' in args.voice:
            _dir = '/' if args.affective else '_v2/'
            precomputed_style_vector = msinference.compute_style(
                'assets/wavs/style_vector' + _dir + args.voice.replace(
                    '/', '_').replace(
                    '#', '_').replace(
                    'cmu-arctic', 'cmu_arctic').replace(
                    '_low', '') + '.wav')
    print('\n  STYLE VECTOR \n', precomputed_style_vector.shape)
    # ====SILENT VIDEO====

    if args.video is not None:
        # banner
        frame_tts = np.zeros((104, 1920, 3), dtype=np.uint8)
        font                   = cv2.FONT_HERSHEY_SIMPLEX
        bottomLeftCornerOfText = (240, 74)  # w,h
        fontScale              = 2
        fontColor              = (255, 255, 255)
        thickness              = 4
        lineType               = 2
        cv2.putText(frame_tts, 'TTS',
            bottomLeftCornerOfText,
            font,
            fontScale,
            fontColor,
            thickness,
            lineType)
        #     cv2.imshow('i', frame_tts); cv2.waitKey(); cv2.destroyAllWindows()
        # ====================================== NATIVE VOICE
        frame_orig = np.zeros((104, 1920, 3), dtype=np.uint8)
        font                   = cv2.FONT_HERSHEY_SIMPLEX
        bottomLeftCornerOfText = (101, 74)  # w,h
        fontScale              = 2
        fontColor              = (255, 255, 255)
        thickness              = 4
        lineType               = 1000
        cv2.putText(frame_orig, 'ORIGINAL VOICE',
            bottomLeftCornerOfText,
            font,
            fontScale,
            fontColor,
            thickness,
            lineType)
        # ====SILENT VIDEO EXTRACT====
        # DONLOAD SRT from youtube
        #
        #     yt-dlp --write-sub --sub-lang en --convert-subs "srt" https://www.youtube.com/watch?v=F1Ib7TAu7eg&list=PL4x2B6LSwFewdDvRnUTpBM7jkmpwouhPv&index=2
        #
        #
        # .mkv ->.mp4 moviepy loads only .mp4
        #
        #     ffmpeg -y -i Distaff\ \[qVonBgRXcWU\].mkv -c copy -c:a aac Distaff_qVonBgRXcWU.mp4
        #           video_file, srt_file = ['assets/Head_of_fortuna.mp4',
        #                         'assets/head_of_fortuna_en.srt']
        #
        video_file = args.video
        vf = VideoFileClip(video_file)
        try:
            # inpaint banners if native voice
            num = x_native.shape[0]
            is_tts = .5 + .5 * np.tanh(4*(np.linspace(-10, 10, num) + 9.4))  # fade heaviside

            def inpaint_banner(get_frame, t):
                '''blend banner - (now plays) tts or native voic
                '''
                im = np.copy(get_frame(t))

                ix = int(t * 24000)

                if is_tts[ix] > .5:  # mask is 1 thus tts else native
                    frame = frame_tts
                else:
                    frame = frame_orig
                h, w, _ = frame.shape
                # im[-h:, -w:, :] = (.4 * im[-h:, -w:, :] + .6 * frame_orig).astype(np.uint8)
                offset_h = 24
                im[offset_h:h + offset_h, :w, :] = (.4 * im[offset_h:h + offset_h, :w, :] 
                                                    + .6 * frame).astype(np.uint8)

                # im2 = np.concatenate([im, frame_tts], 0)
                # cv2.imshow('t', im2); cv2.waitKey(); cv2.destroyAllWindows()
                return im  # np.concatenate([im, frane_ttts], 0)
        except UnboundLocalError:  # args.native == False
            def inpaint_banner(get_frame, t):
                im = np.copy(get_frame(t))
                frame = frame_tts
                h, w, _ = frame.shape
                offset_h = 24
                im[offset_h:h + offset_h, :w, :] = (.4 * im[offset_h:h+offset_h, :w, :] 
                                                    + .6 * frame).astype(np.uint8)
                return im
        vf = vf.fl(inpaint_banner)
        vf.write_videofile(SILENT_VIDEO)

        # ==== TTS .srt ====

        if do_video_dub:
            OUT_FILE = 'tmp.mp4' #args.out_file + '_video_dub.mp4'
            subtitles = text
            MAX_LEN = int(subtitles[-1][2] + 17) * 24000  
            # 17 extra seconds fail-safe for long-last-segment
            print("TOTAL LEN SAMPLES ", MAX_LEN, '\n====================')
            pieces = []
            for k, (_text_, orig_start, orig_end) in enumerate(subtitles):

                # PAUSES ?????????????????????????


                pieces.append(tts_multi_sentence(text=[_text_],
                                                 precomputed_style_vector=precomputed_style_vector,
                                                 voice=args.voice,
                                                 scene=args.scene)
                              )
            total = np.concatenate(pieces, 0)
            # x = audresample.resample(x.astype(np.float32), 24000, 22050)  # reshapes (64,) -> (1,64)
            # PAD SHORTEST of  TTS / NATIVE
            if len(x_native) > len(total):
                total = np.pad(total, (0, max(0, x_native.shape[0] - total.shape[0])))

            else:  # pad native to len of is_tts & total
                x_native = np.pad(x_native, (0, max(0, total.shape[0] - x_native.shape[0])))
            # print(total.shape, x_native.shape, 'PADDED TRACKS')
            soundfile.write(AUDIO_TRACK,
                            # (is_tts * total + (1-is_tts) * x_native)[:, None],
                            (.64 * total + .27 * x_native)[:, None],
                            24000)
        else:  # Video from plain (.txt)
            OUT_FILE = 'tmp.mp4'
            x = tts_multi_sentence(text=text,
                               precomputed_style_vector=precomputed_style_vector,
                               voice=args.voice,
                               scene=args.scene)
            soundfile.write(AUDIO_TRACK, x, 24000)

    # IMAGE 2 SPEECH

    if args.image is not None:

        STATIC_FRAME = args.image  # 'assets/image_from_T31.jpg'
        OUT_FILE = 'tmp.mp4' #args.out_file + '_image_to_speech.mp4'

        # SILENT CLIP

        clip_silent = ImageClip(STATIC_FRAME).set_duration(5)  # as long as the audio - TTS first
        clip_silent.write_videofile(SILENT_VIDEO, fps=24)

        x = tts_multi_sentence(text=text,
                               precomputed_style_vector=precomputed_style_vector,
                               voice=args.voice,
                               scene=args.scene
                               )
        soundfile.write(AUDIO_TRACK, x, 24000)
    if args.video or args.image:
        # write final output video
        subprocess.call(
            ["ffmpeg",
                "-y",
                "-i",
                SILENT_VIDEO,
                "-i",
                AUDIO_TRACK,
                "-c:v",
                "copy",
                "-map",
                "0:v:0",
                "-map",
                " 1:a:0",
                CACHE_DIR + OUT_FILE])

        print(f'\noutput video is saved as {OUT_FILE}')
        
    else:
        
        # Fallback: No image nor video provided - do only tts
        x = tts_multi_sentence(text=text,
                               precomputed_style_vector=precomputed_style_vector, 
                               voice=args.voice,
                               scene=args.scene)
        OUT_FILE = 'tmp.wav'
        soundfile.write(CACHE_DIR + OUT_FILE, x, 24000)


    

    # audios = [msinference.inference(text, 
    #                                 msinference.compute_style(f'voices/{voice}.wav'), 
    #                                 alpha=0.3, beta=0.7, diffusion_steps=7, embedding_scale=1)]
    # # for t in [text]:
    # output_buffer = io.BytesIO()
    # write(output_buffer, 24000, np.concatenate(audios))
    # response = Response(output_buffer.getvalue())
    # response.headers["Content-Type"] = "audio/wav"
    # https://stackoverflow.com/questions/67591467/
    #            flask-shows-typeerror-send-from-directory-missing-1-required-positional-argum
    
    
    
    # send server's output as default file -> srv_result.xx
    print(f'\n=SERVER saved as {OUT_FILE=}\n')
    response = send_from_directory(CACHE_DIR, path=OUT_FILE)
    response.headers['suffix-file-type'] = OUT_FILE
    print('________________\n              ? \n_______________')
    return response


if __name__ == "__main__":
    app.run(host="0.0.0.0")