dkounadis
/

artificial-styletts2

+# -*- coding: utf-8 -*-
+import numpy as np
+import soundfile
+import audresample
+import text_utils
+import msinference
+import re
+import srt
+import subprocess
+import markdown
+import json
+from pathlib import Path
+from types import SimpleNamespace
+from flask import Flask, request, send_from_directory
+from flask_cors import CORS
+from audiocraft.audiogen import AudioGen, audio_write
+sound_generator = AudioGen.get_pretrained('facebook/audiogen-medium')
+sound_generator.set_generation_params(duration=6)
+CACHE_DIR = 'flask_cache/'
+Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
+def _shift(x):
+    n = x.shape[0]
+    i = np.random.randint(.24 * n, max(1, .74 * n))  # high should be above >= 0
+    x = np.roll(x, i)
+    # fade_in = .5 + .5 * np.tanh(4*(np.linspace(-10, 10, x.shape[0]) + 9.4))
+    # x = x * fade_in
+    return x
+def overlay(x, sound_background=None):
+    if sound_background is not None:
+        sound_background = sound_background.detach().cpu().numpy()[0, :]
+        len_speech = len(x)
+        if len_speech > len(sound_background):
+            n_repeat = len_speech // len(sound_background) + 1
+            replica = [sound_background] * n_repeat
+            replica = [_shift(_) for _ in replica]
+            sound_background = np.concatenate(replica)
+        print(f'\nSOUND BACKGROUND SHAPE\n{sound_background.shape=}\n{x.shape=}\n- - - -')
+        x = .74 * x + .26 * sound_background[:len_speech]
+    return x
+def tts_multi_sentence(precomputed_style_vector=None,
+                       text=None,
+                       voice=None,
+                       scene=None):
+    '''create 24kHZ np.array with tts
+       precomputed_style_vector :   required if en_US or en_UK in voice, so
+                                    to perform affective TTS.
+       text  : string
+       voice : string or None (falls to styleTTS)
+       scene : 'A castle in far away lands' -> if passed will generate background sound scene
+       '''
+    # Generate sound scene - up sample to 24KHz
+    if scene is not None:
+        sound_background = sound_generator.generate([scene])[0]
+        sound_background = audio_write(None,
+                                       sound_background.cpu(),
+                                       24000,  # sound_generator.sample_rate,
+                                       strategy="loudness",
+                                       loudness_compressor=True)
+    else:
+        sound_background = None
+    # StyleTTS2
+    if ('en_US/' in voice) or ('en_UK/' in voice) or (voice is None):
+        assert precomputed_style_vector is not None, 'For affective TTS, style vector is needed.'
+        x = []
+        for _sentence in text:
+            x.append(msinference.inference(_sentence,
+                        precomputed_style_vector,
+                                    alpha=0.3,
+                                    beta=0.7,
+                                    diffusion_steps=7,
+                                    embedding_scale=1))
+        x = np.concatenate(x)
+        return overlay(x, sound_background)
+    # Fallback - Mimic-3
+    text_utils.store_ssml(text=text, voice=voice)  # Text has to be list of single sentences
+    ps = subprocess.Popen(f'cat _tmp_ssml.txt | mimic3 --ssml > _tmp.wav', shell=True)
+    ps.wait()
+    x, fs = soundfile.read('_tmp.wav')
+    x = audresample.resample(x.astype(np.float32), 24000, fs)[0, :]  # reshapes (64,) -> (1,64)
+    return overlay(x, sound_background)
+# voices = {}
+# import phonemizer
+# global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True,  with_stress=True)
+app = Flask(__name__)
+cors = CORS(app)
+@app.route("/")
+def index():
+    with open('README.md', 'r') as f:
+        return markdown.markdown(f.read())
+@app.route("/", methods=['GET', 'POST', 'PUT'])
+def serve_wav():
+    # https://stackoverflow.com/questions/13522137/in-flask-convert-form-post-
+    #                      object-into-a-representation-suitable-for-mongodb
+    r = request.form.to_dict(flat=False)
+    args = SimpleNamespace(
+        text=None if r.get('text') is None else r.get('text'),  # string not file?
+        voice=r.get('voice')[0],
+        native=None if r.get('native') is None else CACHE_DIR + r.get('native')[0].replace("/",""),
+        affective = r.get('affective')[0],
+        scene=r.get('scene')[0]
+        )
+    # print('\n==RECOMPOSED as \n',request.data,request.form,'\n==')
+    print(args, 'ENTER Script')
+    do_video_dub = False
+    # ====STYLE VECTOR====
+    precomputed_style_vector = None
+    # NOTE: style vector may be None
+    if precomputed_style_vector is None:
+        if 'en_US' in args.voice or 'en_UK' in args.voice:
+            _dir = '/' if args.affective else '_v2/'
+            precomputed_style_vector = msinference.compute_style(
+                'assets/wavs/style_vector' + _dir + args.voice.replace(
+                    '/', '_').replace(
+                    '#', '_').replace(
+                    'cmu-arctic', 'cmu_arctic').replace(
+                    '_low', '') + '.wav')
+    print('\n  STYLE VECTOR \n', precomputed_style_vector.shape)
+    x = tts_multi_sentence(text=args.text,
+                            precomputed_style_vector=precomputed_style_vector,
+                            voice=args.voice,
+                            scene=args.scene)
+    OUT_FILE = 'tmp.wav'
+    soundfile.write(CACHE_DIR + OUT_FILE, x, 24000)
+    # send server's output as default file -> srv_result.xx
+    print(f'\n=SERVER saved as {OUT_FILE=}\n')
+    response = send_from_directory(CACHE_DIR, path=OUT_FILE)
+    response.headers['suffix-file-type'] = OUT_FILE
+    return response
+if __name__ == "__main__":
+    app.run(host="0.0.0.0")

live_demo.py ADDED Viewed

	@@ -0,0 +1,173 @@

+import numpy as np
+import argparse
+import os
+import requests
+import subprocess
+# SSH AGENT
+#   eval $(ssh-agent -s)
+#   ssh-add ~/.ssh/id_ed25519_github2024
+#
+#   git remote set-url origin [email protected]:audeering/shift
+# https://stackoverflow.com/questions/57158779/how-to-stop-audio-with-playsound-module
+# import multiprocessing
+# from playsound import playsound
+# p = multiprocessing.Process(target=playsound, args=("file.mp3",))
+# p.start()
+# input("press ENTER to stop playback")
+# p.terminate()
+# from playsound import playsound
+# playsound('/path/to/a/sound/file/you/want/to/play.mp3')
+def command_line_args():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument(
+        '--affective',
+        help="Select Emotional or non-emotional variant of Available voices: https://audeering.github.io/shift/",
+        action='store_false',
+    )
+    parser.add_argument(
+        '--device',
+        help="Device ID",
+        type=str,
+        default='cpu',
+    )
+    parser.add_argument(
+        '--text',
+        help="Text to be synthesized.",
+        default='sample.txt',
+        type=str,
+    )
+    parser.add_argument(
+        '--native',
+        help="""
+        --native: (without argument) a flag to do voice cloning using the speech from --video,
+        --native my_voice.wav:  Voice cloning from user provided audio""",
+        # nargs='?',
+        # const=None,
+        # default=False   # default has to be none
+        )
+    parser.add_argument(
+        '--voice',
+        help="TTS voice - Available voices: https://audeering.github.io/shift/",
+        default="en_US/m-ailabs_low#judy_bieber", #'en_US/cmu-arctic_low#lnh',
+        type=str,
+    )
+    parser.add_argument(
+        '--image',
+        help="If provided is set as background for output video, see --text",
+        type=str,
+    )
+    parser.add_argument(
+        '--video',
+        help="Video file for video translation. Voice cloned from the video",
+        type=str,
+    )
+    parser.add_argument(
+        '--out_file',
+        help="Output file name.",
+        type=str,
+        default='b6'
+    )
+    parser.add_argument(
+        '--scene',
+        help='Sound scene description.',
+        type=str,
+        default='calm background sounds of a castle'
+    )
+    return parser
+def send_to_server(args):
+    url = "http://192.168.88.209:5000"
+    payload = {
+        'affective': args.affective,
+        'voice': args.voice,
+        'native': args.native,
+        'text': args.text,
+        'image': args.image,
+        'video': args.video,
+        'scene': args.scene,
+        # 'out_file': args.out_file   # let serve save as temp
+    }
+    # In data= we can write args
+    # In files=  sent actual files if provided
+    text_file = open(args.text, 'rb')
+    image_file, video_file, native_file = None, None, None
+    if args.image is not None:
+        print('\nLOADING IMAGE\n')
+        try:
+            image_file = open(args.image, 'rb')
+        except FileNotFoundError:
+            pass
+    if args.video is not None:
+        print('\nLOADING vid\n')
+        try:
+            video_file = open(args.video, 'rb')
+        except FileNotFoundError:
+            pass
+    if args.native is not None:
+        print('\nLOADING natv\n')
+        try:
+            native_file = open(args.native, 'rb')
+        except FileNotFoundError:
+            pass
+        # --------------------- send this extra
+    print('Sending...\n')
+    response = requests.post(url, data=payload,
+                             files=[(args.image, image_file)])  # NONEs do not arrive to servers dict
+    # Check the response from the server
+    if response.status_code == 200:
+        print("\nRequest was successful!")
+        # print("Response:", respdonse.__dict__.keys(), '\n=====\n')
+    else:
+        print("Failed to send the request")
+        print("Status Code:", response.status_code)
+        print("Response:", response.text)
+    return response
+def cli(): # args.out_file is not send to server - server writes tmp - copied by client
+    parser = command_line_args()
+    args = parser.parse_args()
+    while True:
+        args.text = input("Type your text: ")
+        response = send_to_server(args)
+        out_file = args.out_file + '.' + response.headers['suffix-file-type'].split('.')[-1]
+        with open(out_file, 'wb') as f:
+            f.write(response.content)
+        print('REsponse AT client []\n----------------------------', response.headers)
+        subprocess.run(["paplay", out_file])
+if __name__ == '__main__':
+    cli()
+# assume also video and text for video we have to write some classes for video for audiocraft
+# then call tts.py on this video with nonempty labels - thus calls audiocraft