# -*- coding: utf-8 -*- import numpy as np import soundfile import audresample import text_utils import msinference import re import srt import subprocess import markdown import json from pathlib import Path from types import SimpleNamespace from flask import Flask, request, send_from_directory from flask_cors import CORS from audiocraft.audiogen import AudioGen, audio_write sound_generator = AudioGen.get_pretrained('facebook/audiogen-medium') sound_generator.set_generation_params(duration=6) CACHE_DIR = 'flask_cache/' Path(CACHE_DIR).mkdir(parents=True, exist_ok=True) def _shift(x): n = x.shape[0] i = np.random.randint(.24 * n, max(1, .74 * n)) # high should be above >= 0 x = np.roll(x, i) # fade_in = .5 + .5 * np.tanh(4*(np.linspace(-10, 10, x.shape[0]) + 9.4)) # x = x * fade_in return x def overlay(x, sound_background=None): if sound_background is not None: sound_background = sound_background.detach().cpu().numpy()[0, :] len_speech = len(x) if len_speech > len(sound_background): n_repeat = len_speech // len(sound_background) + 1 replica = [sound_background] * n_repeat replica = [_shift(_) for _ in replica] sound_background = np.concatenate(replica) print(f'\nSOUND BACKGROUND SHAPE\n{sound_background.shape=}\n{x.shape=}\n- - - -') x = .74 * x + .26 * sound_background[:len_speech] return x def tts_multi_sentence(precomputed_style_vector=None, text=None, voice=None, scene=None): '''create 24kHZ np.array with tts precomputed_style_vector : required if en_US or en_UK in voice, so to perform affective TTS. text : string voice : string or None (falls to styleTTS) scene : 'A castle in far away lands' -> if passed will generate background sound scene ''' # Generate sound scene - up sample to 24KHz if scene is not None: sound_background = sound_generator.generate([scene])[0] sound_background = audio_write(None, sound_background.cpu(), 24000, # sound_generator.sample_rate, strategy="loudness", loudness_compressor=True) else: sound_background = None # StyleTTS2 if ('en_US/' in voice) or ('en_UK/' in voice) or (voice is None): assert precomputed_style_vector is not None, 'For affective TTS, style vector is needed.' x = [] for _sentence in text: x.append(msinference.inference(_sentence, precomputed_style_vector, alpha=0.3, beta=0.7, diffusion_steps=7, embedding_scale=1)) x = np.concatenate(x) return overlay(x, sound_background) # Fallback - Mimic-3 text_utils.store_ssml(text=text, voice=voice) # Text has to be list of single sentences ps = subprocess.Popen(f'cat _tmp_ssml.txt | mimic3 --ssml > _tmp.wav', shell=True) ps.wait() x, fs = soundfile.read('_tmp.wav') x = audresample.resample(x.astype(np.float32), 24000, fs)[0, :] # reshapes (64,) -> (1,64) return overlay(x, sound_background) # voices = {} # import phonemizer # global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True) app = Flask(__name__) cors = CORS(app) @app.route("/") def index(): with open('README.md', 'r') as f: return markdown.markdown(f.read()) @app.route("/", methods=['GET', 'POST', 'PUT']) def serve_wav(): # https://stackoverflow.com/questions/13522137/in-flask-convert-form-post- # object-into-a-representation-suitable-for-mongodb r = request.form.to_dict(flat=False) args = SimpleNamespace( text=None if r.get('text') is None else r.get('text'), # string not file? voice=r.get('voice')[0], native=None if r.get('native') is None else CACHE_DIR + r.get('native')[0].replace("/",""), affective = r.get('affective')[0], scene=r.get('scene')[0] ) # print('\n==RECOMPOSED as \n',request.data,request.form,'\n==') print(args, 'ENTER Script') do_video_dub = False # ====STYLE VECTOR==== precomputed_style_vector = None # NOTE: style vector may be None if precomputed_style_vector is None: if 'en_US' in args.voice or 'en_UK' in args.voice: _dir = '/' if args.affective else '_v2/' precomputed_style_vector = msinference.compute_style( 'assets/wavs/style_vector' + _dir + args.voice.replace( '/', '_').replace( '#', '_').replace( 'cmu-arctic', 'cmu_arctic').replace( '_low', '') + '.wav') print('\n STYLE VECTOR \n', precomputed_style_vector.shape) x = tts_multi_sentence(text=args.text, precomputed_style_vector=precomputed_style_vector, voice=args.voice, scene=args.scene) OUT_FILE = 'tmp.wav' soundfile.write(CACHE_DIR + OUT_FILE, x, 24000) # send server's output as default file -> srv_result.xx print(f'\n=SERVER saved as {OUT_FILE=}\n') response = send_from_directory(CACHE_DIR, path=OUT_FILE) response.headers['suffix-file-type'] = OUT_FILE return response if __name__ == "__main__": app.run(host="0.0.0.0")