|
|
|
|
|
import numpy as np |
|
import soundfile |
|
import audresample |
|
import text_utils |
|
import msinference |
|
import re |
|
import srt |
|
import subprocess |
|
import markdown |
|
import json |
|
from pathlib import Path |
|
from types import SimpleNamespace |
|
from flask import Flask, request, send_from_directory |
|
from flask_cors import CORS |
|
from audiocraft.audiogen import AudioGen, audio_write |
|
|
|
sound_generator = AudioGen.get_pretrained('facebook/audiogen-medium') |
|
sound_generator.set_generation_params(duration=6) |
|
|
|
CACHE_DIR = 'flask_cache/' |
|
Path(CACHE_DIR).mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
def _shift(x): |
|
n = x.shape[0] |
|
i = np.random.randint(.24 * n, max(1, .74 * n)) |
|
x = np.roll(x, i) |
|
|
|
|
|
return x |
|
|
|
def overlay(x, sound_background=None): |
|
if sound_background is not None: |
|
sound_background = sound_background.detach().cpu().numpy()[0, :] |
|
len_speech = len(x) |
|
if len_speech > len(sound_background): |
|
n_repeat = len_speech // len(sound_background) + 1 |
|
replica = [sound_background] * n_repeat |
|
replica = [_shift(_) for _ in replica] |
|
sound_background = np.concatenate(replica) |
|
|
|
|
|
print(f'\nSOUND BACKGROUND SHAPE\n{sound_background.shape=}\n{x.shape=}\n- - - -') |
|
x = .74 * x + .26 * sound_background[:len_speech] |
|
return x |
|
|
|
def tts_multi_sentence(precomputed_style_vector=None, |
|
text=None, |
|
voice=None, |
|
scene=None): |
|
'''create 24kHZ np.array with tts |
|
|
|
precomputed_style_vector : required if en_US or en_UK in voice, so |
|
to perform affective TTS. |
|
text : string |
|
voice : string or None (falls to styleTTS) |
|
scene : 'A castle in far away lands' -> if passed will generate background sound scene |
|
''' |
|
|
|
if scene is not None: |
|
|
|
sound_background = sound_generator.generate([scene])[0] |
|
sound_background = audio_write(None, |
|
sound_background.cpu(), |
|
24000, |
|
strategy="loudness", |
|
loudness_compressor=True) |
|
else: |
|
sound_background = None |
|
|
|
|
|
if ('en_US/' in voice) or ('en_UK/' in voice) or (voice is None): |
|
assert precomputed_style_vector is not None, 'For affective TTS, style vector is needed.' |
|
x = [] |
|
for _sentence in text: |
|
x.append(msinference.inference(_sentence, |
|
precomputed_style_vector, |
|
alpha=0.3, |
|
beta=0.7, |
|
diffusion_steps=7, |
|
embedding_scale=1)) |
|
x = np.concatenate(x) |
|
|
|
return overlay(x, sound_background) |
|
|
|
|
|
text_utils.store_ssml(text=text, voice=voice) |
|
ps = subprocess.Popen(f'cat _tmp_ssml.txt | mimic3 --ssml > _tmp.wav', shell=True) |
|
ps.wait() |
|
x, fs = soundfile.read('_tmp.wav') |
|
x = audresample.resample(x.astype(np.float32), 24000, fs)[0, :] |
|
|
|
return overlay(x, sound_background) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app = Flask(__name__) |
|
cors = CORS(app) |
|
|
|
|
|
@app.route("/") |
|
def index(): |
|
with open('README.md', 'r') as f: |
|
return markdown.markdown(f.read()) |
|
|
|
|
|
@app.route("/", methods=['GET', 'POST', 'PUT']) |
|
def serve_wav(): |
|
|
|
|
|
r = request.form.to_dict(flat=False) |
|
|
|
|
|
args = SimpleNamespace( |
|
text=None if r.get('text') is None else r.get('text'), |
|
voice=r.get('voice')[0], |
|
native=None if r.get('native') is None else CACHE_DIR + r.get('native')[0].replace("/",""), |
|
affective = r.get('affective')[0], |
|
scene=r.get('scene')[0] |
|
) |
|
|
|
|
|
|
|
print(args, 'ENTER Script') |
|
do_video_dub = False |
|
|
|
|
|
|
|
precomputed_style_vector = None |
|
|
|
|
|
if precomputed_style_vector is None: |
|
if 'en_US' in args.voice or 'en_UK' in args.voice: |
|
_dir = '/' if args.affective else '_v2/' |
|
precomputed_style_vector = msinference.compute_style( |
|
'assets/wavs/style_vector' + _dir + args.voice.replace( |
|
'/', '_').replace( |
|
'#', '_').replace( |
|
'cmu-arctic', 'cmu_arctic').replace( |
|
'_low', '') + '.wav') |
|
print('\n STYLE VECTOR \n', precomputed_style_vector.shape) |
|
|
|
|
|
|
|
|
|
x = tts_multi_sentence(text=args.text, |
|
precomputed_style_vector=precomputed_style_vector, |
|
voice=args.voice, |
|
scene=args.scene) |
|
OUT_FILE = 'tmp.wav' |
|
soundfile.write(CACHE_DIR + OUT_FILE, x, 24000) |
|
|
|
|
|
|
|
|
|
|
|
|
|
print(f'\n=SERVER saved as {OUT_FILE=}\n') |
|
response = send_from_directory(CACHE_DIR, path=OUT_FILE) |
|
response.headers['suffix-file-type'] = OUT_FILE |
|
return response |
|
|
|
|
|
if __name__ == "__main__": |
|
app.run(host="0.0.0.0") |
|
|