File size: 5,823 Bytes
6e78f43 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 |
# -*- coding: utf-8 -*-
import numpy as np
import soundfile
import audresample
import text_utils
import msinference
import re
import srt
import subprocess
import markdown
import json
from pathlib import Path
from types import SimpleNamespace
from flask import Flask, request, send_from_directory
from flask_cors import CORS
from audiocraft.audiogen import AudioGen, audio_write
sound_generator = AudioGen.get_pretrained('facebook/audiogen-medium')
sound_generator.set_generation_params(duration=6)
CACHE_DIR = 'flask_cache/'
Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
def _shift(x):
n = x.shape[0]
i = np.random.randint(.24 * n, max(1, .74 * n)) # high should be above >= 0
x = np.roll(x, i)
# fade_in = .5 + .5 * np.tanh(4*(np.linspace(-10, 10, x.shape[0]) + 9.4))
# x = x * fade_in
return x
def overlay(x, sound_background=None):
if sound_background is not None:
sound_background = sound_background.detach().cpu().numpy()[0, :]
len_speech = len(x)
if len_speech > len(sound_background):
n_repeat = len_speech // len(sound_background) + 1
replica = [sound_background] * n_repeat
replica = [_shift(_) for _ in replica]
sound_background = np.concatenate(replica)
print(f'\nSOUND BACKGROUND SHAPE\n{sound_background.shape=}\n{x.shape=}\n- - - -')
x = .74 * x + .26 * sound_background[:len_speech]
return x
def tts_multi_sentence(precomputed_style_vector=None,
text=None,
voice=None,
scene=None):
'''create 24kHZ np.array with tts
precomputed_style_vector : required if en_US or en_UK in voice, so
to perform affective TTS.
text : string
voice : string or None (falls to styleTTS)
scene : 'A castle in far away lands' -> if passed will generate background sound scene
'''
# Generate sound scene - up sample to 24KHz
if scene is not None:
sound_background = sound_generator.generate([scene])[0]
sound_background = audio_write(None,
sound_background.cpu(),
24000, # sound_generator.sample_rate,
strategy="loudness",
loudness_compressor=True)
else:
sound_background = None
# StyleTTS2
if ('en_US/' in voice) or ('en_UK/' in voice) or (voice is None):
assert precomputed_style_vector is not None, 'For affective TTS, style vector is needed.'
x = []
for _sentence in text:
x.append(msinference.inference(_sentence,
precomputed_style_vector,
alpha=0.3,
beta=0.7,
diffusion_steps=7,
embedding_scale=1))
x = np.concatenate(x)
return overlay(x, sound_background)
# Fallback - Mimic-3
text_utils.store_ssml(text=text, voice=voice) # Text has to be list of single sentences
ps = subprocess.Popen(f'cat _tmp_ssml.txt | mimic3 --ssml > _tmp.wav', shell=True)
ps.wait()
x, fs = soundfile.read('_tmp.wav')
x = audresample.resample(x.astype(np.float32), 24000, fs)[0, :] # reshapes (64,) -> (1,64)
return overlay(x, sound_background)
# voices = {}
# import phonemizer
# global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True)
app = Flask(__name__)
cors = CORS(app)
@app.route("/")
def index():
with open('README.md', 'r') as f:
return markdown.markdown(f.read())
@app.route("/", methods=['GET', 'POST', 'PUT'])
def serve_wav():
# https://stackoverflow.com/questions/13522137/in-flask-convert-form-post-
# object-into-a-representation-suitable-for-mongodb
r = request.form.to_dict(flat=False)
args = SimpleNamespace(
text=None if r.get('text') is None else r.get('text'), # string not file?
voice=r.get('voice')[0],
native=None if r.get('native') is None else CACHE_DIR + r.get('native')[0].replace("/",""),
affective = r.get('affective')[0],
scene=r.get('scene')[0]
)
# print('\n==RECOMPOSED as \n',request.data,request.form,'\n==')
print(args, 'ENTER Script')
do_video_dub = False
# ====STYLE VECTOR====
precomputed_style_vector = None
# NOTE: style vector may be None
if precomputed_style_vector is None:
if 'en_US' in args.voice or 'en_UK' in args.voice:
_dir = '/' if args.affective else '_v2/'
precomputed_style_vector = msinference.compute_style(
'assets/wavs/style_vector' + _dir + args.voice.replace(
'/', '_').replace(
'#', '_').replace(
'cmu-arctic', 'cmu_arctic').replace(
'_low', '') + '.wav')
print('\n STYLE VECTOR \n', precomputed_style_vector.shape)
x = tts_multi_sentence(text=args.text,
precomputed_style_vector=precomputed_style_vector,
voice=args.voice,
scene=args.scene)
OUT_FILE = 'tmp.wav'
soundfile.write(CACHE_DIR + OUT_FILE, x, 24000)
# send server's output as default file -> srv_result.xx
print(f'\n=SERVER saved as {OUT_FILE=}\n')
response = send_from_directory(CACHE_DIR, path=OUT_FILE)
response.headers['suffix-file-type'] = OUT_FILE
return response
if __name__ == "__main__":
app.run(host="0.0.0.0")
|