trial live demo

6e78f43 10 months ago

5.82 kB


	# -- coding: utf-8 --
	import numpy as np
	import soundfile
	import audresample
	import text_utils
	import msinference
	import re
	import srt
	import subprocess
	import markdown
	import json
	from pathlib import Path
	from types import SimpleNamespace
	from flask import Flask, request, send_from_directory
	from flask_cors import CORS
	from audiocraft.audiogen import AudioGen, audio_write

	sound_generator = AudioGen.get_pretrained('facebook/audiogen-medium')
	sound_generator.set_generation_params(duration=6)

	CACHE_DIR = 'flask_cache/'
	Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)


	def _shift(x):
	n = x.shape[0]
	i = np.random.randint(.24 * n, max(1, .74 * n)) # high should be above >= 0
	x = np.roll(x, i)
	# fade_in = .5 + .5 * np.tanh(4*(np.linspace(-10, 10, x.shape[0]) + 9.4))
	# x = x * fade_in
	return x

	def overlay(x, sound_background=None):
	if sound_background is not None:
	sound_background = sound_background.detach().cpu().numpy()[0, :]
	len_speech = len(x)
	if len_speech > len(sound_background):
	n_repeat = len_speech // len(sound_background) + 1
	replica = [sound_background] * n_repeat
	replica = [_shift(_) for _ in replica]
	sound_background = np.concatenate(replica)


	print(f'\nSOUND BACKGROUND SHAPE\n{sound_background.shape=}\n{x.shape=}\n- - - -')
	x = .74 * x + .26 * sound_background[:len_speech]
	return x

	def tts_multi_sentence(precomputed_style_vector=None,
	text=None,
	voice=None,
	scene=None):
	'''create 24kHZ np.array with tts

	precomputed_style_vector : required if en_US or en_UK in voice, so
	to perform affective TTS.
	text : string
	voice : string or None (falls to styleTTS)
	scene : 'A castle in far away lands' -> if passed will generate background sound scene
	'''
	# Generate sound scene - up sample to 24KHz
	if scene is not None:

	sound_background = sound_generator.generate([scene])[0]
	sound_background = audio_write(None,
	sound_background.cpu(),
	24000, # sound_generator.sample_rate,
	strategy="loudness",
	loudness_compressor=True)
	else:
	sound_background = None

	# StyleTTS2
	if ('en_US/' in voice) or ('en_UK/' in voice) or (voice is None):
	assert precomputed_style_vector is not None, 'For affective TTS, style vector is needed.'
	x = []
	for _sentence in text:
	x.append(msinference.inference(_sentence,
	precomputed_style_vector,
	alpha=0.3,
	beta=0.7,
	diffusion_steps=7,
	embedding_scale=1))
	x = np.concatenate(x)

	return overlay(x, sound_background)

	# Fallback - Mimic-3
	text_utils.store_ssml(text=text, voice=voice) # Text has to be list of single sentences
	ps = subprocess.Popen(f'cat _tmp_ssml.txt \| mimic3 --ssml > _tmp.wav', shell=True)
	ps.wait()
	x, fs = soundfile.read('_tmp.wav')
	x = audresample.resample(x.astype(np.float32), 24000, fs)[0, :] # reshapes (64,) -> (1,64)

	return overlay(x, sound_background)




	# voices = {}
	# import phonemizer
	# global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True)

	app = Flask(__name__)
	cors = CORS(app)


	@app.route("/")
	def index():
	with open('README.md', 'r') as f:
	return markdown.markdown(f.read())


	@app.route("/", methods=['GET', 'POST', 'PUT'])
	def serve_wav():
	# https://stackoverflow.com/questions/13522137/in-flask-convert-form-post-
	# object-into-a-representation-suitable-for-mongodb
	r = request.form.to_dict(flat=False)


	args = SimpleNamespace(
	text=None if r.get('text') is None else r.get('text'), # string not file?
	voice=r.get('voice')[0],
	native=None if r.get('native') is None else CACHE_DIR + r.get('native')[0].replace("/",""),
	affective = r.get('affective')[0],
	scene=r.get('scene')[0]
	)
	# print('\n==RECOMPOSED as \n',request.data,request.form,'\n==')


	print(args, 'ENTER Script')
	do_video_dub = False

	# ====STYLE VECTOR====

	precomputed_style_vector = None
	# NOTE: style vector may be None

	if precomputed_style_vector is None:
	if 'en_US' in args.voice or 'en_UK' in args.voice:
	_dir = '/' if args.affective else '_v2/'
	precomputed_style_vector = msinference.compute_style(
	'assets/wavs/style_vector' + _dir + args.voice.replace(
	'/', '_').replace(
	'#', '_').replace(
	'cmu-arctic', 'cmu_arctic').replace(
	'_low', '') + '.wav')
	print('\n STYLE VECTOR \n', precomputed_style_vector.shape)




	x = tts_multi_sentence(text=args.text,
	precomputed_style_vector=precomputed_style_vector,
	voice=args.voice,
	scene=args.scene)
	OUT_FILE = 'tmp.wav'
	soundfile.write(CACHE_DIR + OUT_FILE, x, 24000)





	# send server's output as default file -> srv_result.xx
	print(f'\n=SERVER saved as {OUT_FILE=}\n')
	response = send_from_directory(CACHE_DIR, path=OUT_FILE)
	response.headers['suffix-file-type'] = OUT_FILE
	return response


	if __name__ == "__main__":
	app.run(host="0.0.0.0")