|
|
|
|
|
import numpy as np |
|
import soundfile |
|
import audresample |
|
import text_utils |
|
import msinference |
|
import re |
|
import srt |
|
import subprocess |
|
import cv2 |
|
import markdown |
|
import json |
|
from pathlib import Path |
|
from types import SimpleNamespace |
|
from flask import Flask, request, send_from_directory |
|
from flask_cors import CORS |
|
from moviepy.editor import * |
|
from audiocraft.builders import AudioGen |
|
CACHE_DIR = 'flask_cache/' |
|
NUM_SOUND_GENERATIONS = 1 |
|
|
|
sound_generator = AudioGen(duration=.74, device='cuda:0').to('cuda:0').eval() |
|
|
|
|
|
Path(CACHE_DIR).mkdir(parents=True, exist_ok=True) |
|
|
|
import nltk |
|
nltk.download('punkt') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _resize(image, width=None, height=None, inter=cv2.INTER_AREA): |
|
'''https://github.com/PyImageSearch/imutils/blob/master/imutils/convenience.py''' |
|
|
|
|
|
dim = None |
|
(h, w) = image.shape[:2] |
|
|
|
|
|
|
|
if width is None and height is None: |
|
return image |
|
|
|
|
|
if width is None: |
|
|
|
|
|
r = height / float(h) |
|
dim = (int(w * r), height) |
|
|
|
|
|
else: |
|
|
|
|
|
r = width / float(w) |
|
dim = (width, int(h * r)) |
|
|
|
|
|
resized = cv2.resize(image, dim, interpolation=inter) |
|
|
|
|
|
return resized |
|
|
|
|
|
|
|
def _shift(x): |
|
n = x.shape[0] |
|
i = np.random.randint(.24 * n, max(1, .74 * n)) |
|
x = np.roll(x, i) |
|
|
|
|
|
|
|
|
|
return x |
|
|
|
def overlay(x, scene=None): |
|
|
|
if scene is not None: |
|
|
|
|
|
print(f'AudioGen {NUM_SOUND_GENERATIONS} x {scene}') |
|
background = sound_generator.generate( |
|
[scene] * NUM_SOUND_GENERATIONS |
|
).reshape(-1).detach().cpu().numpy() |
|
|
|
|
|
|
|
print('Resampling') |
|
|
|
|
|
background = audresample.resample( |
|
background, |
|
original_rate=16000, |
|
target_rate=24000)[0, :] |
|
|
|
|
|
|
|
|
|
n_repeat = len(x) // background.shape[0] + 2 |
|
|
|
|
|
print(f'Additional Repeat {n_repeat=}') |
|
background = np.concatenate(n_repeat * [background]) |
|
|
|
print(f'\n====SOUND BACKGROUND SHAPE\n{background.shape=}', |
|
f'{np.abs(background.max())=}\n{x.shape=}') |
|
x = .1 * x + .9 * background[:len(x)] |
|
else: |
|
print('sound_background = None') |
|
return x |
|
|
|
def tts_multi_sentence(precomputed_style_vector=None, |
|
text=None, |
|
voice=None, |
|
scene=None, |
|
speed=None): |
|
'''create 24kHZ np.array with tts |
|
|
|
precomputed_style_vector : required if en_US or en_UK in voice, so |
|
to perform affective TTS. |
|
text : string |
|
voice : string or None (falls to styleTTS) |
|
scene : 'A castle in far away lands' -> if passed will generate background sound scene |
|
''' |
|
|
|
|
|
|
|
|
|
if ('en_US/' in voice) or ('en_UK/' in voice) or (voice is None): |
|
assert precomputed_style_vector is not None, 'For affective TTS, style vector is needed.' |
|
x = [] |
|
for _sentence in text: |
|
x.append(msinference.inference(_sentence, |
|
precomputed_style_vector, |
|
alpha=0.3, |
|
beta=0.7, |
|
diffusion_steps=7, |
|
embedding_scale=1)) |
|
|
|
else: |
|
x = [] |
|
for _sentence in text: |
|
x.append(msinference.foreign(text=_sentence, |
|
lang=voice, |
|
speed=speed)) |
|
|
|
|
|
x = np.concatenate(x) |
|
|
|
x /= np.abs(x).max() + 1e-7 |
|
|
|
return overlay(x, scene=scene) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app = Flask(__name__) |
|
cors = CORS(app) |
|
|
|
|
|
@app.route("/") |
|
def index(): |
|
with open('README.md', 'r') as f: |
|
return markdown.markdown(f.read()) |
|
|
|
|
|
@app.route("/", methods=['GET', 'POST', 'PUT']) |
|
def serve_wav(): |
|
|
|
|
|
r = request.form.to_dict(flat=False) |
|
|
|
|
|
|
|
for filename, obj in request.files.items(): |
|
obj.save(f'{CACHE_DIR}{filename.replace("/","")}') |
|
|
|
print('Saved all files on Server Side\n\n') |
|
|
|
args = SimpleNamespace( |
|
text = None if r.get('text') is None else CACHE_DIR + r.get('text' )[0][-6:], |
|
video = None if r.get('video') is None else CACHE_DIR + r.get('video')[0][-6:], |
|
image = None if r.get('image') is None else CACHE_DIR + r.get('image')[0][-6:], |
|
native = None if r.get('native') is None else CACHE_DIR + r.get('native')[0][-6:], |
|
affective = r.get('affective')[0], |
|
voice = r.get('voice')[0], |
|
speed = float(r.get('speed')[0]), |
|
scene=r.get('scene')[0] if r.get('scene') is not None else None, |
|
) |
|
|
|
|
|
|
|
print(args, 'ENTER Script') |
|
do_video_dub = True if args.text.endswith('.srt') else False |
|
|
|
SILENT_VIDEO = '_silent_video.mp4' |
|
AUDIO_TRACK = '_audio_track.wav' |
|
|
|
if do_video_dub: |
|
print('==\nFound .srt : {args.txt}, thus Video should be given as well\n\n') |
|
with open(args.text, "r") as f: |
|
s = f.read() |
|
text = [[j.content, j.start.total_seconds(), j.end.total_seconds()] for j in srt.parse(s)] |
|
assert args.video is not None |
|
native_audio_file = '_tmp.wav' |
|
subprocess.call( |
|
["ffmpeg", |
|
"-y", |
|
"-i", |
|
args.video, |
|
"-f", |
|
"mp3", |
|
"-ar", |
|
"24000", |
|
"-vn", |
|
native_audio_file]) |
|
x_native, _ = soundfile.read(native_audio_file) |
|
x_native = x_native[:, 0] |
|
|
|
else: |
|
with open(args.text, 'r') as f: |
|
t = ''.join(f) |
|
t = re.sub(' +', ' ', t) |
|
text = text_utils.split_into_sentences(t) |
|
|
|
|
|
|
|
precomputed_style_vector = None |
|
if args.native: |
|
try: |
|
precomputed_style_vector = msinference.compute_style(args.native) |
|
except soundfile.LibsndfileError: |
|
print('\n Could not voice clone audio:', args.native, 'fallback to video or Internal TTS voice.\n') |
|
if do_video_dub: |
|
native_audio_file = args.video.replace('.', '').replace('/', '') |
|
native_audio_file += '__native_audio_track.wav' |
|
soundfile.write('tgt_spk.wav', |
|
np.concatenate([ |
|
x_native[:int(4 * 24000)]], 0).astype(np.float32), 24000) |
|
precomputed_style_vector = msinference.compute_style('tgt_spk.wav') |
|
|
|
|
|
|
|
if precomputed_style_vector is None: |
|
if 'en_US' in args.voice or 'en_UK' in args.voice: |
|
_dir = '/' if args.affective else '_v2/' |
|
precomputed_style_vector = msinference.compute_style( |
|
'assets/wavs/style_vector' + _dir + args.voice.replace( |
|
'/', '_').replace( |
|
'#', '_').replace( |
|
'cmu-arctic', 'cmu_arctic').replace( |
|
'_low', '') + '.wav') |
|
|
|
|
|
|
|
if args.video is not None: |
|
|
|
frame_tts = np.zeros((104, 1920, 3), dtype=np.uint8) |
|
font = cv2.FONT_HERSHEY_SIMPLEX |
|
bottomLeftCornerOfText = (240, 74) |
|
fontScale = 2 |
|
fontColor = (255, 255, 255) |
|
thickness = 4 |
|
lineType = 2 |
|
cv2.putText(frame_tts, 'TTS', |
|
bottomLeftCornerOfText, |
|
font, |
|
fontScale, |
|
fontColor, |
|
thickness, |
|
lineType) |
|
|
|
|
|
frame_orig = np.zeros((104, 1920, 3), dtype=np.uint8) |
|
font = cv2.FONT_HERSHEY_SIMPLEX |
|
bottomLeftCornerOfText = (101, 74) |
|
fontScale = 2 |
|
fontColor = (255, 255, 255) |
|
thickness = 4 |
|
lineType = 1000 |
|
cv2.putText(frame_orig, 'ORIGINAL VOICE', |
|
bottomLeftCornerOfText, |
|
font, |
|
fontScale, |
|
fontColor, |
|
thickness, |
|
lineType) |
|
|
|
print(f'\n______________________________\n' |
|
f'Gen Banners for TTS/Native Title {frame_tts.shape=} {frame_orig.shape=}' |
|
f'\n______________________________\n') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
video_file = args.video |
|
vf = VideoFileClip(video_file) |
|
|
|
|
|
h, w, _ = vf.get_frame(0).shape |
|
frame_tts = _resize(frame_tts, width=w) |
|
frame_orig = _resize(frame_orig, width=w) |
|
h, w, _ = frame_orig.shape |
|
|
|
try: |
|
|
|
|
|
num = x_native.shape[0] |
|
is_tts = .5 + .5 * np.tanh(4*(np.linspace(-10, 10, num) + 9.4)) |
|
|
|
def inpaint_banner(get_frame, t): |
|
'''blend banner - (now plays) tts or native voic |
|
''' |
|
|
|
im = np.copy(get_frame(t)) |
|
|
|
|
|
ix = int(t * 24000) |
|
|
|
if is_tts[ix] > .5: |
|
frame = frame_tts |
|
|
|
|
|
else: |
|
frame = frame_orig |
|
|
|
|
|
|
|
|
|
|
|
offset_h = 24 |
|
|
|
|
|
print(f' > inpaint_banner() HAS NATIVE: {frame.shape=} {im.shape=}\n\n\n\n') |
|
|
|
|
|
|
|
im[offset_h:h + offset_h, :w, :] = (.4 * im[offset_h:h + offset_h, :w, :] |
|
+ .6 * frame).astype(np.uint8) |
|
|
|
|
|
|
|
return im |
|
|
|
except UnboundLocalError: |
|
|
|
def inpaint_banner(get_frame, t): |
|
|
|
im = np.copy(get_frame(t)) |
|
|
|
h, w, _ = frame_tts.shape |
|
if w != im.shape[1]: |
|
local_frame = _resize(frame_tts, width=im.shape[1]) |
|
offset_h = 24 |
|
im[offset_h:h + offset_h, :w, :] = (.4 * im[offset_h:h+offset_h, :w, :] |
|
+ .6 * local_frame).astype(np.uint8) |
|
return im |
|
vf = vf.fl(inpaint_banner) |
|
vf.write_videofile(SILENT_VIDEO) |
|
|
|
|
|
|
|
if do_video_dub: |
|
OUT_FILE = 'tmp.mp4' |
|
subtitles = text |
|
MAX_LEN = int(subtitles[-1][2] + 17) * 24000 |
|
|
|
print("TOTAL LEN SAMPLES ", MAX_LEN, '\n====================') |
|
pieces = [] |
|
for k, (_text_, orig_start, orig_end) in enumerate(subtitles): |
|
|
|
|
|
|
|
|
|
pieces.append(tts_multi_sentence(text=[_text_], |
|
precomputed_style_vector=precomputed_style_vector, |
|
voice=args.voice, |
|
scene=args.scene, |
|
speed=args.speed) |
|
) |
|
total = np.concatenate(pieces, 0) |
|
|
|
|
|
if len(x_native) > len(total): |
|
total = np.pad(total, (0, max(0, x_native.shape[0] - total.shape[0]))) |
|
|
|
else: |
|
x_native = np.pad(x_native, (0, max(0, total.shape[0] - x_native.shape[0]))) |
|
|
|
soundfile.write(AUDIO_TRACK, |
|
|
|
(.64 * total + .27 * x_native)[:, None], |
|
24000) |
|
else: |
|
OUT_FILE = 'tmp.mp4' |
|
x = tts_multi_sentence(text=text, |
|
precomputed_style_vector=precomputed_style_vector, |
|
voice=args.voice, |
|
scene=args.scene, |
|
speed=args.speed) |
|
soundfile.write(AUDIO_TRACK, x, 24000) |
|
|
|
|
|
|
|
if args.image is not None: |
|
|
|
STATIC_FRAME = args.image |
|
OUT_FILE = 'tmp.mp4' |
|
|
|
|
|
|
|
clip_silent = ImageClip(STATIC_FRAME).set_duration(5) |
|
clip_silent.write_videofile(SILENT_VIDEO, fps=24) |
|
|
|
x = tts_multi_sentence(text=text, |
|
precomputed_style_vector=precomputed_style_vector, |
|
voice=args.voice, |
|
scene=args.scene, |
|
speed=args.speed |
|
) |
|
soundfile.write(AUDIO_TRACK, x, 24000) |
|
if args.video or args.image: |
|
|
|
subprocess.call( |
|
["ffmpeg", |
|
"-y", |
|
"-i", |
|
SILENT_VIDEO, |
|
"-i", |
|
AUDIO_TRACK, |
|
"-c:v", |
|
"copy", |
|
"-map", |
|
"0:v:0", |
|
"-map", |
|
" 1:a:0", |
|
CACHE_DIR + OUT_FILE]) |
|
|
|
print(f'\noutput video is saved as {OUT_FILE}') |
|
|
|
else: |
|
|
|
|
|
x = tts_multi_sentence(text=text, |
|
precomputed_style_vector=precomputed_style_vector, |
|
voice=args.voice, |
|
scene=args.scene, |
|
speed=args.speed) |
|
OUT_FILE = 'tmp.wav' |
|
soundfile.write(CACHE_DIR + OUT_FILE, x, 24000) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(f'\n=SERVER saved as {OUT_FILE=}\n') |
|
response = send_from_directory(CACHE_DIR, path=OUT_FILE) |
|
response.headers['suffix-file-type'] = OUT_FILE |
|
print('________________\n ? \n_______________') |
|
return response |
|
|
|
|
|
if __name__ == "__main__": |
|
app.run(host="0.0.0.0") |
|
|