trial live demo
Browse files- live_api.py +172 -0
- live_demo.py +173 -0
live_api.py
ADDED
@@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
import numpy as np
|
4 |
+
import soundfile
|
5 |
+
import audresample
|
6 |
+
import text_utils
|
7 |
+
import msinference
|
8 |
+
import re
|
9 |
+
import srt
|
10 |
+
import subprocess
|
11 |
+
import markdown
|
12 |
+
import json
|
13 |
+
from pathlib import Path
|
14 |
+
from types import SimpleNamespace
|
15 |
+
from flask import Flask, request, send_from_directory
|
16 |
+
from flask_cors import CORS
|
17 |
+
from audiocraft.audiogen import AudioGen, audio_write
|
18 |
+
|
19 |
+
sound_generator = AudioGen.get_pretrained('facebook/audiogen-medium')
|
20 |
+
sound_generator.set_generation_params(duration=6)
|
21 |
+
|
22 |
+
CACHE_DIR = 'flask_cache/'
|
23 |
+
Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
|
24 |
+
|
25 |
+
|
26 |
+
def _shift(x):
|
27 |
+
n = x.shape[0]
|
28 |
+
i = np.random.randint(.24 * n, max(1, .74 * n)) # high should be above >= 0
|
29 |
+
x = np.roll(x, i)
|
30 |
+
# fade_in = .5 + .5 * np.tanh(4*(np.linspace(-10, 10, x.shape[0]) + 9.4))
|
31 |
+
# x = x * fade_in
|
32 |
+
return x
|
33 |
+
|
34 |
+
def overlay(x, sound_background=None):
|
35 |
+
if sound_background is not None:
|
36 |
+
sound_background = sound_background.detach().cpu().numpy()[0, :]
|
37 |
+
len_speech = len(x)
|
38 |
+
if len_speech > len(sound_background):
|
39 |
+
n_repeat = len_speech // len(sound_background) + 1
|
40 |
+
replica = [sound_background] * n_repeat
|
41 |
+
replica = [_shift(_) for _ in replica]
|
42 |
+
sound_background = np.concatenate(replica)
|
43 |
+
|
44 |
+
|
45 |
+
print(f'\nSOUND BACKGROUND SHAPE\n{sound_background.shape=}\n{x.shape=}\n- - - -')
|
46 |
+
x = .74 * x + .26 * sound_background[:len_speech]
|
47 |
+
return x
|
48 |
+
|
49 |
+
def tts_multi_sentence(precomputed_style_vector=None,
|
50 |
+
text=None,
|
51 |
+
voice=None,
|
52 |
+
scene=None):
|
53 |
+
'''create 24kHZ np.array with tts
|
54 |
+
|
55 |
+
precomputed_style_vector : required if en_US or en_UK in voice, so
|
56 |
+
to perform affective TTS.
|
57 |
+
text : string
|
58 |
+
voice : string or None (falls to styleTTS)
|
59 |
+
scene : 'A castle in far away lands' -> if passed will generate background sound scene
|
60 |
+
'''
|
61 |
+
# Generate sound scene - up sample to 24KHz
|
62 |
+
if scene is not None:
|
63 |
+
|
64 |
+
sound_background = sound_generator.generate([scene])[0]
|
65 |
+
sound_background = audio_write(None,
|
66 |
+
sound_background.cpu(),
|
67 |
+
24000, # sound_generator.sample_rate,
|
68 |
+
strategy="loudness",
|
69 |
+
loudness_compressor=True)
|
70 |
+
else:
|
71 |
+
sound_background = None
|
72 |
+
|
73 |
+
# StyleTTS2
|
74 |
+
if ('en_US/' in voice) or ('en_UK/' in voice) or (voice is None):
|
75 |
+
assert precomputed_style_vector is not None, 'For affective TTS, style vector is needed.'
|
76 |
+
x = []
|
77 |
+
for _sentence in text:
|
78 |
+
x.append(msinference.inference(_sentence,
|
79 |
+
precomputed_style_vector,
|
80 |
+
alpha=0.3,
|
81 |
+
beta=0.7,
|
82 |
+
diffusion_steps=7,
|
83 |
+
embedding_scale=1))
|
84 |
+
x = np.concatenate(x)
|
85 |
+
|
86 |
+
return overlay(x, sound_background)
|
87 |
+
|
88 |
+
# Fallback - Mimic-3
|
89 |
+
text_utils.store_ssml(text=text, voice=voice) # Text has to be list of single sentences
|
90 |
+
ps = subprocess.Popen(f'cat _tmp_ssml.txt | mimic3 --ssml > _tmp.wav', shell=True)
|
91 |
+
ps.wait()
|
92 |
+
x, fs = soundfile.read('_tmp.wav')
|
93 |
+
x = audresample.resample(x.astype(np.float32), 24000, fs)[0, :] # reshapes (64,) -> (1,64)
|
94 |
+
|
95 |
+
return overlay(x, sound_background)
|
96 |
+
|
97 |
+
|
98 |
+
|
99 |
+
|
100 |
+
# voices = {}
|
101 |
+
# import phonemizer
|
102 |
+
# global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True)
|
103 |
+
|
104 |
+
app = Flask(__name__)
|
105 |
+
cors = CORS(app)
|
106 |
+
|
107 |
+
|
108 |
+
@app.route("/")
|
109 |
+
def index():
|
110 |
+
with open('README.md', 'r') as f:
|
111 |
+
return markdown.markdown(f.read())
|
112 |
+
|
113 |
+
|
114 |
+
@app.route("/", methods=['GET', 'POST', 'PUT'])
|
115 |
+
def serve_wav():
|
116 |
+
# https://stackoverflow.com/questions/13522137/in-flask-convert-form-post-
|
117 |
+
# object-into-a-representation-suitable-for-mongodb
|
118 |
+
r = request.form.to_dict(flat=False)
|
119 |
+
|
120 |
+
|
121 |
+
args = SimpleNamespace(
|
122 |
+
text=None if r.get('text') is None else r.get('text'), # string not file?
|
123 |
+
voice=r.get('voice')[0],
|
124 |
+
native=None if r.get('native') is None else CACHE_DIR + r.get('native')[0].replace("/",""),
|
125 |
+
affective = r.get('affective')[0],
|
126 |
+
scene=r.get('scene')[0]
|
127 |
+
)
|
128 |
+
# print('\n==RECOMPOSED as \n',request.data,request.form,'\n==')
|
129 |
+
|
130 |
+
|
131 |
+
print(args, 'ENTER Script')
|
132 |
+
do_video_dub = False
|
133 |
+
|
134 |
+
# ====STYLE VECTOR====
|
135 |
+
|
136 |
+
precomputed_style_vector = None
|
137 |
+
# NOTE: style vector may be None
|
138 |
+
|
139 |
+
if precomputed_style_vector is None:
|
140 |
+
if 'en_US' in args.voice or 'en_UK' in args.voice:
|
141 |
+
_dir = '/' if args.affective else '_v2/'
|
142 |
+
precomputed_style_vector = msinference.compute_style(
|
143 |
+
'assets/wavs/style_vector' + _dir + args.voice.replace(
|
144 |
+
'/', '_').replace(
|
145 |
+
'#', '_').replace(
|
146 |
+
'cmu-arctic', 'cmu_arctic').replace(
|
147 |
+
'_low', '') + '.wav')
|
148 |
+
print('\n STYLE VECTOR \n', precomputed_style_vector.shape)
|
149 |
+
|
150 |
+
|
151 |
+
|
152 |
+
|
153 |
+
x = tts_multi_sentence(text=args.text,
|
154 |
+
precomputed_style_vector=precomputed_style_vector,
|
155 |
+
voice=args.voice,
|
156 |
+
scene=args.scene)
|
157 |
+
OUT_FILE = 'tmp.wav'
|
158 |
+
soundfile.write(CACHE_DIR + OUT_FILE, x, 24000)
|
159 |
+
|
160 |
+
|
161 |
+
|
162 |
+
|
163 |
+
|
164 |
+
# send server's output as default file -> srv_result.xx
|
165 |
+
print(f'\n=SERVER saved as {OUT_FILE=}\n')
|
166 |
+
response = send_from_directory(CACHE_DIR, path=OUT_FILE)
|
167 |
+
response.headers['suffix-file-type'] = OUT_FILE
|
168 |
+
return response
|
169 |
+
|
170 |
+
|
171 |
+
if __name__ == "__main__":
|
172 |
+
app.run(host="0.0.0.0")
|
live_demo.py
ADDED
@@ -0,0 +1,173 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import argparse
|
3 |
+
import os
|
4 |
+
import requests
|
5 |
+
import subprocess
|
6 |
+
|
7 |
+
|
8 |
+
# SSH AGENT
|
9 |
+
# eval $(ssh-agent -s)
|
10 |
+
# ssh-add ~/.ssh/id_ed25519_github2024
|
11 |
+
#
|
12 |
+
# git remote set-url origin [email protected]:audeering/shift
|
13 |
+
|
14 |
+
|
15 |
+
|
16 |
+
# https://stackoverflow.com/questions/57158779/how-to-stop-audio-with-playsound-module
|
17 |
+
# import multiprocessing
|
18 |
+
# from playsound import playsound
|
19 |
+
|
20 |
+
# p = multiprocessing.Process(target=playsound, args=("file.mp3",))
|
21 |
+
# p.start()
|
22 |
+
# input("press ENTER to stop playback")
|
23 |
+
# p.terminate()
|
24 |
+
# from playsound import playsound
|
25 |
+
# playsound('/path/to/a/sound/file/you/want/to/play.mp3')
|
26 |
+
|
27 |
+
|
28 |
+
|
29 |
+
def command_line_args():
|
30 |
+
parser = argparse.ArgumentParser(
|
31 |
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
32 |
+
)
|
33 |
+
parser.add_argument(
|
34 |
+
'--affective',
|
35 |
+
help="Select Emotional or non-emotional variant of Available voices: https://audeering.github.io/shift/",
|
36 |
+
action='store_false',
|
37 |
+
)
|
38 |
+
parser.add_argument(
|
39 |
+
'--device',
|
40 |
+
help="Device ID",
|
41 |
+
type=str,
|
42 |
+
default='cpu',
|
43 |
+
)
|
44 |
+
parser.add_argument(
|
45 |
+
'--text',
|
46 |
+
help="Text to be synthesized.",
|
47 |
+
default='sample.txt',
|
48 |
+
type=str,
|
49 |
+
)
|
50 |
+
parser.add_argument(
|
51 |
+
'--native',
|
52 |
+
help="""
|
53 |
+
--native: (without argument) a flag to do voice cloning using the speech from --video,
|
54 |
+
--native my_voice.wav: Voice cloning from user provided audio""",
|
55 |
+
# nargs='?',
|
56 |
+
# const=None,
|
57 |
+
# default=False # default has to be none
|
58 |
+
)
|
59 |
+
parser.add_argument(
|
60 |
+
'--voice',
|
61 |
+
help="TTS voice - Available voices: https://audeering.github.io/shift/",
|
62 |
+
default="en_US/m-ailabs_low#judy_bieber", #'en_US/cmu-arctic_low#lnh',
|
63 |
+
type=str,
|
64 |
+
)
|
65 |
+
parser.add_argument(
|
66 |
+
'--image',
|
67 |
+
help="If provided is set as background for output video, see --text",
|
68 |
+
type=str,
|
69 |
+
)
|
70 |
+
parser.add_argument(
|
71 |
+
'--video',
|
72 |
+
help="Video file for video translation. Voice cloned from the video",
|
73 |
+
type=str,
|
74 |
+
)
|
75 |
+
parser.add_argument(
|
76 |
+
'--out_file',
|
77 |
+
help="Output file name.",
|
78 |
+
type=str,
|
79 |
+
default='b6'
|
80 |
+
)
|
81 |
+
parser.add_argument(
|
82 |
+
'--scene',
|
83 |
+
help='Sound scene description.',
|
84 |
+
type=str,
|
85 |
+
default='calm background sounds of a castle'
|
86 |
+
)
|
87 |
+
return parser
|
88 |
+
|
89 |
+
def send_to_server(args):
|
90 |
+
url = "http://192.168.88.209:5000"
|
91 |
+
|
92 |
+
payload = {
|
93 |
+
'affective': args.affective,
|
94 |
+
'voice': args.voice,
|
95 |
+
'native': args.native,
|
96 |
+
'text': args.text,
|
97 |
+
'image': args.image,
|
98 |
+
'video': args.video,
|
99 |
+
'scene': args.scene,
|
100 |
+
# 'out_file': args.out_file # let serve save as temp
|
101 |
+
}
|
102 |
+
|
103 |
+
# In data= we can write args
|
104 |
+
|
105 |
+
# In files= sent actual files if provided
|
106 |
+
text_file = open(args.text, 'rb')
|
107 |
+
|
108 |
+
image_file, video_file, native_file = None, None, None
|
109 |
+
if args.image is not None:
|
110 |
+
print('\nLOADING IMAGE\n')
|
111 |
+
try:
|
112 |
+
image_file = open(args.image, 'rb')
|
113 |
+
except FileNotFoundError:
|
114 |
+
pass
|
115 |
+
|
116 |
+
|
117 |
+
if args.video is not None:
|
118 |
+
print('\nLOADING vid\n')
|
119 |
+
try:
|
120 |
+
video_file = open(args.video, 'rb')
|
121 |
+
except FileNotFoundError:
|
122 |
+
pass
|
123 |
+
|
124 |
+
if args.native is not None:
|
125 |
+
print('\nLOADING natv\n')
|
126 |
+
try:
|
127 |
+
native_file = open(args.native, 'rb')
|
128 |
+
except FileNotFoundError:
|
129 |
+
pass
|
130 |
+
|
131 |
+
|
132 |
+
|
133 |
+
# --------------------- send this extra
|
134 |
+
|
135 |
+
print('Sending...\n')
|
136 |
+
|
137 |
+
response = requests.post(url, data=payload,
|
138 |
+
files=[(args.image, image_file)]) # NONEs do not arrive to servers dict
|
139 |
+
|
140 |
+
# Check the response from the server
|
141 |
+
if response.status_code == 200:
|
142 |
+
print("\nRequest was successful!")
|
143 |
+
# print("Response:", respdonse.__dict__.keys(), '\n=====\n')
|
144 |
+
|
145 |
+
else:
|
146 |
+
print("Failed to send the request")
|
147 |
+
print("Status Code:", response.status_code)
|
148 |
+
print("Response:", response.text)
|
149 |
+
return response
|
150 |
+
|
151 |
+
|
152 |
+
def cli(): # args.out_file is not send to server - server writes tmp - copied by client
|
153 |
+
parser = command_line_args()
|
154 |
+
args = parser.parse_args()
|
155 |
+
while True:
|
156 |
+
args.text = input("Type your text: ")
|
157 |
+
response = send_to_server(args)
|
158 |
+
out_file = args.out_file + '.' + response.headers['suffix-file-type'].split('.')[-1]
|
159 |
+
|
160 |
+
with open(out_file, 'wb') as f:
|
161 |
+
f.write(response.content)
|
162 |
+
print('REsponse AT client []\n----------------------------', response.headers)
|
163 |
+
|
164 |
+
|
165 |
+
subprocess.run(["paplay", out_file])
|
166 |
+
|
167 |
+
|
168 |
+
|
169 |
+
if __name__ == '__main__':
|
170 |
+
cli()
|
171 |
+
|
172 |
+
# assume also video and text for video we have to write some classes for video for audiocraft
|
173 |
+
# then call tts.py on this video with nonempty labels - thus calls audiocraft
|