slow OK 4s live
Browse files- README.md +15 -0
- live_api.py +43 -87
- live_demo.py +12 -111
README.md
CHANGED
@@ -124,3 +124,18 @@ From an image and text create a video:
|
|
124 |
|
125 |
python tts.py --text sample.txt --image assets/image_from_T31.jpg
|
126 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
|
125 |
python tts.py --text sample.txt --image assets/image_from_T31.jpg
|
126 |
```
|
127 |
+
|
128 |
+
|
129 |
+
# Live Demo - Paplay
|
130 |
+
|
131 |
+
Flask
|
132 |
+
|
133 |
+
```python
|
134 |
+
CUDA_DEVICE_ORDER=PCI_BUS_ID HF_HOME=/data/dkounadis/.hf7/ CUDA_VISIBLE_DEVICES=4 python live_api.py
|
135 |
+
```
|
136 |
+
|
137 |
+
Client (Ubutu)
|
138 |
+
|
139 |
+
```python
|
140 |
+
python live_demo.py # will ask text input & play soundscape
|
141 |
+
```
|
live_api.py
CHANGED
@@ -4,7 +4,7 @@ import numpy as np
|
|
4 |
import soundfile
|
5 |
import audresample
|
6 |
import text_utils
|
7 |
-
|
8 |
import re
|
9 |
import srt
|
10 |
import subprocess
|
@@ -17,89 +17,67 @@ from flask_cors import CORS
|
|
17 |
from audiocraft.audiogen import AudioGen, audio_write
|
18 |
|
19 |
sound_generator = AudioGen.get_pretrained('facebook/audiogen-medium')
|
20 |
-
sound_generator.set_generation_params(duration=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
CACHE_DIR = 'flask_cache/'
|
23 |
Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
|
24 |
|
25 |
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
x = np.roll(x, i)
|
30 |
-
# fade_in = .5 + .5 * np.tanh(4*(np.linspace(-10, 10, x.shape[0]) + 9.4))
|
31 |
-
# x = x * fade_in
|
32 |
-
return x
|
33 |
-
|
34 |
-
def overlay(x, sound_background=None):
|
35 |
-
if sound_background is not None:
|
36 |
-
sound_background = sound_background.detach().cpu().numpy()[0, :]
|
37 |
-
len_speech = len(x)
|
38 |
-
if len_speech > len(sound_background):
|
39 |
-
n_repeat = len_speech // len(sound_background) + 1
|
40 |
-
replica = [sound_background] * n_repeat
|
41 |
-
replica = [_shift(_) for _ in replica]
|
42 |
-
sound_background = np.concatenate(replica)
|
43 |
-
|
44 |
-
|
45 |
-
print(f'\nSOUND BACKGROUND SHAPE\n{sound_background.shape=}\n{x.shape=}\n- - - -')
|
46 |
-
x = .74 * x + .26 * sound_background[:len_speech]
|
47 |
-
return x
|
48 |
-
|
49 |
-
def tts_multi_sentence(precomputed_style_vector=None,
|
50 |
-
text=None,
|
51 |
-
voice=None,
|
52 |
-
scene=None):
|
53 |
-
'''create 24kHZ np.array with tts
|
54 |
-
|
55 |
-
precomputed_style_vector : required if en_US or en_UK in voice, so
|
56 |
-
to perform affective TTS.
|
57 |
-
text : string
|
58 |
-
voice : string or None (falls to styleTTS)
|
59 |
-
scene : 'A castle in far away lands' -> if passed will generate background sound scene
|
60 |
-
'''
|
61 |
-
# Generate sound scene - up sample to 24KHz
|
62 |
if scene is not None:
|
63 |
|
64 |
sound_background = sound_generator.generate([scene])[0]
|
65 |
sound_background = audio_write(None,
|
66 |
sound_background.cpu(),
|
67 |
-
24000, #
|
68 |
strategy="loudness",
|
69 |
-
loudness_compressor=True)
|
70 |
else:
|
71 |
sound_background = None
|
72 |
|
73 |
-
# StyleTTS2
|
74 |
-
if ('en_US/' in voice) or ('en_UK/' in voice) or (voice is None):
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
|
86 |
-
|
87 |
|
88 |
-
|
89 |
-
text_utils.store_ssml(text=text, voice=voice) # Text has to be list of single sentences
|
90 |
-
ps = subprocess.Popen(f'cat _tmp_ssml.txt | mimic3 --ssml > _tmp.wav', shell=True)
|
91 |
-
ps.wait()
|
92 |
-
x, fs = soundfile.read('_tmp.wav')
|
93 |
-
x = audresample.resample(x.astype(np.float32), 24000, fs)[0, :] # reshapes (64,) -> (1,64)
|
94 |
|
95 |
-
return overlay(x, sound_background)
|
96 |
|
97 |
|
98 |
|
99 |
|
100 |
-
# voices = {}
|
101 |
-
# import phonemizer
|
102 |
-
# global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True)
|
103 |
|
104 |
app = Flask(__name__)
|
105 |
cors = CORS(app)
|
@@ -120,40 +98,18 @@ def serve_wav():
|
|
120 |
|
121 |
args = SimpleNamespace(
|
122 |
text=None if r.get('text') is None else r.get('text'), # string not file?
|
123 |
-
voice=r.get('voice')[0],
|
124 |
-
native=None if r.get('native') is None else CACHE_DIR + r.get('native')[0].replace("/",""),
|
125 |
-
affective = r.get('affective')[0],
|
126 |
scene=r.get('scene')[0]
|
127 |
)
|
128 |
# print('\n==RECOMPOSED as \n',request.data,request.form,'\n==')
|
129 |
|
130 |
|
131 |
-
print(args, 'ENTER Script')
|
132 |
-
do_video_dub = False
|
133 |
-
|
134 |
-
# ====STYLE VECTOR====
|
135 |
-
|
136 |
-
precomputed_style_vector = None
|
137 |
-
# NOTE: style vector may be None
|
138 |
|
139 |
-
if precomputed_style_vector is None:
|
140 |
-
if 'en_US' in args.voice or 'en_UK' in args.voice:
|
141 |
-
_dir = '/' if args.affective else '_v2/'
|
142 |
-
precomputed_style_vector = msinference.compute_style(
|
143 |
-
'assets/wavs/style_vector' + _dir + args.voice.replace(
|
144 |
-
'/', '_').replace(
|
145 |
-
'#', '_').replace(
|
146 |
-
'cmu-arctic', 'cmu_arctic').replace(
|
147 |
-
'_low', '') + '.wav')
|
148 |
-
print('\n STYLE VECTOR \n', precomputed_style_vector.shape)
|
149 |
|
150 |
|
151 |
|
152 |
|
153 |
-
x = tts_multi_sentence(
|
154 |
-
|
155 |
-
voice=args.voice,
|
156 |
-
scene=args.scene)
|
157 |
OUT_FILE = 'tmp.wav'
|
158 |
soundfile.write(CACHE_DIR + OUT_FILE, x, 24000)
|
159 |
|
|
|
4 |
import soundfile
|
5 |
import audresample
|
6 |
import text_utils
|
7 |
+
|
8 |
import re
|
9 |
import srt
|
10 |
import subprocess
|
|
|
17 |
from audiocraft.audiogen import AudioGen, audio_write
|
18 |
|
19 |
sound_generator = AudioGen.get_pretrained('facebook/audiogen-medium')
|
20 |
+
sound_generator.set_generation_params(duration=4)
|
21 |
+
|
22 |
+
|
23 |
+
# ====STYLE VECTOR====
|
24 |
+
|
25 |
+
|
26 |
+
|
27 |
+
# AFFECTIVE = True
|
28 |
+
# VOICE = 'en_UK/apope_low' # en_US/m-ailabs_low#mary_ann
|
29 |
+
|
30 |
+
# _dir = '/' if AFFECTIVE else '_v2/'
|
31 |
+
# precomputed_style_vector = msinference.compute_style(
|
32 |
+
# 'assets/wavs/style_vector' + _dir + VOICE.replace(
|
33 |
+
# '/', '_').replace(
|
34 |
+
# '#', '_').replace(
|
35 |
+
# 'cmu-arctic', 'cmu_arctic').replace(
|
36 |
+
# '_low', '') + '.wav')
|
37 |
+
# print('\n STYLE VECTOR \n', precomputed_style_vector.shape)
|
38 |
+
|
39 |
+
|
40 |
+
# ==== STYLE VECTOR
|
41 |
|
42 |
CACHE_DIR = 'flask_cache/'
|
43 |
Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
|
44 |
|
45 |
|
46 |
+
|
47 |
+
|
48 |
+
def tts_multi_sentence(scene=None):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
if scene is not None:
|
50 |
|
51 |
sound_background = sound_generator.generate([scene])[0]
|
52 |
sound_background = audio_write(None,
|
53 |
sound_background.cpu(),
|
54 |
+
24000, # Same as styleTTs sample_rate,
|
55 |
strategy="loudness",
|
56 |
+
loudness_compressor=True).detach().cpu().numpy()[0, :]
|
57 |
else:
|
58 |
sound_background = None
|
59 |
|
60 |
+
# # StyleTTS2
|
61 |
+
# if ('en_US/' in voice) or ('en_UK/' in voice) or (voice is None):
|
62 |
+
# assert precomputed_style_vector is not None, 'For affective TTS, style vector is needed.'
|
63 |
+
# x = []
|
64 |
+
# for _sentence in text:
|
65 |
+
# x.append(msinference.inference(_sentence,
|
66 |
+
# precomputed_style_vector,
|
67 |
+
# alpha=0.3,
|
68 |
+
# beta=0.7,
|
69 |
+
# diffusion_steps=7,
|
70 |
+
# embedding_scale=1))
|
71 |
+
# x = np.concatenate(x)
|
72 |
|
73 |
+
# return overlay(x, sound_background)
|
74 |
|
75 |
+
return sound_background
|
|
|
|
|
|
|
|
|
|
|
76 |
|
|
|
77 |
|
78 |
|
79 |
|
80 |
|
|
|
|
|
|
|
81 |
|
82 |
app = Flask(__name__)
|
83 |
cors = CORS(app)
|
|
|
98 |
|
99 |
args = SimpleNamespace(
|
100 |
text=None if r.get('text') is None else r.get('text'), # string not file?
|
|
|
|
|
|
|
101 |
scene=r.get('scene')[0]
|
102 |
)
|
103 |
# print('\n==RECOMPOSED as \n',request.data,request.form,'\n==')
|
104 |
|
105 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
|
108 |
|
109 |
|
110 |
|
111 |
+
x = tts_multi_sentence(args.scene)
|
112 |
+
# print('\n\n\n\n Obtai TTS output shape', x.shape)
|
|
|
|
|
113 |
OUT_FILE = 'tmp.wav'
|
114 |
soundfile.write(CACHE_DIR + OUT_FILE, x, 24000)
|
115 |
|
live_demo.py
CHANGED
@@ -1,29 +1,9 @@
|
|
1 |
-
import numpy as np
|
2 |
import argparse
|
3 |
import os
|
4 |
import requests
|
5 |
import subprocess
|
6 |
|
7 |
|
8 |
-
# SSH AGENT
|
9 |
-
# eval $(ssh-agent -s)
|
10 |
-
# ssh-add ~/.ssh/id_ed25519_github2024
|
11 |
-
#
|
12 |
-
# git remote set-url origin [email protected]:audeering/shift
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
# https://stackoverflow.com/questions/57158779/how-to-stop-audio-with-playsound-module
|
17 |
-
# import multiprocessing
|
18 |
-
# from playsound import playsound
|
19 |
-
|
20 |
-
# p = multiprocessing.Process(target=playsound, args=("file.mp3",))
|
21 |
-
# p.start()
|
22 |
-
# input("press ENTER to stop playback")
|
23 |
-
# p.terminate()
|
24 |
-
# from playsound import playsound
|
25 |
-
# playsound('/path/to/a/sound/file/you/want/to/play.mp3')
|
26 |
-
|
27 |
|
28 |
|
29 |
def command_line_args():
|
@@ -44,98 +24,20 @@ def command_line_args():
|
|
44 |
parser.add_argument(
|
45 |
'--text',
|
46 |
help="Text to be synthesized.",
|
47 |
-
default='
|
48 |
-
type=str,
|
49 |
-
)
|
50 |
-
parser.add_argument(
|
51 |
-
'--native',
|
52 |
-
help="""
|
53 |
-
--native: (without argument) a flag to do voice cloning using the speech from --video,
|
54 |
-
--native my_voice.wav: Voice cloning from user provided audio""",
|
55 |
-
# nargs='?',
|
56 |
-
# const=None,
|
57 |
-
# default=False # default has to be none
|
58 |
-
)
|
59 |
-
parser.add_argument(
|
60 |
-
'--voice',
|
61 |
-
help="TTS voice - Available voices: https://audeering.github.io/shift/",
|
62 |
-
default="en_US/m-ailabs_low#judy_bieber", #'en_US/cmu-arctic_low#lnh',
|
63 |
-
type=str,
|
64 |
-
)
|
65 |
-
parser.add_argument(
|
66 |
-
'--image',
|
67 |
-
help="If provided is set as background for output video, see --text",
|
68 |
-
type=str,
|
69 |
-
)
|
70 |
-
parser.add_argument(
|
71 |
-
'--video',
|
72 |
-
help="Video file for video translation. Voice cloned from the video",
|
73 |
type=str,
|
74 |
)
|
75 |
-
parser.add_argument(
|
76 |
-
'--out_file',
|
77 |
-
help="Output file name.",
|
78 |
-
type=str,
|
79 |
-
default='b6'
|
80 |
-
)
|
81 |
-
parser.add_argument(
|
82 |
-
'--scene',
|
83 |
-
help='Sound scene description.',
|
84 |
-
type=str,
|
85 |
-
default='calm background sounds of a castle'
|
86 |
-
)
|
87 |
return parser
|
88 |
|
89 |
def send_to_server(args):
|
90 |
url = "http://192.168.88.209:5000"
|
91 |
|
92 |
payload = {
|
93 |
-
'affective': args.affective,
|
94 |
-
'voice': args.voice,
|
95 |
-
'native': args.native,
|
96 |
'text': args.text,
|
97 |
-
'
|
98 |
-
'video': args.video,
|
99 |
-
'scene': args.scene,
|
100 |
-
# 'out_file': args.out_file # let serve save as temp
|
101 |
}
|
102 |
|
103 |
-
|
104 |
-
|
105 |
-
# In files= sent actual files if provided
|
106 |
-
text_file = open(args.text, 'rb')
|
107 |
-
|
108 |
-
image_file, video_file, native_file = None, None, None
|
109 |
-
if args.image is not None:
|
110 |
-
print('\nLOADING IMAGE\n')
|
111 |
-
try:
|
112 |
-
image_file = open(args.image, 'rb')
|
113 |
-
except FileNotFoundError:
|
114 |
-
pass
|
115 |
-
|
116 |
-
|
117 |
-
if args.video is not None:
|
118 |
-
print('\nLOADING vid\n')
|
119 |
-
try:
|
120 |
-
video_file = open(args.video, 'rb')
|
121 |
-
except FileNotFoundError:
|
122 |
-
pass
|
123 |
-
|
124 |
-
if args.native is not None:
|
125 |
-
print('\nLOADING natv\n')
|
126 |
-
try:
|
127 |
-
native_file = open(args.native, 'rb')
|
128 |
-
except FileNotFoundError:
|
129 |
-
pass
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
# --------------------- send this extra
|
134 |
-
|
135 |
-
print('Sending...\n')
|
136 |
-
|
137 |
-
response = requests.post(url, data=payload,
|
138 |
-
files=[(args.image, image_file)]) # NONEs do not arrive to servers dict
|
139 |
|
140 |
# Check the response from the server
|
141 |
if response.status_code == 200:
|
@@ -152,22 +54,21 @@ def send_to_server(args):
|
|
152 |
def cli(): # args.out_file is not send to server - server writes tmp - copied by client
|
153 |
parser = command_line_args()
|
154 |
args = parser.parse_args()
|
|
|
155 |
while True:
|
156 |
-
args.text = input("
|
|
|
|
|
|
|
157 |
response = send_to_server(args)
|
158 |
-
out_file =
|
159 |
-
|
160 |
with open(out_file, 'wb') as f:
|
161 |
f.write(response.content)
|
162 |
-
print('REsponse AT client []\n----------------------------', response.headers)
|
163 |
-
|
164 |
-
|
165 |
subprocess.run(["paplay", out_file])
|
166 |
-
|
167 |
|
168 |
|
169 |
if __name__ == '__main__':
|
170 |
cli()
|
171 |
-
|
172 |
-
# assume also video and text for video we have to write some classes for video for audiocraft
|
173 |
-
# then call tts.py on this video with nonempty labels - thus calls audiocraft
|
|
|
|
|
1 |
import argparse
|
2 |
import os
|
3 |
import requests
|
4 |
import subprocess
|
5 |
|
6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
|
9 |
def command_line_args():
|
|
|
24 |
parser.add_argument(
|
25 |
'--text',
|
26 |
help="Text to be synthesized.",
|
27 |
+
default='How is hoowl',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
type=str,
|
29 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
return parser
|
31 |
|
32 |
def send_to_server(args):
|
33 |
url = "http://192.168.88.209:5000"
|
34 |
|
35 |
payload = {
|
|
|
|
|
|
|
36 |
'text': args.text,
|
37 |
+
'scene': args.scene
|
|
|
|
|
|
|
38 |
}
|
39 |
|
40 |
+
response = requests.post(url, data=payload) # NONEs do not arrive to servers dict
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
|
42 |
# Check the response from the server
|
43 |
if response.status_code == 200:
|
|
|
54 |
def cli(): # args.out_file is not send to server - server writes tmp - copied by client
|
55 |
parser = command_line_args()
|
56 |
args = parser.parse_args()
|
57 |
+
os.system('cls' if os.name == 'nt' else 'clear')
|
58 |
while True:
|
59 |
+
args.text = input("\n\n\n\nDescribe Any Sound: \n\n\n\n")
|
60 |
+
# _text, _scene = args.text.split('|')
|
61 |
+
# args.text = _text
|
62 |
+
args.scene = args.text #_scene
|
63 |
response = send_to_server(args)
|
64 |
+
out_file = '_gen_.wav' #+ response.headers['suffix-file-type'].split('.')[-1]
|
65 |
+
|
66 |
with open(out_file, 'wb') as f:
|
67 |
f.write(response.content)
|
68 |
+
# print('REsponse AT client []\n----------------------------', response.headers)
|
69 |
+
|
|
|
70 |
subprocess.run(["paplay", out_file])
|
|
|
71 |
|
72 |
|
73 |
if __name__ == '__main__':
|
74 |
cli()
|
|
|
|
|
|