fx live dem
Browse files- README.md +10 -18
- api.py +11 -19
- audiocraft/transformer.py +1 -1
- engineer_style_vectors_v2.py +0 -331
- landscape2soundscape.py +7 -15
- live_api.py +0 -135
- live_demo.py +34 -60
- models.py +5 -2
- msinference.py +5 -2
README.md
CHANGED
@@ -74,7 +74,7 @@ Following examples need `api.py` to be running. [Set this IP](https://huggingfac
|
|
74 |
The following needs `api.py` to be already running on a tmux session.
|
75 |
|
76 |
```python
|
77 |
-
# TTS & soundscape -
|
78 |
python landscape2soundscape.py
|
79 |
```
|
80 |
|
@@ -94,44 +94,36 @@ For SHIFT demo / Collaboration with [SMB](https://www.smb.museum/home/)
|
|
94 |
|
95 |
[![06](uc_spk_Landscape2Soundscape_Masterpieces_pics/thumb____06_Menzel_AI900_001.jpg)](https://youtu.be/3M0y9OYzDfU)
|
96 |
|
97 |
-
[![07](uc_spk_Landscape2Soundscape_Masterpieces_pics/thumb____07_Courbet_AI967_001.jpg)](https://youtu.be/
|
98 |
|
99 |
[![08](uc_spk_Landscape2Soundscape_Masterpieces_pics/thumb____08_Monet_AI1013_001.jpg)](https://youtu.be/gnGCYLcdLsA)
|
100 |
|
101 |
[![10](uc_spk_Landscape2Soundscape_Masterpieces_pics/thumb____10_Boecklin_967648_NG2-80_001_rsz.jpg)](https://www.youtube.com/watch?v=Y8QyYUgLaCg)
|
102 |
|
103 |
-
[![11](uc_spk_Landscape2Soundscape_Masterpieces_pics/thumb____11_Liebermann_NG4-94_001.jpg)](https://youtu.be/
|
104 |
|
105 |
-
[![12](uc_spk_Landscape2Soundscape_Masterpieces_pics/thumb____12_Slevogt_AII1022_001.jpg)](https://youtu.be/
|
106 |
|
107 |
|
108 |
|
109 |
|
110 |
-
# SoundScape Live
|
111 |
|
112 |
-
|
113 |
|
114 |
```python
|
115 |
-
CUDA_DEVICE_ORDER=PCI_BUS_ID HF_HOME=/data/dkounadis/.hf7/ CUDA_VISIBLE_DEVICES=4 python
|
116 |
```
|
117 |
|
118 |
-
|
119 |
|
120 |
```python
|
121 |
-
python live_demo.py #
|
122 |
```
|
123 |
|
124 |
-
# SoundScape (basic) Demo
|
125 |
-
|
126 |
-
```python
|
127 |
-
CUDA_DEVICE_ORDER=PCI_BUS_ID HF_HOME=/data/dkounadis/.hf7/ CUDA_VISIBLE_DEVICES=4 python demo.py
|
128 |
-
```
|
129 |
-
|
130 |
-
##
|
131 |
-
|
132 |
# Audiobook
|
133 |
|
134 |
-
Create audiobook from `.docx`. Listen to it - YouTube [male voice](https://www.youtube.com/watch?v=5-cpf7u18JE) / [v2](https://www.youtube.com/watch?v=Pzo-kKaNg6s) / [v2.1](https://www.youtube.com/watch?v=X4qlKBBaegM)/ [
|
135 |
|
136 |
```python
|
137 |
# audiobook will be saved in ./tts_audiobooks
|
|
|
74 |
The following needs `api.py` to be already running on a tmux session.
|
75 |
|
76 |
```python
|
77 |
+
# TTS & soundscape - output .mp4 saved in ./out/
|
78 |
python landscape2soundscape.py
|
79 |
```
|
80 |
|
|
|
94 |
|
95 |
[![06](uc_spk_Landscape2Soundscape_Masterpieces_pics/thumb____06_Menzel_AI900_001.jpg)](https://youtu.be/3M0y9OYzDfU)
|
96 |
|
97 |
+
[![07](uc_spk_Landscape2Soundscape_Masterpieces_pics/thumb____07_Courbet_AI967_001.jpg)](https://youtu.be/56MH7zOHrNQ)
|
98 |
|
99 |
[![08](uc_spk_Landscape2Soundscape_Masterpieces_pics/thumb____08_Monet_AI1013_001.jpg)](https://youtu.be/gnGCYLcdLsA)
|
100 |
|
101 |
[![10](uc_spk_Landscape2Soundscape_Masterpieces_pics/thumb____10_Boecklin_967648_NG2-80_001_rsz.jpg)](https://www.youtube.com/watch?v=Y8QyYUgLaCg)
|
102 |
|
103 |
+
[![11](uc_spk_Landscape2Soundscape_Masterpieces_pics/thumb____11_Liebermann_NG4-94_001.jpg)](https://youtu.be/RhUuS9HMLhg)
|
104 |
|
105 |
+
[![12](uc_spk_Landscape2Soundscape_Masterpieces_pics/thumb____12_Slevogt_AII1022_001.jpg)](https://youtu.be/NzzhhrUeKVY)
|
106 |
|
107 |
|
108 |
|
109 |
|
110 |
+
# SoundScape Live Demo - Paplay
|
111 |
|
112 |
+
Flask API for playing sounds live
|
113 |
|
114 |
```python
|
115 |
+
CUDA_DEVICE_ORDER=PCI_BUS_ID HF_HOME=/data/dkounadis/.hf7/ CUDA_VISIBLE_DEVICES=4 python api.py
|
116 |
```
|
117 |
|
118 |
+
Describe any sound via text, the tts & soundscape is played back
|
119 |
|
120 |
```python
|
121 |
+
python live_demo.py # type text & plays AudioGen sound & TTS
|
122 |
```
|
123 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
# Audiobook
|
125 |
|
126 |
+
Create audiobook from `.docx`. Listen to it - YouTube [male voice](https://www.youtube.com/watch?v=5-cpf7u18JE) / [v2](https://www.youtube.com/watch?v=Pzo-kKaNg6s) / [v2.1](https://www.youtube.com/watch?v=X4qlKBBaegM)/ [no diffusio](https://www.youtube.com/watch?v=vahKXpd6oLg)
|
127 |
|
128 |
```python
|
129 |
# audiobook will be saved in ./tts_audiobooks
|
api.py
CHANGED
@@ -21,18 +21,8 @@ NUM_SOUND_GENERATIONS = 1 # batch size to generate same text (same soundscape f
|
|
21 |
|
22 |
sound_generator = AudioGen(duration=4.74, device='cuda:0').to('cuda:0').eval()
|
23 |
|
24 |
-
|
25 |
Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
|
26 |
|
27 |
-
|
28 |
-
|
29 |
-
# SSH AGENT
|
30 |
-
# eval $(ssh-agent -s)
|
31 |
-
# ssh-add ~/.ssh/id_ed25519_github2024
|
32 |
-
#
|
33 |
-
# git remote set-url origin [email protected]:audeering/shift
|
34 |
-
# ==
|
35 |
-
|
36 |
def _shorten(filename):
|
37 |
return filename.replace("/","")[-6:]
|
38 |
|
@@ -83,9 +73,9 @@ def _shift(x):
|
|
83 |
def overlay(x, soundscape=None):
|
84 |
|
85 |
if soundscape is not None:
|
86 |
-
|
87 |
# SOUNDS
|
88 |
-
|
89 |
background = sound_generator.generate(
|
90 |
[soundscape] * NUM_SOUND_GENERATIONS
|
91 |
).reshape(-1).detach().cpu().numpy() # bs, 11400 @.74s
|
@@ -98,8 +88,10 @@ def overlay(x, soundscape=None):
|
|
98 |
background = audresample.resample(
|
99 |
background,
|
100 |
original_rate=16000, # sound_generator.sample_rate,
|
101 |
-
target_rate=24000)[0, :-25000]
|
102 |
-
|
|
|
|
|
103 |
# background /= np.abs(background).max() + 1e-7 Apply in sound_generator()
|
104 |
|
105 |
|
@@ -132,14 +124,14 @@ def overlay(x, soundscape=None):
|
|
132 |
# print(total[40000:70000].tolist())
|
133 |
print(np.logical_and(total > .1, total < .9).sum(), total.shape, 'ev')
|
134 |
|
135 |
-
#
|
136 |
-
|
137 |
-
|
138 |
# print(f'\n====SOUND BACKGROUND SHAPE\n{background.shape=}',
|
139 |
# f'{np.abs(background.max())=}\n{x.shape=}')
|
140 |
total /= np.abs(total).max() + 1e-7 # amplify speech to full [-1,1]
|
141 |
-
x = .
|
142 |
-
|
143 |
else:
|
144 |
print('sound_background = None')
|
145 |
return x
|
|
|
21 |
|
22 |
sound_generator = AudioGen(duration=4.74, device='cuda:0').to('cuda:0').eval()
|
23 |
|
|
|
24 |
Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
|
25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
def _shorten(filename):
|
27 |
return filename.replace("/","")[-6:]
|
28 |
|
|
|
73 |
def overlay(x, soundscape=None):
|
74 |
|
75 |
if soundscape is not None:
|
76 |
+
|
77 |
# SOUNDS
|
78 |
+
|
79 |
background = sound_generator.generate(
|
80 |
[soundscape] * NUM_SOUND_GENERATIONS
|
81 |
).reshape(-1).detach().cpu().numpy() # bs, 11400 @.74s
|
|
|
88 |
background = audresample.resample(
|
89 |
background,
|
90 |
original_rate=16000, # sound_generator.sample_rate,
|
91 |
+
target_rate=24000)[0, :-25000]
|
92 |
+
# TODO discards last sampls due to splash sound / polarity change / on long sounds ~ videos / NOT DROP FOR live_demo.py
|
93 |
+
|
94 |
+
|
95 |
# background /= np.abs(background).max() + 1e-7 Apply in sound_generator()
|
96 |
|
97 |
|
|
|
124 |
# print(total[40000:70000].tolist())
|
125 |
print(np.logical_and(total > .1, total < .9).sum(), total.shape, 'ev')
|
126 |
|
127 |
+
# less periodic - cloned sounds
|
128 |
+
for _ in range(4):
|
129 |
+
background = _shift(background)
|
130 |
# print(f'\n====SOUND BACKGROUND SHAPE\n{background.shape=}',
|
131 |
# f'{np.abs(background.max())=}\n{x.shape=}')
|
132 |
total /= np.abs(total).max() + 1e-7 # amplify speech to full [-1,1]
|
133 |
+
x = .26 * x + .74 * total[:len(x)]
|
134 |
+
|
135 |
else:
|
136 |
print('sound_background = None')
|
137 |
return x
|
audiocraft/transformer.py
CHANGED
@@ -203,7 +203,7 @@ class StreamingMultiheadAttention(nn.Module):
|
|
203 |
|
204 |
else:
|
205 |
# init on 1st token (for all 47 transf layers)
|
206 |
-
print(f'
|
207 |
self.k_history = k
|
208 |
self.v_history = v
|
209 |
|
|
|
203 |
|
204 |
else:
|
205 |
# init on 1st token (for all 47 transf layers)
|
206 |
+
print(f'AudioGen kv cache Flush')
|
207 |
self.k_history = k
|
208 |
self.v_history = v
|
209 |
|
engineer_style_vectors_v2.py
DELETED
@@ -1,331 +0,0 @@
|
|
1 |
-
|
2 |
-
from pathlib import Path
|
3 |
-
import shutil
|
4 |
-
import csv
|
5 |
-
import io
|
6 |
-
import os
|
7 |
-
import typing
|
8 |
-
import wave
|
9 |
-
import sys
|
10 |
-
from mimic3_tts.__main__ import (CommandLineInterfaceState,
|
11 |
-
get_args,
|
12 |
-
initialize_args,
|
13 |
-
initialize_tts,
|
14 |
-
# print_voices,
|
15 |
-
# process_lines,
|
16 |
-
shutdown_tts,
|
17 |
-
OutputNaming,
|
18 |
-
process_line)
|
19 |
-
|
20 |
-
|
21 |
-
def process_lines(state: CommandLineInterfaceState, wav_path=None):
|
22 |
-
'''MIMIC3 INTERNAL CALL that yields the sigh sound'''
|
23 |
-
|
24 |
-
args = state.args
|
25 |
-
|
26 |
-
result_idx = 0
|
27 |
-
print(f'why waitings in the for loop LIN {state.texts=}\n')
|
28 |
-
for line in state.texts:
|
29 |
-
print(f'LIN {line=}\n') # prints \n so is empty not getting the predifne text of state.texts
|
30 |
-
line_voice: typing.Optional[str] = None
|
31 |
-
line_id = ""
|
32 |
-
line = line.strip()
|
33 |
-
# if not line:
|
34 |
-
# continue
|
35 |
-
|
36 |
-
if args.output_naming == OutputNaming.ID:
|
37 |
-
# Line has the format id|text instead of just text
|
38 |
-
with io.StringIO(line) as line_io:
|
39 |
-
reader = csv.reader(line_io, delimiter=args.csv_delimiter)
|
40 |
-
row = next(reader)
|
41 |
-
line_id, line = row[0], row[-1]
|
42 |
-
if args.csv_voice:
|
43 |
-
line_voice = row[1]
|
44 |
-
|
45 |
-
process_line(line, state, line_id=line_id, line_voice=line_voice)
|
46 |
-
result_idx += 1
|
47 |
-
|
48 |
-
print('\nARRive at All Audio writing\n\n\n\n')
|
49 |
-
# -------------------------------------------------------------------------
|
50 |
-
|
51 |
-
# Write combined audio to stdout
|
52 |
-
if state.all_audio:
|
53 |
-
# _LOGGER.debug("Writing WAV audio to stdout")
|
54 |
-
|
55 |
-
if sys.stdout.isatty() and (not state.args.stdout):
|
56 |
-
with io.BytesIO() as wav_io:
|
57 |
-
wav_file_play: wave.Wave_write = wave.open(wav_io, "wb")
|
58 |
-
with wav_file_play:
|
59 |
-
wav_file_play.setframerate(state.sample_rate_hz)
|
60 |
-
wav_file_play.setsampwidth(state.sample_width_bytes)
|
61 |
-
wav_file_play.setnchannels(state.num_channels)
|
62 |
-
wav_file_play.writeframes(state.all_audio)
|
63 |
-
|
64 |
-
# play_wav_bytes(state.args, wav_io.getvalue())
|
65 |
-
# wav_path = '_direct_call_2.wav'
|
66 |
-
with open(wav_path, 'wb') as wav_file:
|
67 |
-
wav_file.write(wav_io.getvalue())
|
68 |
-
wav_file.seek(0)
|
69 |
-
|
70 |
-
# -----------------------------------------------------------------------------
|
71 |
-
# cat _tmp_ssml.txt | mimic3 --cuda --ssml --noise-w 0.90001 --length-scale 0.91 --noise-scale 0.04 > noise_w=0.90_en_happy_2.wav
|
72 |
-
# ======================================================================
|
73 |
-
out_dir = 'assets/'
|
74 |
-
reference_wav_directory = 'assets/wavs/style_vector_v2/'
|
75 |
-
Path(reference_wav_directory).mkdir(parents=True, exist_ok=True)
|
76 |
-
Path(out_dir).mkdir(parents=True, exist_ok=True)
|
77 |
-
|
78 |
-
wav_dir = 'assets/wavs/'
|
79 |
-
Path(wav_dir).mkdir(parents=True, exist_ok=True)
|
80 |
-
N_PIX = 11
|
81 |
-
|
82 |
-
|
83 |
-
# =======================================================================
|
84 |
-
# S T A R T G E N E R A T E png/wav
|
85 |
-
# =======================================================================
|
86 |
-
|
87 |
-
NOISE_SCALE = .667
|
88 |
-
NOISE_W = .9001 #.8 #.90001 # default .8 in __main__.py @ L697 IGNORED DUE TO ARTEfACTS - FOR NOW USE default
|
89 |
-
|
90 |
-
a = [
|
91 |
-
'p239',
|
92 |
-
'p236',
|
93 |
-
'p264',
|
94 |
-
'p250',
|
95 |
-
'p259',
|
96 |
-
'p247',
|
97 |
-
'p261',
|
98 |
-
'p263',
|
99 |
-
'p283',
|
100 |
-
'p274',
|
101 |
-
'p286',
|
102 |
-
'p276',
|
103 |
-
'p270',
|
104 |
-
'p281',
|
105 |
-
'p277',
|
106 |
-
'p231',
|
107 |
-
'p238',
|
108 |
-
'p271',
|
109 |
-
'p257',
|
110 |
-
'p273',
|
111 |
-
'p284',
|
112 |
-
'p329',
|
113 |
-
'p361',
|
114 |
-
'p287',
|
115 |
-
'p360',
|
116 |
-
'p374',
|
117 |
-
'p376',
|
118 |
-
'p310',
|
119 |
-
'p304',
|
120 |
-
'p340',
|
121 |
-
'p347',
|
122 |
-
'p330',
|
123 |
-
'p308',
|
124 |
-
'p314',
|
125 |
-
'p317',
|
126 |
-
'p339',
|
127 |
-
'p311',
|
128 |
-
'p294',
|
129 |
-
'p305',
|
130 |
-
'p266',
|
131 |
-
'p335',
|
132 |
-
'p334',
|
133 |
-
'p318',
|
134 |
-
'p323',
|
135 |
-
'p351',
|
136 |
-
'p333',
|
137 |
-
'p313',
|
138 |
-
'p316',
|
139 |
-
'p244',
|
140 |
-
'p307',
|
141 |
-
'p363',
|
142 |
-
'p336',
|
143 |
-
'p312',
|
144 |
-
'p267',
|
145 |
-
'p297',
|
146 |
-
'p275',
|
147 |
-
'p295',
|
148 |
-
'p288',
|
149 |
-
'p258',
|
150 |
-
'p301',
|
151 |
-
'p232',
|
152 |
-
'p292',
|
153 |
-
'p272',
|
154 |
-
'p278',
|
155 |
-
'p280',
|
156 |
-
'p341',
|
157 |
-
'p268',
|
158 |
-
'p298',
|
159 |
-
'p299',
|
160 |
-
'p279',
|
161 |
-
'p285',
|
162 |
-
'p326',
|
163 |
-
'p300',
|
164 |
-
's5',
|
165 |
-
'p230',
|
166 |
-
'p254',
|
167 |
-
'p269',
|
168 |
-
'p293',
|
169 |
-
'p252',
|
170 |
-
'p345',
|
171 |
-
'p262',
|
172 |
-
'p243',
|
173 |
-
'p227',
|
174 |
-
'p343',
|
175 |
-
'p255',
|
176 |
-
'p229',
|
177 |
-
'p240',
|
178 |
-
'p248',
|
179 |
-
'p253',
|
180 |
-
'p233',
|
181 |
-
'p228',
|
182 |
-
'p251',
|
183 |
-
'p282',
|
184 |
-
'p246',
|
185 |
-
'p234',
|
186 |
-
'p226',
|
187 |
-
'p260',
|
188 |
-
'p245',
|
189 |
-
'p241',
|
190 |
-
'p303',
|
191 |
-
'p265',
|
192 |
-
'p306',
|
193 |
-
'p237',
|
194 |
-
'p249',
|
195 |
-
'p256',
|
196 |
-
'p302',
|
197 |
-
'p364',
|
198 |
-
'p225',
|
199 |
-
'p362']
|
200 |
-
|
201 |
-
print(len(a))
|
202 |
-
|
203 |
-
b = []
|
204 |
-
|
205 |
-
for row in a:
|
206 |
-
b.append(f'en_US/vctk_low#{row}')
|
207 |
-
|
208 |
-
# print(b)
|
209 |
-
|
210 |
-
# 00000000 arctic
|
211 |
-
|
212 |
-
|
213 |
-
a = [
|
214 |
-
'awb' # comma
|
215 |
-
'rms',
|
216 |
-
'slt',
|
217 |
-
'ksp',
|
218 |
-
'clb',
|
219 |
-
'aew',
|
220 |
-
'bdl',
|
221 |
-
'lnh',
|
222 |
-
'jmk',
|
223 |
-
'rxr',
|
224 |
-
'fem',
|
225 |
-
'ljm',
|
226 |
-
'slp',
|
227 |
-
'ahw',
|
228 |
-
'axb',
|
229 |
-
'aup',
|
230 |
-
'eey',
|
231 |
-
'gka',
|
232 |
-
]
|
233 |
-
|
234 |
-
|
235 |
-
for row in a:
|
236 |
-
b.append(f'en_US/cmu-arctic_low#{row}')
|
237 |
-
|
238 |
-
# HIFItts
|
239 |
-
|
240 |
-
a = ['9017',
|
241 |
-
'6097',
|
242 |
-
'92']
|
243 |
-
|
244 |
-
for row in a:
|
245 |
-
b.append(f'en_US/hifi-tts_low#{row}')
|
246 |
-
|
247 |
-
a = [
|
248 |
-
'elliot_miller',
|
249 |
-
'judy_bieber',
|
250 |
-
'mary_ann']
|
251 |
-
|
252 |
-
for row in a:
|
253 |
-
b.append(f'en_US/m-ailabs_low#{row}')
|
254 |
-
|
255 |
-
# LJspeech - single speaker
|
256 |
-
|
257 |
-
b.append(f'en_US/ljspeech_low')
|
258 |
-
|
259 |
-
# en_UK apope - only speaker
|
260 |
-
|
261 |
-
b.append(f'en_UK/apope_low')
|
262 |
-
|
263 |
-
all_names = b
|
264 |
-
|
265 |
-
|
266 |
-
VOICES = {}
|
267 |
-
for _id, _voice in enumerate(all_names):
|
268 |
-
|
269 |
-
# If GitHub Quota exceded copy mimic-voices from local copies
|
270 |
-
#
|
271 |
-
# https://github.com/MycroftAI/mimic3-voices
|
272 |
-
#
|
273 |
-
home_voice_dir = f'/home/audeering.local/dkounadis/.local/share/mycroft/mimic3/voices/{_voice.split("#")[0]}/'
|
274 |
-
Path(home_voice_dir).mkdir(parents=True, exist_ok=True)
|
275 |
-
speaker_free_voice_name = _voice.split("#")[0] if '#' in _voice else _voice
|
276 |
-
if not os.path.isfile(home_voice_dir + 'generator.onnx'):
|
277 |
-
shutil.copyfile(
|
278 |
-
f'/data/dkounadis/cache/mimic3-voices/voices/{speaker_free_voice_name}/generator.onnx',
|
279 |
-
home_voice_dir + 'generator.onnx') # 'en_US incl. voice
|
280 |
-
|
281 |
-
prepare_file = _voice.replace('/', '_').replace('#', '_').replace('_low', '')
|
282 |
-
if 'cmu-arctic' in prepare_file:
|
283 |
-
prepare_file = prepare_file.replace('cmu-arctic', 'cmu_arctic') + '.wav'
|
284 |
-
else:
|
285 |
-
prepare_file = prepare_file + '.wav' # [...cmu-arctic...](....cmu_arctic....wav)
|
286 |
-
|
287 |
-
file_true = prepare_file.split('.wav')[0] + '_true_.wav'
|
288 |
-
file_false = prepare_file.split('.wav')[0] + '_false_.wav'
|
289 |
-
print(prepare_file, file_false, file_true)
|
290 |
-
|
291 |
-
|
292 |
-
reference_wav = reference_wav_directory + prepare_file
|
293 |
-
rate = 4 # high speed sounds nice if used as speaker-reference audio for StyleTTS2
|
294 |
-
_ssml = (
|
295 |
-
'<speak>'
|
296 |
-
'<prosody volume=\'64\'>'
|
297 |
-
f'<prosody rate=\'{rate}\'>'
|
298 |
-
f'<voice name=\'{_voice}\'>'
|
299 |
-
'<s>'
|
300 |
-
'Sweet dreams are made of this, .. !!! # I travel the world and the seven seas.'
|
301 |
-
'</s>'
|
302 |
-
'</voice>'
|
303 |
-
'</prosody>'
|
304 |
-
'</prosody>'
|
305 |
-
'</speak>'
|
306 |
-
)
|
307 |
-
with open('_tmp_ssml.txt', 'w') as f:
|
308 |
-
f.write(_ssml)
|
309 |
-
|
310 |
-
|
311 |
-
# ps = subprocess.Popen(f'cat _tmp_ssml.txt | mimic3 --ssml > {reference_wav}', shell=True)
|
312 |
-
# ps.wait() # using ps to call mimic3 because samples dont have time to be written in stdout buffer
|
313 |
-
args = get_args()
|
314 |
-
args.ssml = True
|
315 |
-
args.text = [_ssml] #['aa', 'bb'] #txt
|
316 |
-
args.interactive = False
|
317 |
-
# args.output_naming = OutputNaming.TIME
|
318 |
-
|
319 |
-
state = CommandLineInterfaceState(args=args)
|
320 |
-
initialize_args(state)
|
321 |
-
initialize_tts(state)
|
322 |
-
# args.texts = [txt] #['aa', 'bb'] #txt
|
323 |
-
# state.stdout = '.' #None #'makeme.wav'
|
324 |
-
# state.output_dir = '.noopy'
|
325 |
-
# state.interactive = False
|
326 |
-
# state.output_naming = OutputNaming.TIME
|
327 |
-
# # state.ssml = 1234546575
|
328 |
-
# state.stdout = True
|
329 |
-
# state.tts = True
|
330 |
-
process_lines(state, wav_path=reference_wav)
|
331 |
-
shutdown_tts(state)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
landscape2soundscape.py
CHANGED
@@ -8,10 +8,6 @@ import cv2
|
|
8 |
# yt-dlp https://www.youtube.com/watch?v=UZ9uyQI3pF0 --downloader ffmpeg --downloader-args "ffmpeg_i:-ss 997 -to 2512"
|
9 |
# ffmpeg -i Sandra\ Kotevska\,\ Painting\ Rose\ bush\,\ mixed\ media\,\ 2017.\ \[NMzC_036MtE\].mkv -f mp3 -ar 22050 -vn out44.wav -ac 1
|
10 |
# https://superuser.com/questions/583393/how-to-extract-subtitle-from-video-using-ffmpeg
|
11 |
-
#___________________________________________________________________________________________________
|
12 |
-
# VIDEO FROM IMAGE with CAPTIONS
|
13 |
-
#
|
14 |
-
# UPLOAD to: Simaviro: Documents General WORK PACKAGES WP1 ContentRepository ANBPR_ROMANIA TTSvideos
|
15 |
# __________________________________________________________________________________________________
|
16 |
# TO DONLOAD SRT for youtub
|
17 |
# yt-dlp --write-sub --sub-lang en --convert-subs "srt" https://www.youtube.com/watch?v=F1Ib7TAu7eg&list=PL4x2B6LSwFewdDvRnUTpBM7jkmpwouhPv&index=2
|
@@ -19,12 +15,8 @@ import cv2
|
|
19 |
# _voice = 'en_US/cmu-arctic_low#lnh' #en_US/vctk_low#p249' # 'en_US/vctk_low#p282'
|
20 |
# _voice = ''en_US/vctk_low#p351''
|
21 |
# _voice = 'en_US/vctk_low#p351' # avoid 318 it does the ghhhhhh
|
22 |
-
# _voice = 'en_US/m-ailabs_low#judy_bieber' # Nice voice for ('Arta culinara romaneasca - Groza Irina [phIF0NxgwlQ].mkv' 'Arta culinara romaneasca - Groza Irina [phIF0NxgwlQ].en-GB.srt'),
|
23 |
-
# _voice = 'en_UK/apope_low'
|
24 |
-
# _voice = 'en_US/m-ailabs_low#mary_ann'
|
25 |
# _voice = 'en_US/vctk_low#p351'
|
26 |
# _voice = 'en_US/hifi-tts_low#92'
|
27 |
-
# voice_str = f'_{_voice.replace("/", "")}'
|
28 |
|
29 |
|
30 |
|
@@ -47,7 +39,7 @@ DESCRIPTIONS = [
|
|
47 |
'01_Schick_AII840_001.txt', # text
|
48 |
'statue in shire, hill river, vogels.', # audiocraft
|
49 |
'G. Schick, Bildnis der Heinrike Dannecker, 1802', # cv2 puttext title
|
50 |
-
'
|
51 |
],
|
52 |
# 2
|
53 |
[
|
@@ -65,7 +57,7 @@ DESCRIPTIONS = [
|
|
65 |
'K. Schinkel Gotische Kirche Auf Einem Felsen 1815',
|
66 |
'en_US/hifi-tts_low#6097',
|
67 |
],
|
68 |
-
#
|
69 |
[
|
70 |
'04_Friedrich_FV317_001.jpg',
|
71 |
'04_Friedrich_FV317_001.txt',
|
@@ -73,7 +65,7 @@ DESCRIPTIONS = [
|
|
73 |
'C. D. Friedrich, Der Watzmann, 1824',
|
74 |
'en_US/m-ailabs_low#mary_ann',
|
75 |
],
|
76 |
-
#
|
77 |
[
|
78 |
'05_Blechen_FV40_001.jpg',
|
79 |
'05_Blechen_FV40_001.txt',
|
@@ -95,7 +87,7 @@ DESCRIPTIONS = [
|
|
95 |
'07_Courbet_AI967_001.txt',
|
96 |
'Storm at the strand of waves Tsunami',
|
97 |
'G. Courbet, Die Welle, 1870',
|
98 |
-
'
|
99 |
],
|
100 |
# 8
|
101 |
[
|
@@ -125,7 +117,7 @@ DESCRIPTIONS = [
|
|
125 |
[
|
126 |
'11_Liebermann_NG4-94_001.jpg',
|
127 |
'11_Liebermann_NG4-94_001.txt',
|
128 |
-
'Tavern
|
129 |
'M. Tiebermann, Gartenlokal An Der Havel Nikolskoe, 1916',
|
130 |
'en_US/cmu-arctic_low#ljm',
|
131 |
],
|
@@ -135,7 +127,7 @@ DESCRIPTIONS = [
|
|
135 |
'12_Slevogt_AII1022_001.txt',
|
136 |
'sailing yachts pool fluss',
|
137 |
'M. Slevogt, Segelboote Auf Der Alster Am Abend, 1905',
|
138 |
-
'
|
139 |
],
|
140 |
]
|
141 |
|
@@ -146,7 +138,7 @@ SILENT_VIDEO = '_silent_video.mp4'
|
|
146 |
# SILENT CLIP
|
147 |
|
148 |
|
149 |
-
for img, text, soundscape, title, voice in DESCRIPTIONS[2:4]:
|
150 |
|
151 |
|
152 |
|
|
|
8 |
# yt-dlp https://www.youtube.com/watch?v=UZ9uyQI3pF0 --downloader ffmpeg --downloader-args "ffmpeg_i:-ss 997 -to 2512"
|
9 |
# ffmpeg -i Sandra\ Kotevska\,\ Painting\ Rose\ bush\,\ mixed\ media\,\ 2017.\ \[NMzC_036MtE\].mkv -f mp3 -ar 22050 -vn out44.wav -ac 1
|
10 |
# https://superuser.com/questions/583393/how-to-extract-subtitle-from-video-using-ffmpeg
|
|
|
|
|
|
|
|
|
11 |
# __________________________________________________________________________________________________
|
12 |
# TO DONLOAD SRT for youtub
|
13 |
# yt-dlp --write-sub --sub-lang en --convert-subs "srt" https://www.youtube.com/watch?v=F1Ib7TAu7eg&list=PL4x2B6LSwFewdDvRnUTpBM7jkmpwouhPv&index=2
|
|
|
15 |
# _voice = 'en_US/cmu-arctic_low#lnh' #en_US/vctk_low#p249' # 'en_US/vctk_low#p282'
|
16 |
# _voice = ''en_US/vctk_low#p351''
|
17 |
# _voice = 'en_US/vctk_low#p351' # avoid 318 it does the ghhhhhh
|
|
|
|
|
|
|
18 |
# _voice = 'en_US/vctk_low#p351'
|
19 |
# _voice = 'en_US/hifi-tts_low#92'
|
|
|
20 |
|
21 |
|
22 |
|
|
|
39 |
'01_Schick_AII840_001.txt', # text
|
40 |
'statue in shire, hill river, vogels.', # audiocraft
|
41 |
'G. Schick, Bildnis der Heinrike Dannecker, 1802', # cv2 puttext title
|
42 |
+
'fr_FR_m-ailabs_bernard', #'en_US/m-ailabs_low#judy_bieber', #'en_US/m-ailabs_low#mary_ann',
|
43 |
],
|
44 |
# 2
|
45 |
[
|
|
|
57 |
'K. Schinkel Gotische Kirche Auf Einem Felsen 1815',
|
58 |
'en_US/hifi-tts_low#6097',
|
59 |
],
|
60 |
+
# 4
|
61 |
[
|
62 |
'04_Friedrich_FV317_001.jpg',
|
63 |
'04_Friedrich_FV317_001.txt',
|
|
|
65 |
'C. D. Friedrich, Der Watzmann, 1824',
|
66 |
'en_US/m-ailabs_low#mary_ann',
|
67 |
],
|
68 |
+
# 5
|
69 |
[
|
70 |
'05_Blechen_FV40_001.jpg',
|
71 |
'05_Blechen_FV40_001.txt',
|
|
|
87 |
'07_Courbet_AI967_001.txt',
|
88 |
'Storm at the strand of waves Tsunami',
|
89 |
'G. Courbet, Die Welle, 1870',
|
90 |
+
'af_ZA_google-nwu_0184',
|
91 |
],
|
92 |
# 8
|
93 |
[
|
|
|
117 |
[
|
118 |
'11_Liebermann_NG4-94_001.jpg',
|
119 |
'11_Liebermann_NG4-94_001.txt',
|
120 |
+
'Tavern and shrine and people talking glass plates drink',
|
121 |
'M. Tiebermann, Gartenlokal An Der Havel Nikolskoe, 1916',
|
122 |
'en_US/cmu-arctic_low#ljm',
|
123 |
],
|
|
|
127 |
'12_Slevogt_AII1022_001.txt',
|
128 |
'sailing yachts pool fluss',
|
129 |
'M. Slevogt, Segelboote Auf Der Alster Am Abend, 1905',
|
130 |
+
'jv_ID_google-gmu_06207',
|
131 |
],
|
132 |
]
|
133 |
|
|
|
138 |
# SILENT CLIP
|
139 |
|
140 |
|
141 |
+
for img, text, soundscape, title, voice in DESCRIPTIONS: #[2:4]:
|
142 |
|
143 |
|
144 |
|
live_api.py
DELETED
@@ -1,135 +0,0 @@
|
|
1 |
-
|
2 |
-
# -*- coding: utf-8 -*-
|
3 |
-
import numpy as np
|
4 |
-
import soundfile
|
5 |
-
import audresample
|
6 |
-
import text_utils
|
7 |
-
|
8 |
-
import re
|
9 |
-
import subprocess
|
10 |
-
import markdown
|
11 |
-
import json
|
12 |
-
from pathlib import Path
|
13 |
-
from types import SimpleNamespace
|
14 |
-
from flask import Flask, request, send_from_directory
|
15 |
-
from flask_cors import CORS
|
16 |
-
from audiocraft.builders import AudioGen #, audio_write
|
17 |
-
NUM_SOUND_GENERATIONS = 1 # they differ a lot and are unnatural to concatenate, prefer lm.n_draw
|
18 |
-
sound_generator = AudioGen(duration=.74, device='cuda:0').to('cuda:0').eval()
|
19 |
-
|
20 |
-
|
21 |
-
# ====STYLE VECTOR====
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
# AFFECTIVE = True
|
26 |
-
# VOICE = 'en_UK/apope_low' # en_US/m-ailabs_low#mary_ann
|
27 |
-
|
28 |
-
# _dir = '/' if AFFECTIVE else '_v2/'
|
29 |
-
# precomputed_style_vector = msinference.compute_style(
|
30 |
-
# 'assets/wavs/style_vector' + _dir + VOICE.replace(
|
31 |
-
# '/', '_').replace(
|
32 |
-
# '#', '_').replace(
|
33 |
-
# 'cmu-arctic', 'cmu_arctic').replace(
|
34 |
-
# '_low', '') + '.wav')
|
35 |
-
# print('\n STYLE VECTOR \n', precomputed_style_vector.shape)
|
36 |
-
|
37 |
-
|
38 |
-
# ==== STYLE VECTOR
|
39 |
-
|
40 |
-
CACHE_DIR = 'flask_cache/'
|
41 |
-
Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
def tts_multi_sentence(scene=None):
|
47 |
-
if scene is not None and len(scene) >= 4:
|
48 |
-
print(f'Processing: {scene} ..')
|
49 |
-
# x = sound_generator.generate([scene])[0, :, :].detach().cpu().numpy()
|
50 |
-
x = sound_generator.generate(
|
51 |
-
[scene] * NUM_SOUND_GENERATIONS
|
52 |
-
).reshape(1, -1).detach().cpu().numpy() # bs, 11400
|
53 |
-
|
54 |
-
x /= np.abs(x).max() + 1e-7
|
55 |
-
# is 16kHz - AUdiogen Fs
|
56 |
-
x = audresample.resample(x,
|
57 |
-
original_rate=16000,
|
58 |
-
target_rate=24000)[0, :]
|
59 |
-
|
60 |
-
|
61 |
-
#
|
62 |
-
print(f'Craft Finished for: {scene}\n\n\n\n____{x.shape}')
|
63 |
-
else:
|
64 |
-
print(scene, '\nDrop\n')
|
65 |
-
x = np.zeros(400)
|
66 |
-
|
67 |
-
# # StyleTTS2
|
68 |
-
# if ('en_US/' in voice) or ('en_UK/' in voice) or (voice is None):
|
69 |
-
# assert precomputed_style_vector is not None, 'For affective TTS, style vector is needed.'
|
70 |
-
# x = []
|
71 |
-
# for _sentence in text:
|
72 |
-
# x.append(msinference.inference(_sentence,
|
73 |
-
# precomputed_style_vector,
|
74 |
-
# alpha=0.3,
|
75 |
-
# beta=0.7,
|
76 |
-
# diffusion_steps=7,
|
77 |
-
# embedding_scale=1))
|
78 |
-
# x = np.concatenate(x)
|
79 |
-
|
80 |
-
# return overlay(x, sound_background)
|
81 |
-
|
82 |
-
return x
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
app = Flask(__name__)
|
90 |
-
cors = CORS(app)
|
91 |
-
|
92 |
-
|
93 |
-
@app.route("/")
|
94 |
-
def index():
|
95 |
-
with open('README.md', 'r') as f:
|
96 |
-
return markdown.markdown(f.read())
|
97 |
-
|
98 |
-
|
99 |
-
@app.route("/", methods=['GET', 'POST', 'PUT'])
|
100 |
-
def serve_wav():
|
101 |
-
# https://stackoverflow.com/questions/13522137/in-flask-convert-form-post-
|
102 |
-
# object-into-a-representation-suitable-for-mongodb
|
103 |
-
r = request.form.to_dict(flat=False)
|
104 |
-
|
105 |
-
|
106 |
-
args = SimpleNamespace(
|
107 |
-
text=None if r.get('text') is None else r.get('text'), # string not file?
|
108 |
-
scene=r.get('scene')[0]
|
109 |
-
)
|
110 |
-
# print('\n==RECOMPOSED as \n',request.data,request.form,'\n==')
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
x = tts_multi_sentence(args.scene)
|
119 |
-
|
120 |
-
OUT_FILE = 'tmp.wav'
|
121 |
-
soundfile.write(CACHE_DIR + OUT_FILE, x, 16000)
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
# send server's output as default file -> srv_result.xx
|
128 |
-
print(f'\n=SERVER saved as {OUT_FILE=}\n')
|
129 |
-
response = send_from_directory(CACHE_DIR, path=OUT_FILE)
|
130 |
-
response.headers['suffix-file-type'] = OUT_FILE
|
131 |
-
return response
|
132 |
-
|
133 |
-
|
134 |
-
if __name__ == "__main__":
|
135 |
-
app.run(host="0.0.0.0")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
live_demo.py
CHANGED
@@ -1,74 +1,48 @@
|
|
1 |
-
|
|
|
|
|
2 |
import os
|
3 |
import requests
|
4 |
import subprocess
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
def command_line_args():
|
10 |
-
parser = argparse.ArgumentParser(
|
11 |
-
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
12 |
-
)
|
13 |
-
parser.add_argument(
|
14 |
-
'--affective',
|
15 |
-
help="Select Emotional or non-emotional variant of Available voices: https://audeering.github.io/shift/",
|
16 |
-
action='store_false',
|
17 |
-
)
|
18 |
-
parser.add_argument(
|
19 |
-
'--device',
|
20 |
-
help="Device ID",
|
21 |
-
type=str,
|
22 |
-
default='cpu',
|
23 |
-
)
|
24 |
-
parser.add_argument(
|
25 |
-
'--text',
|
26 |
-
help="Text to be synthesized.",
|
27 |
-
default='How is hoowl',
|
28 |
-
type=str,
|
29 |
-
)
|
30 |
-
return parser
|
31 |
|
32 |
def send_to_server(args):
|
33 |
url = "http://192.168.88.209:5000"
|
34 |
|
35 |
payload = {
|
36 |
'text': args.text,
|
37 |
-
'
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
}
|
39 |
|
40 |
-
|
41 |
-
|
42 |
-
# # Check the response from the server
|
43 |
-
# if response.status_code == 200:
|
44 |
-
# print("\nRequest was successful!")
|
45 |
-
# # print("Response:", respdonse.__dict__.keys(), '\n=====\n')
|
46 |
|
47 |
-
# else:
|
48 |
-
# print("Failed to send the request")
|
49 |
-
# print("Status Code:", response.status_code)
|
50 |
-
# print("Response:", response.text)
|
51 |
-
return response
|
52 |
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
# _text, _scene = args.text.split('|')
|
61 |
-
# args.text = _text
|
62 |
-
args.scene = args.text #_scene
|
63 |
-
if len(args.text) >= 4:
|
64 |
-
response = send_to_server(args)
|
65 |
-
out_file = '_gen_.wav' #+ response.headers['suffix-file-type'].split('.')[-1]
|
66 |
-
with open(out_file, 'wb') as f:
|
67 |
-
f.write(response.content)
|
68 |
-
subprocess.run(["paplay", out_file])
|
69 |
-
else:
|
70 |
-
print(f'__\n{args.text}\n')
|
71 |
|
72 |
-
|
73 |
-
if
|
74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Asks for txt input, creates TTS and sound via AudioGen, plays it back
|
2 |
+
# Need to have paplay installed on client - live_demo.py
|
3 |
+
|
4 |
import os
|
5 |
import requests
|
6 |
import subprocess
|
7 |
+
from types import SimpleNamespace
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
def send_to_server(args):
|
10 |
url = "http://192.168.88.209:5000"
|
11 |
|
12 |
payload = {
|
13 |
'text': args.text,
|
14 |
+
'voice': args.voice,
|
15 |
+
'soundscape': args.soundscape,
|
16 |
+
'affective': True,
|
17 |
+
'image': None,
|
18 |
+
'video': None,
|
19 |
+
'speed': 1.14,
|
20 |
+
'native': None,
|
21 |
}
|
22 |
|
23 |
+
return requests.post(url, data=payload, files=[(args.text, open('_tmp.txt', 'rb'))]) # NONEs do not arrive to servers dict
|
|
|
|
|
|
|
|
|
|
|
24 |
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
+
args = SimpleNamespace()
|
27 |
+
args.voice = 'fr_FR_m-ailabs_bernard' # 'en_US/m-ailabs_low#judy_bieber'
|
28 |
+
args.speed = 1.14
|
29 |
+
os.system('cls' if os.name == 'nt' else 'clear')
|
30 |
+
while True:
|
31 |
+
_str = input("\n\n\n\nDescribe Any Sound: \n\n\n\n")
|
32 |
+
args.soundscape = _str
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
+
# xtra duration for audiogen to sound cool!!!!
|
35 |
+
if len(_str) < 20:
|
36 |
+
_str += 'Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata few silence for audiogen to impress you.'
|
37 |
+
args.text = '_tmp.txt' # input -> .txt (implementation thought for audiobooks in API)
|
38 |
+
|
39 |
+
with open(args.text, 'w') as f:
|
40 |
+
f.write(_str)
|
41 |
+
if len(_str) >= 4:
|
42 |
+
response = send_to_server(args)
|
43 |
+
out_file = '_gen_.wav'#+ response.headers['suffix-file-type'].split('.')[-1]
|
44 |
+
with open(out_file, 'wb') as f:
|
45 |
+
f.write(response.content)
|
46 |
+
subprocess.run(["paplay", out_file])
|
47 |
+
else:
|
48 |
+
print(f'__\n{_str}\n')
|
models.py
CHANGED
@@ -109,6 +109,9 @@ class ResBlk(nn.Module):
|
|
109 |
|
110 |
|
111 |
class StyleEncoder(nn.Module):
|
|
|
|
|
|
|
112 |
def __init__(self, dim_in=48, style_dim=48, max_conv_dim=384):
|
113 |
super().__init__()
|
114 |
blocks = []
|
@@ -549,5 +552,5 @@ def build_model(args, text_aligner, pitch_extractor, bert):
|
|
549 |
text_aligner = text_aligner,
|
550 |
pitch_extractor=pitch_extractor
|
551 |
)
|
552 |
-
|
553 |
-
return nets
|
|
|
109 |
|
110 |
|
111 |
class StyleEncoder(nn.Module):
|
112 |
+
|
113 |
+
# used for both acoustic & prosodic ref_s/p
|
114 |
+
|
115 |
def __init__(self, dim_in=48, style_dim=48, max_conv_dim=384):
|
116 |
super().__init__()
|
117 |
blocks = []
|
|
|
552 |
text_aligner = text_aligner,
|
553 |
pitch_extractor=pitch_extractor
|
554 |
)
|
555 |
+
|
556 |
+
return nets
|
msinference.py
CHANGED
@@ -134,6 +134,7 @@ _ = [model[key].eval() for key in model]
|
|
134 |
_ = [model[key].to(device) for key in model]
|
135 |
|
136 |
# params_whole = torch.load("Models/LibriTTS/epochs_2nd_00020.pth", map_location='cpu')
|
|
|
137 |
params_whole = torch.load(str(cached_path("hf://yl4579/StyleTTS2-LibriTTS/Models/LibriTTS/epochs_2nd_00020.pth")), map_location='cpu')
|
138 |
params = params_whole['net']
|
139 |
|
@@ -167,7 +168,7 @@ def inference(text,
|
|
167 |
ps = global_phonemizer.phonemize([text])
|
168 |
# print(f'PHONEMIZER: {ps=}\n\n') #PHONEMIZER: ps=['ɐbˈɛbæbləm ']
|
169 |
ps = word_tokenize(ps[0])
|
170 |
-
# print(f'TOKENIZER: {ps=}\n\n') #OKENIZER: ps=['ɐbˈɛbæbləm']
|
171 |
ps = ' '.join(ps)
|
172 |
tokens = textclenaer(ps)
|
173 |
# print(f'TEXTCLEAN: {ps=}\n\n') #TEXTCLEAN: ps='ɐbˈɛbæbləm'
|
@@ -198,11 +199,13 @@ def inference(text,
|
|
198 |
# print('BERTdu', bert_dur.shape, tokens.shape, '\n') # bert what is the 768 per token -> IS USED in sampler
|
199 |
# BERTdu torch.Size([1, 11, 768]) torch.Size([1, 11])
|
200 |
|
201 |
-
|
202 |
|
203 |
ref = ref_s[:, :128]
|
204 |
s = ref_s[:, 128:]
|
205 |
|
|
|
|
|
206 |
d = model.predictor.text_encoder(d_en,
|
207 |
s, input_lengths, text_mask)
|
208 |
|
|
|
134 |
_ = [model[key].to(device) for key in model]
|
135 |
|
136 |
# params_whole = torch.load("Models/LibriTTS/epochs_2nd_00020.pth", map_location='cpu')
|
137 |
+
# params_whole = torch.load('freevc2/yl4579_styletts2.pth' map_location='cpu')
|
138 |
params_whole = torch.load(str(cached_path("hf://yl4579/StyleTTS2-LibriTTS/Models/LibriTTS/epochs_2nd_00020.pth")), map_location='cpu')
|
139 |
params = params_whole['net']
|
140 |
|
|
|
168 |
ps = global_phonemizer.phonemize([text])
|
169 |
# print(f'PHONEMIZER: {ps=}\n\n') #PHONEMIZER: ps=['ɐbˈɛbæbləm ']
|
170 |
ps = word_tokenize(ps[0])
|
171 |
+
# # print(f'TOKENIZER: {ps=}\n\n') #OKENIZER: ps=['ɐbˈɛbæbləm']
|
172 |
ps = ' '.join(ps)
|
173 |
tokens = textclenaer(ps)
|
174 |
# print(f'TEXTCLEAN: {ps=}\n\n') #TEXTCLEAN: ps='ɐbˈɛbæbləm'
|
|
|
199 |
# print('BERTdu', bert_dur.shape, tokens.shape, '\n') # bert what is the 768 per token -> IS USED in sampler
|
200 |
# BERTdu torch.Size([1, 11, 768]) torch.Size([1, 11])
|
201 |
|
202 |
+
|
203 |
|
204 |
ref = ref_s[:, :128]
|
205 |
s = ref_s[:, 128:]
|
206 |
|
207 |
+
# s = .74 * s # prosody / arousal & fading unvoiced syllabes [x0.7 - x1.2]
|
208 |
+
|
209 |
d = model.predictor.text_encoder(d_en,
|
210 |
s, input_lengths, text_mask)
|
211 |
|