Dionyssos commited on
Commit
bb2cd38
·
1 Parent(s): f1368b1

fx live dem

Browse files
README.md CHANGED
@@ -74,7 +74,7 @@ Following examples need `api.py` to be running. [Set this IP](https://huggingfac
74
  The following needs `api.py` to be already running on a tmux session.
75
 
76
  ```python
77
- # TTS & soundscape - overlay to .mp4
78
  python landscape2soundscape.py
79
  ```
80
 
@@ -94,44 +94,36 @@ For SHIFT demo / Collaboration with [SMB](https://www.smb.museum/home/)
94
 
95
  [![06](uc_spk_Landscape2Soundscape_Masterpieces_pics/thumb____06_Menzel_AI900_001.jpg)](https://youtu.be/3M0y9OYzDfU)
96
 
97
- [![07](uc_spk_Landscape2Soundscape_Masterpieces_pics/thumb____07_Courbet_AI967_001.jpg)](https://youtu.be/OBY666_By1k)
98
 
99
  [![08](uc_spk_Landscape2Soundscape_Masterpieces_pics/thumb____08_Monet_AI1013_001.jpg)](https://youtu.be/gnGCYLcdLsA)
100
 
101
  [![10](uc_spk_Landscape2Soundscape_Masterpieces_pics/thumb____10_Boecklin_967648_NG2-80_001_rsz.jpg)](https://www.youtube.com/watch?v=Y8QyYUgLaCg)
102
 
103
- [![11](uc_spk_Landscape2Soundscape_Masterpieces_pics/thumb____11_Liebermann_NG4-94_001.jpg)](https://youtu.be/XDDzxDSrhb0)
104
 
105
- [![12](uc_spk_Landscape2Soundscape_Masterpieces_pics/thumb____12_Slevogt_AII1022_001.jpg)](https://youtu.be/I3YYKiUzHpA)
106
 
107
 
108
 
109
 
110
- # SoundScape Live (iterative) Demo - Paplay
111
 
112
- Special Flask API for playing sounds live
113
 
114
  ```python
115
- CUDA_DEVICE_ORDER=PCI_BUS_ID HF_HOME=/data/dkounadis/.hf7/ CUDA_VISIBLE_DEVICES=4 python live_api.py
116
  ```
117
 
118
- Client - Describe any sound with words and it will be played back to you.
119
 
120
  ```python
121
- python live_demo.py # will ask text input & play soundscape
122
  ```
123
 
124
- # SoundScape (basic) Demo
125
-
126
- ```python
127
- CUDA_DEVICE_ORDER=PCI_BUS_ID HF_HOME=/data/dkounadis/.hf7/ CUDA_VISIBLE_DEVICES=4 python demo.py
128
- ```
129
-
130
- ##
131
-
132
  # Audiobook
133
 
134
- Create audiobook from `.docx`. Listen to it - YouTube [male voice](https://www.youtube.com/watch?v=5-cpf7u18JE) / [v2](https://www.youtube.com/watch?v=Pzo-kKaNg6s) / [v2.1](https://www.youtube.com/watch?v=X4qlKBBaegM)/ [female voice](https://www.youtube.com/watch?v=pzrLYCaWD2A)
135
 
136
  ```python
137
  # audiobook will be saved in ./tts_audiobooks
 
74
  The following needs `api.py` to be already running on a tmux session.
75
 
76
  ```python
77
+ # TTS & soundscape - output .mp4 saved in ./out/
78
  python landscape2soundscape.py
79
  ```
80
 
 
94
 
95
  [![06](uc_spk_Landscape2Soundscape_Masterpieces_pics/thumb____06_Menzel_AI900_001.jpg)](https://youtu.be/3M0y9OYzDfU)
96
 
97
+ [![07](uc_spk_Landscape2Soundscape_Masterpieces_pics/thumb____07_Courbet_AI967_001.jpg)](https://youtu.be/56MH7zOHrNQ)
98
 
99
  [![08](uc_spk_Landscape2Soundscape_Masterpieces_pics/thumb____08_Monet_AI1013_001.jpg)](https://youtu.be/gnGCYLcdLsA)
100
 
101
  [![10](uc_spk_Landscape2Soundscape_Masterpieces_pics/thumb____10_Boecklin_967648_NG2-80_001_rsz.jpg)](https://www.youtube.com/watch?v=Y8QyYUgLaCg)
102
 
103
+ [![11](uc_spk_Landscape2Soundscape_Masterpieces_pics/thumb____11_Liebermann_NG4-94_001.jpg)](https://youtu.be/RhUuS9HMLhg)
104
 
105
+ [![12](uc_spk_Landscape2Soundscape_Masterpieces_pics/thumb____12_Slevogt_AII1022_001.jpg)](https://youtu.be/NzzhhrUeKVY)
106
 
107
 
108
 
109
 
110
+ # SoundScape Live Demo - Paplay
111
 
112
+ Flask API for playing sounds live
113
 
114
  ```python
115
+ CUDA_DEVICE_ORDER=PCI_BUS_ID HF_HOME=/data/dkounadis/.hf7/ CUDA_VISIBLE_DEVICES=4 python api.py
116
  ```
117
 
118
+ Describe any sound via text, the tts & soundscape is played back
119
 
120
  ```python
121
+ python live_demo.py # type text & plays AudioGen sound & TTS
122
  ```
123
 
 
 
 
 
 
 
 
 
124
  # Audiobook
125
 
126
+ Create audiobook from `.docx`. Listen to it - YouTube [male voice](https://www.youtube.com/watch?v=5-cpf7u18JE) / [v2](https://www.youtube.com/watch?v=Pzo-kKaNg6s) / [v2.1](https://www.youtube.com/watch?v=X4qlKBBaegM)/ [no diffusio](https://www.youtube.com/watch?v=vahKXpd6oLg)
127
 
128
  ```python
129
  # audiobook will be saved in ./tts_audiobooks
api.py CHANGED
@@ -21,18 +21,8 @@ NUM_SOUND_GENERATIONS = 1 # batch size to generate same text (same soundscape f
21
 
22
  sound_generator = AudioGen(duration=4.74, device='cuda:0').to('cuda:0').eval()
23
 
24
-
25
  Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
26
 
27
-
28
-
29
- # SSH AGENT
30
- # eval $(ssh-agent -s)
31
- # ssh-add ~/.ssh/id_ed25519_github2024
32
- #
33
- # git remote set-url origin [email protected]:audeering/shift
34
- # ==
35
-
36
  def _shorten(filename):
37
  return filename.replace("/","")[-6:]
38
 
@@ -83,9 +73,9 @@ def _shift(x):
83
  def overlay(x, soundscape=None):
84
 
85
  if soundscape is not None:
86
-
87
  # SOUNDS
88
-
89
  background = sound_generator.generate(
90
  [soundscape] * NUM_SOUND_GENERATIONS
91
  ).reshape(-1).detach().cpu().numpy() # bs, 11400 @.74s
@@ -98,8 +88,10 @@ def overlay(x, soundscape=None):
98
  background = audresample.resample(
99
  background,
100
  original_rate=16000, # sound_generator.sample_rate,
101
- target_rate=24000)[0, :-25000] # discard last samples as they have the splash sound / polarity change;
102
-
 
 
103
  # background /= np.abs(background).max() + 1e-7 Apply in sound_generator()
104
 
105
 
@@ -132,14 +124,14 @@ def overlay(x, soundscape=None):
132
  # print(total[40000:70000].tolist())
133
  print(np.logical_and(total > .1, total < .9).sum(), total.shape, 'ev')
134
 
135
- # background = np.concatenate(n_repeat * [background])
136
-
137
- # background = _shift(background)
138
  # print(f'\n====SOUND BACKGROUND SHAPE\n{background.shape=}',
139
  # f'{np.abs(background.max())=}\n{x.shape=}')
140
  total /= np.abs(total).max() + 1e-7 # amplify speech to full [-1,1]
141
- x = .4 * x + .6 * total[:len(x)]
142
-
143
  else:
144
  print('sound_background = None')
145
  return x
 
21
 
22
  sound_generator = AudioGen(duration=4.74, device='cuda:0').to('cuda:0').eval()
23
 
 
24
  Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
25
 
 
 
 
 
 
 
 
 
 
26
  def _shorten(filename):
27
  return filename.replace("/","")[-6:]
28
 
 
73
  def overlay(x, soundscape=None):
74
 
75
  if soundscape is not None:
76
+
77
  # SOUNDS
78
+
79
  background = sound_generator.generate(
80
  [soundscape] * NUM_SOUND_GENERATIONS
81
  ).reshape(-1).detach().cpu().numpy() # bs, 11400 @.74s
 
88
  background = audresample.resample(
89
  background,
90
  original_rate=16000, # sound_generator.sample_rate,
91
+ target_rate=24000)[0, :-25000]
92
+ # TODO discards last sampls due to splash sound / polarity change / on long sounds ~ videos / NOT DROP FOR live_demo.py
93
+
94
+
95
  # background /= np.abs(background).max() + 1e-7 Apply in sound_generator()
96
 
97
 
 
124
  # print(total[40000:70000].tolist())
125
  print(np.logical_and(total > .1, total < .9).sum(), total.shape, 'ev')
126
 
127
+ # less periodic - cloned sounds
128
+ for _ in range(4):
129
+ background = _shift(background)
130
  # print(f'\n====SOUND BACKGROUND SHAPE\n{background.shape=}',
131
  # f'{np.abs(background.max())=}\n{x.shape=}')
132
  total /= np.abs(total).max() + 1e-7 # amplify speech to full [-1,1]
133
+ x = .26 * x + .74 * total[:len(x)]
134
+
135
  else:
136
  print('sound_background = None')
137
  return x
audiocraft/transformer.py CHANGED
@@ -203,7 +203,7 @@ class StreamingMultiheadAttention(nn.Module):
203
 
204
  else:
205
  # init on 1st token (for all 47 transf layers)
206
- print(f'else skip')
207
  self.k_history = k
208
  self.v_history = v
209
 
 
203
 
204
  else:
205
  # init on 1st token (for all 47 transf layers)
206
+ print(f'AudioGen kv cache Flush')
207
  self.k_history = k
208
  self.v_history = v
209
 
engineer_style_vectors_v2.py DELETED
@@ -1,331 +0,0 @@
1
-
2
- from pathlib import Path
3
- import shutil
4
- import csv
5
- import io
6
- import os
7
- import typing
8
- import wave
9
- import sys
10
- from mimic3_tts.__main__ import (CommandLineInterfaceState,
11
- get_args,
12
- initialize_args,
13
- initialize_tts,
14
- # print_voices,
15
- # process_lines,
16
- shutdown_tts,
17
- OutputNaming,
18
- process_line)
19
-
20
-
21
- def process_lines(state: CommandLineInterfaceState, wav_path=None):
22
- '''MIMIC3 INTERNAL CALL that yields the sigh sound'''
23
-
24
- args = state.args
25
-
26
- result_idx = 0
27
- print(f'why waitings in the for loop LIN {state.texts=}\n')
28
- for line in state.texts:
29
- print(f'LIN {line=}\n') # prints \n so is empty not getting the predifne text of state.texts
30
- line_voice: typing.Optional[str] = None
31
- line_id = ""
32
- line = line.strip()
33
- # if not line:
34
- # continue
35
-
36
- if args.output_naming == OutputNaming.ID:
37
- # Line has the format id|text instead of just text
38
- with io.StringIO(line) as line_io:
39
- reader = csv.reader(line_io, delimiter=args.csv_delimiter)
40
- row = next(reader)
41
- line_id, line = row[0], row[-1]
42
- if args.csv_voice:
43
- line_voice = row[1]
44
-
45
- process_line(line, state, line_id=line_id, line_voice=line_voice)
46
- result_idx += 1
47
-
48
- print('\nARRive at All Audio writing\n\n\n\n')
49
- # -------------------------------------------------------------------------
50
-
51
- # Write combined audio to stdout
52
- if state.all_audio:
53
- # _LOGGER.debug("Writing WAV audio to stdout")
54
-
55
- if sys.stdout.isatty() and (not state.args.stdout):
56
- with io.BytesIO() as wav_io:
57
- wav_file_play: wave.Wave_write = wave.open(wav_io, "wb")
58
- with wav_file_play:
59
- wav_file_play.setframerate(state.sample_rate_hz)
60
- wav_file_play.setsampwidth(state.sample_width_bytes)
61
- wav_file_play.setnchannels(state.num_channels)
62
- wav_file_play.writeframes(state.all_audio)
63
-
64
- # play_wav_bytes(state.args, wav_io.getvalue())
65
- # wav_path = '_direct_call_2.wav'
66
- with open(wav_path, 'wb') as wav_file:
67
- wav_file.write(wav_io.getvalue())
68
- wav_file.seek(0)
69
-
70
- # -----------------------------------------------------------------------------
71
- # cat _tmp_ssml.txt | mimic3 --cuda --ssml --noise-w 0.90001 --length-scale 0.91 --noise-scale 0.04 > noise_w=0.90_en_happy_2.wav
72
- # ======================================================================
73
- out_dir = 'assets/'
74
- reference_wav_directory = 'assets/wavs/style_vector_v2/'
75
- Path(reference_wav_directory).mkdir(parents=True, exist_ok=True)
76
- Path(out_dir).mkdir(parents=True, exist_ok=True)
77
-
78
- wav_dir = 'assets/wavs/'
79
- Path(wav_dir).mkdir(parents=True, exist_ok=True)
80
- N_PIX = 11
81
-
82
-
83
- # =======================================================================
84
- # S T A R T G E N E R A T E png/wav
85
- # =======================================================================
86
-
87
- NOISE_SCALE = .667
88
- NOISE_W = .9001 #.8 #.90001 # default .8 in __main__.py @ L697 IGNORED DUE TO ARTEfACTS - FOR NOW USE default
89
-
90
- a = [
91
- 'p239',
92
- 'p236',
93
- 'p264',
94
- 'p250',
95
- 'p259',
96
- 'p247',
97
- 'p261',
98
- 'p263',
99
- 'p283',
100
- 'p274',
101
- 'p286',
102
- 'p276',
103
- 'p270',
104
- 'p281',
105
- 'p277',
106
- 'p231',
107
- 'p238',
108
- 'p271',
109
- 'p257',
110
- 'p273',
111
- 'p284',
112
- 'p329',
113
- 'p361',
114
- 'p287',
115
- 'p360',
116
- 'p374',
117
- 'p376',
118
- 'p310',
119
- 'p304',
120
- 'p340',
121
- 'p347',
122
- 'p330',
123
- 'p308',
124
- 'p314',
125
- 'p317',
126
- 'p339',
127
- 'p311',
128
- 'p294',
129
- 'p305',
130
- 'p266',
131
- 'p335',
132
- 'p334',
133
- 'p318',
134
- 'p323',
135
- 'p351',
136
- 'p333',
137
- 'p313',
138
- 'p316',
139
- 'p244',
140
- 'p307',
141
- 'p363',
142
- 'p336',
143
- 'p312',
144
- 'p267',
145
- 'p297',
146
- 'p275',
147
- 'p295',
148
- 'p288',
149
- 'p258',
150
- 'p301',
151
- 'p232',
152
- 'p292',
153
- 'p272',
154
- 'p278',
155
- 'p280',
156
- 'p341',
157
- 'p268',
158
- 'p298',
159
- 'p299',
160
- 'p279',
161
- 'p285',
162
- 'p326',
163
- 'p300',
164
- 's5',
165
- 'p230',
166
- 'p254',
167
- 'p269',
168
- 'p293',
169
- 'p252',
170
- 'p345',
171
- 'p262',
172
- 'p243',
173
- 'p227',
174
- 'p343',
175
- 'p255',
176
- 'p229',
177
- 'p240',
178
- 'p248',
179
- 'p253',
180
- 'p233',
181
- 'p228',
182
- 'p251',
183
- 'p282',
184
- 'p246',
185
- 'p234',
186
- 'p226',
187
- 'p260',
188
- 'p245',
189
- 'p241',
190
- 'p303',
191
- 'p265',
192
- 'p306',
193
- 'p237',
194
- 'p249',
195
- 'p256',
196
- 'p302',
197
- 'p364',
198
- 'p225',
199
- 'p362']
200
-
201
- print(len(a))
202
-
203
- b = []
204
-
205
- for row in a:
206
- b.append(f'en_US/vctk_low#{row}')
207
-
208
- # print(b)
209
-
210
- # 00000000 arctic
211
-
212
-
213
- a = [
214
- 'awb' # comma
215
- 'rms',
216
- 'slt',
217
- 'ksp',
218
- 'clb',
219
- 'aew',
220
- 'bdl',
221
- 'lnh',
222
- 'jmk',
223
- 'rxr',
224
- 'fem',
225
- 'ljm',
226
- 'slp',
227
- 'ahw',
228
- 'axb',
229
- 'aup',
230
- 'eey',
231
- 'gka',
232
- ]
233
-
234
-
235
- for row in a:
236
- b.append(f'en_US/cmu-arctic_low#{row}')
237
-
238
- # HIFItts
239
-
240
- a = ['9017',
241
- '6097',
242
- '92']
243
-
244
- for row in a:
245
- b.append(f'en_US/hifi-tts_low#{row}')
246
-
247
- a = [
248
- 'elliot_miller',
249
- 'judy_bieber',
250
- 'mary_ann']
251
-
252
- for row in a:
253
- b.append(f'en_US/m-ailabs_low#{row}')
254
-
255
- # LJspeech - single speaker
256
-
257
- b.append(f'en_US/ljspeech_low')
258
-
259
- # en_UK apope - only speaker
260
-
261
- b.append(f'en_UK/apope_low')
262
-
263
- all_names = b
264
-
265
-
266
- VOICES = {}
267
- for _id, _voice in enumerate(all_names):
268
-
269
- # If GitHub Quota exceded copy mimic-voices from local copies
270
- #
271
- # https://github.com/MycroftAI/mimic3-voices
272
- #
273
- home_voice_dir = f'/home/audeering.local/dkounadis/.local/share/mycroft/mimic3/voices/{_voice.split("#")[0]}/'
274
- Path(home_voice_dir).mkdir(parents=True, exist_ok=True)
275
- speaker_free_voice_name = _voice.split("#")[0] if '#' in _voice else _voice
276
- if not os.path.isfile(home_voice_dir + 'generator.onnx'):
277
- shutil.copyfile(
278
- f'/data/dkounadis/cache/mimic3-voices/voices/{speaker_free_voice_name}/generator.onnx',
279
- home_voice_dir + 'generator.onnx') # 'en_US incl. voice
280
-
281
- prepare_file = _voice.replace('/', '_').replace('#', '_').replace('_low', '')
282
- if 'cmu-arctic' in prepare_file:
283
- prepare_file = prepare_file.replace('cmu-arctic', 'cmu_arctic') + '.wav'
284
- else:
285
- prepare_file = prepare_file + '.wav' # [...cmu-arctic...](....cmu_arctic....wav)
286
-
287
- file_true = prepare_file.split('.wav')[0] + '_true_.wav'
288
- file_false = prepare_file.split('.wav')[0] + '_false_.wav'
289
- print(prepare_file, file_false, file_true)
290
-
291
-
292
- reference_wav = reference_wav_directory + prepare_file
293
- rate = 4 # high speed sounds nice if used as speaker-reference audio for StyleTTS2
294
- _ssml = (
295
- '<speak>'
296
- '<prosody volume=\'64\'>'
297
- f'<prosody rate=\'{rate}\'>'
298
- f'<voice name=\'{_voice}\'>'
299
- '<s>'
300
- 'Sweet dreams are made of this, .. !!! # I travel the world and the seven seas.'
301
- '</s>'
302
- '</voice>'
303
- '</prosody>'
304
- '</prosody>'
305
- '</speak>'
306
- )
307
- with open('_tmp_ssml.txt', 'w') as f:
308
- f.write(_ssml)
309
-
310
-
311
- # ps = subprocess.Popen(f'cat _tmp_ssml.txt | mimic3 --ssml > {reference_wav}', shell=True)
312
- # ps.wait() # using ps to call mimic3 because samples dont have time to be written in stdout buffer
313
- args = get_args()
314
- args.ssml = True
315
- args.text = [_ssml] #['aa', 'bb'] #txt
316
- args.interactive = False
317
- # args.output_naming = OutputNaming.TIME
318
-
319
- state = CommandLineInterfaceState(args=args)
320
- initialize_args(state)
321
- initialize_tts(state)
322
- # args.texts = [txt] #['aa', 'bb'] #txt
323
- # state.stdout = '.' #None #'makeme.wav'
324
- # state.output_dir = '.noopy'
325
- # state.interactive = False
326
- # state.output_naming = OutputNaming.TIME
327
- # # state.ssml = 1234546575
328
- # state.stdout = True
329
- # state.tts = True
330
- process_lines(state, wav_path=reference_wav)
331
- shutdown_tts(state)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
landscape2soundscape.py CHANGED
@@ -8,10 +8,6 @@ import cv2
8
  # yt-dlp https://www.youtube.com/watch?v=UZ9uyQI3pF0 --downloader ffmpeg --downloader-args "ffmpeg_i:-ss 997 -to 2512"
9
  # ffmpeg -i Sandra\ Kotevska\,\ Painting\ Rose\ bush\,\ mixed\ media\,\ 2017.\ \[NMzC_036MtE\].mkv -f mp3 -ar 22050 -vn out44.wav -ac 1
10
  # https://superuser.com/questions/583393/how-to-extract-subtitle-from-video-using-ffmpeg
11
- #___________________________________________________________________________________________________
12
- # VIDEO FROM IMAGE with CAPTIONS
13
- #
14
- # UPLOAD to: Simaviro: Documents General WORK PACKAGES WP1 ContentRepository ANBPR_ROMANIA TTSvideos
15
  # __________________________________________________________________________________________________
16
  # TO DONLOAD SRT for youtub
17
  # yt-dlp --write-sub --sub-lang en --convert-subs "srt" https://www.youtube.com/watch?v=F1Ib7TAu7eg&list=PL4x2B6LSwFewdDvRnUTpBM7jkmpwouhPv&index=2
@@ -19,12 +15,8 @@ import cv2
19
  # _voice = 'en_US/cmu-arctic_low#lnh' #en_US/vctk_low#p249' # 'en_US/vctk_low#p282'
20
  # _voice = ''en_US/vctk_low#p351''
21
  # _voice = 'en_US/vctk_low#p351' # avoid 318 it does the ghhhhhh
22
- # _voice = 'en_US/m-ailabs_low#judy_bieber' # Nice voice for ('Arta culinara romaneasca - Groza Irina [phIF0NxgwlQ].mkv' 'Arta culinara romaneasca - Groza Irina [phIF0NxgwlQ].en-GB.srt'),
23
- # _voice = 'en_UK/apope_low'
24
- # _voice = 'en_US/m-ailabs_low#mary_ann'
25
  # _voice = 'en_US/vctk_low#p351'
26
  # _voice = 'en_US/hifi-tts_low#92'
27
- # voice_str = f'_{_voice.replace("/", "")}'
28
 
29
 
30
 
@@ -47,7 +39,7 @@ DESCRIPTIONS = [
47
  '01_Schick_AII840_001.txt', # text
48
  'statue in shire, hill river, vogels.', # audiocraft
49
  'G. Schick, Bildnis der Heinrike Dannecker, 1802', # cv2 puttext title
50
- 'en_US/vctk_low#p326', #'en_US/m-ailabs_low#judy_bieber', #'en_US/m-ailabs_low#mary_ann',
51
  ],
52
  # 2
53
  [
@@ -65,7 +57,7 @@ DESCRIPTIONS = [
65
  'K. Schinkel Gotische Kirche Auf Einem Felsen 1815',
66
  'en_US/hifi-tts_low#6097',
67
  ],
68
- #
69
  [
70
  '04_Friedrich_FV317_001.jpg',
71
  '04_Friedrich_FV317_001.txt',
@@ -73,7 +65,7 @@ DESCRIPTIONS = [
73
  'C. D. Friedrich, Der Watzmann, 1824',
74
  'en_US/m-ailabs_low#mary_ann',
75
  ],
76
- #
77
  [
78
  '05_Blechen_FV40_001.jpg',
79
  '05_Blechen_FV40_001.txt',
@@ -95,7 +87,7 @@ DESCRIPTIONS = [
95
  '07_Courbet_AI967_001.txt',
96
  'Storm at the strand of waves Tsunami',
97
  'G. Courbet, Die Welle, 1870',
98
- 'en_US/m-ailabs_low#mary_ann',
99
  ],
100
  # 8
101
  [
@@ -125,7 +117,7 @@ DESCRIPTIONS = [
125
  [
126
  '11_Liebermann_NG4-94_001.jpg',
127
  '11_Liebermann_NG4-94_001.txt',
128
- 'Tavern at the waterfront',
129
  'M. Tiebermann, Gartenlokal An Der Havel Nikolskoe, 1916',
130
  'en_US/cmu-arctic_low#ljm',
131
  ],
@@ -135,7 +127,7 @@ DESCRIPTIONS = [
135
  '12_Slevogt_AII1022_001.txt',
136
  'sailing yachts pool fluss',
137
  'M. Slevogt, Segelboote Auf Der Alster Am Abend, 1905',
138
- 'en_US/m-ailabs_low#mary_ann',
139
  ],
140
  ]
141
 
@@ -146,7 +138,7 @@ SILENT_VIDEO = '_silent_video.mp4'
146
  # SILENT CLIP
147
 
148
 
149
- for img, text, soundscape, title, voice in DESCRIPTIONS[2:4]:
150
 
151
 
152
 
 
8
  # yt-dlp https://www.youtube.com/watch?v=UZ9uyQI3pF0 --downloader ffmpeg --downloader-args "ffmpeg_i:-ss 997 -to 2512"
9
  # ffmpeg -i Sandra\ Kotevska\,\ Painting\ Rose\ bush\,\ mixed\ media\,\ 2017.\ \[NMzC_036MtE\].mkv -f mp3 -ar 22050 -vn out44.wav -ac 1
10
  # https://superuser.com/questions/583393/how-to-extract-subtitle-from-video-using-ffmpeg
 
 
 
 
11
  # __________________________________________________________________________________________________
12
  # TO DONLOAD SRT for youtub
13
  # yt-dlp --write-sub --sub-lang en --convert-subs "srt" https://www.youtube.com/watch?v=F1Ib7TAu7eg&list=PL4x2B6LSwFewdDvRnUTpBM7jkmpwouhPv&index=2
 
15
  # _voice = 'en_US/cmu-arctic_low#lnh' #en_US/vctk_low#p249' # 'en_US/vctk_low#p282'
16
  # _voice = ''en_US/vctk_low#p351''
17
  # _voice = 'en_US/vctk_low#p351' # avoid 318 it does the ghhhhhh
 
 
 
18
  # _voice = 'en_US/vctk_low#p351'
19
  # _voice = 'en_US/hifi-tts_low#92'
 
20
 
21
 
22
 
 
39
  '01_Schick_AII840_001.txt', # text
40
  'statue in shire, hill river, vogels.', # audiocraft
41
  'G. Schick, Bildnis der Heinrike Dannecker, 1802', # cv2 puttext title
42
+ 'fr_FR_m-ailabs_bernard', #'en_US/m-ailabs_low#judy_bieber', #'en_US/m-ailabs_low#mary_ann',
43
  ],
44
  # 2
45
  [
 
57
  'K. Schinkel Gotische Kirche Auf Einem Felsen 1815',
58
  'en_US/hifi-tts_low#6097',
59
  ],
60
+ # 4
61
  [
62
  '04_Friedrich_FV317_001.jpg',
63
  '04_Friedrich_FV317_001.txt',
 
65
  'C. D. Friedrich, Der Watzmann, 1824',
66
  'en_US/m-ailabs_low#mary_ann',
67
  ],
68
+ # 5
69
  [
70
  '05_Blechen_FV40_001.jpg',
71
  '05_Blechen_FV40_001.txt',
 
87
  '07_Courbet_AI967_001.txt',
88
  'Storm at the strand of waves Tsunami',
89
  'G. Courbet, Die Welle, 1870',
90
+ 'af_ZA_google-nwu_0184',
91
  ],
92
  # 8
93
  [
 
117
  [
118
  '11_Liebermann_NG4-94_001.jpg',
119
  '11_Liebermann_NG4-94_001.txt',
120
+ 'Tavern and shrine and people talking glass plates drink',
121
  'M. Tiebermann, Gartenlokal An Der Havel Nikolskoe, 1916',
122
  'en_US/cmu-arctic_low#ljm',
123
  ],
 
127
  '12_Slevogt_AII1022_001.txt',
128
  'sailing yachts pool fluss',
129
  'M. Slevogt, Segelboote Auf Der Alster Am Abend, 1905',
130
+ 'jv_ID_google-gmu_06207',
131
  ],
132
  ]
133
 
 
138
  # SILENT CLIP
139
 
140
 
141
+ for img, text, soundscape, title, voice in DESCRIPTIONS: #[2:4]:
142
 
143
 
144
 
live_api.py DELETED
@@ -1,135 +0,0 @@
1
-
2
- # -*- coding: utf-8 -*-
3
- import numpy as np
4
- import soundfile
5
- import audresample
6
- import text_utils
7
-
8
- import re
9
- import subprocess
10
- import markdown
11
- import json
12
- from pathlib import Path
13
- from types import SimpleNamespace
14
- from flask import Flask, request, send_from_directory
15
- from flask_cors import CORS
16
- from audiocraft.builders import AudioGen #, audio_write
17
- NUM_SOUND_GENERATIONS = 1 # they differ a lot and are unnatural to concatenate, prefer lm.n_draw
18
- sound_generator = AudioGen(duration=.74, device='cuda:0').to('cuda:0').eval()
19
-
20
-
21
- # ====STYLE VECTOR====
22
-
23
-
24
-
25
- # AFFECTIVE = True
26
- # VOICE = 'en_UK/apope_low' # en_US/m-ailabs_low#mary_ann
27
-
28
- # _dir = '/' if AFFECTIVE else '_v2/'
29
- # precomputed_style_vector = msinference.compute_style(
30
- # 'assets/wavs/style_vector' + _dir + VOICE.replace(
31
- # '/', '_').replace(
32
- # '#', '_').replace(
33
- # 'cmu-arctic', 'cmu_arctic').replace(
34
- # '_low', '') + '.wav')
35
- # print('\n STYLE VECTOR \n', precomputed_style_vector.shape)
36
-
37
-
38
- # ==== STYLE VECTOR
39
-
40
- CACHE_DIR = 'flask_cache/'
41
- Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
42
-
43
-
44
-
45
-
46
- def tts_multi_sentence(scene=None):
47
- if scene is not None and len(scene) >= 4:
48
- print(f'Processing: {scene} ..')
49
- # x = sound_generator.generate([scene])[0, :, :].detach().cpu().numpy()
50
- x = sound_generator.generate(
51
- [scene] * NUM_SOUND_GENERATIONS
52
- ).reshape(1, -1).detach().cpu().numpy() # bs, 11400
53
-
54
- x /= np.abs(x).max() + 1e-7
55
- # is 16kHz - AUdiogen Fs
56
- x = audresample.resample(x,
57
- original_rate=16000,
58
- target_rate=24000)[0, :]
59
-
60
-
61
- #
62
- print(f'Craft Finished for: {scene}\n\n\n\n____{x.shape}')
63
- else:
64
- print(scene, '\nDrop\n')
65
- x = np.zeros(400)
66
-
67
- # # StyleTTS2
68
- # if ('en_US/' in voice) or ('en_UK/' in voice) or (voice is None):
69
- # assert precomputed_style_vector is not None, 'For affective TTS, style vector is needed.'
70
- # x = []
71
- # for _sentence in text:
72
- # x.append(msinference.inference(_sentence,
73
- # precomputed_style_vector,
74
- # alpha=0.3,
75
- # beta=0.7,
76
- # diffusion_steps=7,
77
- # embedding_scale=1))
78
- # x = np.concatenate(x)
79
-
80
- # return overlay(x, sound_background)
81
-
82
- return x
83
-
84
-
85
-
86
-
87
-
88
-
89
- app = Flask(__name__)
90
- cors = CORS(app)
91
-
92
-
93
- @app.route("/")
94
- def index():
95
- with open('README.md', 'r') as f:
96
- return markdown.markdown(f.read())
97
-
98
-
99
- @app.route("/", methods=['GET', 'POST', 'PUT'])
100
- def serve_wav():
101
- # https://stackoverflow.com/questions/13522137/in-flask-convert-form-post-
102
- # object-into-a-representation-suitable-for-mongodb
103
- r = request.form.to_dict(flat=False)
104
-
105
-
106
- args = SimpleNamespace(
107
- text=None if r.get('text') is None else r.get('text'), # string not file?
108
- scene=r.get('scene')[0]
109
- )
110
- # print('\n==RECOMPOSED as \n',request.data,request.form,'\n==')
111
-
112
-
113
-
114
-
115
-
116
-
117
-
118
- x = tts_multi_sentence(args.scene)
119
-
120
- OUT_FILE = 'tmp.wav'
121
- soundfile.write(CACHE_DIR + OUT_FILE, x, 16000)
122
-
123
-
124
-
125
-
126
-
127
- # send server's output as default file -> srv_result.xx
128
- print(f'\n=SERVER saved as {OUT_FILE=}\n')
129
- response = send_from_directory(CACHE_DIR, path=OUT_FILE)
130
- response.headers['suffix-file-type'] = OUT_FILE
131
- return response
132
-
133
-
134
- if __name__ == "__main__":
135
- app.run(host="0.0.0.0")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
live_demo.py CHANGED
@@ -1,74 +1,48 @@
1
- import argparse
 
 
2
  import os
3
  import requests
4
  import subprocess
5
-
6
-
7
-
8
-
9
- def command_line_args():
10
- parser = argparse.ArgumentParser(
11
- formatter_class=argparse.ArgumentDefaultsHelpFormatter
12
- )
13
- parser.add_argument(
14
- '--affective',
15
- help="Select Emotional or non-emotional variant of Available voices: https://audeering.github.io/shift/",
16
- action='store_false',
17
- )
18
- parser.add_argument(
19
- '--device',
20
- help="Device ID",
21
- type=str,
22
- default='cpu',
23
- )
24
- parser.add_argument(
25
- '--text',
26
- help="Text to be synthesized.",
27
- default='How is hoowl',
28
- type=str,
29
- )
30
- return parser
31
 
32
  def send_to_server(args):
33
  url = "http://192.168.88.209:5000"
34
 
35
  payload = {
36
  'text': args.text,
37
- 'scene': args.scene
 
 
 
 
 
 
38
  }
39
 
40
- response = requests.post(url, data=payload) # NONEs do not arrive to servers dict
41
-
42
- # # Check the response from the server
43
- # if response.status_code == 200:
44
- # print("\nRequest was successful!")
45
- # # print("Response:", respdonse.__dict__.keys(), '\n=====\n')
46
 
47
- # else:
48
- # print("Failed to send the request")
49
- # print("Status Code:", response.status_code)
50
- # print("Response:", response.text)
51
- return response
52
 
53
-
54
- def cli(): # args.out_file is not send to server - server writes tmp - copied by client
55
- parser = command_line_args()
56
- args = parser.parse_args()
57
- os.system('cls' if os.name == 'nt' else 'clear')
58
- while True:
59
- args.text = input("\n\n\n\nDescribe Any Sound: \n\n\n\n")
60
- # _text, _scene = args.text.split('|')
61
- # args.text = _text
62
- args.scene = args.text #_scene
63
- if len(args.text) >= 4:
64
- response = send_to_server(args)
65
- out_file = '_gen_.wav' #+ response.headers['suffix-file-type'].split('.')[-1]
66
- with open(out_file, 'wb') as f:
67
- f.write(response.content)
68
- subprocess.run(["paplay", out_file])
69
- else:
70
- print(f'__\n{args.text}\n')
71
 
72
-
73
- if __name__ == '__main__':
74
- cli()
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Asks for txt input, creates TTS and sound via AudioGen, plays it back
2
+ # Need to have paplay installed on client - live_demo.py
3
+
4
  import os
5
  import requests
6
  import subprocess
7
+ from types import SimpleNamespace
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  def send_to_server(args):
10
  url = "http://192.168.88.209:5000"
11
 
12
  payload = {
13
  'text': args.text,
14
+ 'voice': args.voice,
15
+ 'soundscape': args.soundscape,
16
+ 'affective': True,
17
+ 'image': None,
18
+ 'video': None,
19
+ 'speed': 1.14,
20
+ 'native': None,
21
  }
22
 
23
+ return requests.post(url, data=payload, files=[(args.text, open('_tmp.txt', 'rb'))]) # NONEs do not arrive to servers dict
 
 
 
 
 
24
 
 
 
 
 
 
25
 
26
+ args = SimpleNamespace()
27
+ args.voice = 'fr_FR_m-ailabs_bernard' # 'en_US/m-ailabs_low#judy_bieber'
28
+ args.speed = 1.14
29
+ os.system('cls' if os.name == 'nt' else 'clear')
30
+ while True:
31
+ _str = input("\n\n\n\nDescribe Any Sound: \n\n\n\n")
32
+ args.soundscape = _str
 
 
 
 
 
 
 
 
 
 
 
33
 
34
+ # xtra duration for audiogen to sound cool!!!!
35
+ if len(_str) < 20:
36
+ _str += 'Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata few silence for audiogen to impress you.'
37
+ args.text = '_tmp.txt' # input -> .txt (implementation thought for audiobooks in API)
38
+
39
+ with open(args.text, 'w') as f:
40
+ f.write(_str)
41
+ if len(_str) >= 4:
42
+ response = send_to_server(args)
43
+ out_file = '_gen_.wav'#+ response.headers['suffix-file-type'].split('.')[-1]
44
+ with open(out_file, 'wb') as f:
45
+ f.write(response.content)
46
+ subprocess.run(["paplay", out_file])
47
+ else:
48
+ print(f'__\n{_str}\n')
models.py CHANGED
@@ -109,6 +109,9 @@ class ResBlk(nn.Module):
109
 
110
 
111
  class StyleEncoder(nn.Module):
 
 
 
112
  def __init__(self, dim_in=48, style_dim=48, max_conv_dim=384):
113
  super().__init__()
114
  blocks = []
@@ -549,5 +552,5 @@ def build_model(args, text_aligner, pitch_extractor, bert):
549
  text_aligner = text_aligner,
550
  pitch_extractor=pitch_extractor
551
  )
552
-
553
- return nets
 
109
 
110
 
111
  class StyleEncoder(nn.Module):
112
+
113
+ # used for both acoustic & prosodic ref_s/p
114
+
115
  def __init__(self, dim_in=48, style_dim=48, max_conv_dim=384):
116
  super().__init__()
117
  blocks = []
 
552
  text_aligner = text_aligner,
553
  pitch_extractor=pitch_extractor
554
  )
555
+
556
+ return nets
msinference.py CHANGED
@@ -134,6 +134,7 @@ _ = [model[key].eval() for key in model]
134
  _ = [model[key].to(device) for key in model]
135
 
136
  # params_whole = torch.load("Models/LibriTTS/epochs_2nd_00020.pth", map_location='cpu')
 
137
  params_whole = torch.load(str(cached_path("hf://yl4579/StyleTTS2-LibriTTS/Models/LibriTTS/epochs_2nd_00020.pth")), map_location='cpu')
138
  params = params_whole['net']
139
 
@@ -167,7 +168,7 @@ def inference(text,
167
  ps = global_phonemizer.phonemize([text])
168
  # print(f'PHONEMIZER: {ps=}\n\n') #PHONEMIZER: ps=['ɐbˈɛbæbləm ']
169
  ps = word_tokenize(ps[0])
170
- # print(f'TOKENIZER: {ps=}\n\n') #OKENIZER: ps=['ɐbˈɛbæbləm']
171
  ps = ' '.join(ps)
172
  tokens = textclenaer(ps)
173
  # print(f'TEXTCLEAN: {ps=}\n\n') #TEXTCLEAN: ps='ɐbˈɛbæbləm'
@@ -198,11 +199,13 @@ def inference(text,
198
  # print('BERTdu', bert_dur.shape, tokens.shape, '\n') # bert what is the 768 per token -> IS USED in sampler
199
  # BERTdu torch.Size([1, 11, 768]) torch.Size([1, 11])
200
 
201
-
202
 
203
  ref = ref_s[:, :128]
204
  s = ref_s[:, 128:]
205
 
 
 
206
  d = model.predictor.text_encoder(d_en,
207
  s, input_lengths, text_mask)
208
 
 
134
  _ = [model[key].to(device) for key in model]
135
 
136
  # params_whole = torch.load("Models/LibriTTS/epochs_2nd_00020.pth", map_location='cpu')
137
+ # params_whole = torch.load('freevc2/yl4579_styletts2.pth' map_location='cpu')
138
  params_whole = torch.load(str(cached_path("hf://yl4579/StyleTTS2-LibriTTS/Models/LibriTTS/epochs_2nd_00020.pth")), map_location='cpu')
139
  params = params_whole['net']
140
 
 
168
  ps = global_phonemizer.phonemize([text])
169
  # print(f'PHONEMIZER: {ps=}\n\n') #PHONEMIZER: ps=['ɐbˈɛbæbləm ']
170
  ps = word_tokenize(ps[0])
171
+ # # print(f'TOKENIZER: {ps=}\n\n') #OKENIZER: ps=['ɐbˈɛbæbləm']
172
  ps = ' '.join(ps)
173
  tokens = textclenaer(ps)
174
  # print(f'TEXTCLEAN: {ps=}\n\n') #TEXTCLEAN: ps='ɐbˈɛbæbləm'
 
199
  # print('BERTdu', bert_dur.shape, tokens.shape, '\n') # bert what is the 768 per token -> IS USED in sampler
200
  # BERTdu torch.Size([1, 11, 768]) torch.Size([1, 11])
201
 
202
+
203
 
204
  ref = ref_s[:, :128]
205
  s = ref_s[:, 128:]
206
 
207
+ # s = .74 * s # prosody / arousal & fading unvoiced syllabes [x0.7 - x1.2]
208
+
209
  d = model.predictor.text_encoder(d_en,
210
  s, input_lengths, text_mask)
211