Dionyssos commited on
Commit
d72b2c3
β€’
1 Parent(s): f11b6ad

audiocraft

Browse files
This view is limited to 50 files because it contains too many changes. Β  See raw diff
Files changed (50) hide show
  1. api.py +397 -0
  2. {mimic3_foreign β†’ assets/mimic3_foreign}/af_ZA_google-nwu_0184.wav +0 -0
  3. {mimic3_foreign β†’ assets/mimic3_foreign}/af_ZA_google-nwu_1919.wav +0 -0
  4. {mimic3_foreign β†’ assets/mimic3_foreign}/af_ZA_google-nwu_2418.wav +0 -0
  5. {mimic3_foreign β†’ assets/mimic3_foreign}/af_ZA_google-nwu_6590.wav +0 -0
  6. {mimic3_foreign β†’ assets/mimic3_foreign}/af_ZA_google-nwu_7130.wav +0 -0
  7. {mimic3_foreign β†’ assets/mimic3_foreign}/af_ZA_google-nwu_7214.wav +0 -0
  8. {mimic3_foreign β†’ assets/mimic3_foreign}/af_ZA_google-nwu_8148.wav +0 -0
  9. {mimic3_foreign β†’ assets/mimic3_foreign}/af_ZA_google-nwu_8924.wav +0 -0
  10. {mimic3_foreign β†’ assets/mimic3_foreign}/af_ZA_google-nwu_8963.wav +0 -0
  11. {mimic3_foreign β†’ assets/mimic3_foreign}/bn_multi_00737.wav +0 -0
  12. {mimic3_foreign β†’ assets/mimic3_foreign}/bn_multi_00779.wav +0 -0
  13. {mimic3_foreign β†’ assets/mimic3_foreign}/bn_multi_01232.wav +0 -0
  14. {mimic3_foreign β†’ assets/mimic3_foreign}/bn_multi_01701.wav +0 -0
  15. {mimic3_foreign β†’ assets/mimic3_foreign}/bn_multi_02194.wav +0 -0
  16. {mimic3_foreign β†’ assets/mimic3_foreign}/bn_multi_03042.wav +0 -0
  17. {mimic3_foreign β†’ assets/mimic3_foreign}/bn_multi_0834.wav +0 -0
  18. {mimic3_foreign β†’ assets/mimic3_foreign}/bn_multi_1010.wav +0 -0
  19. {mimic3_foreign β†’ assets/mimic3_foreign}/bn_multi_3108.wav +0 -0
  20. {mimic3_foreign β†’ assets/mimic3_foreign}/bn_multi_3713.wav +0 -0
  21. {mimic3_foreign β†’ assets/mimic3_foreign}/bn_multi_3958.wav +0 -0
  22. {mimic3_foreign β†’ assets/mimic3_foreign}/bn_multi_4046.wav +0 -0
  23. {mimic3_foreign β†’ assets/mimic3_foreign}/bn_multi_4811.wav +0 -0
  24. {mimic3_foreign β†’ assets/mimic3_foreign}/bn_multi_5958.wav +0 -0
  25. {mimic3_foreign β†’ assets/mimic3_foreign}/bn_multi_9169.wav +0 -0
  26. {mimic3_foreign β†’ assets/mimic3_foreign}/bn_multi_rm.wav +0 -0
  27. {mimic3_foreign β†’ assets/mimic3_foreign}/de_DE_m-ailabs_angela_merkel.wav +0 -0
  28. {mimic3_foreign β†’ assets/mimic3_foreign}/de_DE_m-ailabs_eva_k.wav +0 -0
  29. {mimic3_foreign β†’ assets/mimic3_foreign}/de_DE_m-ailabs_karlsson.wav +0 -0
  30. {mimic3_foreign β†’ assets/mimic3_foreign}/de_DE_m-ailabs_ramona_deininger.wav +0 -0
  31. {mimic3_foreign β†’ assets/mimic3_foreign}/de_DE_m-ailabs_rebecca_braunert_plunkett.wav +0 -0
  32. {mimic3_foreign β†’ assets/mimic3_foreign}/de_DE_thorsten-emotion_amused.wav +0 -0
  33. {mimic3_foreign β†’ assets/mimic3_foreign}/de_DE_thorsten-emotion_angry.wav +0 -0
  34. {mimic3_foreign β†’ assets/mimic3_foreign}/de_DE_thorsten-emotion_disgusted.wav +0 -0
  35. {mimic3_foreign β†’ assets/mimic3_foreign}/de_DE_thorsten-emotion_drunk.wav +0 -0
  36. {mimic3_foreign β†’ assets/mimic3_foreign}/de_DE_thorsten-emotion_neutral.wav +0 -0
  37. {mimic3_foreign β†’ assets/mimic3_foreign}/de_DE_thorsten-emotion_sleepy.wav +0 -0
  38. {mimic3_foreign β†’ assets/mimic3_foreign}/de_DE_thorsten-emotion_surprised.wav +0 -0
  39. {mimic3_foreign β†’ assets/mimic3_foreign}/de_DE_thorsten-emotion_whisper.wav +0 -0
  40. {mimic3_foreign β†’ assets/mimic3_foreign}/de_DE_thorsten.wav +0 -0
  41. {mimic3_foreign β†’ assets/mimic3_foreign}/el_GR_rapunzelina.wav +0 -0
  42. {mimic3_foreign β†’ assets/mimic3_foreign}/es_ES_carlfm.wav +0 -0
  43. {mimic3_foreign β†’ assets/mimic3_foreign}/es_ES_m-ailabs_karen_savage.wav +0 -0
  44. {mimic3_foreign β†’ assets/mimic3_foreign}/es_ES_m-ailabs_tux.wav +0 -0
  45. {mimic3_foreign β†’ assets/mimic3_foreign}/es_ES_m-ailabs_victor_villarraza.wav +0 -0
  46. {mimic3_foreign β†’ assets/mimic3_foreign}/fa_haaniye.wav +0 -0
  47. {mimic3_foreign β†’ assets/mimic3_foreign}/fi_FI_harri-tapani-ylilammi.wav +0 -0
  48. {mimic3_foreign β†’ assets/mimic3_foreign}/fr_FR_m-ailabs_bernard.wav +0 -0
  49. {mimic3_foreign β†’ assets/mimic3_foreign}/fr_FR_m-ailabs_ezwa.wav +0 -0
  50. {mimic3_foreign β†’ assets/mimic3_foreign}/fr_FR_m-ailabs_gilles_g_le_blanc.wav +0 -0
api.py ADDED
@@ -0,0 +1,397 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # -*- coding: utf-8 -*-
3
+ import numpy as np
4
+ import soundfile
5
+ import audresample
6
+ import text_utils
7
+ import msinference
8
+ import re
9
+ import srt
10
+ import subprocess
11
+ import cv2
12
+ import markdown
13
+ import json
14
+ from pathlib import Path
15
+ from types import SimpleNamespace
16
+ from flask import Flask, request, send_from_directory
17
+ from flask_cors import CORS
18
+ from moviepy.editor import *
19
+ from audiocraft.audiogen import AudioGen, audio_write
20
+
21
+ sound_generator = AudioGen.get_pretrained('facebook/audiogen-medium')
22
+ sound_generator.set_generation_params(duration=6)
23
+
24
+ Path('./flask_cache').mkdir(parents=True, exist_ok=True)
25
+
26
+ # SSH AGENT
27
+ # eval $(ssh-agent -s)
28
+ # ssh-add ~/.ssh/id_ed25519_github2024
29
+ #
30
+ # git remote set-url origin [email protected]:audeering/shift
31
+ # ==
32
+
33
+ def _shift(x):
34
+ n = x.shape[0]
35
+ i = np.random.randint(.24 * n, max(1, .74 * n)) # high should be above >= 0
36
+ x = np.roll(x, i)
37
+ # fade_in = .5 + .5 * np.tanh(4*(np.linspace(-10, 10, x.shape[0]) + 9.4))
38
+ # x = x * fade_in
39
+ return x
40
+
41
+ def _background(x, sound_background=None):
42
+ if sound_background is not None:
43
+ sound_background = sound_background[0, :]
44
+ len_speech = len(x)
45
+ if len_speech < len(sound_background):
46
+ n_repeat = len_speech // len(sound_background) + 1
47
+ replica = [sound_background] * n_repeat
48
+ replica = [_shift(_) for _ in replica]
49
+ sound_background = np.concatenate(replica)
50
+
51
+
52
+ print(f'\nSOUND\nBACKGROUND\nSHAPE\n{sound_background=}\n{x.shape=}\n- - - -')
53
+ x = .74 * x + .26 * sound_background[:len_speech]
54
+ return x
55
+
56
+ def tts_multi_sentence(precomputed_style_vector=None,
57
+ text=None,
58
+ voice=None,
59
+ scene=None):
60
+ '''create 24kHZ np.array with tts
61
+
62
+ precomputed_style_vector : required if en_US or en_UK in voice, so
63
+ to perform affective TTS.
64
+ text : string
65
+ voice : string or None (falls to styleTTS)
66
+ scene : 'A castle in far away lands' -> if passed will generate background sound scene
67
+ '''
68
+ # Generate sound scene - up sample to 24KHz
69
+ if scene is not None:
70
+
71
+ sound_background = sound_generator.generate([scene])[0]
72
+ sound_background = audio_write(None,
73
+ sound_background.cpu(),
74
+ 24000, # sound_generator.sample_rate,
75
+ strategy="loudness",
76
+ loudness_compressor=True)
77
+ else:
78
+ sound_background = None
79
+
80
+ # StyleTTS2
81
+ if ('en_US/' in voice) or ('en_UK/' in voice) or (voice is None):
82
+ assert precomputed_style_vector is not None, 'For affective TTS, style vector is needed.'
83
+ x = []
84
+ for _sentence in text:
85
+ x.append(msinference.inference(_sentence,
86
+ precomputed_style_vector,
87
+ alpha=0.3,
88
+ beta=0.7,
89
+ diffusion_steps=7,
90
+ embedding_scale=1))
91
+ x = np.concatenate(x)
92
+
93
+ return _background(x, sound_background)
94
+
95
+ # Fallback - Mimic-3
96
+ text_utils.store_ssml(text=text, voice=voice) # Text has to be list of single sentences
97
+ ps = subprocess.Popen(f'cat _tmp_ssml.txt | mimic3 --ssml > _tmp.wav', shell=True)
98
+ ps.wait()
99
+ x, fs = soundfile.read('_tmp.wav')
100
+ x = audresample.resample(x.astype(np.float32), 24000, fs)[0, :] # reshapes (64,) -> (1,64)
101
+
102
+ return _background(x, sound_background)
103
+
104
+
105
+
106
+
107
+ # voices = {}
108
+ # import phonemizer
109
+ # global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True)
110
+
111
+ app = Flask(__name__)
112
+ cors = CORS(app)
113
+
114
+
115
+ @app.route("/")
116
+ def index():
117
+ with open('README.md', 'r') as f:
118
+ return markdown.markdown(f.read())
119
+
120
+
121
+ @app.route("/", methods=['GET', 'POST', 'PUT'])
122
+ def serve_wav():
123
+ # https://stackoverflow.com/questions/13522137/in-flask-convert-form-post-
124
+ # object-into-a-representation-suitable-for-mongodb
125
+ r = request.form.to_dict(flat=False)
126
+
127
+
128
+ # Physically Save Client Files
129
+ for filename, obj in request.files.items():
130
+ obj.save(f'flask_cache/{filename.replace("/","")}')
131
+
132
+ print('Saved all files on Server Side\n\n')
133
+
134
+ args = SimpleNamespace(text=None if r.get('text') is None else 'flask_cache/' + r.get('text')[0],
135
+ video=None if r.get('video') is None else 'flask_cache/' + r.get('video')[0],
136
+ image=None if r.get('image') is None else 'flask_cache/' + r.get('image')[0],
137
+ voice=r.get('voice')[0],
138
+ native=None if r.get('native') is None else 'flask_cache/' + r.get('native')[0],
139
+ affective = r.get('affective')[0],
140
+ scene=r.get('scene')[0]
141
+ )
142
+ # print('\n==RECOMPOSED as \n',request.data,request.form,'\n==')
143
+
144
+
145
+ print(args, 'ENTER Script')
146
+ do_video_dub = True if args.text.endswith('.srt') else False
147
+
148
+ SILENT_VIDEO = '_silent_video.mp4'
149
+ AUDIO_TRACK = '_audio_track.wav'
150
+
151
+ if do_video_dub:
152
+ print('==\nFound .srt : {args.txt}, thus Video should be given as well\n\n')
153
+ with open(args.text, "r") as f:
154
+ s = f.read()
155
+ text = [[j.content, j.start.total_seconds(), j.end.total_seconds()] for j in srt.parse(s)]
156
+ assert args.video is not None
157
+ native_audio_file = '_tmp.wav'
158
+ subprocess.call(
159
+ ["ffmpeg",
160
+ "-y", # https://stackoverflow.com/questions/39788972/ffmpeg-overwrite-output-file-if-exists
161
+ "-i",
162
+ args.video,
163
+ "-f",
164
+ "mp3",
165
+ "-ar",
166
+ "24000", # "22050 for mimic3",
167
+ "-vn",
168
+ native_audio_file])
169
+ x_native, _ = soundfile.read(native_audio_file) # reads mp3
170
+ x_native = x_native[:, 0] # stereo
171
+ # ffmpeg -i Sandra\ Kotevska\,\ Painting\ Rose\ bush\,\ mixed\ media\,\ 2017.\ \[NMzC_036MtE\].mkv -f mp3 -ar 22050 -vn out44.wa
172
+ else:
173
+ with open(args.text, 'r') as f:
174
+ t = ''.join(f)
175
+ t = re.sub(' +', ' ', t) # delete spaces
176
+ text = text_utils.split_into_sentences(t) # split to short sentences (~200 phonemes max)
177
+
178
+ # ====STYLE VECTOR====
179
+
180
+ precomputed_style_vector = None
181
+ if args.native: # Voice Cloning
182
+ try:
183
+ precomputed_style_vector = msinference.compute_style(args.native)
184
+ except soundfile.LibsndfileError: # Fallback - internal voice
185
+ print('\n Could not voice clone audio:', args.native, 'fallback to video or Internal TTS voice.\n')
186
+ if do_video_dub: # Clone voice via Video
187
+ native_audio_file = args.video.replace('.', '').replace('/', '')
188
+ native_audio_file += '__native_audio_track.wav'
189
+ soundfile.write('tgt_spk.wav',
190
+ np.concatenate([
191
+ x_native[:int(4 * 24000)]], 0).astype(np.float32), 24000) # 27400?
192
+ precomputed_style_vector = msinference.compute_style('tgt_spk.wav')
193
+
194
+ # NOTE: style vector may be None
195
+
196
+ if precomputed_style_vector is None:
197
+ if 'en_US' in args.voice or 'en_UK' in args.voice:
198
+ _dir = '/' if args.affective else '_v2/'
199
+ precomputed_style_vector = msinference.compute_style(
200
+ 'assets/wavs/style_vector' + _dir + args.voice.replace(
201
+ '/', '_').replace(
202
+ '#', '_').replace(
203
+ 'cmu-arctic', 'cmu_arctic').replace(
204
+ '_low', '') + '.wav')
205
+ print('\n STYLE VECTOR \n', precomputed_style_vector)
206
+ # ====SILENT VIDEO====
207
+
208
+ if args.video is not None:
209
+ # banner
210
+ frame_tts = np.zeros((104, 1920, 3), dtype=np.uint8)
211
+ font = cv2.FONT_HERSHEY_SIMPLEX
212
+ bottomLeftCornerOfText = (240, 74) # w,h
213
+ fontScale = 2
214
+ fontColor = (255, 255, 255)
215
+ thickness = 4
216
+ lineType = 2
217
+ cv2.putText(frame_tts, 'TTS',
218
+ bottomLeftCornerOfText,
219
+ font,
220
+ fontScale,
221
+ fontColor,
222
+ thickness,
223
+ lineType)
224
+ # cv2.imshow('i', frame_tts); cv2.waitKey(); cv2.destroyAllWindows()
225
+ # ====================================== NATIVE VOICE
226
+ frame_orig = np.zeros((104, 1920, 3), dtype=np.uint8)
227
+ font = cv2.FONT_HERSHEY_SIMPLEX
228
+ bottomLeftCornerOfText = (101, 74) # w,h
229
+ fontScale = 2
230
+ fontColor = (255, 255, 255)
231
+ thickness = 4
232
+ lineType = 1000
233
+ cv2.putText(frame_orig, 'ORIGINAL VOICE',
234
+ bottomLeftCornerOfText,
235
+ font,
236
+ fontScale,
237
+ fontColor,
238
+ thickness,
239
+ lineType)
240
+ # ====SILENT VIDEO EXTRACT====
241
+ # DONLOAD SRT from youtube
242
+ #
243
+ # yt-dlp --write-sub --sub-lang en --convert-subs "srt" https://www.youtube.com/watch?v=F1Ib7TAu7eg&list=PL4x2B6LSwFewdDvRnUTpBM7jkmpwouhPv&index=2
244
+ #
245
+ #
246
+ # .mkv ->.mp4 moviepy loads only .mp4
247
+ #
248
+ # ffmpeg -y -i Distaff\ \[qVonBgRXcWU\].mkv -c copy -c:a aac Distaff_qVonBgRXcWU.mp4
249
+ # video_file, srt_file = ['assets/Head_of_fortuna.mp4',
250
+ # 'assets/head_of_fortuna_en.srt']
251
+ #
252
+ video_file = args.video
253
+ vf = VideoFileClip(video_file)
254
+ try:
255
+ # inpaint banners if native voice
256
+ num = x_native.shape[0]
257
+ is_tts = .5 + .5 * np.tanh(4*(np.linspace(-10, 10, num) + 9.4)) # fade heaviside
258
+
259
+ def inpaint_banner(get_frame, t):
260
+ '''blend banner - (now plays) tts or native voic
261
+ '''
262
+ im = np.copy(get_frame(t))
263
+
264
+ ix = int(t * 24000)
265
+
266
+ if is_tts[ix] > .5: # mask is 1 thus tts else native
267
+ frame = frame_tts
268
+ else:
269
+ frame = frame_orig
270
+ h, w, _ = frame.shape
271
+ # im[-h:, -w:, :] = (.4 * im[-h:, -w:, :] + .6 * frame_orig).astype(np.uint8)
272
+ offset_h = 24
273
+ im[offset_h:h + offset_h, :w, :] = (.4 * im[offset_h:h + offset_h, :w, :]
274
+ + .6 * frame).astype(np.uint8)
275
+
276
+ # im2 = np.concatenate([im, frame_tts], 0)
277
+ # cv2.imshow('t', im2); cv2.waitKey(); cv2.destroyAllWindows()
278
+ return im # np.concatenate([im, frane_ttts], 0)
279
+ except UnboundLocalError: # args.native == False
280
+ def inpaint_banner(get_frame, t):
281
+ im = np.copy(get_frame(t))
282
+ frame = frame_tts
283
+ h, w, _ = frame.shape
284
+ offset_h = 24
285
+ im[offset_h:h + offset_h, :w, :] = (.4 * im[offset_h:h+offset_h, :w, :]
286
+ + .6 * frame).astype(np.uint8)
287
+ return im
288
+ vf = vf.fl(inpaint_banner)
289
+ vf.write_videofile(SILENT_VIDEO)
290
+
291
+ # ==== TTS .srt ====
292
+
293
+ if do_video_dub:
294
+ OUT_FILE = './flask_cache/tmp.mp4' #args.out_file + '_video_dub.mp4'
295
+ subtitles = text
296
+ MAX_LEN = int(subtitles[-1][2] + 17) * 24000
297
+ # 17 extra seconds fail-safe for long-last-segment
298
+ print("TOTAL LEN SAMPLES ", MAX_LEN, '\n====================')
299
+ pieces = []
300
+ for k, (_text_, orig_start, orig_end) in enumerate(subtitles):
301
+
302
+ # PAUSES ?????????????????????????
303
+
304
+
305
+ pieces.append(tts_multi_sentence(text=[_text_],
306
+ precomputed_style_vector=precomputed_style_vector,
307
+ voice=args.voice,
308
+ scene=args.scene)
309
+ )
310
+ total = np.concatenate(pieces, 0)
311
+ # x = audresample.resample(x.astype(np.float32), 24000, 22050) # reshapes (64,) -> (1,64)
312
+ # PAD SHORTEST of TTS / NATIVE
313
+ if len(x_native) > len(total):
314
+ total = np.pad(total, (0, max(0, x_native.shape[0] - total.shape[0])))
315
+
316
+ else: # pad native to len of is_tts & total
317
+ x_native = np.pad(x_native, (0, max(0, total.shape[0] - x_native.shape[0])))
318
+ # print(total.shape, x_native.shape, 'PADDED TRACKS')
319
+ soundfile.write(AUDIO_TRACK,
320
+ # (is_tts * total + (1-is_tts) * x_native)[:, None],
321
+ (.64 * total + .27 * x_native)[:, None],
322
+ 24000)
323
+ else: # Video from plain (.txt)
324
+ OUT_FILE = './flask_cache/tmp.mp4' #args.out_file + '_video_from_txt.mp4'
325
+ x = tts_multi_sentence(text=text,
326
+ precomputed_style_vector=precomputed_style_vector,
327
+ voice=args.voice,
328
+ scene=args.scene)
329
+ soundfile.write(AUDIO_TRACK, x, 24000)
330
+
331
+ # IMAGE 2 SPEECH
332
+
333
+ if args.image is not None:
334
+
335
+ STATIC_FRAME = args.image # 'assets/image_from_T31.jpg'
336
+ OUT_FILE = './flask_cache/tmp.mp4' #args.out_file + '_image_to_speech.mp4'
337
+
338
+ # SILENT CLIP
339
+
340
+ clip_silent = ImageClip(STATIC_FRAME).set_duration(5) # as long as the audio - TTS first
341
+ clip_silent.write_videofile(SILENT_VIDEO, fps=24)
342
+
343
+ x = tts_multi_sentence(text=text,
344
+ precomputed_style_vector=precomputed_style_vector,
345
+ voice=args.voice,
346
+ scene=args.scene
347
+ )
348
+ soundfile.write(AUDIO_TRACK, x, 24000)
349
+ elif args.video or args.image:
350
+ # write final output video
351
+ subprocess.call(
352
+ ["ffmpeg",
353
+ "-y",
354
+ "-i",
355
+ SILENT_VIDEO,
356
+ "-i",
357
+ AUDIO_TRACK,
358
+ "-c:v",
359
+ "copy",
360
+ "-map",
361
+ "0:v:0",
362
+ "-map",
363
+ " 1:a:0",
364
+ OUT_FILE])
365
+
366
+ print(f'\noutput video is saved as {OUT_FILE}')
367
+
368
+ else:
369
+
370
+ # Fallback: No image nor video provided - do only tts
371
+ x = tts_multi_sentence(text=text,
372
+ precomputed_style_vector=precomputed_style_vector,
373
+ voice=args.voice,
374
+ scene=args.scene)
375
+ OUT_FILE = './flask_cache/tmp.wav' #args.out_file + '.wav'
376
+ soundfile.write(OUT_FILE, x, 24000)
377
+
378
+
379
+
380
+
381
+ # audios = [msinference.inference(text,
382
+ # msinference.compute_style(f'voices/{voice}.wav'),
383
+ # alpha=0.3, beta=0.7, diffusion_steps=7, embedding_scale=1)]
384
+ # # for t in [text]:
385
+ # output_buffer = io.BytesIO()
386
+ # write(output_buffer, 24000, np.concatenate(audios))
387
+ # response = Response(output_buffer.getvalue())
388
+ # response.headers["Content-Type"] = "audio/wav"
389
+ # https://stackoverflow.com/questions/67591467/
390
+ # flask-shows-typeerror-send-from-directory-missing-1-required-positional-argum
391
+ response = send_from_directory('flask_cache/', path=OUT_FILE.split('/')[-1])
392
+ response.headers['suffix-file-type'] = OUT_FILE.split('/')[-1]
393
+ return response
394
+
395
+
396
+ if __name__ == "__main__":
397
+ app.run(host="0.0.0.0")
{mimic3_foreign β†’ assets/mimic3_foreign}/af_ZA_google-nwu_0184.wav RENAMED
File without changes
{mimic3_foreign β†’ assets/mimic3_foreign}/af_ZA_google-nwu_1919.wav RENAMED
File without changes
{mimic3_foreign β†’ assets/mimic3_foreign}/af_ZA_google-nwu_2418.wav RENAMED
File without changes
{mimic3_foreign β†’ assets/mimic3_foreign}/af_ZA_google-nwu_6590.wav RENAMED
File without changes
{mimic3_foreign β†’ assets/mimic3_foreign}/af_ZA_google-nwu_7130.wav RENAMED
File without changes
{mimic3_foreign β†’ assets/mimic3_foreign}/af_ZA_google-nwu_7214.wav RENAMED
File without changes
{mimic3_foreign β†’ assets/mimic3_foreign}/af_ZA_google-nwu_8148.wav RENAMED
File without changes
{mimic3_foreign β†’ assets/mimic3_foreign}/af_ZA_google-nwu_8924.wav RENAMED
File without changes
{mimic3_foreign β†’ assets/mimic3_foreign}/af_ZA_google-nwu_8963.wav RENAMED
File without changes
{mimic3_foreign β†’ assets/mimic3_foreign}/bn_multi_00737.wav RENAMED
File without changes
{mimic3_foreign β†’ assets/mimic3_foreign}/bn_multi_00779.wav RENAMED
File without changes
{mimic3_foreign β†’ assets/mimic3_foreign}/bn_multi_01232.wav RENAMED
File without changes
{mimic3_foreign β†’ assets/mimic3_foreign}/bn_multi_01701.wav RENAMED
File without changes
{mimic3_foreign β†’ assets/mimic3_foreign}/bn_multi_02194.wav RENAMED
File without changes
{mimic3_foreign β†’ assets/mimic3_foreign}/bn_multi_03042.wav RENAMED
File without changes
{mimic3_foreign β†’ assets/mimic3_foreign}/bn_multi_0834.wav RENAMED
File without changes
{mimic3_foreign β†’ assets/mimic3_foreign}/bn_multi_1010.wav RENAMED
File without changes
{mimic3_foreign β†’ assets/mimic3_foreign}/bn_multi_3108.wav RENAMED
File without changes
{mimic3_foreign β†’ assets/mimic3_foreign}/bn_multi_3713.wav RENAMED
File without changes
{mimic3_foreign β†’ assets/mimic3_foreign}/bn_multi_3958.wav RENAMED
File without changes
{mimic3_foreign β†’ assets/mimic3_foreign}/bn_multi_4046.wav RENAMED
File without changes
{mimic3_foreign β†’ assets/mimic3_foreign}/bn_multi_4811.wav RENAMED
File without changes
{mimic3_foreign β†’ assets/mimic3_foreign}/bn_multi_5958.wav RENAMED
File without changes
{mimic3_foreign β†’ assets/mimic3_foreign}/bn_multi_9169.wav RENAMED
File without changes
{mimic3_foreign β†’ assets/mimic3_foreign}/bn_multi_rm.wav RENAMED
File without changes
{mimic3_foreign β†’ assets/mimic3_foreign}/de_DE_m-ailabs_angela_merkel.wav RENAMED
File without changes
{mimic3_foreign β†’ assets/mimic3_foreign}/de_DE_m-ailabs_eva_k.wav RENAMED
File without changes
{mimic3_foreign β†’ assets/mimic3_foreign}/de_DE_m-ailabs_karlsson.wav RENAMED
File without changes
{mimic3_foreign β†’ assets/mimic3_foreign}/de_DE_m-ailabs_ramona_deininger.wav RENAMED
File without changes
{mimic3_foreign β†’ assets/mimic3_foreign}/de_DE_m-ailabs_rebecca_braunert_plunkett.wav RENAMED
File without changes
{mimic3_foreign β†’ assets/mimic3_foreign}/de_DE_thorsten-emotion_amused.wav RENAMED
File without changes
{mimic3_foreign β†’ assets/mimic3_foreign}/de_DE_thorsten-emotion_angry.wav RENAMED
File without changes
{mimic3_foreign β†’ assets/mimic3_foreign}/de_DE_thorsten-emotion_disgusted.wav RENAMED
File without changes
{mimic3_foreign β†’ assets/mimic3_foreign}/de_DE_thorsten-emotion_drunk.wav RENAMED
File without changes
{mimic3_foreign β†’ assets/mimic3_foreign}/de_DE_thorsten-emotion_neutral.wav RENAMED
File without changes
{mimic3_foreign β†’ assets/mimic3_foreign}/de_DE_thorsten-emotion_sleepy.wav RENAMED
File without changes
{mimic3_foreign β†’ assets/mimic3_foreign}/de_DE_thorsten-emotion_surprised.wav RENAMED
File without changes
{mimic3_foreign β†’ assets/mimic3_foreign}/de_DE_thorsten-emotion_whisper.wav RENAMED
File without changes
{mimic3_foreign β†’ assets/mimic3_foreign}/de_DE_thorsten.wav RENAMED
File without changes
{mimic3_foreign β†’ assets/mimic3_foreign}/el_GR_rapunzelina.wav RENAMED
File without changes
{mimic3_foreign β†’ assets/mimic3_foreign}/es_ES_carlfm.wav RENAMED
File without changes
{mimic3_foreign β†’ assets/mimic3_foreign}/es_ES_m-ailabs_karen_savage.wav RENAMED
File without changes
{mimic3_foreign β†’ assets/mimic3_foreign}/es_ES_m-ailabs_tux.wav RENAMED
File without changes
{mimic3_foreign β†’ assets/mimic3_foreign}/es_ES_m-ailabs_victor_villarraza.wav RENAMED
File without changes
{mimic3_foreign β†’ assets/mimic3_foreign}/fa_haaniye.wav RENAMED
File without changes
{mimic3_foreign β†’ assets/mimic3_foreign}/fi_FI_harri-tapani-ylilammi.wav RENAMED
File without changes
{mimic3_foreign β†’ assets/mimic3_foreign}/fr_FR_m-ailabs_bernard.wav RENAMED
File without changes
{mimic3_foreign β†’ assets/mimic3_foreign}/fr_FR_m-ailabs_ezwa.wav RENAMED
File without changes
{mimic3_foreign β†’ assets/mimic3_foreign}/fr_FR_m-ailabs_gilles_g_le_blanc.wav RENAMED
File without changes