Dionyssos commited on
Commit
6e78f43
·
1 Parent(s): 0572d9a

trial live demo

Browse files
Files changed (2) hide show
  1. live_api.py +172 -0
  2. live_demo.py +173 -0
live_api.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # -*- coding: utf-8 -*-
3
+ import numpy as np
4
+ import soundfile
5
+ import audresample
6
+ import text_utils
7
+ import msinference
8
+ import re
9
+ import srt
10
+ import subprocess
11
+ import markdown
12
+ import json
13
+ from pathlib import Path
14
+ from types import SimpleNamespace
15
+ from flask import Flask, request, send_from_directory
16
+ from flask_cors import CORS
17
+ from audiocraft.audiogen import AudioGen, audio_write
18
+
19
+ sound_generator = AudioGen.get_pretrained('facebook/audiogen-medium')
20
+ sound_generator.set_generation_params(duration=6)
21
+
22
+ CACHE_DIR = 'flask_cache/'
23
+ Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
24
+
25
+
26
+ def _shift(x):
27
+ n = x.shape[0]
28
+ i = np.random.randint(.24 * n, max(1, .74 * n)) # high should be above >= 0
29
+ x = np.roll(x, i)
30
+ # fade_in = .5 + .5 * np.tanh(4*(np.linspace(-10, 10, x.shape[0]) + 9.4))
31
+ # x = x * fade_in
32
+ return x
33
+
34
+ def overlay(x, sound_background=None):
35
+ if sound_background is not None:
36
+ sound_background = sound_background.detach().cpu().numpy()[0, :]
37
+ len_speech = len(x)
38
+ if len_speech > len(sound_background):
39
+ n_repeat = len_speech // len(sound_background) + 1
40
+ replica = [sound_background] * n_repeat
41
+ replica = [_shift(_) for _ in replica]
42
+ sound_background = np.concatenate(replica)
43
+
44
+
45
+ print(f'\nSOUND BACKGROUND SHAPE\n{sound_background.shape=}\n{x.shape=}\n- - - -')
46
+ x = .74 * x + .26 * sound_background[:len_speech]
47
+ return x
48
+
49
+ def tts_multi_sentence(precomputed_style_vector=None,
50
+ text=None,
51
+ voice=None,
52
+ scene=None):
53
+ '''create 24kHZ np.array with tts
54
+
55
+ precomputed_style_vector : required if en_US or en_UK in voice, so
56
+ to perform affective TTS.
57
+ text : string
58
+ voice : string or None (falls to styleTTS)
59
+ scene : 'A castle in far away lands' -> if passed will generate background sound scene
60
+ '''
61
+ # Generate sound scene - up sample to 24KHz
62
+ if scene is not None:
63
+
64
+ sound_background = sound_generator.generate([scene])[0]
65
+ sound_background = audio_write(None,
66
+ sound_background.cpu(),
67
+ 24000, # sound_generator.sample_rate,
68
+ strategy="loudness",
69
+ loudness_compressor=True)
70
+ else:
71
+ sound_background = None
72
+
73
+ # StyleTTS2
74
+ if ('en_US/' in voice) or ('en_UK/' in voice) or (voice is None):
75
+ assert precomputed_style_vector is not None, 'For affective TTS, style vector is needed.'
76
+ x = []
77
+ for _sentence in text:
78
+ x.append(msinference.inference(_sentence,
79
+ precomputed_style_vector,
80
+ alpha=0.3,
81
+ beta=0.7,
82
+ diffusion_steps=7,
83
+ embedding_scale=1))
84
+ x = np.concatenate(x)
85
+
86
+ return overlay(x, sound_background)
87
+
88
+ # Fallback - Mimic-3
89
+ text_utils.store_ssml(text=text, voice=voice) # Text has to be list of single sentences
90
+ ps = subprocess.Popen(f'cat _tmp_ssml.txt | mimic3 --ssml > _tmp.wav', shell=True)
91
+ ps.wait()
92
+ x, fs = soundfile.read('_tmp.wav')
93
+ x = audresample.resample(x.astype(np.float32), 24000, fs)[0, :] # reshapes (64,) -> (1,64)
94
+
95
+ return overlay(x, sound_background)
96
+
97
+
98
+
99
+
100
+ # voices = {}
101
+ # import phonemizer
102
+ # global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True)
103
+
104
+ app = Flask(__name__)
105
+ cors = CORS(app)
106
+
107
+
108
+ @app.route("/")
109
+ def index():
110
+ with open('README.md', 'r') as f:
111
+ return markdown.markdown(f.read())
112
+
113
+
114
+ @app.route("/", methods=['GET', 'POST', 'PUT'])
115
+ def serve_wav():
116
+ # https://stackoverflow.com/questions/13522137/in-flask-convert-form-post-
117
+ # object-into-a-representation-suitable-for-mongodb
118
+ r = request.form.to_dict(flat=False)
119
+
120
+
121
+ args = SimpleNamespace(
122
+ text=None if r.get('text') is None else r.get('text'), # string not file?
123
+ voice=r.get('voice')[0],
124
+ native=None if r.get('native') is None else CACHE_DIR + r.get('native')[0].replace("/",""),
125
+ affective = r.get('affective')[0],
126
+ scene=r.get('scene')[0]
127
+ )
128
+ # print('\n==RECOMPOSED as \n',request.data,request.form,'\n==')
129
+
130
+
131
+ print(args, 'ENTER Script')
132
+ do_video_dub = False
133
+
134
+ # ====STYLE VECTOR====
135
+
136
+ precomputed_style_vector = None
137
+ # NOTE: style vector may be None
138
+
139
+ if precomputed_style_vector is None:
140
+ if 'en_US' in args.voice or 'en_UK' in args.voice:
141
+ _dir = '/' if args.affective else '_v2/'
142
+ precomputed_style_vector = msinference.compute_style(
143
+ 'assets/wavs/style_vector' + _dir + args.voice.replace(
144
+ '/', '_').replace(
145
+ '#', '_').replace(
146
+ 'cmu-arctic', 'cmu_arctic').replace(
147
+ '_low', '') + '.wav')
148
+ print('\n STYLE VECTOR \n', precomputed_style_vector.shape)
149
+
150
+
151
+
152
+
153
+ x = tts_multi_sentence(text=args.text,
154
+ precomputed_style_vector=precomputed_style_vector,
155
+ voice=args.voice,
156
+ scene=args.scene)
157
+ OUT_FILE = 'tmp.wav'
158
+ soundfile.write(CACHE_DIR + OUT_FILE, x, 24000)
159
+
160
+
161
+
162
+
163
+
164
+ # send server's output as default file -> srv_result.xx
165
+ print(f'\n=SERVER saved as {OUT_FILE=}\n')
166
+ response = send_from_directory(CACHE_DIR, path=OUT_FILE)
167
+ response.headers['suffix-file-type'] = OUT_FILE
168
+ return response
169
+
170
+
171
+ if __name__ == "__main__":
172
+ app.run(host="0.0.0.0")
live_demo.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import argparse
3
+ import os
4
+ import requests
5
+ import subprocess
6
+
7
+
8
+ # SSH AGENT
9
+ # eval $(ssh-agent -s)
10
+ # ssh-add ~/.ssh/id_ed25519_github2024
11
+ #
12
+ # git remote set-url origin [email protected]:audeering/shift
13
+
14
+
15
+
16
+ # https://stackoverflow.com/questions/57158779/how-to-stop-audio-with-playsound-module
17
+ # import multiprocessing
18
+ # from playsound import playsound
19
+
20
+ # p = multiprocessing.Process(target=playsound, args=("file.mp3",))
21
+ # p.start()
22
+ # input("press ENTER to stop playback")
23
+ # p.terminate()
24
+ # from playsound import playsound
25
+ # playsound('/path/to/a/sound/file/you/want/to/play.mp3')
26
+
27
+
28
+
29
+ def command_line_args():
30
+ parser = argparse.ArgumentParser(
31
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter
32
+ )
33
+ parser.add_argument(
34
+ '--affective',
35
+ help="Select Emotional or non-emotional variant of Available voices: https://audeering.github.io/shift/",
36
+ action='store_false',
37
+ )
38
+ parser.add_argument(
39
+ '--device',
40
+ help="Device ID",
41
+ type=str,
42
+ default='cpu',
43
+ )
44
+ parser.add_argument(
45
+ '--text',
46
+ help="Text to be synthesized.",
47
+ default='sample.txt',
48
+ type=str,
49
+ )
50
+ parser.add_argument(
51
+ '--native',
52
+ help="""
53
+ --native: (without argument) a flag to do voice cloning using the speech from --video,
54
+ --native my_voice.wav: Voice cloning from user provided audio""",
55
+ # nargs='?',
56
+ # const=None,
57
+ # default=False # default has to be none
58
+ )
59
+ parser.add_argument(
60
+ '--voice',
61
+ help="TTS voice - Available voices: https://audeering.github.io/shift/",
62
+ default="en_US/m-ailabs_low#judy_bieber", #'en_US/cmu-arctic_low#lnh',
63
+ type=str,
64
+ )
65
+ parser.add_argument(
66
+ '--image',
67
+ help="If provided is set as background for output video, see --text",
68
+ type=str,
69
+ )
70
+ parser.add_argument(
71
+ '--video',
72
+ help="Video file for video translation. Voice cloned from the video",
73
+ type=str,
74
+ )
75
+ parser.add_argument(
76
+ '--out_file',
77
+ help="Output file name.",
78
+ type=str,
79
+ default='b6'
80
+ )
81
+ parser.add_argument(
82
+ '--scene',
83
+ help='Sound scene description.',
84
+ type=str,
85
+ default='calm background sounds of a castle'
86
+ )
87
+ return parser
88
+
89
+ def send_to_server(args):
90
+ url = "http://192.168.88.209:5000"
91
+
92
+ payload = {
93
+ 'affective': args.affective,
94
+ 'voice': args.voice,
95
+ 'native': args.native,
96
+ 'text': args.text,
97
+ 'image': args.image,
98
+ 'video': args.video,
99
+ 'scene': args.scene,
100
+ # 'out_file': args.out_file # let serve save as temp
101
+ }
102
+
103
+ # In data= we can write args
104
+
105
+ # In files= sent actual files if provided
106
+ text_file = open(args.text, 'rb')
107
+
108
+ image_file, video_file, native_file = None, None, None
109
+ if args.image is not None:
110
+ print('\nLOADING IMAGE\n')
111
+ try:
112
+ image_file = open(args.image, 'rb')
113
+ except FileNotFoundError:
114
+ pass
115
+
116
+
117
+ if args.video is not None:
118
+ print('\nLOADING vid\n')
119
+ try:
120
+ video_file = open(args.video, 'rb')
121
+ except FileNotFoundError:
122
+ pass
123
+
124
+ if args.native is not None:
125
+ print('\nLOADING natv\n')
126
+ try:
127
+ native_file = open(args.native, 'rb')
128
+ except FileNotFoundError:
129
+ pass
130
+
131
+
132
+
133
+ # --------------------- send this extra
134
+
135
+ print('Sending...\n')
136
+
137
+ response = requests.post(url, data=payload,
138
+ files=[(args.image, image_file)]) # NONEs do not arrive to servers dict
139
+
140
+ # Check the response from the server
141
+ if response.status_code == 200:
142
+ print("\nRequest was successful!")
143
+ # print("Response:", respdonse.__dict__.keys(), '\n=====\n')
144
+
145
+ else:
146
+ print("Failed to send the request")
147
+ print("Status Code:", response.status_code)
148
+ print("Response:", response.text)
149
+ return response
150
+
151
+
152
+ def cli(): # args.out_file is not send to server - server writes tmp - copied by client
153
+ parser = command_line_args()
154
+ args = parser.parse_args()
155
+ while True:
156
+ args.text = input("Type your text: ")
157
+ response = send_to_server(args)
158
+ out_file = args.out_file + '.' + response.headers['suffix-file-type'].split('.')[-1]
159
+
160
+ with open(out_file, 'wb') as f:
161
+ f.write(response.content)
162
+ print('REsponse AT client []\n----------------------------', response.headers)
163
+
164
+
165
+ subprocess.run(["paplay", out_file])
166
+
167
+
168
+
169
+ if __name__ == '__main__':
170
+ cli()
171
+
172
+ # assume also video and text for video we have to write some classes for video for audiocraft
173
+ # then call tts.py on this video with nonempty labels - thus calls audiocraft