Dionyssos commited on
Commit
c4effd2
·
1 Parent(s): 6e78f43

slow OK 4s live

Browse files
Files changed (3) hide show
  1. README.md +15 -0
  2. live_api.py +43 -87
  3. live_demo.py +12 -111
README.md CHANGED
@@ -124,3 +124,18 @@ From an image and text create a video:
124
 
125
  python tts.py --text sample.txt --image assets/image_from_T31.jpg
126
  ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
  python tts.py --text sample.txt --image assets/image_from_T31.jpg
126
  ```
127
+
128
+
129
+ # Live Demo - Paplay
130
+
131
+ Flask
132
+
133
+ ```python
134
+ CUDA_DEVICE_ORDER=PCI_BUS_ID HF_HOME=/data/dkounadis/.hf7/ CUDA_VISIBLE_DEVICES=4 python live_api.py
135
+ ```
136
+
137
+ Client (Ubutu)
138
+
139
+ ```python
140
+ python live_demo.py # will ask text input & play soundscape
141
+ ```
live_api.py CHANGED
@@ -4,7 +4,7 @@ import numpy as np
4
  import soundfile
5
  import audresample
6
  import text_utils
7
- import msinference
8
  import re
9
  import srt
10
  import subprocess
@@ -17,89 +17,67 @@ from flask_cors import CORS
17
  from audiocraft.audiogen import AudioGen, audio_write
18
 
19
  sound_generator = AudioGen.get_pretrained('facebook/audiogen-medium')
20
- sound_generator.set_generation_params(duration=6)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  CACHE_DIR = 'flask_cache/'
23
  Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
24
 
25
 
26
- def _shift(x):
27
- n = x.shape[0]
28
- i = np.random.randint(.24 * n, max(1, .74 * n)) # high should be above >= 0
29
- x = np.roll(x, i)
30
- # fade_in = .5 + .5 * np.tanh(4*(np.linspace(-10, 10, x.shape[0]) + 9.4))
31
- # x = x * fade_in
32
- return x
33
-
34
- def overlay(x, sound_background=None):
35
- if sound_background is not None:
36
- sound_background = sound_background.detach().cpu().numpy()[0, :]
37
- len_speech = len(x)
38
- if len_speech > len(sound_background):
39
- n_repeat = len_speech // len(sound_background) + 1
40
- replica = [sound_background] * n_repeat
41
- replica = [_shift(_) for _ in replica]
42
- sound_background = np.concatenate(replica)
43
-
44
-
45
- print(f'\nSOUND BACKGROUND SHAPE\n{sound_background.shape=}\n{x.shape=}\n- - - -')
46
- x = .74 * x + .26 * sound_background[:len_speech]
47
- return x
48
-
49
- def tts_multi_sentence(precomputed_style_vector=None,
50
- text=None,
51
- voice=None,
52
- scene=None):
53
- '''create 24kHZ np.array with tts
54
-
55
- precomputed_style_vector : required if en_US or en_UK in voice, so
56
- to perform affective TTS.
57
- text : string
58
- voice : string or None (falls to styleTTS)
59
- scene : 'A castle in far away lands' -> if passed will generate background sound scene
60
- '''
61
- # Generate sound scene - up sample to 24KHz
62
  if scene is not None:
63
 
64
  sound_background = sound_generator.generate([scene])[0]
65
  sound_background = audio_write(None,
66
  sound_background.cpu(),
67
- 24000, # sound_generator.sample_rate,
68
  strategy="loudness",
69
- loudness_compressor=True)
70
  else:
71
  sound_background = None
72
 
73
- # StyleTTS2
74
- if ('en_US/' in voice) or ('en_UK/' in voice) or (voice is None):
75
- assert precomputed_style_vector is not None, 'For affective TTS, style vector is needed.'
76
- x = []
77
- for _sentence in text:
78
- x.append(msinference.inference(_sentence,
79
- precomputed_style_vector,
80
- alpha=0.3,
81
- beta=0.7,
82
- diffusion_steps=7,
83
- embedding_scale=1))
84
- x = np.concatenate(x)
85
 
86
- return overlay(x, sound_background)
87
 
88
- # Fallback - Mimic-3
89
- text_utils.store_ssml(text=text, voice=voice) # Text has to be list of single sentences
90
- ps = subprocess.Popen(f'cat _tmp_ssml.txt | mimic3 --ssml > _tmp.wav', shell=True)
91
- ps.wait()
92
- x, fs = soundfile.read('_tmp.wav')
93
- x = audresample.resample(x.astype(np.float32), 24000, fs)[0, :] # reshapes (64,) -> (1,64)
94
 
95
- return overlay(x, sound_background)
96
 
97
 
98
 
99
 
100
- # voices = {}
101
- # import phonemizer
102
- # global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True)
103
 
104
  app = Flask(__name__)
105
  cors = CORS(app)
@@ -120,40 +98,18 @@ def serve_wav():
120
 
121
  args = SimpleNamespace(
122
  text=None if r.get('text') is None else r.get('text'), # string not file?
123
- voice=r.get('voice')[0],
124
- native=None if r.get('native') is None else CACHE_DIR + r.get('native')[0].replace("/",""),
125
- affective = r.get('affective')[0],
126
  scene=r.get('scene')[0]
127
  )
128
  # print('\n==RECOMPOSED as \n',request.data,request.form,'\n==')
129
 
130
 
131
- print(args, 'ENTER Script')
132
- do_video_dub = False
133
-
134
- # ====STYLE VECTOR====
135
-
136
- precomputed_style_vector = None
137
- # NOTE: style vector may be None
138
 
139
- if precomputed_style_vector is None:
140
- if 'en_US' in args.voice or 'en_UK' in args.voice:
141
- _dir = '/' if args.affective else '_v2/'
142
- precomputed_style_vector = msinference.compute_style(
143
- 'assets/wavs/style_vector' + _dir + args.voice.replace(
144
- '/', '_').replace(
145
- '#', '_').replace(
146
- 'cmu-arctic', 'cmu_arctic').replace(
147
- '_low', '') + '.wav')
148
- print('\n STYLE VECTOR \n', precomputed_style_vector.shape)
149
 
150
 
151
 
152
 
153
- x = tts_multi_sentence(text=args.text,
154
- precomputed_style_vector=precomputed_style_vector,
155
- voice=args.voice,
156
- scene=args.scene)
157
  OUT_FILE = 'tmp.wav'
158
  soundfile.write(CACHE_DIR + OUT_FILE, x, 24000)
159
 
 
4
  import soundfile
5
  import audresample
6
  import text_utils
7
+
8
  import re
9
  import srt
10
  import subprocess
 
17
  from audiocraft.audiogen import AudioGen, audio_write
18
 
19
  sound_generator = AudioGen.get_pretrained('facebook/audiogen-medium')
20
+ sound_generator.set_generation_params(duration=4)
21
+
22
+
23
+ # ====STYLE VECTOR====
24
+
25
+
26
+
27
+ # AFFECTIVE = True
28
+ # VOICE = 'en_UK/apope_low' # en_US/m-ailabs_low#mary_ann
29
+
30
+ # _dir = '/' if AFFECTIVE else '_v2/'
31
+ # precomputed_style_vector = msinference.compute_style(
32
+ # 'assets/wavs/style_vector' + _dir + VOICE.replace(
33
+ # '/', '_').replace(
34
+ # '#', '_').replace(
35
+ # 'cmu-arctic', 'cmu_arctic').replace(
36
+ # '_low', '') + '.wav')
37
+ # print('\n STYLE VECTOR \n', precomputed_style_vector.shape)
38
+
39
+
40
+ # ==== STYLE VECTOR
41
 
42
  CACHE_DIR = 'flask_cache/'
43
  Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
44
 
45
 
46
+
47
+
48
+ def tts_multi_sentence(scene=None):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  if scene is not None:
50
 
51
  sound_background = sound_generator.generate([scene])[0]
52
  sound_background = audio_write(None,
53
  sound_background.cpu(),
54
+ 24000, # Same as styleTTs sample_rate,
55
  strategy="loudness",
56
+ loudness_compressor=True).detach().cpu().numpy()[0, :]
57
  else:
58
  sound_background = None
59
 
60
+ # # StyleTTS2
61
+ # if ('en_US/' in voice) or ('en_UK/' in voice) or (voice is None):
62
+ # assert precomputed_style_vector is not None, 'For affective TTS, style vector is needed.'
63
+ # x = []
64
+ # for _sentence in text:
65
+ # x.append(msinference.inference(_sentence,
66
+ # precomputed_style_vector,
67
+ # alpha=0.3,
68
+ # beta=0.7,
69
+ # diffusion_steps=7,
70
+ # embedding_scale=1))
71
+ # x = np.concatenate(x)
72
 
73
+ # return overlay(x, sound_background)
74
 
75
+ return sound_background
 
 
 
 
 
76
 
 
77
 
78
 
79
 
80
 
 
 
 
81
 
82
  app = Flask(__name__)
83
  cors = CORS(app)
 
98
 
99
  args = SimpleNamespace(
100
  text=None if r.get('text') is None else r.get('text'), # string not file?
 
 
 
101
  scene=r.get('scene')[0]
102
  )
103
  # print('\n==RECOMPOSED as \n',request.data,request.form,'\n==')
104
 
105
 
 
 
 
 
 
 
 
106
 
 
 
 
 
 
 
 
 
 
 
107
 
108
 
109
 
110
 
111
+ x = tts_multi_sentence(args.scene)
112
+ # print('\n\n\n\n Obtai TTS output shape', x.shape)
 
 
113
  OUT_FILE = 'tmp.wav'
114
  soundfile.write(CACHE_DIR + OUT_FILE, x, 24000)
115
 
live_demo.py CHANGED
@@ -1,29 +1,9 @@
1
- import numpy as np
2
  import argparse
3
  import os
4
  import requests
5
  import subprocess
6
 
7
 
8
- # SSH AGENT
9
- # eval $(ssh-agent -s)
10
- # ssh-add ~/.ssh/id_ed25519_github2024
11
- #
12
- # git remote set-url origin [email protected]:audeering/shift
13
-
14
-
15
-
16
- # https://stackoverflow.com/questions/57158779/how-to-stop-audio-with-playsound-module
17
- # import multiprocessing
18
- # from playsound import playsound
19
-
20
- # p = multiprocessing.Process(target=playsound, args=("file.mp3",))
21
- # p.start()
22
- # input("press ENTER to stop playback")
23
- # p.terminate()
24
- # from playsound import playsound
25
- # playsound('/path/to/a/sound/file/you/want/to/play.mp3')
26
-
27
 
28
 
29
  def command_line_args():
@@ -44,98 +24,20 @@ def command_line_args():
44
  parser.add_argument(
45
  '--text',
46
  help="Text to be synthesized.",
47
- default='sample.txt',
48
- type=str,
49
- )
50
- parser.add_argument(
51
- '--native',
52
- help="""
53
- --native: (without argument) a flag to do voice cloning using the speech from --video,
54
- --native my_voice.wav: Voice cloning from user provided audio""",
55
- # nargs='?',
56
- # const=None,
57
- # default=False # default has to be none
58
- )
59
- parser.add_argument(
60
- '--voice',
61
- help="TTS voice - Available voices: https://audeering.github.io/shift/",
62
- default="en_US/m-ailabs_low#judy_bieber", #'en_US/cmu-arctic_low#lnh',
63
- type=str,
64
- )
65
- parser.add_argument(
66
- '--image',
67
- help="If provided is set as background for output video, see --text",
68
- type=str,
69
- )
70
- parser.add_argument(
71
- '--video',
72
- help="Video file for video translation. Voice cloned from the video",
73
  type=str,
74
  )
75
- parser.add_argument(
76
- '--out_file',
77
- help="Output file name.",
78
- type=str,
79
- default='b6'
80
- )
81
- parser.add_argument(
82
- '--scene',
83
- help='Sound scene description.',
84
- type=str,
85
- default='calm background sounds of a castle'
86
- )
87
  return parser
88
 
89
  def send_to_server(args):
90
  url = "http://192.168.88.209:5000"
91
 
92
  payload = {
93
- 'affective': args.affective,
94
- 'voice': args.voice,
95
- 'native': args.native,
96
  'text': args.text,
97
- 'image': args.image,
98
- 'video': args.video,
99
- 'scene': args.scene,
100
- # 'out_file': args.out_file # let serve save as temp
101
  }
102
 
103
- # In data= we can write args
104
-
105
- # In files= sent actual files if provided
106
- text_file = open(args.text, 'rb')
107
-
108
- image_file, video_file, native_file = None, None, None
109
- if args.image is not None:
110
- print('\nLOADING IMAGE\n')
111
- try:
112
- image_file = open(args.image, 'rb')
113
- except FileNotFoundError:
114
- pass
115
-
116
-
117
- if args.video is not None:
118
- print('\nLOADING vid\n')
119
- try:
120
- video_file = open(args.video, 'rb')
121
- except FileNotFoundError:
122
- pass
123
-
124
- if args.native is not None:
125
- print('\nLOADING natv\n')
126
- try:
127
- native_file = open(args.native, 'rb')
128
- except FileNotFoundError:
129
- pass
130
-
131
-
132
-
133
- # --------------------- send this extra
134
-
135
- print('Sending...\n')
136
-
137
- response = requests.post(url, data=payload,
138
- files=[(args.image, image_file)]) # NONEs do not arrive to servers dict
139
 
140
  # Check the response from the server
141
  if response.status_code == 200:
@@ -152,22 +54,21 @@ def send_to_server(args):
152
  def cli(): # args.out_file is not send to server - server writes tmp - copied by client
153
  parser = command_line_args()
154
  args = parser.parse_args()
 
155
  while True:
156
- args.text = input("Type your text: ")
 
 
 
157
  response = send_to_server(args)
158
- out_file = args.out_file + '.' + response.headers['suffix-file-type'].split('.')[-1]
159
-
160
  with open(out_file, 'wb') as f:
161
  f.write(response.content)
162
- print('REsponse AT client []\n----------------------------', response.headers)
163
-
164
-
165
  subprocess.run(["paplay", out_file])
166
-
167
 
168
 
169
  if __name__ == '__main__':
170
  cli()
171
-
172
- # assume also video and text for video we have to write some classes for video for audiocraft
173
- # then call tts.py on this video with nonempty labels - thus calls audiocraft
 
 
1
  import argparse
2
  import os
3
  import requests
4
  import subprocess
5
 
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
 
9
  def command_line_args():
 
24
  parser.add_argument(
25
  '--text',
26
  help="Text to be synthesized.",
27
+ default='How is hoowl',
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  type=str,
29
  )
 
 
 
 
 
 
 
 
 
 
 
 
30
  return parser
31
 
32
  def send_to_server(args):
33
  url = "http://192.168.88.209:5000"
34
 
35
  payload = {
 
 
 
36
  'text': args.text,
37
+ 'scene': args.scene
 
 
 
38
  }
39
 
40
+ response = requests.post(url, data=payload) # NONEs do not arrive to servers dict
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
  # Check the response from the server
43
  if response.status_code == 200:
 
54
  def cli(): # args.out_file is not send to server - server writes tmp - copied by client
55
  parser = command_line_args()
56
  args = parser.parse_args()
57
+ os.system('cls' if os.name == 'nt' else 'clear')
58
  while True:
59
+ args.text = input("\n\n\n\nDescribe Any Sound: \n\n\n\n")
60
+ # _text, _scene = args.text.split('|')
61
+ # args.text = _text
62
+ args.scene = args.text #_scene
63
  response = send_to_server(args)
64
+ out_file = '_gen_.wav' #+ response.headers['suffix-file-type'].split('.')[-1]
65
+
66
  with open(out_file, 'wb') as f:
67
  f.write(response.content)
68
+ # print('REsponse AT client []\n----------------------------', response.headers)
69
+
 
70
  subprocess.run(["paplay", out_file])
 
71
 
72
 
73
  if __name__ == '__main__':
74
  cli()