StormblessedKal commited on
Commit
5f8297f
1 Parent(s): a81bf6e

deliverable api

Browse files
src/__pycache__/predict.cpython-310.pyc CHANGED
Binary files a/src/__pycache__/predict.cpython-310.pyc and b/src/__pycache__/predict.cpython-310.pyc differ
 
src/__pycache__/rp_schema.cpython-310.pyc CHANGED
Binary files a/src/__pycache__/rp_schema.cpython-310.pyc and b/src/__pycache__/rp_schema.cpython-310.pyc differ
 
src/__pycache__/se_extractor.cpython-310.pyc CHANGED
Binary files a/src/__pycache__/se_extractor.cpython-310.pyc and b/src/__pycache__/se_extractor.cpython-310.pyc differ
 
src/predict.py CHANGED
@@ -6,6 +6,11 @@ repository, with some modifications to make it work with the RP platform.
6
 
7
  from concurrent.futures import ThreadPoolExecutor
8
  import numpy as np
 
 
 
 
 
9
 
10
  from runpod.serverless.utils import rp_cuda
11
  import boto3
@@ -52,6 +57,9 @@ from text_utils import TextCleaner
52
  from pydantic import BaseModel, HttpUrl
53
  from api import BaseSpeakerTTS, ToneColorConverter
54
 
 
 
 
55
  class Predictor:
56
  def __init__(self):
57
  self.model = None
@@ -135,7 +143,70 @@ class Predictor:
135
  self.tone_color_converter.load_ckpt(f'{self.ckpt_converter}/checkpoint.pth')
136
 
137
 
138
- def predict(self,s3_url,passage,method_type='voice_clone'):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  output_dir = 'processed'
140
  gen_id = str(uuid.uuid4())
141
  os.makedirs(output_dir,exist_ok=True)
@@ -159,29 +230,32 @@ class Predictor:
159
  #voice_clone with styletts2
160
  model,sampler = self.model,self.sampler
161
  processed_seg_dir = os.path.join(segments_dir,s3_key.split('.')[0],'wavs')
162
- result = self.process_audio_file(processed_seg_dir,passage,model,sampler)
163
  final_output = os.path.join(results_dir,f"{gen_id}-voice-clone-1.wav")
 
164
  sf.write(final_output,result,24000)
 
 
 
165
 
166
  base_speaker_tts,tone_color_converter = self.base_speaker_tts,self.tone_color_converter
167
  reference_speaker = local_file_path
168
  target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, target_dir=openvoice_dir, vad=False)
169
  src_path = os.path.join(results_dir,f"{gen_id}-tmp.wav")
170
- openvoice_output = os.path.join(results_dir,f"{gen_id}-2.wav")
171
  base_speaker_tts.tts(passage,src_path,speaker='default',language='English',speed=1.0)
172
 
173
  source_se = torch.load(f'{self.ckpt_base}/en_default_se.pth').to(self.device)
174
  tone_color_converter.convert(audio_src_path=src_path,src_se=source_se,tgt_se=target_se,output_path=openvoice_output,message='')
175
- (new_sr, wav1) = self._fn(openvoice_output,"Midpoint",32,0.5)
176
- denoised_openvoice_output = os.path.join(results_dir,f"{gen_id}-voice-clone-2.wav")
177
- sf.write(denoised_openvoice_output,wav1,new_sr)
178
-
179
-
180
 
181
  mp3_final_output_1 = str(final_output).replace('wav','mp3')
182
- mp3_final_output_2 = str(denoised_openvoice_output).replace('wav','mp3')
183
  self.convert_wav_to_mp3(final_output,mp3_final_output_1)
184
- self.convert_wav_to_mp3(denoised_openvoice_output,mp3_final_output_2)
185
  print(mp3_final_output_1)
186
  print(mp3_final_output_2)
187
 
@@ -200,19 +274,19 @@ class Predictor:
200
  base_speaker_tts.tts(passage,src_path,speaker='default',language='English',speed=1.0,use_emotions=True)
201
  source_se = torch.load(f'{self.ckpt_base}/en_style_se.pth').to(self.device)
202
  tone_color_converter.convert(audio_src_path=src_path,src_se=source_se,tgt_se=target_se,output_path=openvoice_output,message='')
203
- (new_sr, wav1) = self._fn(openvoice_output,"Midpoint",32,0.5)
204
- denoised_openvoice_output = os.path.join(results_dir,f"{gen_id}-with-emotions.wav")
205
- sf.write(denoised_openvoice_output,wav1,new_sr)
206
 
207
- mp3_final_output_1 = str(denoised_openvoice_output).replace('wav','mp3')
208
- self.convert_wav_to_mp3(denoised_openvoice_output,mp3_final_output_1)
209
  print(mp3_final_output_1)
210
  self.upload_file_to_s3(mp3_final_output_1,'demovidelyusergenerations',f"{gen_id}-voice-with-emotions.mp3")
211
  shutil.rmtree(os.path.join(output_dir,gen_id))
212
  return {"voice_clone_with_emotions":f"https://demovidelyusergenerations.s3.amazonaws.com/{gen_id}-voice-with-emotions.mp3"
213
  }
214
 
215
- if method_type == 'voice_clone_multi_lang':
216
  #voice clone with multi-lingugal
217
  _,tone_color_converter = self.base_speaker_tts,self.tone_color_converter
218
  reference_speaker = local_file_path
@@ -236,9 +310,10 @@ class Predictor:
236
  self.tone_color_converter.convert(audio_src_path=openai_multi_lang_path, src_se=source_se, tgt_se=target_se, output_path=multi_lang_with_voice_clone_path,message='')
237
 
238
  mp3_final_output_1 = str(multi_lang_with_voice_clone_path).replace('wav','mp3')
239
- convert_wav_to_mp3(multi_lang_with_voice_clone_path,mp3_final_output_1)
240
  print(mp3_final_output_1)
241
- upload_file_to_s3(mp3_final_output_1,'demovidelyusergenerations',f"{gen_id}-voice-clone-multi-lang.mp3")
 
242
  return {"voice_clone_with_emotions":f"https://demovidelyusergenerations.s3.amazonaws.com/{gen_id}-voice-clone-multi-lang.mp3"
243
  }
244
 
@@ -249,7 +324,7 @@ class Predictor:
249
 
250
  solver = solver.lower()
251
  nfe = int(nfe)
252
- lambd = 0.9
253
 
254
  dwav, sr = torchaudio.load(path)
255
  dwav = dwav.mean(dim=0)
@@ -380,14 +455,9 @@ class Predictor:
380
 
381
  return torch.cat([ref_s, ref_p], dim=1)
382
 
383
- def process_audio_file(self,file_dir,passage,model,sampler):
384
- print(file_dir)
385
- audio_segs = glob(f'{file_dir}/*.wav')
386
- print(audio_segs)
387
- if len(audio_segs) >= 1:
388
- s_ref = self.compute_style(audio_segs[0], model)
389
- else:
390
- raise NotImplementedError('No audio segments found!')
391
  sentences = split_and_recombine_text(passage)
392
  wavs = []
393
  s_prev = None
@@ -398,7 +468,7 @@ class Predictor:
398
  s_prev,
399
  s_ref,
400
  alpha = 0,
401
- beta = 0.1,
402
  t = 0.7,
403
  diffusion_steps=10, embedding_scale=1)
404
  wavs.append(wav)
@@ -437,4 +507,5 @@ class Predictor:
437
  return False
438
  except Exception as e:
439
  print(f"Error uploading file: {e}")
440
- return False
 
 
6
 
7
  from concurrent.futures import ThreadPoolExecutor
8
  import numpy as np
9
+ import base64
10
+ from pydub.utils import mediainfo
11
+ import tempfile
12
+
13
+
14
 
15
  from runpod.serverless.utils import rp_cuda
16
  import boto3
 
57
  from pydantic import BaseModel, HttpUrl
58
  from api import BaseSpeakerTTS, ToneColorConverter
59
 
60
+ from pydub import AudioSegment
61
+
62
+
63
  class Predictor:
64
  def __init__(self):
65
  self.model = None
 
143
  self.tone_color_converter.load_ckpt(f'{self.ckpt_converter}/checkpoint.pth')
144
 
145
 
146
+ def createvoice(self,audio_base_64,cut_audio,process_audio):
147
+ file_bytes = base64.b64decode(audio_base_64)
148
+ file_buffer = io.BytesIO(file_bytes)
149
+
150
+ header = file_buffer.read(12)
151
+ print(header)
152
+ file_format = None
153
+ bucket_name = 'demovidelyuseruploads'
154
+ if b'WAVE' in header:
155
+ file_format = 'wav'
156
+ elif header.startswith((b'\xff\xfb', b'\xff\xf3', b'\xff\xe3', b'\xff\xfa')):
157
+ file_format = 'mp3'
158
+ else:
159
+ file_format = 'unknown'
160
+ if file_format == 'unknown':
161
+ return {'error':'unrecognized file format, encode audio file as base64 str'}
162
+
163
+ unique_filename = f"{uuid.uuid4()}"
164
+
165
+ local_filename = f"{unique_filename}.{file_format}"
166
+ with open(local_filename, 'wb') as file_out:
167
+ file_out.write(file_bytes)
168
+
169
+ wav_filename = local_filename
170
+ if file_format == "mp3":
171
+ wav_filename = f"{unique_filename}.wav"
172
+ subprocess.run(["ffmpeg", "-i", local_filename, wav_filename])
173
+ os.remove(local_filename)
174
+ print(wav_filename)
175
+
176
+ # if cut_audio > 0, means it was set
177
+ if cut_audio > 0:
178
+ #need to cut
179
+ se_extractor.extract_segments_to_cut_audio(cut_audio,wav_filename)
180
+
181
+ file_url = f"https://{bucket_name}.s3.amazonaws.com/{wav_filename}"
182
+
183
+ if process_audio:
184
+ (new_sr, wav1) = self._fn(wav_filename,"Midpoint",32,0.5)
185
+ print('Denoised')
186
+ buffer = io.BytesIO()
187
+ sf.write(buffer, wav1, new_sr, format='WAV')
188
+ print(new_sr)
189
+ buffer.seek(0)
190
+ else:
191
+ wav1, sr = librosa.load(wav_filename, sr=None)
192
+ buffer = io.BytesIO()
193
+ sf.write(buffer, wav1, sr, format='WAV')
194
+ buffer.seek(0)
195
+
196
+ print("uploading")
197
+ content_type = "audio/wav"
198
+ try:
199
+ self.s3_client.put_object(Bucket=bucket_name, Key=wav_filename, Body=buffer, ContentType=content_type)
200
+ print("uploaded")
201
+ except Exception as e:
202
+ print(f"Error uploading to S3: {e}")
203
+ return {"error": str(e)}
204
+
205
+ os.remove(wav_filename)
206
+ return {"url": file_url}
207
+
208
+
209
+ def predict(self,s3_url,passage,process_audio,method_type='voice_clone'):
210
  output_dir = 'processed'
211
  gen_id = str(uuid.uuid4())
212
  os.makedirs(output_dir,exist_ok=True)
 
230
  #voice_clone with styletts2
231
  model,sampler = self.model,self.sampler
232
  processed_seg_dir = os.path.join(segments_dir,s3_key.split('.')[0],'wavs')
233
+ result = self.process_audio_file(local_file_path,passage,model,sampler)
234
  final_output = os.path.join(results_dir,f"{gen_id}-voice-clone-1.wav")
235
+
236
  sf.write(final_output,result,24000)
237
+ if process_audio:
238
+ (new_sr, wav1) = self._fn(final_output,"Midpoint",32,0.5)
239
+ sf.write(final_output,wav1,new_sr)
240
 
241
  base_speaker_tts,tone_color_converter = self.base_speaker_tts,self.tone_color_converter
242
  reference_speaker = local_file_path
243
  target_se, audio_name = se_extractor.get_se(reference_speaker, tone_color_converter, target_dir=openvoice_dir, vad=False)
244
  src_path = os.path.join(results_dir,f"{gen_id}-tmp.wav")
245
+ openvoice_output = os.path.join(results_dir,f"{gen_id}-voice-clone-2.wav")
246
  base_speaker_tts.tts(passage,src_path,speaker='default',language='English',speed=1.0)
247
 
248
  source_se = torch.load(f'{self.ckpt_base}/en_default_se.pth').to(self.device)
249
  tone_color_converter.convert(audio_src_path=src_path,src_se=source_se,tgt_se=target_se,output_path=openvoice_output,message='')
250
+ if process_audio:
251
+ (new_sr, wav1) = self._fn(openvoice_output,"Midpoint",32,0.5)
252
+ sf.write(openvoice_output,wav1,new_sr)
253
+
 
254
 
255
  mp3_final_output_1 = str(final_output).replace('wav','mp3')
256
+ mp3_final_output_2 = str(openvoice_output).replace('wav','mp3')
257
  self.convert_wav_to_mp3(final_output,mp3_final_output_1)
258
+ self.convert_wav_to_mp3(openvoice_output,mp3_final_output_2)
259
  print(mp3_final_output_1)
260
  print(mp3_final_output_2)
261
 
 
274
  base_speaker_tts.tts(passage,src_path,speaker='default',language='English',speed=1.0,use_emotions=True)
275
  source_se = torch.load(f'{self.ckpt_base}/en_style_se.pth').to(self.device)
276
  tone_color_converter.convert(audio_src_path=src_path,src_se=source_se,tgt_se=target_se,output_path=openvoice_output,message='')
277
+ if process_audio:
278
+ (new_sr, wav1) = self._fn(openvoice_output,"Midpoint",32,0.5)
279
+ sf.write(openvoice_output,wav1,new_sr)
280
 
281
+ mp3_final_output_1 = str(openvoice_output).replace('wav','mp3')
282
+ self.convert_wav_to_mp3(openvoice_output,mp3_final_output_1)
283
  print(mp3_final_output_1)
284
  self.upload_file_to_s3(mp3_final_output_1,'demovidelyusergenerations',f"{gen_id}-voice-with-emotions.mp3")
285
  shutil.rmtree(os.path.join(output_dir,gen_id))
286
  return {"voice_clone_with_emotions":f"https://demovidelyusergenerations.s3.amazonaws.com/{gen_id}-voice-with-emotions.mp3"
287
  }
288
 
289
+ if method_type == 'voice_clone_with_multi_lang':
290
  #voice clone with multi-lingugal
291
  _,tone_color_converter = self.base_speaker_tts,self.tone_color_converter
292
  reference_speaker = local_file_path
 
310
  self.tone_color_converter.convert(audio_src_path=openai_multi_lang_path, src_se=source_se, tgt_se=target_se, output_path=multi_lang_with_voice_clone_path,message='')
311
 
312
  mp3_final_output_1 = str(multi_lang_with_voice_clone_path).replace('wav','mp3')
313
+ self.convert_wav_to_mp3(multi_lang_with_voice_clone_path,mp3_final_output_1)
314
  print(mp3_final_output_1)
315
+ self.upload_file_to_s3(mp3_final_output_1,'demovidelyusergenerations',f"{gen_id}-voice-clone-multi-lang.mp3")
316
+ shutil.rmtree(os.path.join(output_dir,gen_id))
317
  return {"voice_clone_with_emotions":f"https://demovidelyusergenerations.s3.amazonaws.com/{gen_id}-voice-clone-multi-lang.mp3"
318
  }
319
 
 
324
 
325
  solver = solver.lower()
326
  nfe = int(nfe)
327
+ lambd = 0.1 # lets remove denoise
328
 
329
  dwav, sr = torchaudio.load(path)
330
  dwav = dwav.mean(dim=0)
 
455
 
456
  return torch.cat([ref_s, ref_p], dim=1)
457
 
458
+ def process_audio_file(self,local_file_path,passage,model,sampler):
459
+ print(local_file_path)
460
+ s_ref = self.compute_style(local_file_path, model)
 
 
 
 
 
461
  sentences = split_and_recombine_text(passage)
462
  wavs = []
463
  s_prev = None
 
468
  s_prev,
469
  s_ref,
470
  alpha = 0,
471
+ beta = 0.3,
472
  t = 0.7,
473
  diffusion_steps=10, embedding_scale=1)
474
  wavs.append(wav)
 
507
  return False
508
  except Exception as e:
509
  print(f"Error uploading file: {e}")
510
+ return False
511
+
src/processed/4d651a78-ccbd-4f66-96b1-0e0ede048d77/raw/631a27e2-8466-463e-a6ca-a2afd468c5a3.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c25a5bee8b60933b09cc779d942fa5c219f437e455bf64b08c2623f1c833ccfe
3
+ size 322856
src/processed/69b28271-7198-4307-8501-e3969bbebef4/raw/631a27e2-8466-463e-a6ca-a2afd468c5a3.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c25a5bee8b60933b09cc779d942fa5c219f437e455bf64b08c2623f1c833ccfe
3
+ size 322856
src/rp_handler.py CHANGED
@@ -21,13 +21,36 @@ MODEL.setup()
21
  @rp_debugger.FunctionTimer
22
  def run_voice_clone_job(job):
23
  job_input = job['input']
24
- method_type = job_input['method_type']
25
- assert method_type in ["create_voice","voice_clone","voice_clone_with_emotions","voice_clone_with_multi_lang"]
26
- s3_url = job_input['s3_url']
27
- passage = job_input['passage']
28
- processed_urls = MODEL.predict(s3_url,passage,method_type)
29
-
30
- return processed_urls
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
 
33
  runpod.serverless.start({"handler": run_voice_clone_job})
 
21
  @rp_debugger.FunctionTimer
22
  def run_voice_clone_job(job):
23
  job_input = job['input']
24
+ method_type = job_input.get('method_type')
25
+
26
+ if method_type not in ["create_voice","voice_clone","voice_clone_with_emotions","voice_clone_with_multi_lang"]:
27
+ return {"error":"Please set method_type: available options, create_voice, voice_clone, voice_clone_with_emotions,voice_clone_with_multi_lang"}
28
+
29
+ if method_type == "create_voice":
30
+ audio_base64 = job_input.get('audio_base64')
31
+ if audio_base64 is None:
32
+ return {"error":"Needs audio file as base64"}
33
+ cut_audio = job_input.get('cut_audio')
34
+ process_audio = job_input.get('process_audio')
35
+ print(process_audio)
36
+ if process_audio is None:
37
+ process_audio = False
38
+ if cut_audio is None:
39
+ cut_audio = 0
40
+
41
+ processed_urls = MODEL.createvoice(audio_base64,cut_audio,process_audio)
42
+ return processed_urls
43
+ else:
44
+ s3_url = job_input.get('s3_url')
45
+ passage = job_input.get('passage')
46
+ process_audio = job_input.get('process_audio')
47
+ print(process_audio)
48
+ if process_audio is None:
49
+ process_audio = False
50
+
51
+ result = MODEL.predict(s3_url,passage,process_audio,method_type)
52
+
53
+ return result
54
 
55
 
56
  runpod.serverless.start({"handler": run_voice_clone_job})
src/rp_schema.py CHANGED
@@ -14,5 +14,20 @@ INPUT_VALIDATIONS = {
14
  'required': False,
15
  'default': 'None'
16
  },
17
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  }
 
14
  'required': False,
15
  'default': 'None'
16
  },
17
+ 'audio_base64': {
18
+ 'type': str,
19
+ 'required': False,
20
+ 'default': 'None'
21
+ },
22
+ 'cut_audio': {
23
+ 'type': int,
24
+ 'required': False,
25
+ 'default': 0
26
+ },
27
+ 'process_audio': {
28
+ 'type': bool,
29
+ 'required': False,
30
+ 'default': False
31
+ }
32
+
33
  }
src/se_extractor.py CHANGED
@@ -10,7 +10,7 @@ from whisper_timestamped.transcribe import get_audio_tensor, get_vad_segments
10
  model_size = "medium"
11
  # Run on GPU with FP16
12
  model = None
13
- def split_audio_whisper(audio_path, target_dir='processed'):
14
  global model
15
  if model is None:
16
  model = WhisperModel(model_size, device="cuda", compute_type="float16")
@@ -62,13 +62,18 @@ def split_audio_whisper(audio_path, target_dir='processed'):
62
  output_file = os.path.join(wavs_folder, fname)
63
  audio_seg.export(output_file, format='wav')
64
 
 
 
 
65
  if k < len(segments) - 1:
66
- start_time = max(0, segments[k+1].start - 0.08)
67
 
68
  s_ind = s_ind + 1
69
  return wavs_folder
70
 
71
 
 
 
72
  def split_audio_vad(audio_path, target_dir, split_seconds=10.0):
73
  SAMPLE_RATE = 16000
74
  audio_vad = get_audio_tensor(audio_path)
@@ -155,3 +160,24 @@ def generate_voice_segments(audio_path, target_dir='processed', vad=True):
155
  def load_model():
156
  model = WhisperModel(model_size, device="cpu", compute_type="int8")
157
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  model_size = "medium"
11
  # Run on GPU with FP16
12
  model = None
13
+ def split_audio_whisper(audio_path, target_dir='processed',needs_offset=True):
14
  global model
15
  if model is None:
16
  model = WhisperModel(model_size, device="cuda", compute_type="float16")
 
62
  output_file = os.path.join(wavs_folder, fname)
63
  audio_seg.export(output_file, format='wav')
64
 
65
+ offset = 0.0
66
+ if needs_offset:
67
+ offset = 0.08
68
  if k < len(segments) - 1:
69
+ start_time = max(0, segments[k+1].start - offset)
70
 
71
  s_ind = s_ind + 1
72
  return wavs_folder
73
 
74
 
75
+
76
+
77
  def split_audio_vad(audio_path, target_dir, split_seconds=10.0):
78
  SAMPLE_RATE = 16000
79
  audio_vad = get_audio_tensor(audio_path)
 
160
  def load_model():
161
  model = WhisperModel(model_size, device="cpu", compute_type="int8")
162
 
163
+
164
+ def extract_segments_to_cut_audio(max_duration,audio_path,target_dir='processed'):
165
+ global model
166
+ if model is None:
167
+ model = WhisperModel(model_size, device="cuda", compute_type="float16")
168
+ audio = AudioSegment.from_file(audio_path)
169
+ max_len = len(audio)
170
+
171
+ segments, info = model.transcribe(audio_path, beam_size=5, word_timestamps=True)
172
+ segments = list(segments)
173
+ start_time = 0.0
174
+ end_time = max_len
175
+ for segment in segments:
176
+ print(segment.end)
177
+ if segment.end > max_duration:
178
+ end_time = segment.end * 1000
179
+ break
180
+ max_duration_audio = audio[start_time:end_time]
181
+ max_duration_audio.export(audio_path,format='wav')
182
+
183
+