StormblessedKal commited on
Commit
e340491
1 Parent(s): e13b6d4

output_extension support

Browse files
Files changed (3) hide show
  1. src/predict.py +24 -24
  2. src/rp_handler.py +8 -4
  3. src/rp_schema.py +5 -0
src/predict.py CHANGED
@@ -204,7 +204,7 @@ class Predictor:
204
  return {"url": file_url}
205
 
206
 
207
- def predict(self,s3_url,passage,process_audio,run_type='styletts2'):
208
  output_dir = 'processed'
209
  gen_id = str(uuid.uuid4())
210
  os.makedirs(output_dir,exist_ok=True)
@@ -225,14 +225,14 @@ class Predictor:
225
  if run_type == 'styletts2':
226
  model,sampler = self.model,self.sampler
227
  result = self.process_audio_file(local_file_path,passage,model,sampler)
228
- final_output = os.path.join(results_dir,f"{gen_id}-voice-clone-1.wav")
229
 
230
- sf.write(final_output,result,24000)
231
  if process_audio:
232
- (new_sr, wav1) = self._fn(final_output,"Midpoint",32,0.5)
233
- sf.write(final_output,wav1,new_sr)
234
- mp3_final_output = str(final_output).replace('wav','mp3')
235
- self.convert_wav_to_mp3(final_output,mp3_final_output)
236
 
237
  if run_type == 'openvoice':
238
  s_ref = self.compute_style(local_file_path, self.model)
@@ -248,18 +248,18 @@ class Predictor:
248
  if process_audio:
249
  (new_sr, wav1) = self._fn(openvoice_output,"Midpoint",32,0.5)
250
  sf.write(openvoice_output,wav1,new_sr)
251
- mp3_final_output = str(openvoice_output).replace('wav','mp3')
252
- self.convert_wav_to_mp3(openvoice_output,mp3_final_output)
253
 
254
- self.upload_file_to_s3(mp3_final_output,'demovidelyusergenerations',f"{gen_id}-voice-clone.mp3")
255
  shutil.rmtree(os.path.join(output_dir,gen_id))
256
- return {"voice_clone":f"https://demovidelyusergenerations.s3.amazonaws.com/{gen_id}-voice-clone.mp3"
257
  }
258
 
259
 
260
 
261
 
262
- def predict_with_emotions(self,s3_url,passage,process_audio):
263
  output_dir = 'processed'
264
  gen_id = str(uuid.uuid4())
265
  os.makedirs(output_dir,exist_ok=True)
@@ -292,15 +292,15 @@ class Predictor:
292
  if process_audio:
293
  (new_sr, wav1) = self._fn(openvoice_output,"Midpoint",32,0.5)
294
  sf.write(openvoice_output,wav1,new_sr)
295
- mp3_final_output_2 = str(openvoice_output).replace('wav','mp3')
296
- self.convert_wav_to_mp3(openvoice_output,mp3_final_output_2)
297
- self.upload_file_to_s3(mp3_final_output_2,'demovidelyusergenerations',f"{gen_id}-voice-clone-emotions.mp3")
298
  shutil.rmtree(os.path.join(output_dir,gen_id))
299
- return {"voice_clone_emotions":f"https://demovidelyusergenerations.s3.amazonaws.com/{gen_id}-voice-clone-emotions.mp3",
300
  }
301
 
302
 
303
- def predict_with_multi_lang(self,s3_url,passage,process_audio):
304
  print("In multi lang voice cloning")
305
  output_dir = 'processed'
306
  gen_id = str(uuid.uuid4())
@@ -342,12 +342,12 @@ class Predictor:
342
  source_se, audio_name = se_extractor.get_se(src_path, tone_color_converter, vad=True)
343
  self.tone_color_converter.convert(audio_src_path=openai_multi_lang_path, src_se=source_se, tgt_se=target_se, output_path=multi_lang_with_voice_clone_path,message='')
344
 
345
- mp3_final_output_1 = str(multi_lang_with_voice_clone_path).replace('wav','mp3')
346
- self.convert_wav_to_mp3(multi_lang_with_voice_clone_path,mp3_final_output_1)
347
- print(mp3_final_output_1)
348
- self.upload_file_to_s3(mp3_final_output_1,'demovidelyusergenerations',f"{gen_id}-voice-clone-multi-lang.mp3")
349
  shutil.rmtree(os.path.join(output_dir,gen_id))
350
- return {"voice_clone_with_multi_lang":f"https://demovidelyusergenerations.s3.amazonaws.com/{gen_id}-voice-clone-multi-lang.mp3"
351
  }
352
 
353
 
@@ -520,8 +520,8 @@ class Predictor:
520
  print(f"Error downloading file: {e}")
521
 
522
 
523
- def convert_wav_to_mp3(self,wav_file, mp3_file):
524
- command = ['ffmpeg', '-i', wav_file, '-q:a', '0', '-map', 'a', mp3_file]
525
  subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
526
 
527
 
 
204
  return {"url": file_url}
205
 
206
 
207
+ def predict(self,s3_url,passage,process_audio,output_extension,run_type='styletts2'):
208
  output_dir = 'processed'
209
  gen_id = str(uuid.uuid4())
210
  os.makedirs(output_dir,exist_ok=True)
 
225
  if run_type == 'styletts2':
226
  model,sampler = self.model,self.sampler
227
  result = self.process_audio_file(local_file_path,passage,model,sampler)
228
+ generated_output = os.path.join(results_dir,f"{gen_id}-voice-clone-1.wav")
229
 
230
+ sf.write(generated_output,result,24000)
231
  if process_audio:
232
+ (new_sr, wav1) = self._fn(generated_output,"Midpoint",32,0.5)
233
+ sf.write(generated_output,wav1,new_sr)
234
+ final_output = str(generated_output).replace('wav',output_extension)
235
+ self.convert_wav_to_output_extension(generated_output,final_output)
236
 
237
  if run_type == 'openvoice':
238
  s_ref = self.compute_style(local_file_path, self.model)
 
248
  if process_audio:
249
  (new_sr, wav1) = self._fn(openvoice_output,"Midpoint",32,0.5)
250
  sf.write(openvoice_output,wav1,new_sr)
251
+ final_output = str(openvoice_output).replace('wav',output_extension)
252
+ self.convert_wav_to_output_extension(openvoice_output,final_output)
253
 
254
+ self.upload_file_to_s3(final_output,'demovidelyusergenerations',f"{gen_id}-voice-clone.{output_extension}")
255
  shutil.rmtree(os.path.join(output_dir,gen_id))
256
+ return {"voice_clone":f"https://demovidelyusergenerations.s3.amazonaws.com/{gen_id}-voice-clone.{output_extension}"
257
  }
258
 
259
 
260
 
261
 
262
+ def predict_with_emotions(self,s3_url,passage,output_extension,process_audio):
263
  output_dir = 'processed'
264
  gen_id = str(uuid.uuid4())
265
  os.makedirs(output_dir,exist_ok=True)
 
292
  if process_audio:
293
  (new_sr, wav1) = self._fn(openvoice_output,"Midpoint",32,0.5)
294
  sf.write(openvoice_output,wav1,new_sr)
295
+ final_ouput = str(openvoice_output).replace('wav',output_extension)
296
+ self.convert_wav_to_output_extension(openvoice_output,final_ouput)
297
+ self.upload_file_to_s3(final_ouput,'demovidelyusergenerations',f"{gen_id}-voice-clone-emotions.{output_extension}")
298
  shutil.rmtree(os.path.join(output_dir,gen_id))
299
+ return {"voice_clone_emotions":f"https://demovidelyusergenerations.s3.amazonaws.com/{gen_id}-voice-clone-emotions.{output_extension}",
300
  }
301
 
302
 
303
+ def predict_with_multi_lang(self,s3_url,passage,output_extension,process_audio):
304
  print("In multi lang voice cloning")
305
  output_dir = 'processed'
306
  gen_id = str(uuid.uuid4())
 
342
  source_se, audio_name = se_extractor.get_se(src_path, tone_color_converter, vad=True)
343
  self.tone_color_converter.convert(audio_src_path=openai_multi_lang_path, src_se=source_se, tgt_se=target_se, output_path=multi_lang_with_voice_clone_path,message='')
344
 
345
+ final_output = str(multi_lang_with_voice_clone_path).replace('wav',output_extension)
346
+ self.convert_wav_to_output_extension(multi_lang_with_voice_clone_path,final_output)
347
+ print(final_output)
348
+ self.upload_file_to_s3(final_output,'demovidelyusergenerations',f"{gen_id}-voice-clone-multi-lang.{output_extension}")
349
  shutil.rmtree(os.path.join(output_dir,gen_id))
350
+ return {"voice_clone_with_multi_lang":f"https://demovidelyusergenerations.s3.amazonaws.com/{gen_id}-voice-clone-multi-lang.{output_extension}"
351
  }
352
 
353
 
 
520
  print(f"Error downloading file: {e}")
521
 
522
 
523
+ def convert_wav_to_output_extension(self,wav_file, output_file):
524
+ command = ['ffmpeg', '-i', wav_file, '-q:a', '0', '-map', 'a', output_file]
525
  subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
526
 
527
 
src/rp_handler.py CHANGED
@@ -45,20 +45,24 @@ def run_voice_clone_job(job):
45
  s3_url = job_input.get('s3_url')
46
  passage = job_input.get('passage')
47
  process_audio = job_input.get('process_audio')
 
48
  print(process_audio)
 
 
 
49
  if process_audio is None:
50
  process_audio = False
51
 
52
  if method_type == 'voice_clone':
53
  run_type = job_input.get('run_type')
54
  if run_type is not None:
55
- result = MODEL.predict(s3_url,passage,process_audio,run_type)
56
  else:
57
- result = MODEL.predict(s3_url,passage,process_audio)
58
  if method_type == 'voice_clone_with_emotions':
59
- result = MODEL.predict_with_emotions(s3_url,passage,process_audio)
60
  if method_type == 'voice_clone_with_multi_lang':
61
- result = MODEL.predict_with_multi_lang(s3_url,passage,process_audio)
62
 
63
  return result
64
 
 
45
  s3_url = job_input.get('s3_url')
46
  passage = job_input.get('passage')
47
  process_audio = job_input.get('process_audio')
48
+ output_extension = job_input.get('output_extension')
49
  print(process_audio)
50
+ if output_extension not in ["mp3","ogg"]:
51
+ return {"error" : "only supports mp3 and ogg as output_extension"}
52
+ print(output_extension)
53
  if process_audio is None:
54
  process_audio = False
55
 
56
  if method_type == 'voice_clone':
57
  run_type = job_input.get('run_type')
58
  if run_type is not None:
59
+ result = MODEL.predict(s3_url,passage,process_audio,output_extension,run_type)
60
  else:
61
+ result = MODEL.predict(s3_url,passage,process_audio,output_extension)
62
  if method_type == 'voice_clone_with_emotions':
63
+ result = MODEL.predict_with_emotions(s3_url,passage,process_audio,output_extension)
64
  if method_type == 'voice_clone_with_multi_lang':
65
+ result = MODEL.predict_with_multi_lang(s3_url,passage,process_audio,output_extension)
66
 
67
  return result
68
 
src/rp_schema.py CHANGED
@@ -33,5 +33,10 @@ INPUT_VALIDATIONS = {
33
  'type': str,
34
  'required': False,
35
  'default': False
 
 
 
 
 
36
  }
37
  }
 
33
  'type': str,
34
  'required': False,
35
  'default': False
36
+ },
37
+ 'output_extension': {
38
+ 'type': str,
39
+ 'required': False,
40
+ 'default': 'ogg'
41
  }
42
  }