Spanicin commited on
Commit
8c41ef2
·
verified ·
1 Parent(s): b0c735f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +128 -264
app.py CHANGED
@@ -3,19 +3,13 @@ import torch
3
  import shutil
4
  import os
5
  import sys
6
- from argparse import ArgumentParser
7
- from time import strftime
8
- from argparse import Namespace
9
  from src.utils.preprocess import CropAndExtract
10
  from src.test_audio2coeff import Audio2Coeff
11
  from src.facerender.animate import AnimateFromCoeff
12
  from src.generate_batch import get_data
13
  from src.generate_facerender_batch import get_facerender_data
14
- # from src.utils.init_path import init_path
15
  import tempfile
16
  from openai import OpenAI
17
- import threading
18
- import elevenlabs
19
  from elevenlabs import set_api_key, generate, play, clone, Voice, VoiceSettings
20
  from flask_cors import CORS, cross_origin
21
  # from flask_swagger_ui import get_swaggerui_blueprint
@@ -29,6 +23,12 @@ import pickle
29
  # from videoretalking import inference_function
30
  # import base64
31
  # import gfpgan_enhancer
 
 
 
 
 
 
32
 
33
 
34
 
@@ -80,216 +80,71 @@ app.config['text_prompt'] = None
80
  app.config['final_video_path'] = None
81
  app.config['final_video_duration'] = None
82
 
83
-
84
-
85
- def main(args):
86
- pic_path = args.source_image
87
- audio_path = args.driven_audio
88
- save_dir = args.result_dir
89
- pose_style = args.pose_style
90
- device = args.device
91
- batch_size = args.batch_size
92
- input_yaw_list = args.input_yaw
93
- input_pitch_list = args.input_pitch
94
- input_roll_list = args.input_roll
95
- ref_eyeblink = args.ref_eyeblink
96
- ref_pose = args.ref_pose
97
- preprocess = args.preprocess
98
- image_hardcoded = args.image_hardcoded
99
-
100
- dir_path = os.path.dirname(os.path.realpath(__file__))
101
- current_root_path = dir_path
102
- print('current_root_path ',current_root_path)
103
-
104
- # sadtalker_paths = init_path(args.checkpoint_dir, os.path.join(current_root_path, 'src/config'), args.size, args.old_version, args.preprocess)
105
-
106
- path_of_lm_croper = os.path.join(current_root_path, args.checkpoint_dir, 'shape_predictor_68_face_landmarks.dat')
107
- path_of_net_recon_model = os.path.join(current_root_path, args.checkpoint_dir, 'epoch_20.pth')
108
- dir_of_BFM_fitting = os.path.join(current_root_path, args.checkpoint_dir, 'BFM_Fitting')
109
- wav2lip_checkpoint = os.path.join(current_root_path, args.checkpoint_dir, 'wav2lip.pth')
110
-
111
- audio2pose_checkpoint = os.path.join(current_root_path, args.checkpoint_dir, 'auido2pose_00140-model.pth')
112
- audio2pose_yaml_path = os.path.join(current_root_path, 'src', 'config', 'auido2pose.yaml')
113
-
114
- audio2exp_checkpoint = os.path.join(current_root_path, args.checkpoint_dir, 'auido2exp_00300-model.pth')
115
- audio2exp_yaml_path = os.path.join(current_root_path, 'src', 'config', 'auido2exp.yaml')
116
-
117
- free_view_checkpoint = os.path.join(current_root_path, args.checkpoint_dir, 'facevid2vid_00189-model.pth.tar')
118
-
119
- if preprocess == 'full':
120
- mapping_checkpoint = os.path.join(current_root_path, args.checkpoint_dir, 'mapping_00109-model.pth.tar')
121
  facerender_yaml_path = os.path.join(current_root_path, 'src', 'config', 'facerender_still.yaml')
122
  else:
123
- mapping_checkpoint = os.path.join(current_root_path, args.checkpoint_dir, 'mapping_00229-model.pth.tar')
124
  facerender_yaml_path = os.path.join(current_root_path, 'src', 'config', 'facerender.yaml')
125
 
126
-
127
- # preprocess_model = CropAndExtract(sadtalker_paths, device)
128
- #init model
129
- print(path_of_net_recon_model)
130
- preprocess_model = CropAndExtract(path_of_lm_croper, path_of_net_recon_model, dir_of_BFM_fitting, device)
131
-
132
- # audio_to_coeff = Audio2Coeff(sadtalker_paths, device)
 
 
 
 
 
133
  audio_to_coeff = Audio2Coeff(audio2pose_checkpoint, audio2pose_yaml_path,
134
  audio2exp_checkpoint, audio2exp_yaml_path,
135
- wav2lip_checkpoint, device)
136
- # animate_from_coeff = AnimateFromCoeff(sadtalker_paths, device)
137
- animate_from_coeff = AnimateFromCoeff(free_view_checkpoint, mapping_checkpoint,
138
- facerender_yaml_path, device)
139
-
140
- first_frame_dir = os.path.join(save_dir, 'first_frame_dir')
141
- os.makedirs(first_frame_dir, exist_ok=True)
142
- # first_coeff_path, crop_pic_path, crop_info = preprocess_model.generate(pic_path, first_frame_dir, args.preprocess,\
143
- # source_image_flag=True, pic_size=args.size)
144
-
145
-
146
- fixed_temp_dir = "/tmp/preprocess_data"
147
- os.makedirs(fixed_temp_dir, exist_ok=True)
148
- preprocessed_data_path = os.path.join(fixed_temp_dir, "preprocessed_data.pkl")
149
-
150
- if os.path.exists(preprocessed_data_path) and image_hardcoded == "yes":
151
- print("Loading preprocessed data...")
152
- with open(preprocessed_data_path, "rb") as f:
153
- preprocessed_data = pickle.load(f)
154
- first_coeff_new_path = preprocessed_data["first_coeff_path"]
155
- crop_pic_new_path = preprocessed_data["crop_pic_path"]
156
- crop_info_path = preprocessed_data["crop_info_path"]
157
- with open(crop_info_path, "rb") as f:
158
- crop_info = pickle.load(f)
159
-
160
- print(f"Loaded existing preprocessed data from: {preprocessed_data_path}")
161
-
162
- else:
163
- print("Running preprocessing...")
164
- first_coeff_path, crop_pic_path, crop_info = preprocess_model.generate(pic_path, first_frame_dir, args.preprocess, source_image_flag=True)
165
- first_coeff_new_path = os.path.join(fixed_temp_dir, os.path.basename(first_coeff_path))
166
- crop_pic_new_path = os.path.join(fixed_temp_dir, os.path.basename(crop_pic_path))
167
- crop_info_new_path = os.path.join(fixed_temp_dir, "crop_info.pkl")
168
- shutil.move(first_coeff_path, first_coeff_new_path)
169
- shutil.move(crop_pic_path, crop_pic_new_path)
170
-
171
- with open(crop_info_new_path, "wb") as f:
172
- pickle.dump(crop_info, f)
173
-
174
- preprocessed_data = {"first_coeff_path": first_coeff_new_path,
175
- "crop_pic_path": crop_pic_new_path,
176
- "crop_info_path": crop_info_new_path}
177
-
178
-
179
- with open(preprocessed_data_path, "wb") as f:
180
- pickle.dump(preprocessed_data, f)
181
- print(f"Preprocessed data saved to: {preprocessed_data_path}")
182
 
183
- print('first_coeff_path ',first_coeff_new_path)
184
- print('crop_pic_path ',crop_pic_new_path)
185
- print('crop_info ',crop_info)
186
-
187
- if first_coeff_new_path is None:
188
- print("Can't get the coeffs of the input")
189
- return
190
-
191
- if ref_eyeblink is not None:
192
- ref_eyeblink_videoname = os.path.splitext(os.path.split(ref_eyeblink)[-1])[0]
193
- ref_eyeblink_frame_dir = os.path.join(save_dir, ref_eyeblink_videoname)
194
- os.makedirs(ref_eyeblink_frame_dir, exist_ok=True)
195
- # ref_eyeblink_coeff_path, _, _ = preprocess_model.generate(ref_eyeblink, ref_eyeblink_frame_dir, args.preprocess, source_image_flag=False)
196
- ref_eyeblink_coeff_path, _, _ = preprocess_model.generate(ref_eyeblink, ref_eyeblink_frame_dir)
197
- else:
198
- ref_eyeblink_coeff_path=None
199
- print('ref_eyeblink_coeff_path',ref_eyeblink_coeff_path)
200
-
201
- if ref_pose is not None:
202
- if ref_pose == ref_eyeblink:
203
- ref_pose_coeff_path = ref_eyeblink_coeff_path
204
- else:
205
- ref_pose_videoname = os.path.splitext(os.path.split(ref_pose)[-1])[0]
206
- ref_pose_frame_dir = os.path.join(save_dir, ref_pose_videoname)
207
- os.makedirs(ref_pose_frame_dir, exist_ok=True)
208
- # ref_pose_coeff_path, _, _ = preprocess_model.generate(ref_pose, ref_pose_frame_dir, args.preprocess, source_image_flag=False)
209
- ref_pose_coeff_path, _, _ = preprocess_model.generate(ref_pose, ref_pose_frame_dir)
210
- else:
211
- ref_pose_coeff_path=None
212
- print('ref_eyeblink_coeff_path',ref_pose_coeff_path)
213
-
214
- batch = get_data(first_coeff_new_path, audio_path, device, ref_eyeblink_coeff_path, still=args.still)
215
- coeff_path = audio_to_coeff.generate(batch, save_dir, pose_style, ref_pose_coeff_path)
216
-
217
-
218
- if args.face3dvis:
219
- from src.face3d.visualize import gen_composed_video
220
- gen_composed_video(args, device, first_coeff_new_path, coeff_path, audio_path, os.path.join(save_dir, '3dface.mp4'))
221
-
222
- # data = get_facerender_data(coeff_path, crop_pic_path, first_coeff_path, audio_path,
223
- # batch_size, input_yaw_list, input_pitch_list, input_roll_list,
224
- # expression_scale=args.expression_scale, still_mode=args.still, preprocess=args.preprocess, size=args.size)
225
-
226
 
227
- data = get_facerender_data(coeff_path, crop_pic_new_path, first_coeff_new_path, audio_path,
228
- batch_size, input_yaw_list, input_pitch_list, input_roll_list,
 
229
  expression_scale=args.expression_scale, still_mode=args.still, preprocess=args.preprocess)
 
 
 
 
230
 
231
- # result, base64_video,temp_file_path= animate_from_coeff.generate(data, save_dir, pic_path, crop_info, \
232
- # enhancer=args.enhancer, background_enhancer=args.background_enhancer, preprocess=args.preprocess, img_size=args.size)
233
-
234
-
235
- result, base64_video,temp_file_path,new_audio_path = animate_from_coeff.generate(data, save_dir, pic_path, crop_info, \
236
- enhancer=args.enhancer, background_enhancer=args.background_enhancer, preprocess=args.preprocess)
237
-
238
- # face_path = temp_file_path
239
- # audio_path = new_audio_path
240
- # temp_file = tempfile.NamedTemporaryFile(delete=False, dir=TEMP_DIR.name, suffix='.mp4')
241
- # video_lipsync_file_path = temp_file.name
242
- # output_path = video_lipsync_file_path
243
-
244
- # # Call the function
245
- # inference_function.video_lipsync_correctness(
246
- # face=face_path,
247
- # audio_path=audio_path,
248
- # face3d_net_path = path_of_net_recon_model,
249
- # outfile=output_path,
250
- # tmp_dir="temp",
251
- # crop=[0, -1, 0, -1],
252
- # re_preprocess=True, # Set to True if you want to reprocess; False otherwise
253
- # exp_img="neutral", # Can be 'smile', 'neutral', or path to an expression image
254
- # one_shot=False,
255
- # up_face="original", # Options: 'original', 'sad', 'angry', 'surprise'
256
- # LNet_batch_size=16,
257
- # without_rl1=False
258
- # )
259
-
260
- # print('The video with lip sync is generated')
261
- # print("GFPGAN Activated")
262
-
263
- # gfpgan_enhancer.process_video_with_gfpgan(output_path, output_path)
264
- # audio_clip = mp.AudioFileClip(new_audio_path)
265
- # video_clip = mp.VideoFileClip(output_path)
266
- # # Combine audio and video
267
- # final_clip = video_clip.set_audio(audio_clip)
268
-
269
- # temp_file = tempfile.NamedTemporaryFile(suffix='.mp4', dir=TEMP_DIR.name, delete=False)
270
- # temp_file.close()
271
- # final_video_path = temp_file.name
272
- # final_clip.write_videofile(final_video_path)
273
-
274
- # with open(final_video_path, 'rb') as f:
275
- # video_content = f.read()
276
-
277
- # base64_lipsync_video = base64.b64encode(video_content).decode('utf-8')
278
-
279
- video_clip = mp.VideoFileClip(temp_file_path)
280
- duration = video_clip.duration
281
 
282
  app.config['temp_response'] = base64_video
283
  app.config['final_video_path'] = temp_file_path
284
- app.config['final_video_duration'] = duration
285
-
286
- return base64_video, temp_file_path, duration
287
-
288
- # shutil.move(result, save_dir+'.mp4')
289
-
290
 
291
- if not args.verbose:
292
- shutil.rmtree(save_dir)
293
 
294
  def create_temp_dir():
295
  return tempfile.TemporaryDirectory()
@@ -305,8 +160,8 @@ client = OpenAI(api_key="sk-proj-04146TPzEmvdV6DzSxsvNM7jxOnzys5TnB7iZB0tp59B-jM
305
  def openai_chat_avatar(text_prompt):
306
  response = client.chat.completions.create(
307
  model="gpt-4o-mini",
308
- messages=[{"role": "system", "content": "Answer in English language always using the minimum words you can ever use."},
309
- {"role": "user", "content": f"Hi! I need help with something. Can you assist me with the following: {text_prompt}"},
310
  ],
311
  max_tokens = len(text_prompt) + 300 # Use the length of the input text
312
  # temperature=0.3,
@@ -359,6 +214,72 @@ def custom_cleanup(temp_dir, exclude_dir):
359
  except Exception as e:
360
  print(f"Failed to delete {file_path}. Reason: {e}")
361
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
362
  @app.route("/run", methods=['POST'])
363
  def generate_video():
364
  global start_time
@@ -430,64 +351,8 @@ def generate_video():
430
  source_image_path = save_uploaded_file(source_image, 'source_image.png',TEMP_DIR)
431
  print(source_image_path)
432
 
433
- if voice_cloning == 'no':
434
- if voice_gender == 'male':
435
- voice = 'echo'
436
- print('Entering Audio creation using elevenlabs')
437
- set_api_key("92e149985ea2732b4359c74346c3daee")
438
-
439
- audio = generate(text = text_prompt, voice = "George", model = "eleven_multilingual_v2",stream=True, latency=4)
440
- with tempfile.NamedTemporaryFile(suffix=".mp3", prefix="text_to_speech_",dir=TEMP_DIR.name, delete=False) as temp_file:
441
- for chunk in audio:
442
- temp_file.write(chunk)
443
- driven_audio_path = temp_file.name
444
- print('driven_audio_path',driven_audio_path)
445
- print('Audio file saved using elevenlabs')
446
-
447
- else:
448
- voice = 'nova'
449
-
450
- print('Entering Audio creation using whisper')
451
- response = client.audio.speech.create(model="tts-1-hd",
452
- voice=voice,
453
- input = text_prompt)
454
-
455
- print('Audio created using whisper')
456
- with tempfile.NamedTemporaryFile(suffix=".wav", prefix="text_to_speech_",dir=TEMP_DIR.name, delete=False) as temp_file:
457
- driven_audio_path = temp_file.name
458
-
459
- response.write_to_file(driven_audio_path)
460
- print('Audio file saved using whisper')
461
-
462
- elif voice_cloning == 'yes':
463
- # user_voice = request.files['user_voice']
464
- # user_voice = '/home/user/app/images/marc_voice.mp3'
465
-
466
- # with tempfile.NamedTemporaryFile(suffix=".wav", prefix="user_voice_",dir=TEMP_DIR.name, delete=False) as temp_file:
467
- # with open(user_voice, 'rb') as source_file:
468
- # file_contents = source_file.read()
469
- # temp_file.write(file_contents)
470
-
471
- # temp_file.flush()
472
- # user_voice_path = temp_file.name
473
- # user_voice.save(user_voice_path)
474
- # print('user_voice_path',user_voice_path)
475
-
476
- set_api_key("92e149985ea2732b4359c74346c3daee")
477
- # voice = clone(name = "User Cloned Voice",
478
- # files = [user_voice_path] )
479
- voice = Voice(voice_id="DeZH4ash9IU9gUcNjVXh",name="Marc",settings=VoiceSettings(
480
- stability=0.71, similarity_boost=0.5, style=0.0, use_speaker_boost=True),)
481
-
482
- audio = generate(text = text_prompt, voice = voice, model = "eleven_multilingual_v2",stream=True, latency=4)
483
- with tempfile.NamedTemporaryFile(suffix=".mp3", prefix="cloned_audio_",dir=TEMP_DIR.name, delete=False) as temp_file:
484
- for chunk in audio:
485
- temp_file.write(chunk)
486
- driven_audio_path = temp_file.name
487
- print('driven_audio_path',driven_audio_path)
488
-
489
- # elevenlabs.save(audio, driven_audio_path)
490
-
491
  save_dir = tempfile.mkdtemp(dir=TEMP_DIR.name)
492
  result_folder = os.path.join(save_dir, "results")
493
  os.makedirs(result_folder, exist_ok=True)
@@ -503,7 +368,6 @@ def generate_video():
503
  app.logger.error(f"An error occurred: {e}")
504
  return "An error occurred", 500
505
 
506
- # Example of using the class with some hypothetical paths
507
  args = AnimationConfig(driven_audio_path=driven_audio_path, source_image_path=source_image_path, result_folder=result_folder, pose_style=pose_style, expression_scale=expression_scale,enhancer=enhancer,still=still,preprocess=preprocess,ref_pose_video_path=ref_pose_video_path, image_hardcoded=image_hardcoded)
508
 
509
  if torch.cuda.is_available() and not args.cpu:
@@ -518,7 +382,7 @@ def generate_video():
518
  # "process_id": generation_thread.ident}
519
 
520
  try:
521
- base64_video, temp_file_path, duration = main(args)
522
  final_video_path = app.config['final_video_path']
523
  print('final_video_path',final_video_path)
524
 
 
3
  import shutil
4
  import os
5
  import sys
 
 
 
6
  from src.utils.preprocess import CropAndExtract
7
  from src.test_audio2coeff import Audio2Coeff
8
  from src.facerender.animate import AnimateFromCoeff
9
  from src.generate_batch import get_data
10
  from src.generate_facerender_batch import get_facerender_data
 
11
  import tempfile
12
  from openai import OpenAI
 
 
13
  from elevenlabs import set_api_key, generate, play, clone, Voice, VoiceSettings
14
  from flask_cors import CORS, cross_origin
15
  # from flask_swagger_ui import get_swaggerui_blueprint
 
23
  # from videoretalking import inference_function
24
  # import base64
25
  # import gfpgan_enhancer
26
+ # import threading
27
+ # import elevenlabs
28
+ # from argparse import Namespace
29
+ # from argparse import ArgumentParser
30
+ # from time import strftime
31
+ # from src.utils.init_path import init_path
32
 
33
 
34
 
 
80
  app.config['final_video_path'] = None
81
  app.config['final_video_duration'] = None
82
 
83
+ # Global paths
84
+ dir_path = os.path.dirname(os.path.realpath(__file__))
85
+ current_root_path = dir_path
86
+
87
+ path_of_lm_croper = os.path.join(current_root_path, 'checkpoints', 'shape_predictor_68_face_landmarks.dat')
88
+ path_of_net_recon_model = os.path.join(current_root_path, 'checkpoints', 'epoch_20.pth')
89
+ dir_of_BFM_fitting = os.path.join(current_root_path, 'checkpoints', 'BFM_Fitting')
90
+ wav2lip_checkpoint = os.path.join(current_root_path, 'checkpoints', 'wav2lip.pth')
91
+ audio2pose_checkpoint = os.path.join(current_root_path, 'checkpoints', 'auido2pose_00140-model.pth')
92
+ audio2pose_yaml_path = os.path.join(current_root_path, 'src', 'config', 'auido2pose.yaml')
93
+ audio2exp_checkpoint = os.path.join(current_root_path, 'checkpoints', 'auido2exp_00300-model.pth')
94
+ audio2exp_yaml_path = os.path.join(current_root_path, 'src', 'config', 'auido2exp.yaml')
95
+ free_view_checkpoint = os.path.join(current_root_path, 'checkpoints', 'facevid2vid_00189-model.pth.tar')
96
+
97
+ # Function for running the actual task (using preprocessed data)
98
+ def process_chunk(audio_chunk, preprocessed_data, args):
99
+ print("Entered Process Chunk Function")
100
+ global audio2pose_checkpoint, audio2pose_yaml_path, audio2exp_checkpoint, audio2exp_yaml_path, wav2lip_checkpoint
101
+ global free_view_checkpoint
102
+ if args.preprocess == 'full':
103
+ mapping_checkpoint = os.path.join(current_root_path, 'checkpoints', 'mapping_00109-model.pth.tar')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  facerender_yaml_path = os.path.join(current_root_path, 'src', 'config', 'facerender_still.yaml')
105
  else:
106
+ mapping_checkpoint = os.path.join(current_root_path, 'checkpoints', 'mapping_00229-model.pth.tar')
107
  facerender_yaml_path = os.path.join(current_root_path, 'src', 'config', 'facerender.yaml')
108
 
109
+ first_coeff_path = preprocessed_data["first_coeff_path"]
110
+ crop_pic_path = preprocessed_data["crop_pic_path"]
111
+ crop_info_path = "/home/user/app/preprocess_data/crop_info.json"
112
+ with open(crop_info_path , "rb") as f:
113
+ crop_info = json.load(f)
114
+
115
+ print(f"Loaded existing preprocessed data")
116
+ print("first_coeff_path",first_coeff_path)
117
+ print("crop_pic_path",crop_pic_path)
118
+ print("crop_info",crop_info)
119
+ torch.cuda.empty_cache()
120
+ batch = get_data(first_coeff_path, audio_chunk, args.device, ref_eyeblink_coeff_path=None, still=args.still)
121
  audio_to_coeff = Audio2Coeff(audio2pose_checkpoint, audio2pose_yaml_path,
122
  audio2exp_checkpoint, audio2exp_yaml_path,
123
+ wav2lip_checkpoint, args.device)
124
+ coeff_path = audio_to_coeff.generate(batch, args.result_dir, args.pose_style, ref_pose_coeff_path=None)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
 
126
+ # Further processing with animate_from_coeff using the coeff_path
127
+ animate_from_coeff = AnimateFromCoeff(free_view_checkpoint, mapping_checkpoint,
128
+ facerender_yaml_path, args.device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
 
130
+ torch.cuda.empty_cache()
131
+ data = get_facerender_data(coeff_path, crop_pic_path, first_coeff_path, audio_chunk,
132
+ args.batch_size, args.input_yaw, args.input_pitch, args.input_roll,
133
  expression_scale=args.expression_scale, still_mode=args.still, preprocess=args.preprocess)
134
+ torch.cuda.empty_cache()
135
+ print("Will Enter Animation")
136
+ result, base64_video, temp_file_path, _ = animate_from_coeff.generate(data, args.result_dir, args.source_image, crop_info,
137
+ enhancer=args.enhancer, background_enhancer=args.background_enhancer, preprocess=args.preprocess)
138
 
139
+ # video_clip = mp.VideoFileClip(temp_file_path)
140
+ # duration = video_clip.duration
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
  app.config['temp_response'] = base64_video
143
  app.config['final_video_path'] = temp_file_path
144
+ # app.config['final_video_duration'] = duration
145
+ torch.cuda.empty_cache()
146
+ return base64_video, temp_file_path
 
 
 
147
 
 
 
148
 
149
  def create_temp_dir():
150
  return tempfile.TemporaryDirectory()
 
160
  def openai_chat_avatar(text_prompt):
161
  response = client.chat.completions.create(
162
  model="gpt-4o-mini",
163
+ messages=[{"role": "system", "content": "Ensure answers are concise, human-like, and clear while maintaining quality. Use the fewest possible words, avoiding unnecessary articles, prepositions, and adjectives. Responses should be short but still address the question thoroughly without being verbose.Keep them to one sentence only"},
164
+ {"role": "user", "content": f"Hi! I need help with something. {text_prompt}"},
165
  ],
166
  max_tokens = len(text_prompt) + 300 # Use the length of the input text
167
  # temperature=0.3,
 
214
  except Exception as e:
215
  print(f"Failed to delete {file_path}. Reason: {e}")
216
 
217
+
218
+ def generate_audio(voice_cloning, voice_gender, text_prompt):
219
+ print("generate_audio")
220
+ if voice_cloning == 'no':
221
+ if voice_gender == 'male':
222
+ voice = 'echo'
223
+ print('Entering Audio creation using elevenlabs')
224
+ set_api_key('92e149985ea2732b4359c74346c3daee')
225
+
226
+ audio = generate(text = text_prompt, voice = "Daniel", model = "eleven_multilingual_v2",stream=True, latency=4)
227
+ with tempfile.NamedTemporaryFile(suffix=".mp3", prefix="text_to_speech_",dir=TEMP_DIR.name, delete=False) as temp_file:
228
+ for chunk in audio:
229
+ temp_file.write(chunk)
230
+ driven_audio_path = temp_file.name
231
+ print('driven_audio_path',driven_audio_path)
232
+ print('Audio file saved using elevenlabs')
233
+
234
+ else:
235
+ voice = 'nova'
236
+
237
+ print('Entering Audio creation using whisper')
238
+ response = client.audio.speech.create(model="tts-1-hd",
239
+ voice=voice,
240
+ input = text_prompt)
241
+
242
+ print('Audio created using whisper')
243
+ with tempfile.NamedTemporaryFile(suffix=".wav", prefix="text_to_speech_",dir=TEMP_DIR.name, delete=False) as temp_file:
244
+ driven_audio_path = temp_file.name
245
+
246
+ response.write_to_file(driven_audio_path)
247
+ print('Audio file saved using whisper')
248
+
249
+ elif voice_cloning == 'yes':
250
+ set_api_key('92e149985ea2732b4359c74346c3daee')
251
+ # voice = clone(name = "User Cloned Voice",
252
+ # files = [user_voice_path] )
253
+ voice = Voice(voice_id="CEii8R8RxmB0zhAiloZg",name="Marc",settings=VoiceSettings(
254
+ stability=0.71, similarity_boost=0.5, style=0.0, use_speaker_boost=True),)
255
+
256
+ audio = generate(text = text_prompt, voice = voice, model = "eleven_multilingual_v2",stream=True, latency=4)
257
+ with tempfile.NamedTemporaryFile(suffix=".mp3", prefix="cloned_audio_",dir=TEMP_DIR.name, delete=False) as temp_file:
258
+ for chunk in audio:
259
+ temp_file.write(chunk)
260
+ driven_audio_path = temp_file.name
261
+ print('driven_audio_path',driven_audio_path)
262
+ # audio_duration = get_audio_duration(driven_audio_path)
263
+ # print('Total Audio Duration in seconds',audio_duration)
264
+
265
+ return driven_audio_path
266
+
267
+ def run_preprocessing(args):
268
+ global path_of_lm_croper, path_of_net_recon_model, dir_of_BFM_fitting
269
+ first_frame_dir = os.path.join(args.result_dir, 'first_frame_dir')
270
+ os.makedirs(first_frame_dir, exist_ok=True)
271
+ fixed_temp_dir = "/home/user/app/preprocess_data/"
272
+ os.makedirs(fixed_temp_dir, exist_ok=True)
273
+ preprocessed_data_path = os.path.join(fixed_temp_dir, "preprocessed_data.pkl")
274
+
275
+ if os.path.exists(preprocessed_data_path) and args.image_hardcoded == "yes":
276
+ print("Loading preprocessed data...")
277
+ with open(preprocessed_data_path, "rb") as f:
278
+ preprocessed_data = pickle.load(f)
279
+ print("Loaded existing preprocessed data from:", preprocessed_data_path)
280
+
281
+ return preprocessed_data
282
+
283
  @app.route("/run", methods=['POST'])
284
  def generate_video():
285
  global start_time
 
351
  source_image_path = save_uploaded_file(source_image, 'source_image.png',TEMP_DIR)
352
  print(source_image_path)
353
 
354
+ driven_audio_path = generate_audio(voice_cloning, voice_gender, text_prompt)
355
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
356
  save_dir = tempfile.mkdtemp(dir=TEMP_DIR.name)
357
  result_folder = os.path.join(save_dir, "results")
358
  os.makedirs(result_folder, exist_ok=True)
 
368
  app.logger.error(f"An error occurred: {e}")
369
  return "An error occurred", 500
370
 
 
371
  args = AnimationConfig(driven_audio_path=driven_audio_path, source_image_path=source_image_path, result_folder=result_folder, pose_style=pose_style, expression_scale=expression_scale,enhancer=enhancer,still=still,preprocess=preprocess,ref_pose_video_path=ref_pose_video_path, image_hardcoded=image_hardcoded)
372
 
373
  if torch.cuda.is_available() and not args.cpu:
 
382
  # "process_id": generation_thread.ident}
383
 
384
  try:
385
+ base64_video, temp_file_path, duration = process_chunk(driven_audio_path, preprocessed_data, args)
386
  final_video_path = app.config['final_video_path']
387
  print('final_video_path',final_video_path)
388