Spaces:

fudan-generative-ai
/

hallo

Running

App Files Files Community

leeway.zlw commited on Jun 20, 2024

Commit

69c71b8

1 Parent(s): ca33a23

update

Browse files

Files changed (1) hide show

app.py +27 -6

app.py CHANGED Viewed

@@ -12,7 +12,7 @@ is_shared_ui = True if "fudan-generative-ai/hallo" in os.environ['SPACE_ID'] els
 if(not is_shared_ui):
     hallo_dir = snapshot_download(repo_id="fudan-generative-ai/hallo", local_dir="pretrained_models")
-def run_inference(source_image, driving_audio, progress=gr.Progress(track_tqdm=True)):
     if is_shared_ui:
         raise gr.Error("This Space only works in duplicated instances")
@@ -23,10 +23,10 @@ def run_inference(source_image, driving_audio, progress=gr.Progress(track_tqdm=T
         source_image=source_image,
         driving_audio=driving_audio,
         output=f'output-{unique_id}.mp4',
-        pose_weight=1.0,
-        face_weight=1.0,
-        lip_weight=1.0,
-        face_expand_ratio=1.2,
         checkpoint=None
     )
@@ -91,17 +91,38 @@ with gr.Blocks(css=css) as demo:
         ''', elem_id="warning-duplicate")
     gr.Markdown("# Demo for Hallo: Hierarchical Audio-Driven Visual Synthesis for Portrait Image Animation")
     gr.Markdown("Generate talking head avatars driven from audio. **5 seconds of audio takes >10 minutes to generate on an L4** - duplicate the space for private use or try for free on Google Colab")
     with gr.Row():
         with gr.Column():
             avatar_face = gr.Image(type="filepath", label="Face")
             driving_audio = gr.Audio(type="filepath", label="Driving audio")
             generate = gr.Button("Generate")
         with gr.Column():
             output_video = gr.Video(label="Your talking head")
     generate.click(
         fn=run_inference,
-        inputs=[avatar_face, driving_audio],
         outputs=output_video
     )

 if(not is_shared_ui):
     hallo_dir = snapshot_download(repo_id="fudan-generative-ai/hallo", local_dir="pretrained_models")
+def run_inference(source_image, driving_audio, pose_weight, face_weight, lip_weight, face_expand_ratio, progress=gr.Progress(track_tqdm=True)):
     if is_shared_ui:
         raise gr.Error("This Space only works in duplicated instances")
         source_image=source_image,
         driving_audio=driving_audio,
         output=f'output-{unique_id}.mp4',
+        pose_weight=pose_weight,
+        face_weight=face_weight,
+        lip_weight=lip_weight,
+        face_expand_ratio=face_expand_ratio,
         checkpoint=None
     )
         ''', elem_id="warning-duplicate")
     gr.Markdown("# Demo for Hallo: Hierarchical Audio-Driven Visual Synthesis for Portrait Image Animation")
     gr.Markdown("Generate talking head avatars driven from audio. **5 seconds of audio takes >10 minutes to generate on an L4** - duplicate the space for private use or try for free on Google Colab")
+    gr.Markdown("""
+Hallo has a few simple requirements for input data:
+For the source image:
+1. It should be cropped into squares.
+2. The face should be the main focus, making up 50%-70% of the image.
+3. The face should be facing forward, with a rotation angle of less than 30° (no side profiles).
+For the driving audio:
+1. It must be in WAV format.
+2. It must be in English since our training datasets are only in this language.
+3. Ensure the vocals are clear; background music is acceptable.
+We have provided some [samples](https://huggingface.co/datasets/fudan-generative-ai/hallo_inference_samples) for your reference.
+                """)
     with gr.Row():
         with gr.Column():
             avatar_face = gr.Image(type="filepath", label="Face")
             driving_audio = gr.Audio(type="filepath", label="Driving audio")
+            pose_weight = gr.Number(label="pose weight", value=1.0),
+            face_weight = gr.Number(label="face weight", value=1.0),
+            lip_weight = gr.Number(label="lip weight", value=1.0),
+            face_expand_ratio = gr.Number(label="face expand ratio", value=1.2),
             generate = gr.Button("Generate")
         with gr.Column():
             output_video = gr.Video(label="Your talking head")
     generate.click(
         fn=run_inference,
+        inputs=[avatar_face, driving_audio, pose_weight, face_weight, lip_weight, face_expand_ratio],
         outputs=output_video
     )