Spaces:
Running
Running
leeway.zlw
commited on
Commit
•
69c71b8
1
Parent(s):
ca33a23
update
Browse files
app.py
CHANGED
@@ -12,7 +12,7 @@ is_shared_ui = True if "fudan-generative-ai/hallo" in os.environ['SPACE_ID'] els
|
|
12 |
if(not is_shared_ui):
|
13 |
hallo_dir = snapshot_download(repo_id="fudan-generative-ai/hallo", local_dir="pretrained_models")
|
14 |
|
15 |
-
def run_inference(source_image, driving_audio, progress=gr.Progress(track_tqdm=True)):
|
16 |
if is_shared_ui:
|
17 |
raise gr.Error("This Space only works in duplicated instances")
|
18 |
|
@@ -23,10 +23,10 @@ def run_inference(source_image, driving_audio, progress=gr.Progress(track_tqdm=T
|
|
23 |
source_image=source_image,
|
24 |
driving_audio=driving_audio,
|
25 |
output=f'output-{unique_id}.mp4',
|
26 |
-
pose_weight=
|
27 |
-
face_weight=
|
28 |
-
lip_weight=
|
29 |
-
face_expand_ratio=
|
30 |
checkpoint=None
|
31 |
)
|
32 |
|
@@ -91,17 +91,38 @@ with gr.Blocks(css=css) as demo:
|
|
91 |
''', elem_id="warning-duplicate")
|
92 |
gr.Markdown("# Demo for Hallo: Hierarchical Audio-Driven Visual Synthesis for Portrait Image Animation")
|
93 |
gr.Markdown("Generate talking head avatars driven from audio. **5 seconds of audio takes >10 minutes to generate on an L4** - duplicate the space for private use or try for free on Google Colab")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
with gr.Row():
|
95 |
with gr.Column():
|
96 |
avatar_face = gr.Image(type="filepath", label="Face")
|
97 |
driving_audio = gr.Audio(type="filepath", label="Driving audio")
|
|
|
|
|
|
|
|
|
98 |
generate = gr.Button("Generate")
|
99 |
with gr.Column():
|
100 |
output_video = gr.Video(label="Your talking head")
|
101 |
|
102 |
generate.click(
|
103 |
fn=run_inference,
|
104 |
-
inputs=[avatar_face, driving_audio],
|
105 |
outputs=output_video
|
106 |
)
|
107 |
|
|
|
12 |
if(not is_shared_ui):
|
13 |
hallo_dir = snapshot_download(repo_id="fudan-generative-ai/hallo", local_dir="pretrained_models")
|
14 |
|
15 |
+
def run_inference(source_image, driving_audio, pose_weight, face_weight, lip_weight, face_expand_ratio, progress=gr.Progress(track_tqdm=True)):
|
16 |
if is_shared_ui:
|
17 |
raise gr.Error("This Space only works in duplicated instances")
|
18 |
|
|
|
23 |
source_image=source_image,
|
24 |
driving_audio=driving_audio,
|
25 |
output=f'output-{unique_id}.mp4',
|
26 |
+
pose_weight=pose_weight,
|
27 |
+
face_weight=face_weight,
|
28 |
+
lip_weight=lip_weight,
|
29 |
+
face_expand_ratio=face_expand_ratio,
|
30 |
checkpoint=None
|
31 |
)
|
32 |
|
|
|
91 |
''', elem_id="warning-duplicate")
|
92 |
gr.Markdown("# Demo for Hallo: Hierarchical Audio-Driven Visual Synthesis for Portrait Image Animation")
|
93 |
gr.Markdown("Generate talking head avatars driven from audio. **5 seconds of audio takes >10 minutes to generate on an L4** - duplicate the space for private use or try for free on Google Colab")
|
94 |
+
gr.Markdown("""
|
95 |
+
Hallo has a few simple requirements for input data:
|
96 |
+
|
97 |
+
For the source image:
|
98 |
+
|
99 |
+
1. It should be cropped into squares.
|
100 |
+
2. The face should be the main focus, making up 50%-70% of the image.
|
101 |
+
3. The face should be facing forward, with a rotation angle of less than 30° (no side profiles).
|
102 |
+
|
103 |
+
For the driving audio:
|
104 |
+
|
105 |
+
1. It must be in WAV format.
|
106 |
+
2. It must be in English since our training datasets are only in this language.
|
107 |
+
3. Ensure the vocals are clear; background music is acceptable.
|
108 |
+
|
109 |
+
We have provided some [samples](https://huggingface.co/datasets/fudan-generative-ai/hallo_inference_samples) for your reference.
|
110 |
+
""")
|
111 |
with gr.Row():
|
112 |
with gr.Column():
|
113 |
avatar_face = gr.Image(type="filepath", label="Face")
|
114 |
driving_audio = gr.Audio(type="filepath", label="Driving audio")
|
115 |
+
pose_weight = gr.Number(label="pose weight", value=1.0),
|
116 |
+
face_weight = gr.Number(label="face weight", value=1.0),
|
117 |
+
lip_weight = gr.Number(label="lip weight", value=1.0),
|
118 |
+
face_expand_ratio = gr.Number(label="face expand ratio", value=1.2),
|
119 |
generate = gr.Button("Generate")
|
120 |
with gr.Column():
|
121 |
output_video = gr.Video(label="Your talking head")
|
122 |
|
123 |
generate.click(
|
124 |
fn=run_inference,
|
125 |
+
inputs=[avatar_face, driving_audio, pose_weight, face_weight, lip_weight, face_expand_ratio],
|
126 |
outputs=output_video
|
127 |
)
|
128 |
|