File size: 5,554 Bytes
63f899c
 
 
 
59d9186
63f899c
 
59d9186
63f899c
ca33a23
bd786ec
c39b894
 
63f899c
69c71b8
a6075c0
 
 
59d9186
 
63f899c
59d9186
751c5b7
 
59d9186
69c71b8
 
 
 
59d9186
63f899c
 
 
59d9186
63f899c
 
a6075c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
baa1646
 
 
 
a6075c0
bd786ec
a6075c0
 
 
 
 
 
 
 
e4e4cf1
a6075c0
 
 
 
 
 
 
 
 
 
e4e4cf1
69c71b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a6075c0
 
 
 
69c71b8
 
 
 
a6075c0
 
 
 
 
 
69c71b8
a6075c0
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import os
import shutil
from huggingface_hub import snapshot_download
import gradio as gr
os.chdir(os.path.dirname(os.path.abspath(__file__)))
from scripts.inference import inference_process
import argparse
import uuid

is_shared_ui = True if "fudan-generative-ai/hallo" in os.environ['SPACE_ID'] else False

if(not is_shared_ui):
    hallo_dir = snapshot_download(repo_id="fudan-generative-ai/hallo", local_dir="pretrained_models")

def run_inference(source_image, driving_audio, pose_weight, face_weight, lip_weight, face_expand_ratio, progress=gr.Progress(track_tqdm=True)):
    if is_shared_ui:
        raise gr.Error("This Space only works in duplicated instances")
        
    unique_id = uuid.uuid4()
    
    args = argparse.Namespace(
        config='configs/inference/default.yaml',
        source_image=source_image,
        driving_audio=driving_audio,
        output=f'output-{unique_id}.mp4',
        pose_weight=pose_weight,
        face_weight=face_weight,
        lip_weight=lip_weight,
        face_expand_ratio=face_expand_ratio,
        checkpoint=None
    )
    
    inference_process(args)
    return f'output-{unique_id}.mp4' 


css = '''
div#warning-ready {
    background-color: #ecfdf5;
    padding: 0 16px 16px;
    margin: 20px 0;
    color: #030303!important;
}
div#warning-ready > .gr-prose > h2, div#warning-ready > .gr-prose > p {
    color: #057857!important;
}
div#warning-duplicate {
    background-color: #ebf5ff;
    padding: 0 16px 16px;
    margin: 20px 0;
    color: #030303!important;
}
div#warning-duplicate > .gr-prose > h2, div#warning-duplicate > .gr-prose > p {
    color: #0f4592!important;
}
div#warning-duplicate strong {
    color: #0f4592;
}
p.actions {
    display: flex;
    align-items: center;
    margin: 20px 0;
}
div#warning-duplicate .actions a {
    display: inline-block;
    margin-right: 10px;
}
.dark #warning-duplicate {
    background-color: #0c0c0c !important;
    border: 1px solid white !important;
}
'''

with gr.Blocks(css=css) as demo:
    if is_shared_ui:
        top_description = gr.HTML(f'''
            <div class="gr-prose">
                <h2 class="custom-color"><svg xmlns="http://www.w3.org/2000/svg" width="18px" height="18px" style="margin-right: 0px;display: inline-block;"fill="none"><path fill="#fff" d="M7 13.2a6.3 6.3 0 0 0 4.4-10.7A6.3 6.3 0 0 0 .6 6.9 6.3 6.3 0 0 0 7 13.2Z"/><path fill="#fff" fill-rule="evenodd" d="M7 0a6.9 6.9 0 0 1 4.8 11.8A6.9 6.9 0 0 1 0 7 6.9 6.9 0 0 1 7 0Zm0 0v.7V0ZM0 7h.6H0Zm7 6.8v-.6.6ZM13.7 7h-.6.6ZM9.1 1.7c-.7-.3-1.4-.4-2.2-.4a5.6 5.6 0 0 0-4 1.6 5.6 5.6 0 0 0-1.6 4 5.6 5.6 0 0 0 1.6 4 5.6 5.6 0 0 0 4 1.7 5.6 5.6 0 0 0 4-1.7 5.6 5.6 0 0 0 1.7-4 5.6 5.6 0 0 0-1.7-4c-.5-.5-1.1-.9-1.8-1.2Z" clip-rule="evenodd"/><path fill="#000" fill-rule="evenodd" d="M7 2.9a.8.8 0 1 1 0 1.5A.8.8 0 0 1 7 3ZM5.8 5.7c0-.4.3-.6.6-.6h.7c.3 0 .6.2.6.6v3.7h.5a.6.6 0 0 1 0 1.3H6a.6.6 0 0 1 0-1.3h.4v-3a.6.6 0 0 1-.6-.7Z" clip-rule="evenodd"/></svg>
                Attention: this Space need to be duplicated to work</h2>
                <p class="main-message custom-color">
                    To make it work, <strong>duplicate the Space</strong> and run it on your own profile using a <strong>private</strong> GPU.<br />
                    An L4 costs <strong>US$0.80/h</strong>
                </p>
                <p class="actions custom-color">
                    <a href="https://huggingface.co/spaces/{os.environ['SPACE_ID']}?duplicate=true">
                        <img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/duplicate-this-space-lg-dark.svg" alt="Duplicate this Space" />
                    </a>
                    to start generate your talking head
                </p>
            </div>
        ''', elem_id="warning-duplicate")
    gr.Markdown("# Demo for Hallo: Hierarchical Audio-Driven Visual Synthesis for Portrait Image Animation")
    gr.Markdown("Generate talking head avatars driven from audio. **5 seconds of audio takes >10 minutes to generate on an L4** - duplicate the space for private use or try for free on Google Colab")
    gr.Markdown("""
Hallo has a few simple requirements for input data:

For the source image:

1. It should be cropped into squares.
2. The face should be the main focus, making up 50%-70% of the image.
3. The face should be facing forward, with a rotation angle of less than 30° (no side profiles).

For the driving audio:

1. It must be in WAV format.
2. It must be in English since our training datasets are only in this language.
3. Ensure the vocals are clear; background music is acceptable.

We have provided some [samples](https://huggingface.co/datasets/fudan-generative-ai/hallo_inference_samples) for your reference.
                """)
    with gr.Row():
        with gr.Column():
            avatar_face = gr.Image(type="filepath", label="Face")
            driving_audio = gr.Audio(type="filepath", label="Driving audio")
            pose_weight = gr.Number(label="pose weight", value=1.0),
            face_weight = gr.Number(label="face weight", value=1.0),
            lip_weight = gr.Number(label="lip weight", value=1.0),
            face_expand_ratio = gr.Number(label="face expand ratio", value=1.2),
            generate = gr.Button("Generate")
        with gr.Column():
            output_video = gr.Video(label="Your talking head")

    generate.click(
        fn=run_inference,
        inputs=[avatar_face, driving_audio, pose_weight, face_weight, lip_weight, face_expand_ratio],
        outputs=output_video
    )
    
demo.launch()