Spaces:

fffiloni
/

Meigen-MultiTalk

Running on L40S

App Files Files Community

fffiloni commited on Jun 24

Commit

743662a

verified ·

1 Parent(s): d7bf027

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -10

app.py CHANGED Viewed

@@ -107,7 +107,7 @@ GPU_TO_VRAM_PARAMS = {
     "NVIDIA A100-SXM4-40GB": 11000000000,
     "NVIDIA A100-SXM4-80GB": 22000000000,
     "NVIDIA L4": 5000000000,
-    "NVIDIA L40S": 22000000000
 }
 USED_VRAM_PARAMS = GPU_TO_VRAM_PARAMS[gpu_name]
 print("Using", USED_VRAM_PARAMS, "for num_persistent_param_in_dit")
@@ -138,7 +138,7 @@ def create_temp_input_json(prompt: str, cond_image_path: str, cond_audio_path: s
     return temp_json_path
-def infer(prompt, cond_image_path, cond_audio_path):
     if is_shared_ui:
         trimmed_audio_path = trim_audio_to_5s_temp(cond_audio_path)
@@ -152,7 +152,7 @@ def infer(prompt, cond_image_path, cond_audio_path):
         "--ckpt_dir", "weights/Wan2.1-I2V-14B-480P",
         "--wav2vec_dir", "weights/chinese-wav2vec2-base",
         "--input_json", input_json_path,
-        "--sample_steps", "6",
         "--mode", "streaming",
         "--use_teacache",
         "--save_file", "multi_long_multigpu_exp"
@@ -164,11 +164,16 @@ def infer(prompt, cond_image_path, cond_audio_path):
             f"--nproc_per_node={num_gpus}",
             "--standalone",
             "generate_multitalk.py",
             "--dit_fsdp", "--t5_fsdp",
             "--ulysses_size", str(num_gpus),
         ] + common_args
     else:
-        cmd = ["python3", "generate_multitalk.py"] + common_args
     try:
         # Log to file and stream
@@ -196,14 +201,27 @@ def infer(prompt, cond_image_path, cond_audio_path):
 with gr.Blocks(title="MultiTalk Inference") as demo:
-    gr.Markdown("## 🎤 MultiTalk Inference Demo")
     with gr.Row():
-        with gr.Column():
             prompt_input = gr.Textbox(
                 label="Text Prompt",
                 placeholder="Describe the scene...",
-                lines=4
             )
             image_input = gr.Image(
@@ -213,9 +231,19 @@ with gr.Blocks(title="MultiTalk Inference") as demo:
             audio_input = gr.Audio(
                 type="filepath",
-                label="Conditioning Audio (.wav)"
             )
             submit_btn = gr.Button("Generate")
             gr.Examples(
@@ -225,12 +253,12 @@ with gr.Blocks(title="MultiTalk Inference") as demo:
                 inputs = [prompt_input, image_input, audio_input]
             )
-        with gr.Column():
             output_video = gr.Video(label="Generated Video")
     submit_btn.click(
         fn=infer,
-        inputs=[prompt_input, image_input, audio_input],
         outputs=output_video
     )

     "NVIDIA A100-SXM4-40GB": 11000000000,
     "NVIDIA A100-SXM4-80GB": 22000000000,
     "NVIDIA L4": 5000000000,
+    "NVIDIA L40S": 11000000000
 }
 USED_VRAM_PARAMS = GPU_TO_VRAM_PARAMS[gpu_name]
 print("Using", USED_VRAM_PARAMS, "for num_persistent_param_in_dit")
     return temp_json_path
+def infer(prompt, cond_image_path, cond_audio_path, sample_steps):
     if is_shared_ui:
         trimmed_audio_path = trim_audio_to_5s_temp(cond_audio_path)
         "--ckpt_dir", "weights/Wan2.1-I2V-14B-480P",
         "--wav2vec_dir", "weights/chinese-wav2vec2-base",
         "--input_json", input_json_path,
+        "--sample_steps", str(sample_steps),
         "--mode", "streaming",
         "--use_teacache",
         "--save_file", "multi_long_multigpu_exp"
             f"--nproc_per_node={num_gpus}",
             "--standalone",
             "generate_multitalk.py",
+            "--num_persistent_param_in_dit", "22000000000", # On 4xL40S
             "--dit_fsdp", "--t5_fsdp",
             "--ulysses_size", str(num_gpus),
         ] + common_args
     else:
+        cmd = [
+            "python3",
+            "generate_multitalk.py",
+            "--num_persistent_param_in_dit", str(USED_VRAM_PARAMS),
+        ] + common_args
     try:
         # Log to file and stream
 with gr.Blocks(title="MultiTalk Inference") as demo:
+    gr.Markdown("## 🎤 Meigen MultiTalk Inference Demo")
+    gr.Markdown("Audio will be trimmed to max 5 seconds on fffiloni's shared UI. Duplicate tonskip the queue and work with longer audio inference. ")
+    gr.HTML("""
+    <div style="display:flex;column-gap:4px;">
+        <a href="https://github.com/MeiGen-AI/MultiTalk">
+            <img src='https://img.shields.io/badge/GitHub-Repo-blue'>
+        </a>
+        <a href='https://meigen-ai.github.io/multi-talk/'><img src='https://img.shields.io/badge/Project-Page-blue'></a>
+        <a href='https://huggingface.co/MeiGen-AI/MeiGen-MultiTalk'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Model-yellow'></a>
+        <a href='https://arxiv.org/abs/2505.22647'><img src='https://img.shields.io/badge/Paper-Arxiv-red'></a>
+        <a href="https://huggingface.co/spaces/fffiloni/KDTalker?duplicate=true">
+            <img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/duplicate-this-space-sm.svg" alt="Duplicate this Space">
+        </a>
+    </div>
+    """)
     with gr.Row():
+        with gr.Column(scale=1):
             prompt_input = gr.Textbox(
                 label="Text Prompt",
                 placeholder="Describe the scene...",
             )
             image_input = gr.Image(
             audio_input = gr.Audio(
                 type="filepath",
+                label="Conditioning Audio (.wav)",
+                info
             )
+            with gr.Accordion("Advanced settings", open=False):
+                sample_steps = gr.Slider(
+                    value=6,
+                    minimum=2,
+                    maximum=25,
+                    step=1,
+                    interactive=True # False if is_shared_ui else True
+                )
             submit_btn = gr.Button("Generate")
             gr.Examples(
                 inputs = [prompt_input, image_input, audio_input]
             )
+        with gr.Column(scale=3):
             output_video = gr.Video(label="Generated Video")
     submit_btn.click(
         fn=infer,
+        inputs=[prompt_input, image_input, audio_input, sample_steps],
         outputs=output_video
     )