Spaces:

fffiloni
/

Meigen-MultiTalk

Running on L40S

App Files Files Community

fffiloni commited on Jun 24

Commit

6836325

verified ·

1 Parent(s): f8c81bd

Set examples

Browse files

Files changed (1) hide show

app.py +45 -17

app.py CHANGED Viewed

@@ -114,19 +114,31 @@ print("Using", USED_VRAM_PARAMS, "for num_persistent_param_in_dit")
-def create_temp_input_json(prompt: str, cond_image_path: str, cond_audio_path: str) -> str:
     """
     Create a temporary JSON file with the user-provided prompt, image, and audio paths.
     Returns the path to the temporary JSON file.
     """
     # Structure based on your original JSON format
-    data = {
-        "prompt": prompt,
-        "cond_image": cond_image_path,
-        "cond_audio": {
-            "person1": cond_audio_path
         }
-    }
     # Create a temp file
     temp_json = tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode='w', encoding='utf-8')
@@ -138,14 +150,19 @@ def create_temp_input_json(prompt: str, cond_image_path: str, cond_audio_path: s
     return temp_json_path
-def infer(prompt, cond_image_path, cond_audio_path, sample_steps):
     if is_shared_ui:
-        trimmed_audio_path = trim_audio_to_5s_temp(cond_audio_path)
-        cond_audio_path = trimmed_audio_path
     # Prepare input JSON
-    input_json_path = create_temp_input_json(prompt, cond_image_path, cond_audio_path)
     # Base args
     common_args = [
@@ -229,13 +246,19 @@ with gr.Blocks(title="MultiTalk Inference") as demo:
                 label="Conditioning Image"
             )
-            audio_input = gr.Audio(
                 type="filepath",
-                label="Conditioning Audio (.wav)"
             )
             with gr.Accordion("Advanced settings", open=False):
                 sample_steps = gr.Slider(
                     value=6,
                     minimum=2,
                     maximum=25,
@@ -247,9 +270,14 @@ with gr.Blocks(title="MultiTalk Inference") as demo:
             gr.Examples(
                 examples = [
-                    ["A woman sings passionately in a dimly lit studio.", "examples/single/single1.png", "examples/single/1.wav"]
                 ],
-                inputs = [prompt_input, image_input, audio_input]
             )
         with gr.Column(scale=3):
@@ -257,8 +285,8 @@ with gr.Blocks(title="MultiTalk Inference") as demo:
     submit_btn.click(
         fn=infer,
-        inputs=[prompt_input, image_input, audio_input, sample_steps],
         outputs=output_video
     )
-demo.launch()

+def create_temp_input_json(prompt: str, cond_image_path: str, cond_audio_path_spk1: str, cond_audio_path_spk2: str) -> str:
     """
     Create a temporary JSON file with the user-provided prompt, image, and audio paths.
     Returns the path to the temporary JSON file.
     """
     # Structure based on your original JSON format
+    if cond_audio_path_spk2 is None:
+        data = {
+            "prompt": prompt,
+            "cond_image": cond_image_path,
+            "cond_audio": {
+                "person1": cond_audio_path_spk1
+            }
+        }
+    else:
+        data = {
+            "prompt": prompt,
+            "cond_image": cond_image_path,
+            "audio_type": "para",
+            "cond_audio": {
+                "person1": cond_audio_path_spk1,
+                "person2": cond_audio_path_spk2
+            }
         }
     # Create a temp file
     temp_json = tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode='w', encoding='utf-8')
     return temp_json_path
+def infer(prompt, cond_image_path, cond_audio_path_spk1, cond_audio_path_spk2, sample_steps):
     if is_shared_ui:
+        trimmed_audio_path_spk1 = trim_audio_to_5s_temp(cond_audio_path_spk1)
+        cond_audio_path_spk1 = trimmed_audio_path_spk1
+        if cond_audio_path_spk2 is not None:
+            trimmed_audio_path_spk2 = trim_audio_to_5s_temp(cond_audio_path_spk2)
+            cond_audio_path_spk2 = trimmed_audio_path_spk2
     # Prepare input JSON
+    input_json_path = create_temp_input_json(prompt, cond_image_path, cond_audio_path_spk1, cond_audio_path_spk2)
     # Base args
     common_args = [
                 label="Conditioning Image"
             )
+            audio_input_spk1 = gr.Audio(
+                type="filepath",
+                label="Conditioning Audio for speaker 1(.wav)"
+            )
+            audio_input_spk2 = gr.Audio(
                 type="filepath",
+                label="Conditioning Audio for speaker 2(.wav)"
             )
             with gr.Accordion("Advanced settings", open=False):
                 sample_steps = gr.Slider(
+                    label="sample steps",
                     value=6,
                     minimum=2,
                     maximum=25,
             gr.Examples(
                 examples = [
+                    ["A woman sings passionately in a dimly lit studio.", "examples/single/single1.png", "examples/single/1.wav", None, 6],
+                    ["In a cozy recording studio, a man and a woman are singing together. The man, with tousled brown hair, stands to the left, wearing a light green button-down shirt. His gaze is directed towards the woman, who is smiling warmly. She, with wavy dark hair, is dressed in a black floral dress and stands to the right, her eyes closed in enjoyment. Between them is a professional microphone, capturing their harmonious voices. The background features wooden panels and various audio equipment, creating an intimate and focused atmosphere. The lighting is soft and warm, highlighting their expressions and the intimate setting. A medium shot captures their interaction closely.", "examples/multi/3/multi3.png", "examples/multi/3/1-man.WAV", "examples/multi/3/1-woman.WAV", 6],
                 ],
+                fn=infer,
+                inputs = [prompt_input, image_input, audio_input_spk1, audio_input_spk2, sample_steps]
+                outputs=output_video,
+                cache_examples = True,
+                cache_mode = "lazy"
             )
         with gr.Column(scale=3):
     submit_btn.click(
         fn=infer,
+        inputs=[prompt_input, image_input, audio_input_spk1, sample_steps],
         outputs=output_video
     )
+demo.launch(ssr_mode=False, show_error=True, show_api=False)