fffiloni commited on
Commit
743662a
·
verified ·
1 Parent(s): d7bf027

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -10
app.py CHANGED
@@ -107,7 +107,7 @@ GPU_TO_VRAM_PARAMS = {
107
  "NVIDIA A100-SXM4-40GB": 11000000000,
108
  "NVIDIA A100-SXM4-80GB": 22000000000,
109
  "NVIDIA L4": 5000000000,
110
- "NVIDIA L40S": 22000000000
111
  }
112
  USED_VRAM_PARAMS = GPU_TO_VRAM_PARAMS[gpu_name]
113
  print("Using", USED_VRAM_PARAMS, "for num_persistent_param_in_dit")
@@ -138,7 +138,7 @@ def create_temp_input_json(prompt: str, cond_image_path: str, cond_audio_path: s
138
  return temp_json_path
139
 
140
 
141
- def infer(prompt, cond_image_path, cond_audio_path):
142
 
143
  if is_shared_ui:
144
  trimmed_audio_path = trim_audio_to_5s_temp(cond_audio_path)
@@ -152,7 +152,7 @@ def infer(prompt, cond_image_path, cond_audio_path):
152
  "--ckpt_dir", "weights/Wan2.1-I2V-14B-480P",
153
  "--wav2vec_dir", "weights/chinese-wav2vec2-base",
154
  "--input_json", input_json_path,
155
- "--sample_steps", "6",
156
  "--mode", "streaming",
157
  "--use_teacache",
158
  "--save_file", "multi_long_multigpu_exp"
@@ -164,11 +164,16 @@ def infer(prompt, cond_image_path, cond_audio_path):
164
  f"--nproc_per_node={num_gpus}",
165
  "--standalone",
166
  "generate_multitalk.py",
 
167
  "--dit_fsdp", "--t5_fsdp",
168
  "--ulysses_size", str(num_gpus),
169
  ] + common_args
170
  else:
171
- cmd = ["python3", "generate_multitalk.py"] + common_args
 
 
 
 
172
 
173
  try:
174
  # Log to file and stream
@@ -196,14 +201,27 @@ def infer(prompt, cond_image_path, cond_audio_path):
196
 
197
 
198
  with gr.Blocks(title="MultiTalk Inference") as demo:
199
- gr.Markdown("## 🎤 MultiTalk Inference Demo")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
 
201
  with gr.Row():
202
- with gr.Column():
203
  prompt_input = gr.Textbox(
204
  label="Text Prompt",
205
  placeholder="Describe the scene...",
206
- lines=4
207
  )
208
 
209
  image_input = gr.Image(
@@ -213,9 +231,19 @@ with gr.Blocks(title="MultiTalk Inference") as demo:
213
 
214
  audio_input = gr.Audio(
215
  type="filepath",
216
- label="Conditioning Audio (.wav)"
 
217
  )
218
 
 
 
 
 
 
 
 
 
 
219
  submit_btn = gr.Button("Generate")
220
 
221
  gr.Examples(
@@ -225,12 +253,12 @@ with gr.Blocks(title="MultiTalk Inference") as demo:
225
  inputs = [prompt_input, image_input, audio_input]
226
  )
227
 
228
- with gr.Column():
229
  output_video = gr.Video(label="Generated Video")
230
 
231
  submit_btn.click(
232
  fn=infer,
233
- inputs=[prompt_input, image_input, audio_input],
234
  outputs=output_video
235
  )
236
 
 
107
  "NVIDIA A100-SXM4-40GB": 11000000000,
108
  "NVIDIA A100-SXM4-80GB": 22000000000,
109
  "NVIDIA L4": 5000000000,
110
+ "NVIDIA L40S": 11000000000
111
  }
112
  USED_VRAM_PARAMS = GPU_TO_VRAM_PARAMS[gpu_name]
113
  print("Using", USED_VRAM_PARAMS, "for num_persistent_param_in_dit")
 
138
  return temp_json_path
139
 
140
 
141
+ def infer(prompt, cond_image_path, cond_audio_path, sample_steps):
142
 
143
  if is_shared_ui:
144
  trimmed_audio_path = trim_audio_to_5s_temp(cond_audio_path)
 
152
  "--ckpt_dir", "weights/Wan2.1-I2V-14B-480P",
153
  "--wav2vec_dir", "weights/chinese-wav2vec2-base",
154
  "--input_json", input_json_path,
155
+ "--sample_steps", str(sample_steps),
156
  "--mode", "streaming",
157
  "--use_teacache",
158
  "--save_file", "multi_long_multigpu_exp"
 
164
  f"--nproc_per_node={num_gpus}",
165
  "--standalone",
166
  "generate_multitalk.py",
167
+ "--num_persistent_param_in_dit", "22000000000", # On 4xL40S
168
  "--dit_fsdp", "--t5_fsdp",
169
  "--ulysses_size", str(num_gpus),
170
  ] + common_args
171
  else:
172
+ cmd = [
173
+ "python3",
174
+ "generate_multitalk.py",
175
+ "--num_persistent_param_in_dit", str(USED_VRAM_PARAMS),
176
+ ] + common_args
177
 
178
  try:
179
  # Log to file and stream
 
201
 
202
 
203
  with gr.Blocks(title="MultiTalk Inference") as demo:
204
+ gr.Markdown("## 🎤 Meigen MultiTalk Inference Demo")
205
+ gr.Markdown("Audio will be trimmed to max 5 seconds on fffiloni's shared UI. Duplicate tonskip the queue and work with longer audio inference. ")
206
+ gr.HTML("""
207
+ <div style="display:flex;column-gap:4px;">
208
+ <a href="https://github.com/MeiGen-AI/MultiTalk">
209
+ <img src='https://img.shields.io/badge/GitHub-Repo-blue'>
210
+ </a>
211
+ <a href='https://meigen-ai.github.io/multi-talk/'><img src='https://img.shields.io/badge/Project-Page-blue'></a>
212
+ <a href='https://huggingface.co/MeiGen-AI/MeiGen-MultiTalk'><img src='https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Model-yellow'></a>
213
+ <a href='https://arxiv.org/abs/2505.22647'><img src='https://img.shields.io/badge/Paper-Arxiv-red'></a>
214
+ <a href="https://huggingface.co/spaces/fffiloni/KDTalker?duplicate=true">
215
+ <img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/duplicate-this-space-sm.svg" alt="Duplicate this Space">
216
+ </a>
217
+ </div>
218
+ """)
219
 
220
  with gr.Row():
221
+ with gr.Column(scale=1):
222
  prompt_input = gr.Textbox(
223
  label="Text Prompt",
224
  placeholder="Describe the scene...",
 
225
  )
226
 
227
  image_input = gr.Image(
 
231
 
232
  audio_input = gr.Audio(
233
  type="filepath",
234
+ label="Conditioning Audio (.wav)",
235
+ info
236
  )
237
 
238
+ with gr.Accordion("Advanced settings", open=False):
239
+ sample_steps = gr.Slider(
240
+ value=6,
241
+ minimum=2,
242
+ maximum=25,
243
+ step=1,
244
+ interactive=True # False if is_shared_ui else True
245
+ )
246
+
247
  submit_btn = gr.Button("Generate")
248
 
249
  gr.Examples(
 
253
  inputs = [prompt_input, image_input, audio_input]
254
  )
255
 
256
+ with gr.Column(scale=3):
257
  output_video = gr.Video(label="Generated Video")
258
 
259
  submit_btn.click(
260
  fn=infer,
261
+ inputs=[prompt_input, image_input, audio_input, sample_steps],
262
  outputs=output_video
263
  )
264