ginipick commited on
Commit
927c274
·
verified ·
1 Parent(s): 9e8432c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +130 -189
app.py CHANGED
@@ -1,13 +1,12 @@
 
1
  import torch
2
- from diffusers import AutoencoderKLWan, WanImageToVideoPipeline, UniPCMultistepScheduler
3
  from diffusers.utils import export_to_video
4
- from transformers import CLIPVisionModel
5
  import gradio as gr
6
  import tempfile
7
  import spaces
8
  from huggingface_hub import hf_hub_download
9
  import numpy as np
10
- from PIL import Image
11
  import random
12
  import logging
13
  import torchaudio
@@ -23,7 +22,7 @@ except ImportError:
23
 
24
  # Set environment variables for better memory management
25
  os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:512'
26
- os.environ['HF_HUB_CACHE'] = '/tmp/hub' # Use temp directory to avoid filling persistent storage
27
 
28
  from mmaudio.eval_utils import (ModelConfig, all_model_cfg, generate, load_video, make_video,
29
  setup_eval_logging)
@@ -32,6 +31,10 @@ from mmaudio.model.networks import MMAudio, get_my_mmaudio
32
  from mmaudio.model.sequence_config import SequenceConfig
33
  from mmaudio.model.utils.features_utils import FeaturesUtils
34
 
 
 
 
 
35
  # Clean up temp files periodically
36
  def cleanup_temp_files():
37
  """Clean up temporary files to save storage"""
@@ -44,23 +47,23 @@ def cleanup_temp_files():
44
  except:
45
  pass
46
 
47
- # Video generation model setup
48
- MODEL_ID = "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers"
49
- LORA_REPO_ID = "Kijai/WanVideo_comfy"
50
- LORA_FILENAME = "Wan21_CausVid_14B_T2V_lora_rank32.safetensors"
51
 
52
- image_encoder = CLIPVisionModel.from_pretrained(MODEL_ID, subfolder="image_encoder", torch_dtype=torch.float32)
53
  vae = AutoencoderKLWan.from_pretrained(MODEL_ID, subfolder="vae", torch_dtype=torch.float32)
54
- pipe = WanImageToVideoPipeline.from_pretrained(
55
- MODEL_ID, vae=vae, image_encoder=image_encoder, torch_dtype=torch.bfloat16
 
 
56
  )
57
- pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=8.0)
58
  pipe.to("cuda")
59
 
60
- causvid_path = hf_hub_download(repo_id=LORA_REPO_ID, filename=LORA_FILENAME)
61
- pipe.load_lora_weights(causvid_path, adapter_name="causvid_lora")
62
- pipe.set_adapters(["causvid_lora"], adapter_weights=[0.95])
63
- pipe.fuse_lora()
64
 
65
  # Audio generation model setup
66
  torch.backends.cuda.matmul.allow_tf32 = True
@@ -81,7 +84,7 @@ def load_audio_model():
81
  global audio_model, audio_net, audio_feature_utils, audio_seq_cfg
82
 
83
  if audio_net is None:
84
- audio_model = all_model_cfg['small_16k'] # Use smaller model
85
  audio_model.download_if_needed()
86
  setup_eval_logging()
87
 
@@ -106,20 +109,22 @@ def load_audio_model():
106
 
107
  # Constants
108
  MOD_VALUE = 32
109
- DEFAULT_H_SLIDER_VALUE = 320
110
- DEFAULT_W_SLIDER_VALUE = 560
111
- NEW_FORMULA_MAX_AREA = 480.0 * 832.0
 
 
 
112
 
113
  SLIDER_MIN_H, SLIDER_MAX_H = 128, 896
114
  SLIDER_MIN_W, SLIDER_MAX_W = 128, 896
115
  MAX_SEED = np.iinfo(np.int32).max
116
 
117
- FIXED_FPS = 24
118
  MIN_FRAMES_MODEL = 8
119
- MAX_FRAMES_MODEL = 120
120
 
121
- default_prompt_i2v = "make this image come alive, cinematic motion, smooth animation"
122
- default_negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards, watermark, text, signature"
123
  default_audio_prompt = ""
124
  default_audio_negative_prompt = "music"
125
 
@@ -243,19 +248,6 @@ label {
243
  margin-bottom: 5px !important;
244
  }
245
 
246
- /* 이미지 업로드 영역 */
247
- .image-upload {
248
- border: 2px dashed rgba(255, 255, 255, 0.3) !important;
249
- border-radius: 15px !important;
250
- background: rgba(255, 255, 255, 0.05) !important;
251
- transition: all 0.3s ease !important;
252
- }
253
-
254
- .image-upload:hover {
255
- border-color: rgba(255, 255, 255, 0.5) !important;
256
- background: rgba(255, 255, 255, 0.1) !important;
257
- }
258
-
259
  /* 비디오 출력 영역 */
260
  video {
261
  border-radius: 15px !important;
@@ -287,41 +279,6 @@ input[type="radio"] {
287
  }
288
  """
289
 
290
- def _calculate_new_dimensions_wan(pil_image, mod_val, calculation_max_area,
291
- min_slider_h, max_slider_h,
292
- min_slider_w, max_slider_w,
293
- default_h, default_w):
294
- orig_w, orig_h = pil_image.size
295
- if orig_w <= 0 or orig_h <= 0:
296
- return default_h, default_w
297
-
298
- aspect_ratio = orig_h / orig_w
299
-
300
- calc_h = round(np.sqrt(calculation_max_area * aspect_ratio))
301
- calc_w = round(np.sqrt(calculation_max_area / aspect_ratio))
302
-
303
- calc_h = max(mod_val, (calc_h // mod_val) * mod_val)
304
- calc_w = max(mod_val, (calc_w // mod_val) * mod_val)
305
-
306
- new_h = int(np.clip(calc_h, min_slider_h, (max_slider_h // mod_val) * mod_val))
307
- new_w = int(np.clip(calc_w, min_slider_w, (max_slider_w // mod_val) * mod_val))
308
-
309
- return new_h, new_w
310
-
311
- def handle_image_upload_for_dims_wan(uploaded_pil_image, current_h_val, current_w_val):
312
- if uploaded_pil_image is None:
313
- return gr.update(value=DEFAULT_H_SLIDER_VALUE), gr.update(value=DEFAULT_W_SLIDER_VALUE)
314
- try:
315
- new_h, new_w = _calculate_new_dimensions_wan(
316
- uploaded_pil_image, MOD_VALUE, NEW_FORMULA_MAX_AREA,
317
- SLIDER_MIN_H, SLIDER_MAX_H, SLIDER_MIN_W, SLIDER_MAX_W,
318
- DEFAULT_H_SLIDER_VALUE, DEFAULT_W_SLIDER_VALUE
319
- )
320
- return gr.update(value=new_h), gr.update(value=new_w)
321
- except Exception as e:
322
- gr.Warning("Error attempting to calculate new dimensions")
323
- return gr.update(value=DEFAULT_H_SLIDER_VALUE), gr.update(value=DEFAULT_W_SLIDER_VALUE)
324
-
325
  def clear_cache():
326
  """Clear GPU and CPU cache to free memory"""
327
  if torch.cuda.is_available():
@@ -329,18 +286,13 @@ def clear_cache():
329
  torch.cuda.synchronize()
330
  gc.collect()
331
 
332
- def get_duration(input_image, prompt, height, width,
333
- negative_prompt, duration_seconds,
334
- guidance_scale, steps,
335
- seed, randomize_seed,
336
- audio_mode, audio_prompt, audio_negative_prompt,
337
- audio_seed, audio_steps, audio_cfg_strength,
338
- progress):
339
- base_duration = 60
340
- if steps > 4 and duration_seconds > 2:
341
- base_duration = 90
342
- elif steps > 4 or duration_seconds > 2:
343
- base_duration = 75
344
 
345
  # Add extra time for audio generation
346
  if audio_mode == "Enable Audio":
@@ -387,39 +339,38 @@ def add_audio_to_video(video_path, duration_sec, audio_prompt, audio_negative_pr
387
  return video_with_audio_path
388
 
389
  @spaces.GPU(duration=get_duration)
390
- def generate_video(input_image, prompt, height, width,
391
- negative_prompt, duration_seconds,
392
- guidance_scale, steps,
393
- seed, randomize_seed,
394
  audio_mode, audio_prompt, audio_negative_prompt,
395
  audio_seed, audio_steps, audio_cfg_strength,
396
  progress=gr.Progress(track_tqdm=True)):
397
 
398
- if input_image is None:
399
- raise gr.Error("Please upload an input image.")
400
-
401
  target_h = max(MOD_VALUE, (int(height) // MOD_VALUE) * MOD_VALUE)
402
  target_w = max(MOD_VALUE, (int(width) // MOD_VALUE) * MOD_VALUE)
403
 
404
- num_frames = np.clip(int(round(duration_seconds * FIXED_FPS)), MIN_FRAMES_MODEL, MAX_FRAMES_MODEL)
405
 
406
  current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
407
 
408
- resized_image = input_image.resize((target_w, target_h))
409
-
410
- # Generate video
411
  with torch.inference_mode():
412
- output_frames_list = pipe(
413
- image=resized_image, prompt=prompt, negative_prompt=negative_prompt,
 
 
 
 
414
  height=target_h, width=target_w, num_frames=num_frames,
415
- guidance_scale=float(guidance_scale), num_inference_steps=int(steps),
 
416
  generator=torch.Generator(device="cuda").manual_seed(current_seed)
417
  ).frames[0]
418
 
419
  # Save video without audio
420
  with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
421
  video_path = tmpfile.name
422
- export_to_video(output_frames_list, video_path, fps=FIXED_FPS)
423
 
424
  # Generate audio if enabled
425
  video_with_audio_path = None
@@ -433,7 +384,7 @@ def generate_video(input_image, prompt, height, width,
433
 
434
  # Clear cache to free memory
435
  clear_cache()
436
- cleanup_temp_files() # Clean up temp files
437
 
438
  return video_path, video_with_audio_path, current_seed
439
 
@@ -443,9 +394,9 @@ def update_audio_visibility(audio_mode):
443
 
444
  with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
445
  with gr.Column(elem_classes=["main-container"]):
446
- gr.Markdown("# ✨ Fast 4 steps Wan 2.1 I2V (14B) with CausVid LoRA + Audio")
447
 
448
- # Add badges side by side
449
  gr.HTML("""
450
  <div class="badge-container">
451
  <a href="https://huggingface.co/spaces/Heartsync/WAN2-1-fast-T2V-FusioniX" target="_blank">
@@ -453,31 +404,39 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
453
  </a>
454
  <a href="https://huggingface.co/spaces/Heartsync/WAN2-1-fast-T2V-FusioniX2" target="_blank">
455
  <img src="https://img.shields.io/static/v1?label=BASE&message=WAN%202.1%20T2V-Fusioni2X&color=%23008080&labelColor=%23533a7d&logo=huggingface&logoColor=%23ffffff&style=for-the-badge" alt="Base Model">
456
- </a>
457
- <a href="https://huggingface.co/spaces/Heartsync/wan2-1-fast-security" target="_blank">
458
- <img src="https://img.shields.io/static/v1?label=WAN%202.1&message=FAST%20%26%20Furios&color=%23008080&labelColor=%230000ff&logo=huggingface&logoColor=%23ffa500&style=for-the-badge" alt="badge">
459
  </a>
460
  </div>
461
  """)
462
 
463
  with gr.Row():
464
  with gr.Column(elem_classes=["input-container"]):
465
- input_image_component = gr.Image(
466
- type="pil",
467
- label="🖼️ Input Image (auto-resized to target H/W)",
468
- elem_classes=["image-upload"]
469
- )
470
  prompt_input = gr.Textbox(
471
- label="✏️ Prompt",
472
- value=default_prompt_i2v,
473
- lines=2
474
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
475
  duration_seconds_input = gr.Slider(
476
- minimum=round(MIN_FRAMES_MODEL/FIXED_FPS,1),
477
- maximum=round(MAX_FRAMES_MODEL/FIXED_FPS,1),
478
- step=0.1,
479
- value=2,
480
- label="⏱️ Duration (seconds)",
481
  info=f"Clamped to model's {MIN_FRAMES_MODEL}-{MAX_FRAMES_MODEL} frames at {FIXED_FPS}fps."
482
  )
483
 
@@ -525,65 +484,53 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
525
  )
526
 
527
  with gr.Accordion("⚙️ Advanced Settings", open=False):
528
- negative_prompt_input = gr.Textbox(
529
- label="❌ Negative Prompt",
530
- value=default_negative_prompt,
531
- lines=3
532
- )
533
- seed_input = gr.Slider(
534
- label="🎲 Seed",
535
- minimum=0,
536
- maximum=MAX_SEED,
537
- step=1,
538
- value=42,
539
- interactive=True
540
- )
541
- randomize_seed_checkbox = gr.Checkbox(
542
- label="🔀 Randomize seed",
543
- value=True,
544
- interactive=True
545
- )
546
  with gr.Row():
547
  height_input = gr.Slider(
548
- minimum=SLIDER_MIN_H,
549
- maximum=SLIDER_MAX_H,
550
- step=MOD_VALUE,
551
- value=DEFAULT_H_SLIDER_VALUE,
552
- label=f"📏 Output Height (multiple of {MOD_VALUE})"
553
  )
554
  width_input = gr.Slider(
555
- minimum=SLIDER_MIN_W,
556
- maximum=SLIDER_MAX_W,
557
- step=MOD_VALUE,
558
- value=DEFAULT_W_SLIDER_VALUE,
559
- label=f"📐 Output Width (multiple of {MOD_VALUE})"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
560
  )
561
- steps_slider = gr.Slider(
562
- minimum=1,
563
- maximum=30,
564
- step=1,
565
- value=4,
566
- label="🚀 Inference Steps"
567
- )
568
- guidance_scale_input = gr.Slider(
569
- minimum=0.0,
570
- maximum=20.0,
571
- step=0.5,
572
- value=1.0,
573
- label="🎯 Guidance Scale",
574
- visible=False
575
  )
576
 
577
  generate_button = gr.Button(
578
- "🎬 Generate Video",
579
  variant="primary",
580
  elem_classes=["generate-btn"]
581
  )
582
 
583
  with gr.Column(elem_classes=["output-container"]):
584
  video_output = gr.Video(
585
- label="🎥 Generated Video",
586
- autoplay=True,
587
  interactive=False
588
  )
589
  video_with_audio_output = gr.Video(
@@ -600,44 +547,38 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
600
  outputs=[audio_settings, video_with_audio_output]
601
  )
602
 
603
- input_image_component.upload(
604
- fn=handle_image_upload_for_dims_wan,
605
- inputs=[input_image_component, height_input, width_input],
606
- outputs=[height_input, width_input]
607
- )
608
-
609
- input_image_component.clear(
610
- fn=handle_image_upload_for_dims_wan,
611
- inputs=[input_image_component, height_input, width_input],
612
- outputs=[height_input, width_input]
613
- )
614
-
615
  ui_inputs = [
616
- input_image_component, prompt_input, height_input, width_input,
617
- negative_prompt_input, duration_seconds_input,
618
- guidance_scale_input, steps_slider, seed_input, randomize_seed_checkbox,
619
  audio_mode, audio_prompt, audio_negative_prompt,
620
  audio_seed, audio_steps, audio_cfg_strength
621
  ]
622
  generate_button.click(
623
- fn=generate_video,
624
- inputs=ui_inputs,
625
  outputs=[video_output, video_with_audio_output, seed_input]
626
  )
627
 
628
  with gr.Column():
629
  gr.Examples(
630
- examples=[
631
- ["peng.png", "a penguin playfully dancing in the snow, Antarctica", 896, 896,
632
- default_negative_prompt, 2, 1.0, 4, 42, False,
 
 
 
 
 
 
 
 
 
633
  "Video Only", "", default_audio_negative_prompt, -1, 25, 4.5],
634
- ["forg.jpg", "the frog jumps around", 832, 832,
635
- default_negative_prompt, 2, 1.0, 4, 42, False,
636
- "Enable Audio", "frog croaking, water splashing", default_audio_negative_prompt, -1, 25, 4.5],
637
  ],
638
- inputs=ui_inputs,
639
- outputs=[video_output, video_with_audio_output, seed_input],
640
- fn=generate_video,
641
  cache_examples="lazy",
642
  label="🌟 Example Gallery"
643
  )
 
1
+ import types
2
  import torch
3
+ from diffusers import AutoencoderKLWan, UniPCMultistepScheduler
4
  from diffusers.utils import export_to_video
 
5
  import gradio as gr
6
  import tempfile
7
  import spaces
8
  from huggingface_hub import hf_hub_download
9
  import numpy as np
 
10
  import random
11
  import logging
12
  import torchaudio
 
22
 
23
  # Set environment variables for better memory management
24
  os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:512'
25
+ os.environ['HF_HUB_CACHE'] = '/tmp/hub'
26
 
27
  from mmaudio.eval_utils import (ModelConfig, all_model_cfg, generate, load_video, make_video,
28
  setup_eval_logging)
 
31
  from mmaudio.model.sequence_config import SequenceConfig
32
  from mmaudio.model.utils.features_utils import FeaturesUtils
33
 
34
+ # NAG imports
35
+ from src.pipeline_wan_nag import NAGWanPipeline
36
+ from src.transformer_wan_nag import NagWanTransformer3DModel
37
+
38
  # Clean up temp files periodically
39
  def cleanup_temp_files():
40
  """Clean up temporary files to save storage"""
 
47
  except:
48
  pass
49
 
50
+ # Video generation model setup (NAG)
51
+ MODEL_ID = "Wan-AI/Wan2.1-T2V-14B-Diffusers"
52
+ SUB_MODEL_ID = "vrgamedevgirl84/Wan14BT2VFusioniX"
53
+ SUB_MODEL_FILENAME = "Wan14BT2VFusioniX_fp16_.safetensors"
54
 
 
55
  vae = AutoencoderKLWan.from_pretrained(MODEL_ID, subfolder="vae", torch_dtype=torch.float32)
56
+ wan_path = hf_hub_download(repo_id=SUB_MODEL_ID, filename=SUB_MODEL_FILENAME)
57
+ transformer = NagWanTransformer3DModel.from_single_file(wan_path, torch_dtype=torch.bfloat16)
58
+ pipe = NAGWanPipeline.from_pretrained(
59
+ MODEL_ID, vae=vae, transformer=transformer, torch_dtype=torch.bfloat16
60
  )
61
+ pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=5.0)
62
  pipe.to("cuda")
63
 
64
+ pipe.transformer.__class__.attn_processors = NagWanTransformer3DModel.attn_processors
65
+ pipe.transformer.__class__.set_attn_processor = NagWanTransformer3DModel.set_attn_processor
66
+ pipe.transformer.__class__.forward = NagWanTransformer3DModel.forward
 
67
 
68
  # Audio generation model setup
69
  torch.backends.cuda.matmul.allow_tf32 = True
 
84
  global audio_model, audio_net, audio_feature_utils, audio_seq_cfg
85
 
86
  if audio_net is None:
87
+ audio_model = all_model_cfg['small_16k']
88
  audio_model.download_if_needed()
89
  setup_eval_logging()
90
 
 
109
 
110
  # Constants
111
  MOD_VALUE = 32
112
+ DEFAULT_DURATION_SECONDS = 4
113
+ DEFAULT_STEPS = 4
114
+ DEFAULT_SEED = 2025
115
+ DEFAULT_H_SLIDER_VALUE = 480
116
+ DEFAULT_W_SLIDER_VALUE = 832
117
+ NEW_FORMULA_MAX_AREA = 480.0 * 832.0
118
 
119
  SLIDER_MIN_H, SLIDER_MAX_H = 128, 896
120
  SLIDER_MIN_W, SLIDER_MAX_W = 128, 896
121
  MAX_SEED = np.iinfo(np.int32).max
122
 
123
+ FIXED_FPS = 16
124
  MIN_FRAMES_MODEL = 8
125
+ MAX_FRAMES_MODEL = 129
126
 
127
+ DEFAULT_NAG_NEGATIVE_PROMPT = "Static, motionless, still, ugly, bad quality, worst quality, poorly drawn, low resolution, blurry, lack of details"
 
128
  default_audio_prompt = ""
129
  default_audio_negative_prompt = "music"
130
 
 
248
  margin-bottom: 5px !important;
249
  }
250
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
  /* 비디오 출력 영역 */
252
  video {
253
  border-radius: 15px !important;
 
279
  }
280
  """
281
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
282
  def clear_cache():
283
  """Clear GPU and CPU cache to free memory"""
284
  if torch.cuda.is_available():
 
286
  torch.cuda.synchronize()
287
  gc.collect()
288
 
289
+ def get_duration(prompt, nag_negative_prompt, nag_scale,
290
+ height, width, duration_seconds,
291
+ steps, seed, randomize_seed,
292
+ audio_mode, audio_prompt, audio_negative_prompt,
293
+ audio_seed, audio_steps, audio_cfg_strength,
294
+ progress):
295
+ base_duration = int(duration_seconds) * int(steps) * 2.25 + 5
 
 
 
 
 
296
 
297
  # Add extra time for audio generation
298
  if audio_mode == "Enable Audio":
 
339
  return video_with_audio_path
340
 
341
  @spaces.GPU(duration=get_duration)
342
+ def generate_video(prompt, nag_negative_prompt, nag_scale,
343
+ height, width, duration_seconds,
344
+ steps, seed, randomize_seed,
 
345
  audio_mode, audio_prompt, audio_negative_prompt,
346
  audio_seed, audio_steps, audio_cfg_strength,
347
  progress=gr.Progress(track_tqdm=True)):
348
 
 
 
 
349
  target_h = max(MOD_VALUE, (int(height) // MOD_VALUE) * MOD_VALUE)
350
  target_w = max(MOD_VALUE, (int(width) // MOD_VALUE) * MOD_VALUE)
351
 
352
+ num_frames = np.clip(int(round(int(duration_seconds) * FIXED_FPS) + 1), MIN_FRAMES_MODEL, MAX_FRAMES_MODEL)
353
 
354
  current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
355
 
356
+ # Generate video using NAG
 
 
357
  with torch.inference_mode():
358
+ nag_output_frames_list = pipe(
359
+ prompt=prompt,
360
+ nag_negative_prompt=nag_negative_prompt,
361
+ nag_scale=nag_scale,
362
+ nag_tau=3.5,
363
+ nag_alpha=0.5,
364
  height=target_h, width=target_w, num_frames=num_frames,
365
+ guidance_scale=0.,
366
+ num_inference_steps=int(steps),
367
  generator=torch.Generator(device="cuda").manual_seed(current_seed)
368
  ).frames[0]
369
 
370
  # Save video without audio
371
  with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
372
  video_path = tmpfile.name
373
+ export_to_video(nag_output_frames_list, video_path, fps=FIXED_FPS)
374
 
375
  # Generate audio if enabled
376
  video_with_audio_path = None
 
384
 
385
  # Clear cache to free memory
386
  clear_cache()
387
+ cleanup_temp_files()
388
 
389
  return video_path, video_with_audio_path, current_seed
390
 
 
394
 
395
  with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
396
  with gr.Column(elem_classes=["main-container"]):
397
+ gr.Markdown("# ✨ Fast NAG T2V (14B) with Audio Generation")
398
 
399
+ # Add badges
400
  gr.HTML("""
401
  <div class="badge-container">
402
  <a href="https://huggingface.co/spaces/Heartsync/WAN2-1-fast-T2V-FusioniX" target="_blank">
 
404
  </a>
405
  <a href="https://huggingface.co/spaces/Heartsync/WAN2-1-fast-T2V-FusioniX2" target="_blank">
406
  <img src="https://img.shields.io/static/v1?label=BASE&message=WAN%202.1%20T2V-Fusioni2X&color=%23008080&labelColor=%23533a7d&logo=huggingface&logoColor=%23ffffff&style=for-the-badge" alt="Base Model">
 
 
 
407
  </a>
408
  </div>
409
  """)
410
 
411
  with gr.Row():
412
  with gr.Column(elem_classes=["input-container"]):
 
 
 
 
 
413
  prompt_input = gr.Textbox(
414
+ label="✏️ Video Prompt",
415
+ placeholder="Describe your video scene in detail...",
416
+ lines=3
417
  )
418
+
419
+ with gr.Accordion("🎨 NAG Settings", open=False):
420
+ nag_negative_prompt = gr.Textbox(
421
+ label="❌ NAG Negative Prompt",
422
+ value=DEFAULT_NAG_NEGATIVE_PROMPT,
423
+ lines=2
424
+ )
425
+ nag_scale = gr.Slider(
426
+ label="🎯 NAG Scale",
427
+ minimum=1.0,
428
+ maximum=20.0,
429
+ step=0.25,
430
+ value=11.0,
431
+ info="Higher values = stronger guidance"
432
+ )
433
+
434
  duration_seconds_input = gr.Slider(
435
+ minimum=1,
436
+ maximum=8,
437
+ step=1,
438
+ value=DEFAULT_DURATION_SECONDS,
439
+ label="⏱️ Duration (seconds)",
440
  info=f"Clamped to model's {MIN_FRAMES_MODEL}-{MAX_FRAMES_MODEL} frames at {FIXED_FPS}fps."
441
  )
442
 
 
484
  )
485
 
486
  with gr.Accordion("⚙️ Advanced Settings", open=False):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
487
  with gr.Row():
488
  height_input = gr.Slider(
489
+ minimum=SLIDER_MIN_H,
490
+ maximum=SLIDER_MAX_H,
491
+ step=MOD_VALUE,
492
+ value=DEFAULT_H_SLIDER_VALUE,
493
+ label=f"📏 Output Height (×{MOD_VALUE})"
494
  )
495
  width_input = gr.Slider(
496
+ minimum=SLIDER_MIN_W,
497
+ maximum=SLIDER_MAX_W,
498
+ step=MOD_VALUE,
499
+ value=DEFAULT_W_SLIDER_VALUE,
500
+ label=f"📐 Output Width (×{MOD_VALUE})"
501
+ )
502
+ with gr.Row():
503
+ steps_slider = gr.Slider(
504
+ minimum=1,
505
+ maximum=8,
506
+ step=1,
507
+ value=DEFAULT_STEPS,
508
+ label="🚀 Inference Steps"
509
+ )
510
+ seed_input = gr.Slider(
511
+ label="🎲 Seed",
512
+ minimum=0,
513
+ maximum=MAX_SEED,
514
+ step=1,
515
+ value=DEFAULT_SEED,
516
+ interactive=True
517
  )
518
+ randomize_seed_checkbox = gr.Checkbox(
519
+ label="🔀 Randomize seed",
520
+ value=True,
521
+ interactive=True
 
 
 
 
 
 
 
 
 
 
522
  )
523
 
524
  generate_button = gr.Button(
525
+ "🎬 Generate Video",
526
  variant="primary",
527
  elem_classes=["generate-btn"]
528
  )
529
 
530
  with gr.Column(elem_classes=["output-container"]):
531
  video_output = gr.Video(
532
+ label="🎥 Generated Video",
533
+ autoplay=True,
534
  interactive=False
535
  )
536
  video_with_audio_output = gr.Video(
 
547
  outputs=[audio_settings, video_with_audio_output]
548
  )
549
 
 
 
 
 
 
 
 
 
 
 
 
 
550
  ui_inputs = [
551
+ prompt_input, nag_negative_prompt, nag_scale,
552
+ height_input, width_input, duration_seconds_input,
553
+ steps_slider, seed_input, randomize_seed_checkbox,
554
  audio_mode, audio_prompt, audio_negative_prompt,
555
  audio_seed, audio_steps, audio_cfg_strength
556
  ]
557
  generate_button.click(
558
+ fn=generate_video,
559
+ inputs=ui_inputs,
560
  outputs=[video_output, video_with_audio_output, seed_input]
561
  )
562
 
563
  with gr.Column():
564
  gr.Examples(
565
+ examples=[
566
+ ["A ginger cat passionately plays electric guitar with intensity and emotion on a stage. The background is shrouded in deep darkness. Spotlights cast dramatic shadows.", DEFAULT_NAG_NEGATIVE_PROMPT, 11,
567
+ DEFAULT_H_SLIDER_VALUE, DEFAULT_W_SLIDER_VALUE, DEFAULT_DURATION_SECONDS,
568
+ DEFAULT_STEPS, DEFAULT_SEED, False,
569
+ "Enable Audio", "electric guitar riffs, cat meowing", default_audio_negative_prompt, -1, 25, 4.5],
570
+ ["A red vintage Porsche convertible flying over a rugged coastal cliff. Monstrous waves violently crashing against the rocks below. A lighthouse stands tall atop the cliff.", DEFAULT_NAG_NEGATIVE_PROMPT, 11,
571
+ DEFAULT_H_SLIDER_VALUE, DEFAULT_W_SLIDER_VALUE, DEFAULT_DURATION_SECONDS,
572
+ DEFAULT_STEPS, DEFAULT_SEED, False,
573
+ "Enable Audio", "car engine, ocean waves crashing, wind", default_audio_negative_prompt, -1, 25, 4.5],
574
+ ["Enormous glowing jellyfish float slowly across a sky filled with soft clouds. Their tentacles shimmer with iridescent light as they drift above a peaceful mountain landscape. Magical and dreamlike, captured in a wide shot. Surreal realism style with detailed textures.", DEFAULT_NAG_NEGATIVE_PROMPT, 11,
575
+ DEFAULT_H_SLIDER_VALUE, DEFAULT_W_SLIDER_VALUE, DEFAULT_DURATION_SECONDS,
576
+ DEFAULT_STEPS, DEFAULT_SEED, False,
577
  "Video Only", "", default_audio_negative_prompt, -1, 25, 4.5],
 
 
 
578
  ],
579
+ inputs=ui_inputs,
580
+ outputs=[video_output, video_with_audio_output, seed_input],
581
+ fn=generate_video,
582
  cache_examples="lazy",
583
  label="🌟 Example Gallery"
584
  )