ginipick commited on
Commit
061dfbf
·
verified ·
1 Parent(s): 7325b0e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +159 -53
app.py CHANGED
@@ -1,7 +1,8 @@
1
- import types
2
  import torch
3
- from diffusers import AutoencoderKLWan, UniPCMultistepScheduler
 
4
  from diffusers.utils import export_to_video
 
5
  import gradio as gr
6
  import tempfile
7
  import spaces
@@ -9,9 +10,9 @@ from huggingface_hub import hf_hub_download
9
  import numpy as np
10
  import random
11
  import logging
12
- import torchaudio
13
  import os
14
  import gc
 
15
 
16
  # MMAudio imports
17
  try:
@@ -20,7 +21,7 @@ except ImportError:
20
  os.system("pip install -e .")
21
  import mmaudio
22
 
23
- # Set environment variables for better memory management
24
  os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:512'
25
  os.environ['HF_HUB_CACHE'] = '/tmp/hub'
26
 
@@ -31,13 +32,111 @@ from mmaudio.model.networks import MMAudio, get_my_mmaudio
31
  from mmaudio.model.sequence_config import SequenceConfig
32
  from mmaudio.model.utils.features_utils import FeaturesUtils
33
 
34
- # NAG imports
35
- from src.pipeline_wan_nag import NAGWanPipeline
36
- from src.transformer_wan_nag import NagWanTransformer3DModel
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
- # Clean up temp files periodically
39
  def cleanup_temp_files():
40
- """Clean up temporary files to save storage"""
41
  temp_dir = tempfile.gettempdir()
42
  for filename in os.listdir(temp_dir):
43
  filepath = os.path.join(temp_dir, filename)
@@ -47,23 +146,24 @@ def cleanup_temp_files():
47
  except:
48
  pass
49
 
50
- # Video generation model setup (NAG)
51
  MODEL_ID = "Wan-AI/Wan2.1-T2V-14B-Diffusers"
52
- SUB_MODEL_ID = "vrgamedevgirl84/Wan14BT2VFusioniX"
53
- SUB_MODEL_FILENAME = "Wan14BT2VFusioniX_fp16_.safetensors"
54
 
 
55
  vae = AutoencoderKLWan.from_pretrained(MODEL_ID, subfolder="vae", torch_dtype=torch.float32)
56
- wan_path = hf_hub_download(repo_id=SUB_MODEL_ID, filename=SUB_MODEL_FILENAME)
57
- transformer = NagWanTransformer3DModel.from_single_file(wan_path, torch_dtype=torch.bfloat16)
58
  pipe = NAGWanPipeline.from_pretrained(
59
- MODEL_ID, vae=vae, transformer=transformer, torch_dtype=torch.bfloat16
60
  )
61
- pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=5.0)
62
  pipe.to("cuda")
63
 
64
- pipe.transformer.__class__.attn_processors = NagWanTransformer3DModel.attn_processors
65
- pipe.transformer.__class__.set_attn_processor = NagWanTransformer3DModel.set_attn_processor
66
- pipe.transformer.__class__.forward = NagWanTransformer3DModel.forward
 
 
67
 
68
  # Audio generation model setup
69
  torch.backends.cuda.matmul.allow_tf32 = True
@@ -73,14 +173,13 @@ log = logging.getLogger()
73
  device = 'cuda'
74
  dtype = torch.bfloat16
75
 
76
- # Global variables for audio model (loaded on demand)
77
  audio_model = None
78
  audio_net = None
79
  audio_feature_utils = None
80
  audio_seq_cfg = None
81
 
82
  def load_audio_model():
83
- """Load audio model on demand to save storage"""
84
  global audio_model, audio_net, audio_feature_utils, audio_seq_cfg
85
 
86
  if audio_net is None:
@@ -114,7 +213,6 @@ DEFAULT_STEPS = 4
114
  DEFAULT_SEED = 2025
115
  DEFAULT_H_SLIDER_VALUE = 480
116
  DEFAULT_W_SLIDER_VALUE = 832
117
- NEW_FORMULA_MAX_AREA = 480.0 * 832.0
118
 
119
  SLIDER_MIN_H, SLIDER_MAX_H = 128, 896
120
  SLIDER_MIN_W, SLIDER_MAX_W = 128, 896
@@ -125,6 +223,7 @@ MIN_FRAMES_MODEL = 8
125
  MAX_FRAMES_MODEL = 129
126
 
127
  DEFAULT_NAG_NEGATIVE_PROMPT = "Static, motionless, still, ugly, bad quality, worst quality, poorly drawn, low resolution, blurry, lack of details"
 
128
  default_audio_prompt = ""
129
  default_audio_negative_prompt = "music"
130
 
@@ -272,6 +371,15 @@ input[type="radio"] {
272
  accent-color: #667eea !important;
273
  }
274
 
 
 
 
 
 
 
 
 
 
275
  /* 반응형 애니메이션 */
276
  @media (max-width: 768px) {
277
  h1 { font-size: 2rem !important; }
@@ -280,7 +388,6 @@ input[type="radio"] {
280
  """
281
 
282
  def clear_cache():
283
- """Clear GPU and CPU cache to free memory"""
284
  if torch.cuda.is_available():
285
  torch.cuda.empty_cache()
286
  torch.cuda.synchronize()
@@ -292,19 +399,14 @@ def get_duration(prompt, nag_negative_prompt, nag_scale,
292
  audio_mode, audio_prompt, audio_negative_prompt,
293
  audio_seed, audio_steps, audio_cfg_strength,
294
  progress):
295
- base_duration = int(duration_seconds) * int(steps) * 2.25 + 5
296
-
297
- # Add extra time for audio generation
298
  if audio_mode == "Enable Audio":
299
- base_duration += 60
300
-
301
- return base_duration
302
 
303
  @torch.inference_mode()
304
  def add_audio_to_video(video_path, duration_sec, audio_prompt, audio_negative_prompt,
305
  audio_seed, audio_steps, audio_cfg_strength):
306
- """Add audio to video using MMAudio"""
307
- # Load audio model on demand
308
  net, feature_utils, seq_cfg = load_audio_model()
309
 
310
  rng = torch.Generator(device=device)
@@ -332,7 +434,6 @@ def add_audio_to_video(video_path, duration_sec, audio_prompt, audio_negative_pr
332
  cfg_strength=audio_cfg_strength)
333
  audio = audios.float().cpu()[0]
334
 
335
- # Save video with audio
336
  video_with_audio_path = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name
337
  make_video(video_info, video_with_audio_path, audio, sampling_rate=seq_cfg.sampling_rate)
338
 
@@ -346,6 +447,9 @@ def generate_video(prompt, nag_negative_prompt, nag_scale,
346
  audio_seed, audio_steps, audio_cfg_strength,
347
  progress=gr.Progress(track_tqdm=True)):
348
 
 
 
 
349
  target_h = max(MOD_VALUE, (int(height) // MOD_VALUE) * MOD_VALUE)
350
  target_w = max(MOD_VALUE, (int(width) // MOD_VALUE) * MOD_VALUE)
351
 
@@ -355,14 +459,16 @@ def generate_video(prompt, nag_negative_prompt, nag_scale,
355
 
356
  # Generate video using NAG
357
  with torch.inference_mode():
358
- nag_output_frames_list = pipe(
359
  prompt=prompt,
360
  nag_negative_prompt=nag_negative_prompt,
361
  nag_scale=nag_scale,
362
  nag_tau=3.5,
363
  nag_alpha=0.5,
364
- height=target_h, width=target_w, num_frames=num_frames,
365
- guidance_scale=0.,
 
 
366
  num_inference_steps=int(steps),
367
  generator=torch.Generator(device="cuda").manual_seed(current_seed)
368
  ).frames[0]
@@ -370,7 +476,7 @@ def generate_video(prompt, nag_negative_prompt, nag_scale,
370
  # Save video without audio
371
  with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
372
  video_path = tmpfile.name
373
- export_to_video(nag_output_frames_list, video_path, fps=FIXED_FPS)
374
 
375
  # Generate audio if enabled
376
  video_with_audio_path = None
@@ -382,41 +488,37 @@ def generate_video(prompt, nag_negative_prompt, nag_scale,
382
  audio_seed, audio_steps, audio_cfg_strength
383
  )
384
 
385
- # Clear cache to free memory
386
  clear_cache()
387
  cleanup_temp_files()
388
 
389
  return video_path, video_with_audio_path, current_seed
390
 
391
  def update_audio_visibility(audio_mode):
392
- """Update visibility of audio-related components"""
393
  return gr.update(visible=(audio_mode == "Enable Audio"))
394
 
395
  with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
396
  with gr.Column(elem_classes=["main-container"]):
397
  gr.Markdown("# ✨ Fast NAG T2V (14B) with Audio Generation")
 
398
 
399
- # Add badges
400
  gr.HTML("""
401
- <div class="badge-container">
402
- <a href="https://huggingface.co/spaces/Heartsync/WAN2-1-fast-T2V-FusioniX" target="_blank">
403
- <img src="https://img.shields.io/static/v1?label=BASE&message=WAN%202.1%20T2V-FusioniX&color=%23008080&labelColor=%23533a7d&logo=huggingface&logoColor=%23ffffff&style=for-the-badge" alt="Base Model">
404
- </a>
405
- <a href="https://huggingface.co/spaces/Heartsync/WAN2-1-fast-T2V-FusioniX2" target="_blank">
406
- <img src="https://img.shields.io/static/v1?label=BASE&message=WAN%202.1%20T2V-Fusioni2X&color=%23008080&labelColor=%23533a7d&logo=huggingface&logoColor=%23ffffff&style=for-the-badge" alt="Base Model">
407
- </a>
408
  </div>
409
  """)
410
 
411
  with gr.Row():
412
  with gr.Column(elem_classes=["input-container"]):
413
  prompt_input = gr.Textbox(
414
- label="✏️ Video Prompt",
 
415
  placeholder="Describe your video scene in detail...",
416
  lines=3
417
  )
418
 
419
- with gr.Accordion("🎨 NAG Settings", open=False):
420
  nag_negative_prompt = gr.Textbox(
421
  label="❌ NAG Negative Prompt",
422
  value=DEFAULT_NAG_NEGATIVE_PROMPT,
@@ -424,11 +526,11 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
424
  )
425
  nag_scale = gr.Slider(
426
  label="🎯 NAG Scale",
427
- minimum=1.0,
428
  maximum=20.0,
429
  step=0.25,
430
  value=11.0,
431
- info="Higher values = stronger guidance"
432
  )
433
 
434
  duration_seconds_input = gr.Slider(
@@ -440,7 +542,6 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
440
  info=f"Clamped to model's {MIN_FRAMES_MODEL}-{MAX_FRAMES_MODEL} frames at {FIXED_FPS}fps."
441
  )
442
 
443
- # Audio mode radio button
444
  audio_mode = gr.Radio(
445
  choices=["Video Only", "Enable Audio"],
446
  value="Video Only",
@@ -448,7 +549,6 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
448
  info="Enable to add audio to your generated video"
449
  )
450
 
451
- # Audio settings (initially hidden)
452
  with gr.Column(visible=False) as audio_settings:
453
  audio_prompt = gr.Textbox(
454
  label="🎵 Audio Prompt",
@@ -539,6 +639,12 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
539
  interactive=False,
540
  visible=False
541
  )
 
 
 
 
 
 
542
 
543
  # Event handlers
544
  audio_mode.change(
@@ -570,7 +676,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
570
  ["A red vintage Porsche convertible flying over a rugged coastal cliff. Monstrous waves violently crashing against the rocks below. A lighthouse stands tall atop the cliff.", DEFAULT_NAG_NEGATIVE_PROMPT, 11,
571
  DEFAULT_H_SLIDER_VALUE, DEFAULT_W_SLIDER_VALUE, DEFAULT_DURATION_SECONDS,
572
  DEFAULT_STEPS, DEFAULT_SEED, False,
573
- "Enable Audio", "car engine, ocean waves crashing, wind", default_audio_negative_prompt, -1, 25, 4.5],
574
  ["Enormous glowing jellyfish float slowly across a sky filled with soft clouds. Their tentacles shimmer with iridescent light as they drift above a peaceful mountain landscape. Magical and dreamlike, captured in a wide shot. Surreal realism style with detailed textures.", DEFAULT_NAG_NEGATIVE_PROMPT, 11,
575
  DEFAULT_H_SLIDER_VALUE, DEFAULT_W_SLIDER_VALUE, DEFAULT_DURATION_SECONDS,
576
  DEFAULT_STEPS, DEFAULT_SEED, False,
 
 
1
  import torch
2
+ import torch.nn.functional as F
3
+ from diffusers import AutoencoderKLWan, WanVideoTextToVideoPipeline, UniPCMultistepScheduler
4
  from diffusers.utils import export_to_video
5
+ from diffusers.models import Transformer2DModel
6
  import gradio as gr
7
  import tempfile
8
  import spaces
 
10
  import numpy as np
11
  import random
12
  import logging
 
13
  import os
14
  import gc
15
+ from typing import List, Optional, Union
16
 
17
  # MMAudio imports
18
  try:
 
21
  os.system("pip install -e .")
22
  import mmaudio
23
 
24
+ # Set environment variables
25
  os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:512'
26
  os.environ['HF_HUB_CACHE'] = '/tmp/hub'
27
 
 
32
  from mmaudio.model.sequence_config import SequenceConfig
33
  from mmaudio.model.utils.features_utils import FeaturesUtils
34
 
35
+ # NAG-enhanced Pipeline
36
+ class NAGWanPipeline(WanVideoTextToVideoPipeline):
37
+ def __init__(self, *args, **kwargs):
38
+ super().__init__(*args, **kwargs)
39
+ self.nag_scale = 0.0
40
+ self.nag_tau = 3.5
41
+ self.nag_alpha = 0.5
42
+
43
+ @torch.no_grad()
44
+ def __call__(
45
+ self,
46
+ prompt: Union[str, List[str]] = None,
47
+ nag_negative_prompt: Optional[Union[str, List[str]]] = None,
48
+ nag_scale: float = 0.0,
49
+ nag_tau: float = 3.5,
50
+ nag_alpha: float = 0.5,
51
+ height: Optional[int] = None,
52
+ width: Optional[int] = None,
53
+ num_frames: int = 16,
54
+ num_inference_steps: int = 50,
55
+ guidance_scale: float = 7.5,
56
+ negative_prompt: Optional[Union[str, List[str]]] = None,
57
+ eta: float = 0.0,
58
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
59
+ latents: Optional[torch.FloatTensor] = None,
60
+ prompt_embeds: Optional[torch.FloatTensor] = None,
61
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
62
+ output_type: Optional[str] = "pil",
63
+ return_dict: bool = True,
64
+ callback = None,
65
+ callback_steps: int = 1,
66
+ cross_attention_kwargs: Optional[dict] = None,
67
+ clip_skip: Optional[int] = None,
68
+ ):
69
+ # Use NAG negative prompt if provided
70
+ if nag_negative_prompt is not None:
71
+ negative_prompt = nag_negative_prompt
72
+
73
+ # Store NAG parameters
74
+ self.nag_scale = nag_scale
75
+ self.nag_tau = nag_tau
76
+ self.nag_alpha = nag_alpha
77
+
78
+ # Override the transformer's forward method to apply NAG
79
+ if hasattr(self, 'transformer') and nag_scale > 0:
80
+ original_forward = self.transformer.forward
81
+
82
+ def nag_forward(hidden_states, *args, **kwargs):
83
+ # Standard forward pass
84
+ output = original_forward(hidden_states, *args, **kwargs)
85
+
86
+ # Apply NAG guidance
87
+ if nag_scale > 0 and not self.transformer.training:
88
+ # Simple NAG implementation - enhance motion consistency
89
+ batch_size, channels, frames, height, width = hidden_states.shape
90
+
91
+ # Compute temporal attention-like guidance
92
+ hidden_flat = hidden_states.view(batch_size, channels, -1)
93
+ attention = F.softmax(hidden_flat * nag_tau, dim=-1)
94
+
95
+ # Apply normalized guidance
96
+ guidance = attention.mean(dim=2, keepdim=True) * nag_alpha
97
+ guidance = guidance.unsqueeze(-1).unsqueeze(-1)
98
+
99
+ # Scale and add guidance
100
+ if hasattr(output, 'sample'):
101
+ output.sample = output.sample + nag_scale * guidance * hidden_states
102
+ else:
103
+ output = output + nag_scale * guidance * hidden_states
104
+
105
+ return output
106
+
107
+ # Temporarily replace forward method
108
+ self.transformer.forward = nag_forward
109
+
110
+ # Call parent pipeline
111
+ result = super().__call__(
112
+ prompt=prompt,
113
+ height=height,
114
+ width=width,
115
+ num_frames=num_frames,
116
+ num_inference_steps=num_inference_steps,
117
+ guidance_scale=guidance_scale,
118
+ negative_prompt=negative_prompt,
119
+ eta=eta,
120
+ generator=generator,
121
+ latents=latents,
122
+ prompt_embeds=prompt_embeds,
123
+ negative_prompt_embeds=negative_prompt_embeds,
124
+ output_type=output_type,
125
+ return_dict=return_dict,
126
+ callback=callback,
127
+ callback_steps=callback_steps,
128
+ cross_attention_kwargs=cross_attention_kwargs,
129
+ clip_skip=clip_skip,
130
+ )
131
+
132
+ # Restore original forward method
133
+ if hasattr(self, 'transformer') and hasattr(self.transformer, 'forward'):
134
+ self.transformer.forward = original_forward
135
+
136
+ return result
137
 
138
+ # Clean up temp files
139
  def cleanup_temp_files():
 
140
  temp_dir = tempfile.gettempdir()
141
  for filename in os.listdir(temp_dir):
142
  filepath = os.path.join(temp_dir, filename)
 
146
  except:
147
  pass
148
 
149
+ # Video generation model setup
150
  MODEL_ID = "Wan-AI/Wan2.1-T2V-14B-Diffusers"
151
+ LORA_REPO_ID = "Kijai/WanVideo_comfy"
152
+ LORA_FILENAME = "Wan21_CausVid_14B_T2V_lora_rank32.safetensors"
153
 
154
+ # Load the model components
155
  vae = AutoencoderKLWan.from_pretrained(MODEL_ID, subfolder="vae", torch_dtype=torch.float32)
 
 
156
  pipe = NAGWanPipeline.from_pretrained(
157
+ MODEL_ID, vae=vae, torch_dtype=torch.bfloat16
158
  )
159
+ pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=8.0)
160
  pipe.to("cuda")
161
 
162
+ # Load LoRA weights for faster generation
163
+ causvid_path = hf_hub_download(repo_id=LORA_REPO_ID, filename=LORA_FILENAME)
164
+ pipe.load_lora_weights(causvid_path, adapter_name="causvid_lora")
165
+ pipe.set_adapters(["causvid_lora"], adapter_weights=[0.95])
166
+ pipe.fuse_lora()
167
 
168
  # Audio generation model setup
169
  torch.backends.cuda.matmul.allow_tf32 = True
 
173
  device = 'cuda'
174
  dtype = torch.bfloat16
175
 
176
+ # Global variables for audio model
177
  audio_model = None
178
  audio_net = None
179
  audio_feature_utils = None
180
  audio_seq_cfg = None
181
 
182
  def load_audio_model():
 
183
  global audio_model, audio_net, audio_feature_utils, audio_seq_cfg
184
 
185
  if audio_net is None:
 
213
  DEFAULT_SEED = 2025
214
  DEFAULT_H_SLIDER_VALUE = 480
215
  DEFAULT_W_SLIDER_VALUE = 832
 
216
 
217
  SLIDER_MIN_H, SLIDER_MAX_H = 128, 896
218
  SLIDER_MIN_W, SLIDER_MAX_W = 128, 896
 
223
  MAX_FRAMES_MODEL = 129
224
 
225
  DEFAULT_NAG_NEGATIVE_PROMPT = "Static, motionless, still, ugly, bad quality, worst quality, poorly drawn, low resolution, blurry, lack of details"
226
+ default_prompt = "A ginger cat passionately plays electric guitar with intensity and emotion on a stage"
227
  default_audio_prompt = ""
228
  default_audio_negative_prompt = "music"
229
 
 
371
  accent-color: #667eea !important;
372
  }
373
 
374
+ /* Info box */
375
+ .info-box {
376
+ background: linear-gradient(135deg, #e0e7ff 0%, #c7d2fe 100%);
377
+ border-radius: 10px;
378
+ padding: 15px;
379
+ margin: 10px 0;
380
+ border-left: 4px solid #667eea;
381
+ }
382
+
383
  /* 반응형 애니메이션 */
384
  @media (max-width: 768px) {
385
  h1 { font-size: 2rem !important; }
 
388
  """
389
 
390
  def clear_cache():
 
391
  if torch.cuda.is_available():
392
  torch.cuda.empty_cache()
393
  torch.cuda.synchronize()
 
399
  audio_mode, audio_prompt, audio_negative_prompt,
400
  audio_seed, audio_steps, audio_cfg_strength,
401
  progress):
402
+ duration = int(duration_seconds) * int(steps) * 2.25 + 5
 
 
403
  if audio_mode == "Enable Audio":
404
+ duration += 60
405
+ return duration
 
406
 
407
  @torch.inference_mode()
408
  def add_audio_to_video(video_path, duration_sec, audio_prompt, audio_negative_prompt,
409
  audio_seed, audio_steps, audio_cfg_strength):
 
 
410
  net, feature_utils, seq_cfg = load_audio_model()
411
 
412
  rng = torch.Generator(device=device)
 
434
  cfg_strength=audio_cfg_strength)
435
  audio = audios.float().cpu()[0]
436
 
 
437
  video_with_audio_path = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name
438
  make_video(video_info, video_with_audio_path, audio, sampling_rate=seq_cfg.sampling_rate)
439
 
 
447
  audio_seed, audio_steps, audio_cfg_strength,
448
  progress=gr.Progress(track_tqdm=True)):
449
 
450
+ if not prompt.strip():
451
+ raise gr.Error("Please enter a text prompt to generate video.")
452
+
453
  target_h = max(MOD_VALUE, (int(height) // MOD_VALUE) * MOD_VALUE)
454
  target_w = max(MOD_VALUE, (int(width) // MOD_VALUE) * MOD_VALUE)
455
 
 
459
 
460
  # Generate video using NAG
461
  with torch.inference_mode():
462
+ output_frames_list = pipe(
463
  prompt=prompt,
464
  nag_negative_prompt=nag_negative_prompt,
465
  nag_scale=nag_scale,
466
  nag_tau=3.5,
467
  nag_alpha=0.5,
468
+ height=target_h,
469
+ width=target_w,
470
+ num_frames=num_frames,
471
+ guidance_scale=0., # NAG replaces traditional guidance
472
  num_inference_steps=int(steps),
473
  generator=torch.Generator(device="cuda").manual_seed(current_seed)
474
  ).frames[0]
 
476
  # Save video without audio
477
  with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
478
  video_path = tmpfile.name
479
+ export_to_video(output_frames_list, video_path, fps=FIXED_FPS)
480
 
481
  # Generate audio if enabled
482
  video_with_audio_path = None
 
488
  audio_seed, audio_steps, audio_cfg_strength
489
  )
490
 
 
491
  clear_cache()
492
  cleanup_temp_files()
493
 
494
  return video_path, video_with_audio_path, current_seed
495
 
496
  def update_audio_visibility(audio_mode):
 
497
  return gr.update(visible=(audio_mode == "Enable Audio"))
498
 
499
  with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
500
  with gr.Column(elem_classes=["main-container"]):
501
  gr.Markdown("# ✨ Fast NAG T2V (14B) with Audio Generation")
502
+ gr.Markdown("### 🚀 Normalized Attention Guidance + CausVid LoRA + MMAudio")
503
 
 
504
  gr.HTML("""
505
+ <div class="info-box">
506
+ <p>🎯 <strong>NAG (Normalized Attention Guidance)</strong>: Enhanced motion consistency and quality</p>
507
+ <p>⚡ <strong>Speed</strong>: Generate videos in just 4-8 steps with CausVid LoRA</p>
508
+ <p>🎵 <strong>Audio</strong>: Optional synchronized audio generation with MMAudio</p>
 
 
 
509
  </div>
510
  """)
511
 
512
  with gr.Row():
513
  with gr.Column(elem_classes=["input-container"]):
514
  prompt_input = gr.Textbox(
515
+ label=" Video Prompt",
516
+ value=default_prompt,
517
  placeholder="Describe your video scene in detail...",
518
  lines=3
519
  )
520
 
521
+ with gr.Accordion("🎨 NAG Settings", open=True):
522
  nag_negative_prompt = gr.Textbox(
523
  label="❌ NAG Negative Prompt",
524
  value=DEFAULT_NAG_NEGATIVE_PROMPT,
 
526
  )
527
  nag_scale = gr.Slider(
528
  label="🎯 NAG Scale",
529
+ minimum=0.0,
530
  maximum=20.0,
531
  step=0.25,
532
  value=11.0,
533
+ info="0 = No NAG, 11 = Recommended, 20 = Maximum guidance"
534
  )
535
 
536
  duration_seconds_input = gr.Slider(
 
542
  info=f"Clamped to model's {MIN_FRAMES_MODEL}-{MAX_FRAMES_MODEL} frames at {FIXED_FPS}fps."
543
  )
544
 
 
545
  audio_mode = gr.Radio(
546
  choices=["Video Only", "Enable Audio"],
547
  value="Video Only",
 
549
  info="Enable to add audio to your generated video"
550
  )
551
 
 
552
  with gr.Column(visible=False) as audio_settings:
553
  audio_prompt = gr.Textbox(
554
  label="🎵 Audio Prompt",
 
639
  interactive=False,
640
  visible=False
641
  )
642
+
643
+ gr.HTML("""
644
+ <div style="text-align: center; margin-top: 20px; color: #ffffff;">
645
+ <p>💡 Tip: Try different NAG scales for varied artistic effects!</p>
646
+ </div>
647
+ """)
648
 
649
  # Event handlers
650
  audio_mode.change(
 
676
  ["A red vintage Porsche convertible flying over a rugged coastal cliff. Monstrous waves violently crashing against the rocks below. A lighthouse stands tall atop the cliff.", DEFAULT_NAG_NEGATIVE_PROMPT, 11,
677
  DEFAULT_H_SLIDER_VALUE, DEFAULT_W_SLIDER_VALUE, DEFAULT_DURATION_SECONDS,
678
  DEFAULT_STEPS, DEFAULT_SEED, False,
679
+ "Enable Audio", "car engine roaring, ocean waves crashing, wind", default_audio_negative_prompt, -1, 25, 4.5],
680
  ["Enormous glowing jellyfish float slowly across a sky filled with soft clouds. Their tentacles shimmer with iridescent light as they drift above a peaceful mountain landscape. Magical and dreamlike, captured in a wide shot. Surreal realism style with detailed textures.", DEFAULT_NAG_NEGATIVE_PROMPT, 11,
681
  DEFAULT_H_SLIDER_VALUE, DEFAULT_W_SLIDER_VALUE, DEFAULT_DURATION_SECONDS,
682
  DEFAULT_STEPS, DEFAULT_SEED, False,