Update app.py
Browse files
app.py
CHANGED
@@ -1,13 +1,12 @@
|
|
|
|
1 |
import torch
|
2 |
-
from diffusers import AutoencoderKLWan,
|
3 |
from diffusers.utils import export_to_video
|
4 |
-
from transformers import CLIPVisionModel
|
5 |
import gradio as gr
|
6 |
import tempfile
|
7 |
import spaces
|
8 |
from huggingface_hub import hf_hub_download
|
9 |
import numpy as np
|
10 |
-
from PIL import Image
|
11 |
import random
|
12 |
import logging
|
13 |
import torchaudio
|
@@ -23,7 +22,7 @@ except ImportError:
|
|
23 |
|
24 |
# Set environment variables for better memory management
|
25 |
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:512'
|
26 |
-
os.environ['HF_HUB_CACHE'] = '/tmp/hub'
|
27 |
|
28 |
from mmaudio.eval_utils import (ModelConfig, all_model_cfg, generate, load_video, make_video,
|
29 |
setup_eval_logging)
|
@@ -32,6 +31,10 @@ from mmaudio.model.networks import MMAudio, get_my_mmaudio
|
|
32 |
from mmaudio.model.sequence_config import SequenceConfig
|
33 |
from mmaudio.model.utils.features_utils import FeaturesUtils
|
34 |
|
|
|
|
|
|
|
|
|
35 |
# Clean up temp files periodically
|
36 |
def cleanup_temp_files():
|
37 |
"""Clean up temporary files to save storage"""
|
@@ -44,23 +47,23 @@ def cleanup_temp_files():
|
|
44 |
except:
|
45 |
pass
|
46 |
|
47 |
-
# Video generation model setup
|
48 |
-
MODEL_ID = "Wan-AI/Wan2.1-
|
49 |
-
|
50 |
-
|
51 |
|
52 |
-
image_encoder = CLIPVisionModel.from_pretrained(MODEL_ID, subfolder="image_encoder", torch_dtype=torch.float32)
|
53 |
vae = AutoencoderKLWan.from_pretrained(MODEL_ID, subfolder="vae", torch_dtype=torch.float32)
|
54 |
-
|
55 |
-
|
|
|
|
|
56 |
)
|
57 |
-
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=
|
58 |
pipe.to("cuda")
|
59 |
|
60 |
-
|
61 |
-
pipe.
|
62 |
-
pipe.
|
63 |
-
pipe.fuse_lora()
|
64 |
|
65 |
# Audio generation model setup
|
66 |
torch.backends.cuda.matmul.allow_tf32 = True
|
@@ -81,7 +84,7 @@ def load_audio_model():
|
|
81 |
global audio_model, audio_net, audio_feature_utils, audio_seq_cfg
|
82 |
|
83 |
if audio_net is None:
|
84 |
-
audio_model = all_model_cfg['small_16k']
|
85 |
audio_model.download_if_needed()
|
86 |
setup_eval_logging()
|
87 |
|
@@ -106,20 +109,22 @@ def load_audio_model():
|
|
106 |
|
107 |
# Constants
|
108 |
MOD_VALUE = 32
|
109 |
-
|
110 |
-
|
111 |
-
|
|
|
|
|
|
|
112 |
|
113 |
SLIDER_MIN_H, SLIDER_MAX_H = 128, 896
|
114 |
SLIDER_MIN_W, SLIDER_MAX_W = 128, 896
|
115 |
MAX_SEED = np.iinfo(np.int32).max
|
116 |
|
117 |
-
FIXED_FPS =
|
118 |
MIN_FRAMES_MODEL = 8
|
119 |
-
MAX_FRAMES_MODEL =
|
120 |
|
121 |
-
|
122 |
-
default_negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards, watermark, text, signature"
|
123 |
default_audio_prompt = ""
|
124 |
default_audio_negative_prompt = "music"
|
125 |
|
@@ -243,19 +248,6 @@ label {
|
|
243 |
margin-bottom: 5px !important;
|
244 |
}
|
245 |
|
246 |
-
/* 이미지 업로드 영역 */
|
247 |
-
.image-upload {
|
248 |
-
border: 2px dashed rgba(255, 255, 255, 0.3) !important;
|
249 |
-
border-radius: 15px !important;
|
250 |
-
background: rgba(255, 255, 255, 0.05) !important;
|
251 |
-
transition: all 0.3s ease !important;
|
252 |
-
}
|
253 |
-
|
254 |
-
.image-upload:hover {
|
255 |
-
border-color: rgba(255, 255, 255, 0.5) !important;
|
256 |
-
background: rgba(255, 255, 255, 0.1) !important;
|
257 |
-
}
|
258 |
-
|
259 |
/* 비디오 출력 영역 */
|
260 |
video {
|
261 |
border-radius: 15px !important;
|
@@ -287,41 +279,6 @@ input[type="radio"] {
|
|
287 |
}
|
288 |
"""
|
289 |
|
290 |
-
def _calculate_new_dimensions_wan(pil_image, mod_val, calculation_max_area,
|
291 |
-
min_slider_h, max_slider_h,
|
292 |
-
min_slider_w, max_slider_w,
|
293 |
-
default_h, default_w):
|
294 |
-
orig_w, orig_h = pil_image.size
|
295 |
-
if orig_w <= 0 or orig_h <= 0:
|
296 |
-
return default_h, default_w
|
297 |
-
|
298 |
-
aspect_ratio = orig_h / orig_w
|
299 |
-
|
300 |
-
calc_h = round(np.sqrt(calculation_max_area * aspect_ratio))
|
301 |
-
calc_w = round(np.sqrt(calculation_max_area / aspect_ratio))
|
302 |
-
|
303 |
-
calc_h = max(mod_val, (calc_h // mod_val) * mod_val)
|
304 |
-
calc_w = max(mod_val, (calc_w // mod_val) * mod_val)
|
305 |
-
|
306 |
-
new_h = int(np.clip(calc_h, min_slider_h, (max_slider_h // mod_val) * mod_val))
|
307 |
-
new_w = int(np.clip(calc_w, min_slider_w, (max_slider_w // mod_val) * mod_val))
|
308 |
-
|
309 |
-
return new_h, new_w
|
310 |
-
|
311 |
-
def handle_image_upload_for_dims_wan(uploaded_pil_image, current_h_val, current_w_val):
|
312 |
-
if uploaded_pil_image is None:
|
313 |
-
return gr.update(value=DEFAULT_H_SLIDER_VALUE), gr.update(value=DEFAULT_W_SLIDER_VALUE)
|
314 |
-
try:
|
315 |
-
new_h, new_w = _calculate_new_dimensions_wan(
|
316 |
-
uploaded_pil_image, MOD_VALUE, NEW_FORMULA_MAX_AREA,
|
317 |
-
SLIDER_MIN_H, SLIDER_MAX_H, SLIDER_MIN_W, SLIDER_MAX_W,
|
318 |
-
DEFAULT_H_SLIDER_VALUE, DEFAULT_W_SLIDER_VALUE
|
319 |
-
)
|
320 |
-
return gr.update(value=new_h), gr.update(value=new_w)
|
321 |
-
except Exception as e:
|
322 |
-
gr.Warning("Error attempting to calculate new dimensions")
|
323 |
-
return gr.update(value=DEFAULT_H_SLIDER_VALUE), gr.update(value=DEFAULT_W_SLIDER_VALUE)
|
324 |
-
|
325 |
def clear_cache():
|
326 |
"""Clear GPU and CPU cache to free memory"""
|
327 |
if torch.cuda.is_available():
|
@@ -329,18 +286,13 @@ def clear_cache():
|
|
329 |
torch.cuda.synchronize()
|
330 |
gc.collect()
|
331 |
|
332 |
-
def get_duration(
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
base_duration = 60
|
340 |
-
if steps > 4 and duration_seconds > 2:
|
341 |
-
base_duration = 90
|
342 |
-
elif steps > 4 or duration_seconds > 2:
|
343 |
-
base_duration = 75
|
344 |
|
345 |
# Add extra time for audio generation
|
346 |
if audio_mode == "Enable Audio":
|
@@ -387,39 +339,38 @@ def add_audio_to_video(video_path, duration_sec, audio_prompt, audio_negative_pr
|
|
387 |
return video_with_audio_path
|
388 |
|
389 |
@spaces.GPU(duration=get_duration)
|
390 |
-
def generate_video(
|
391 |
-
|
392 |
-
|
393 |
-
seed, randomize_seed,
|
394 |
audio_mode, audio_prompt, audio_negative_prompt,
|
395 |
audio_seed, audio_steps, audio_cfg_strength,
|
396 |
progress=gr.Progress(track_tqdm=True)):
|
397 |
|
398 |
-
if input_image is None:
|
399 |
-
raise gr.Error("Please upload an input image.")
|
400 |
-
|
401 |
target_h = max(MOD_VALUE, (int(height) // MOD_VALUE) * MOD_VALUE)
|
402 |
target_w = max(MOD_VALUE, (int(width) // MOD_VALUE) * MOD_VALUE)
|
403 |
|
404 |
-
num_frames = np.clip(int(round(duration_seconds * FIXED_FPS)), MIN_FRAMES_MODEL, MAX_FRAMES_MODEL)
|
405 |
|
406 |
current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
|
407 |
|
408 |
-
|
409 |
-
|
410 |
-
# Generate video
|
411 |
with torch.inference_mode():
|
412 |
-
|
413 |
-
|
|
|
|
|
|
|
|
|
414 |
height=target_h, width=target_w, num_frames=num_frames,
|
415 |
-
guidance_scale=
|
|
|
416 |
generator=torch.Generator(device="cuda").manual_seed(current_seed)
|
417 |
).frames[0]
|
418 |
|
419 |
# Save video without audio
|
420 |
with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
|
421 |
video_path = tmpfile.name
|
422 |
-
export_to_video(
|
423 |
|
424 |
# Generate audio if enabled
|
425 |
video_with_audio_path = None
|
@@ -433,7 +384,7 @@ def generate_video(input_image, prompt, height, width,
|
|
433 |
|
434 |
# Clear cache to free memory
|
435 |
clear_cache()
|
436 |
-
cleanup_temp_files()
|
437 |
|
438 |
return video_path, video_with_audio_path, current_seed
|
439 |
|
@@ -443,9 +394,9 @@ def update_audio_visibility(audio_mode):
|
|
443 |
|
444 |
with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
|
445 |
with gr.Column(elem_classes=["main-container"]):
|
446 |
-
gr.Markdown("# ✨ Fast
|
447 |
|
448 |
-
# Add badges
|
449 |
gr.HTML("""
|
450 |
<div class="badge-container">
|
451 |
<a href="https://huggingface.co/spaces/Heartsync/WAN2-1-fast-T2V-FusioniX" target="_blank">
|
@@ -453,31 +404,39 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
|
|
453 |
</a>
|
454 |
<a href="https://huggingface.co/spaces/Heartsync/WAN2-1-fast-T2V-FusioniX2" target="_blank">
|
455 |
<img src="https://img.shields.io/static/v1?label=BASE&message=WAN%202.1%20T2V-Fusioni2X&color=%23008080&labelColor=%23533a7d&logo=huggingface&logoColor=%23ffffff&style=for-the-badge" alt="Base Model">
|
456 |
-
</a>
|
457 |
-
<a href="https://huggingface.co/spaces/Heartsync/wan2-1-fast-security" target="_blank">
|
458 |
-
<img src="https://img.shields.io/static/v1?label=WAN%202.1&message=FAST%20%26%20Furios&color=%23008080&labelColor=%230000ff&logo=huggingface&logoColor=%23ffa500&style=for-the-badge" alt="badge">
|
459 |
</a>
|
460 |
</div>
|
461 |
""")
|
462 |
|
463 |
with gr.Row():
|
464 |
with gr.Column(elem_classes=["input-container"]):
|
465 |
-
input_image_component = gr.Image(
|
466 |
-
type="pil",
|
467 |
-
label="🖼️ Input Image (auto-resized to target H/W)",
|
468 |
-
elem_classes=["image-upload"]
|
469 |
-
)
|
470 |
prompt_input = gr.Textbox(
|
471 |
-
label="✏️ Prompt",
|
472 |
-
|
473 |
-
lines=
|
474 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
475 |
duration_seconds_input = gr.Slider(
|
476 |
-
minimum=
|
477 |
-
maximum=
|
478 |
-
step=
|
479 |
-
value=
|
480 |
-
label="⏱️ Duration (seconds)",
|
481 |
info=f"Clamped to model's {MIN_FRAMES_MODEL}-{MAX_FRAMES_MODEL} frames at {FIXED_FPS}fps."
|
482 |
)
|
483 |
|
@@ -525,65 +484,53 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
|
|
525 |
)
|
526 |
|
527 |
with gr.Accordion("⚙️ Advanced Settings", open=False):
|
528 |
-
negative_prompt_input = gr.Textbox(
|
529 |
-
label="❌ Negative Prompt",
|
530 |
-
value=default_negative_prompt,
|
531 |
-
lines=3
|
532 |
-
)
|
533 |
-
seed_input = gr.Slider(
|
534 |
-
label="🎲 Seed",
|
535 |
-
minimum=0,
|
536 |
-
maximum=MAX_SEED,
|
537 |
-
step=1,
|
538 |
-
value=42,
|
539 |
-
interactive=True
|
540 |
-
)
|
541 |
-
randomize_seed_checkbox = gr.Checkbox(
|
542 |
-
label="🔀 Randomize seed",
|
543 |
-
value=True,
|
544 |
-
interactive=True
|
545 |
-
)
|
546 |
with gr.Row():
|
547 |
height_input = gr.Slider(
|
548 |
-
minimum=SLIDER_MIN_H,
|
549 |
-
maximum=SLIDER_MAX_H,
|
550 |
-
step=MOD_VALUE,
|
551 |
-
value=DEFAULT_H_SLIDER_VALUE,
|
552 |
-
label=f"📏 Output Height (
|
553 |
)
|
554 |
width_input = gr.Slider(
|
555 |
-
minimum=SLIDER_MIN_W,
|
556 |
-
maximum=SLIDER_MAX_W,
|
557 |
-
step=MOD_VALUE,
|
558 |
-
value=DEFAULT_W_SLIDER_VALUE,
|
559 |
-
label=f"📐 Output Width (
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
560 |
)
|
561 |
-
|
562 |
-
|
563 |
-
|
564 |
-
|
565 |
-
value=4,
|
566 |
-
label="🚀 Inference Steps"
|
567 |
-
)
|
568 |
-
guidance_scale_input = gr.Slider(
|
569 |
-
minimum=0.0,
|
570 |
-
maximum=20.0,
|
571 |
-
step=0.5,
|
572 |
-
value=1.0,
|
573 |
-
label="🎯 Guidance Scale",
|
574 |
-
visible=False
|
575 |
)
|
576 |
|
577 |
generate_button = gr.Button(
|
578 |
-
"🎬 Generate Video",
|
579 |
variant="primary",
|
580 |
elem_classes=["generate-btn"]
|
581 |
)
|
582 |
|
583 |
with gr.Column(elem_classes=["output-container"]):
|
584 |
video_output = gr.Video(
|
585 |
-
label="🎥 Generated Video",
|
586 |
-
autoplay=True,
|
587 |
interactive=False
|
588 |
)
|
589 |
video_with_audio_output = gr.Video(
|
@@ -600,44 +547,38 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
|
|
600 |
outputs=[audio_settings, video_with_audio_output]
|
601 |
)
|
602 |
|
603 |
-
input_image_component.upload(
|
604 |
-
fn=handle_image_upload_for_dims_wan,
|
605 |
-
inputs=[input_image_component, height_input, width_input],
|
606 |
-
outputs=[height_input, width_input]
|
607 |
-
)
|
608 |
-
|
609 |
-
input_image_component.clear(
|
610 |
-
fn=handle_image_upload_for_dims_wan,
|
611 |
-
inputs=[input_image_component, height_input, width_input],
|
612 |
-
outputs=[height_input, width_input]
|
613 |
-
)
|
614 |
-
|
615 |
ui_inputs = [
|
616 |
-
|
617 |
-
|
618 |
-
|
619 |
audio_mode, audio_prompt, audio_negative_prompt,
|
620 |
audio_seed, audio_steps, audio_cfg_strength
|
621 |
]
|
622 |
generate_button.click(
|
623 |
-
fn=generate_video,
|
624 |
-
inputs=ui_inputs,
|
625 |
outputs=[video_output, video_with_audio_output, seed_input]
|
626 |
)
|
627 |
|
628 |
with gr.Column():
|
629 |
gr.Examples(
|
630 |
-
examples=[
|
631 |
-
["
|
632 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
633 |
"Video Only", "", default_audio_negative_prompt, -1, 25, 4.5],
|
634 |
-
["forg.jpg", "the frog jumps around", 832, 832,
|
635 |
-
default_negative_prompt, 2, 1.0, 4, 42, False,
|
636 |
-
"Enable Audio", "frog croaking, water splashing", default_audio_negative_prompt, -1, 25, 4.5],
|
637 |
],
|
638 |
-
inputs=ui_inputs,
|
639 |
-
outputs=[video_output, video_with_audio_output, seed_input],
|
640 |
-
fn=generate_video,
|
641 |
cache_examples="lazy",
|
642 |
label="🌟 Example Gallery"
|
643 |
)
|
|
|
1 |
+
import types
|
2 |
import torch
|
3 |
+
from diffusers import AutoencoderKLWan, UniPCMultistepScheduler
|
4 |
from diffusers.utils import export_to_video
|
|
|
5 |
import gradio as gr
|
6 |
import tempfile
|
7 |
import spaces
|
8 |
from huggingface_hub import hf_hub_download
|
9 |
import numpy as np
|
|
|
10 |
import random
|
11 |
import logging
|
12 |
import torchaudio
|
|
|
22 |
|
23 |
# Set environment variables for better memory management
|
24 |
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:512'
|
25 |
+
os.environ['HF_HUB_CACHE'] = '/tmp/hub'
|
26 |
|
27 |
from mmaudio.eval_utils import (ModelConfig, all_model_cfg, generate, load_video, make_video,
|
28 |
setup_eval_logging)
|
|
|
31 |
from mmaudio.model.sequence_config import SequenceConfig
|
32 |
from mmaudio.model.utils.features_utils import FeaturesUtils
|
33 |
|
34 |
+
# NAG imports
|
35 |
+
from src.pipeline_wan_nag import NAGWanPipeline
|
36 |
+
from src.transformer_wan_nag import NagWanTransformer3DModel
|
37 |
+
|
38 |
# Clean up temp files periodically
|
39 |
def cleanup_temp_files():
|
40 |
"""Clean up temporary files to save storage"""
|
|
|
47 |
except:
|
48 |
pass
|
49 |
|
50 |
+
# Video generation model setup (NAG)
|
51 |
+
MODEL_ID = "Wan-AI/Wan2.1-T2V-14B-Diffusers"
|
52 |
+
SUB_MODEL_ID = "vrgamedevgirl84/Wan14BT2VFusioniX"
|
53 |
+
SUB_MODEL_FILENAME = "Wan14BT2VFusioniX_fp16_.safetensors"
|
54 |
|
|
|
55 |
vae = AutoencoderKLWan.from_pretrained(MODEL_ID, subfolder="vae", torch_dtype=torch.float32)
|
56 |
+
wan_path = hf_hub_download(repo_id=SUB_MODEL_ID, filename=SUB_MODEL_FILENAME)
|
57 |
+
transformer = NagWanTransformer3DModel.from_single_file(wan_path, torch_dtype=torch.bfloat16)
|
58 |
+
pipe = NAGWanPipeline.from_pretrained(
|
59 |
+
MODEL_ID, vae=vae, transformer=transformer, torch_dtype=torch.bfloat16
|
60 |
)
|
61 |
+
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config, flow_shift=5.0)
|
62 |
pipe.to("cuda")
|
63 |
|
64 |
+
pipe.transformer.__class__.attn_processors = NagWanTransformer3DModel.attn_processors
|
65 |
+
pipe.transformer.__class__.set_attn_processor = NagWanTransformer3DModel.set_attn_processor
|
66 |
+
pipe.transformer.__class__.forward = NagWanTransformer3DModel.forward
|
|
|
67 |
|
68 |
# Audio generation model setup
|
69 |
torch.backends.cuda.matmul.allow_tf32 = True
|
|
|
84 |
global audio_model, audio_net, audio_feature_utils, audio_seq_cfg
|
85 |
|
86 |
if audio_net is None:
|
87 |
+
audio_model = all_model_cfg['small_16k']
|
88 |
audio_model.download_if_needed()
|
89 |
setup_eval_logging()
|
90 |
|
|
|
109 |
|
110 |
# Constants
|
111 |
MOD_VALUE = 32
|
112 |
+
DEFAULT_DURATION_SECONDS = 4
|
113 |
+
DEFAULT_STEPS = 4
|
114 |
+
DEFAULT_SEED = 2025
|
115 |
+
DEFAULT_H_SLIDER_VALUE = 480
|
116 |
+
DEFAULT_W_SLIDER_VALUE = 832
|
117 |
+
NEW_FORMULA_MAX_AREA = 480.0 * 832.0
|
118 |
|
119 |
SLIDER_MIN_H, SLIDER_MAX_H = 128, 896
|
120 |
SLIDER_MIN_W, SLIDER_MAX_W = 128, 896
|
121 |
MAX_SEED = np.iinfo(np.int32).max
|
122 |
|
123 |
+
FIXED_FPS = 16
|
124 |
MIN_FRAMES_MODEL = 8
|
125 |
+
MAX_FRAMES_MODEL = 129
|
126 |
|
127 |
+
DEFAULT_NAG_NEGATIVE_PROMPT = "Static, motionless, still, ugly, bad quality, worst quality, poorly drawn, low resolution, blurry, lack of details"
|
|
|
128 |
default_audio_prompt = ""
|
129 |
default_audio_negative_prompt = "music"
|
130 |
|
|
|
248 |
margin-bottom: 5px !important;
|
249 |
}
|
250 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
251 |
/* 비디오 출력 영역 */
|
252 |
video {
|
253 |
border-radius: 15px !important;
|
|
|
279 |
}
|
280 |
"""
|
281 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
282 |
def clear_cache():
|
283 |
"""Clear GPU and CPU cache to free memory"""
|
284 |
if torch.cuda.is_available():
|
|
|
286 |
torch.cuda.synchronize()
|
287 |
gc.collect()
|
288 |
|
289 |
+
def get_duration(prompt, nag_negative_prompt, nag_scale,
|
290 |
+
height, width, duration_seconds,
|
291 |
+
steps, seed, randomize_seed,
|
292 |
+
audio_mode, audio_prompt, audio_negative_prompt,
|
293 |
+
audio_seed, audio_steps, audio_cfg_strength,
|
294 |
+
progress):
|
295 |
+
base_duration = int(duration_seconds) * int(steps) * 2.25 + 5
|
|
|
|
|
|
|
|
|
|
|
296 |
|
297 |
# Add extra time for audio generation
|
298 |
if audio_mode == "Enable Audio":
|
|
|
339 |
return video_with_audio_path
|
340 |
|
341 |
@spaces.GPU(duration=get_duration)
|
342 |
+
def generate_video(prompt, nag_negative_prompt, nag_scale,
|
343 |
+
height, width, duration_seconds,
|
344 |
+
steps, seed, randomize_seed,
|
|
|
345 |
audio_mode, audio_prompt, audio_negative_prompt,
|
346 |
audio_seed, audio_steps, audio_cfg_strength,
|
347 |
progress=gr.Progress(track_tqdm=True)):
|
348 |
|
|
|
|
|
|
|
349 |
target_h = max(MOD_VALUE, (int(height) // MOD_VALUE) * MOD_VALUE)
|
350 |
target_w = max(MOD_VALUE, (int(width) // MOD_VALUE) * MOD_VALUE)
|
351 |
|
352 |
+
num_frames = np.clip(int(round(int(duration_seconds) * FIXED_FPS) + 1), MIN_FRAMES_MODEL, MAX_FRAMES_MODEL)
|
353 |
|
354 |
current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
|
355 |
|
356 |
+
# Generate video using NAG
|
|
|
|
|
357 |
with torch.inference_mode():
|
358 |
+
nag_output_frames_list = pipe(
|
359 |
+
prompt=prompt,
|
360 |
+
nag_negative_prompt=nag_negative_prompt,
|
361 |
+
nag_scale=nag_scale,
|
362 |
+
nag_tau=3.5,
|
363 |
+
nag_alpha=0.5,
|
364 |
height=target_h, width=target_w, num_frames=num_frames,
|
365 |
+
guidance_scale=0.,
|
366 |
+
num_inference_steps=int(steps),
|
367 |
generator=torch.Generator(device="cuda").manual_seed(current_seed)
|
368 |
).frames[0]
|
369 |
|
370 |
# Save video without audio
|
371 |
with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
|
372 |
video_path = tmpfile.name
|
373 |
+
export_to_video(nag_output_frames_list, video_path, fps=FIXED_FPS)
|
374 |
|
375 |
# Generate audio if enabled
|
376 |
video_with_audio_path = None
|
|
|
384 |
|
385 |
# Clear cache to free memory
|
386 |
clear_cache()
|
387 |
+
cleanup_temp_files()
|
388 |
|
389 |
return video_path, video_with_audio_path, current_seed
|
390 |
|
|
|
394 |
|
395 |
with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
|
396 |
with gr.Column(elem_classes=["main-container"]):
|
397 |
+
gr.Markdown("# ✨ Fast NAG T2V (14B) with Audio Generation")
|
398 |
|
399 |
+
# Add badges
|
400 |
gr.HTML("""
|
401 |
<div class="badge-container">
|
402 |
<a href="https://huggingface.co/spaces/Heartsync/WAN2-1-fast-T2V-FusioniX" target="_blank">
|
|
|
404 |
</a>
|
405 |
<a href="https://huggingface.co/spaces/Heartsync/WAN2-1-fast-T2V-FusioniX2" target="_blank">
|
406 |
<img src="https://img.shields.io/static/v1?label=BASE&message=WAN%202.1%20T2V-Fusioni2X&color=%23008080&labelColor=%23533a7d&logo=huggingface&logoColor=%23ffffff&style=for-the-badge" alt="Base Model">
|
|
|
|
|
|
|
407 |
</a>
|
408 |
</div>
|
409 |
""")
|
410 |
|
411 |
with gr.Row():
|
412 |
with gr.Column(elem_classes=["input-container"]):
|
|
|
|
|
|
|
|
|
|
|
413 |
prompt_input = gr.Textbox(
|
414 |
+
label="✏️ Video Prompt",
|
415 |
+
placeholder="Describe your video scene in detail...",
|
416 |
+
lines=3
|
417 |
)
|
418 |
+
|
419 |
+
with gr.Accordion("🎨 NAG Settings", open=False):
|
420 |
+
nag_negative_prompt = gr.Textbox(
|
421 |
+
label="❌ NAG Negative Prompt",
|
422 |
+
value=DEFAULT_NAG_NEGATIVE_PROMPT,
|
423 |
+
lines=2
|
424 |
+
)
|
425 |
+
nag_scale = gr.Slider(
|
426 |
+
label="🎯 NAG Scale",
|
427 |
+
minimum=1.0,
|
428 |
+
maximum=20.0,
|
429 |
+
step=0.25,
|
430 |
+
value=11.0,
|
431 |
+
info="Higher values = stronger guidance"
|
432 |
+
)
|
433 |
+
|
434 |
duration_seconds_input = gr.Slider(
|
435 |
+
minimum=1,
|
436 |
+
maximum=8,
|
437 |
+
step=1,
|
438 |
+
value=DEFAULT_DURATION_SECONDS,
|
439 |
+
label="⏱️ Duration (seconds)",
|
440 |
info=f"Clamped to model's {MIN_FRAMES_MODEL}-{MAX_FRAMES_MODEL} frames at {FIXED_FPS}fps."
|
441 |
)
|
442 |
|
|
|
484 |
)
|
485 |
|
486 |
with gr.Accordion("⚙️ Advanced Settings", open=False):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
487 |
with gr.Row():
|
488 |
height_input = gr.Slider(
|
489 |
+
minimum=SLIDER_MIN_H,
|
490 |
+
maximum=SLIDER_MAX_H,
|
491 |
+
step=MOD_VALUE,
|
492 |
+
value=DEFAULT_H_SLIDER_VALUE,
|
493 |
+
label=f"📏 Output Height (×{MOD_VALUE})"
|
494 |
)
|
495 |
width_input = gr.Slider(
|
496 |
+
minimum=SLIDER_MIN_W,
|
497 |
+
maximum=SLIDER_MAX_W,
|
498 |
+
step=MOD_VALUE,
|
499 |
+
value=DEFAULT_W_SLIDER_VALUE,
|
500 |
+
label=f"📐 Output Width (×{MOD_VALUE})"
|
501 |
+
)
|
502 |
+
with gr.Row():
|
503 |
+
steps_slider = gr.Slider(
|
504 |
+
minimum=1,
|
505 |
+
maximum=8,
|
506 |
+
step=1,
|
507 |
+
value=DEFAULT_STEPS,
|
508 |
+
label="🚀 Inference Steps"
|
509 |
+
)
|
510 |
+
seed_input = gr.Slider(
|
511 |
+
label="🎲 Seed",
|
512 |
+
minimum=0,
|
513 |
+
maximum=MAX_SEED,
|
514 |
+
step=1,
|
515 |
+
value=DEFAULT_SEED,
|
516 |
+
interactive=True
|
517 |
)
|
518 |
+
randomize_seed_checkbox = gr.Checkbox(
|
519 |
+
label="🔀 Randomize seed",
|
520 |
+
value=True,
|
521 |
+
interactive=True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
522 |
)
|
523 |
|
524 |
generate_button = gr.Button(
|
525 |
+
"🎬 Generate Video",
|
526 |
variant="primary",
|
527 |
elem_classes=["generate-btn"]
|
528 |
)
|
529 |
|
530 |
with gr.Column(elem_classes=["output-container"]):
|
531 |
video_output = gr.Video(
|
532 |
+
label="🎥 Generated Video",
|
533 |
+
autoplay=True,
|
534 |
interactive=False
|
535 |
)
|
536 |
video_with_audio_output = gr.Video(
|
|
|
547 |
outputs=[audio_settings, video_with_audio_output]
|
548 |
)
|
549 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
550 |
ui_inputs = [
|
551 |
+
prompt_input, nag_negative_prompt, nag_scale,
|
552 |
+
height_input, width_input, duration_seconds_input,
|
553 |
+
steps_slider, seed_input, randomize_seed_checkbox,
|
554 |
audio_mode, audio_prompt, audio_negative_prompt,
|
555 |
audio_seed, audio_steps, audio_cfg_strength
|
556 |
]
|
557 |
generate_button.click(
|
558 |
+
fn=generate_video,
|
559 |
+
inputs=ui_inputs,
|
560 |
outputs=[video_output, video_with_audio_output, seed_input]
|
561 |
)
|
562 |
|
563 |
with gr.Column():
|
564 |
gr.Examples(
|
565 |
+
examples=[
|
566 |
+
["A ginger cat passionately plays electric guitar with intensity and emotion on a stage. The background is shrouded in deep darkness. Spotlights cast dramatic shadows.", DEFAULT_NAG_NEGATIVE_PROMPT, 11,
|
567 |
+
DEFAULT_H_SLIDER_VALUE, DEFAULT_W_SLIDER_VALUE, DEFAULT_DURATION_SECONDS,
|
568 |
+
DEFAULT_STEPS, DEFAULT_SEED, False,
|
569 |
+
"Enable Audio", "electric guitar riffs, cat meowing", default_audio_negative_prompt, -1, 25, 4.5],
|
570 |
+
["A red vintage Porsche convertible flying over a rugged coastal cliff. Monstrous waves violently crashing against the rocks below. A lighthouse stands tall atop the cliff.", DEFAULT_NAG_NEGATIVE_PROMPT, 11,
|
571 |
+
DEFAULT_H_SLIDER_VALUE, DEFAULT_W_SLIDER_VALUE, DEFAULT_DURATION_SECONDS,
|
572 |
+
DEFAULT_STEPS, DEFAULT_SEED, False,
|
573 |
+
"Enable Audio", "car engine, ocean waves crashing, wind", default_audio_negative_prompt, -1, 25, 4.5],
|
574 |
+
["Enormous glowing jellyfish float slowly across a sky filled with soft clouds. Their tentacles shimmer with iridescent light as they drift above a peaceful mountain landscape. Magical and dreamlike, captured in a wide shot. Surreal realism style with detailed textures.", DEFAULT_NAG_NEGATIVE_PROMPT, 11,
|
575 |
+
DEFAULT_H_SLIDER_VALUE, DEFAULT_W_SLIDER_VALUE, DEFAULT_DURATION_SECONDS,
|
576 |
+
DEFAULT_STEPS, DEFAULT_SEED, False,
|
577 |
"Video Only", "", default_audio_negative_prompt, -1, 25, 4.5],
|
|
|
|
|
|
|
578 |
],
|
579 |
+
inputs=ui_inputs,
|
580 |
+
outputs=[video_output, video_with_audio_output, seed_input],
|
581 |
+
fn=generate_video,
|
582 |
cache_examples="lazy",
|
583 |
label="🌟 Example Gallery"
|
584 |
)
|