Wan-2.2-5B

Runtime error

App Files Files Community

dangthr commited on 4 days ago

Commit

2a7f487

verified ·

1 Parent(s): f55df05

Update app.py

Browse files

Files changed (1) hide show

app.py +137 -217

app.py CHANGED Viewed

@@ -1,189 +1,123 @@
 import os
 import sys
-sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
-# wan2.2-main/gradio_ti2v.py
-import gradio as gr
 import torch
 from huggingface_hub import snapshot_download
 from PIL import Image
-import random
-import numpy as np
-import spaces
-import wan
-from wan.configs import WAN_CONFIGS, SIZE_CONFIGS, MAX_AREA_CONFIGS, SUPPORTED_SIZES
-from wan.utils.utils import cache_video
 import gc
-# --- 1. Global Setup and Model Loading ---
-print("Starting Gradio App for Wan 2.2 TI2V-5B...")
-# Download model snapshots from Hugging Face Hub
-repo_id = "Wan-AI/Wan2.2-TI2V-5B"
-print(f"Downloading/loading checkpoints for {repo_id}...")
-ckpt_dir = snapshot_download(repo_id, local_dir_use_symlinks=False)
-print(f"Using checkpoints from {ckpt_dir}")
-# Load the model configuration
-TASK_NAME = 'ti2v-5B'
-cfg = WAN_CONFIGS[TASK_NAME]
-FIXED_FPS = 24
-MIN_FRAMES_MODEL = 8
-MAX_FRAMES_MODEL = 121
-# Dimension calculation constants
-MOD_VALUE = 32
-DEFAULT_H_SLIDER_VALUE = 704
-DEFAULT_W_SLIDER_VALUE = 1280
-NEW_FORMULA_MAX_AREA = 1280.0 * 704.0
-SLIDER_MIN_H, SLIDER_MAX_H = 128, 1280
-SLIDER_MIN_W, SLIDER_MAX_W = 128, 1280
-# Instantiate the pipeline in the global scope
-print("Initializing WanTI2V pipeline...")
-device = "cuda" if torch.cuda.is_available() else "cpu"
-device_id = 0 if torch.cuda.is_available() else -1
-pipeline = wan.WanTI2V(
-    config=cfg,
-    checkpoint_dir=ckpt_dir,
-    device_id=device_id,
-    rank=0,
-    t5_fsdp=False,
-    dit_fsdp=False,
-    use_sp=False,
-    t5_cpu=False,
-    init_on_cpu=False,
-    convert_model_dtype=True,
-)
-print("Pipeline initialized and ready.")
-# --- Helper Functions (from Wan 2.1 Fast demo) ---
-def _calculate_new_dimensions_wan(pil_image, mod_val, calculation_max_area,
-                                 min_slider_h, max_slider_h,
-                                 min_slider_w, max_slider_w,
-                                 default_h, default_w):
-    orig_w, orig_h = pil_image.size
-    if orig_w <= 0 or orig_h <= 0:
-        return default_h, default_w
-    aspect_ratio = orig_h / orig_w
-    calc_h = round(np.sqrt(calculation_max_area * aspect_ratio))
-    calc_w = round(np.sqrt(calculation_max_area / aspect_ratio))
-    calc_h = max(mod_val, (calc_h // mod_val) * mod_val)
-    calc_w = max(mod_val, (calc_w // mod_val) * mod_val)
-    new_h = int(np.clip(calc_h, min_slider_h, (max_slider_h // mod_val) * mod_val))
-    new_w = int(np.clip(calc_w, min_slider_w, (max_slider_w // mod_val) * mod_val))
-    return new_h, new_w
-def handle_image_upload_for_dims_wan(uploaded_pil_image, current_h_val, current_w_val):
     """
-    Handle image upload and calculate appropriate dimensions for video generation.
-    Args:
-        uploaded_pil_image: The uploaded image (PIL Image or numpy array)
-        current_h_val: Current height slider value
-        current_w_val: Current width slider value
-    Returns:
-        Tuple of gr.update objects for height and width sliders
     """
-    if uploaded_pil_image is None:
-        return gr.update(value=DEFAULT_H_SLIDER_VALUE), gr.update(value=DEFAULT_W_SLIDER_VALUE)
     try:
-        # Convert numpy array to PIL Image if needed
-        if hasattr(uploaded_pil_image, 'shape'):  # numpy array
-            pil_image = Image.fromarray(uploaded_pil_image).convert("RGB")
-        else:  # already PIL Image
-            pil_image = uploaded_pil_image
-        new_h, new_w = _calculate_new_dimensions_wan(
-            pil_image, MOD_VALUE, NEW_FORMULA_MAX_AREA,
-            SLIDER_MIN_H, SLIDER_MAX_H, SLIDER_MIN_W, SLIDER_MAX_W,
-            DEFAULT_H_SLIDER_VALUE, DEFAULT_W_SLIDER_VALUE
-        )
-        return gr.update(value=new_h), gr.update(value=new_w)
     except Exception as e:
-        gr.Warning("Error attempting to calculate new dimensions")
-        return gr.update(value=DEFAULT_H_SLIDER_VALUE), gr.update(value=DEFAULT_W_SLIDER_VALUE)
-def get_duration(image,
-                 prompt,
-                 height,
-                 width,
-                 duration_seconds,
-                 sampling_steps,
-                 guide_scale,
-                 shift,
-                 seed,
-                 progress):
-    """Calculate dynamic GPU duration based on parameters."""
-    return sampling_steps * 15
-# --- 2. Gradio Inference Function ---
-@spaces.GPU(duration=get_duration)
-def generate_video(
-    image,
-    prompt,
-    height,
-    width,
-    duration_seconds,
-    sampling_steps=38,
-    guide_scale=cfg.sample_guide_scale,
-    shift=cfg.sample_shift,
-    seed=42,
-    progress=gr.Progress(track_tqdm=True)
-):
     """
-    Generate a video from text prompt and optional image using the Wan 2.2 TI2V model.
-    Args:
-        image: Optional input image (numpy array) for image-to-video generation
-        prompt: Text prompt describing the desired video
-        height: Target video height in pixels
-        width: Target video width in pixels
-        duration_seconds: Desired video duration in seconds
-        sampling_steps: Number of denoising steps for video generation
-        guide_scale: Guidance scale for classifier-free guidance
-        shift: Sample shift parameter for the model
-        seed: Random seed for reproducibility (-1 for random)
-        progress: Gradio progress tracker
-    Returns:
-        Path to the generated video file
     """
     if seed == -1:
         seed = random.randint(0, sys.maxsize)
-    # Ensure dimensions are multiples of MOD_VALUE
     target_h = max(MOD_VALUE, (int(height) // MOD_VALUE) * MOD_VALUE)
     target_w = max(MOD_VALUE, (int(width) // MOD_VALUE) * MOD_VALUE)
-    input_image = None
-    if image is not None:
-        input_image = Image.fromarray(image).convert("RGB")
-        # Resize image to match target dimensions
-        input_image = input_image.resize((target_w, target_h))
-    # Calculate number of frames based on duration
     num_frames = np.clip(int(round(duration_seconds * FIXED_FPS)), MIN_FRAMES_MODEL, MAX_FRAMES_MODEL)
-    # Create size string for the pipeline
-    size_str = f"{target_h}*{target_w}"
     video_tensor = pipeline.generate(
         input_prompt=prompt,
-        img=input_image,  # Pass None for T2V, Image for I2V
         size=SIZE_CONFIGS.get(size_str, (target_h, target_w)),
         max_area=MAX_AREA_CONFIGS.get(size_str, target_h * target_w),
-        frame_num=num_frames,  # Use calculated frames instead of cfg.frame_num
         shift=shift,
         sample_solver='unipc',
         sampling_steps=int(sampling_steps),
@@ -192,83 +126,69 @@ def generate_video(
         offload_model=True
     )
-    # Save the video to a temporary file
     video_path = cache_video(
-        tensor=video_tensor[None],  # Add a batch dimension
-        save_file=None,  # cache_video will create a temp file
         fps=cfg.sample_fps,
         normalize=True,
         value_range=(-1, 1)
     )
     del video_tensor
     gc.collect()
-    return video_path
-# --- 3. Gradio Interface ---
-css = ".gradio-container {max-width: 1100px !important; margin: 0 auto} #output_video {height: 500px;} #input_image {height: 500px;}"
-with gr.Blocks(css=css, theme=gr.themes.Soft(), delete_cache=(60, 900)) as demo:
-    gr.Markdown("# Wan 2.2 TI2V 5B")
-    gr.Markdown("generate high quality videos using **Wan 2.2 5B Text-Image-to-Video model**,[[model]](https://huggingface.co/Wan-AI/Wan2.2-TI2V-5B),[[paper]](https://arxiv.org/abs/2503.20314)")
-    with gr.Row():
-        with gr.Column(scale=2):
-            image_input = gr.Image(type="numpy", label="Optional (blank = text-to-image)", elem_id="input_image")
-            prompt_input = gr.Textbox(label="Prompt", value="A beautiful waterfall in a lush jungle, cinematic.", lines=3)
-            duration_input = gr.Slider(
-                minimum=round(MIN_FRAMES_MODEL/FIXED_FPS, 1),
-                maximum=round(MAX_FRAMES_MODEL/FIXED_FPS, 1),
-                step=0.1,
-                value=2.0,
-                label="Duration (seconds)",
-                info=f"Clamped to model's {MIN_FRAMES_MODEL}-{MAX_FRAMES_MODEL} frames at {FIXED_FPS}fps."
-            )
-            with gr.Accordion("Advanced Settings", open=False):
-                with gr.Row():
-                    height_input = gr.Slider(minimum=SLIDER_MIN_H, maximum=SLIDER_MAX_H, step=MOD_VALUE, value=DEFAULT_H_SLIDER_VALUE, label=f"Output Height (multiple of {MOD_VALUE})")
-                    width_input = gr.Slider(minimum=SLIDER_MIN_W, maximum=SLIDER_MAX_W, step=MOD_VALUE, value=DEFAULT_W_SLIDER_VALUE, label=f"Output Width (multiple of {MOD_VALUE})")
-                steps_input = gr.Slider(label="Sampling Steps", minimum=10, maximum=50, value=38, step=1)
-                scale_input = gr.Slider(label="Guidance Scale", minimum=1.0, maximum=10.0, value=cfg.sample_guide_scale, step=0.1)
-                shift_input = gr.Slider(label="Sample Shift", minimum=1.0, maximum=20.0, value=cfg.sample_shift, step=0.1)
-                seed_input = gr.Number(label="Seed (-1 for random)", value=-1, precision=0)
-        with gr.Column(scale=2):
-            video_output = gr.Video(label="Generated Video", elem_id="output_video")
-            run_button = gr.Button("Generate Video", variant="primary")
-    # Add image upload handler
-    image_input.upload(
-        fn=handle_image_upload_for_dims_wan,
-        inputs=[image_input, height_input, width_input],
-        outputs=[height_input, width_input]
     )
-    image_input.clear(
-        fn=handle_image_upload_for_dims_wan,
-        inputs=[image_input, height_input, width_input],
-        outputs=[height_input, width_input]
     )
-    example_image_path = os.path.join(os.path.dirname(__file__), "examples/i2v_input.JPG")
-    gr.Examples(
-        examples=[
-            [example_image_path, "The cat removes the glasses from its eyes.", 1088, 800, 1.5],
-            [None, "A cinematic shot of a boat sailing on a calm sea at sunset.", 704, 1280, 2.0],
-            [None, "Drone footage flying over a futuristic city with flying cars.", 704, 1280, 2.0],
-        ],
-        inputs=[image_input, prompt_input, height_input, width_input, duration_input],
-        outputs=video_output,
-        fn=generate_video,
-        cache_examples="lazy",
     )
-    run_button.click(
-        fn=generate_video,
-        inputs=[image_input, prompt_input, height_input, width_input, duration_input, steps_input, scale_input, shift_input, seed_input],
-        outputs=video_output
-    )
 if __name__ == "__main__":
-    demo.launch(mcp_server=True)

 import os
 import sys
+import argparse
+import random
+import numpy as np
 import torch
 from huggingface_hub import snapshot_download
 from PIL import Image
 import gc
+# 将当前文件所在目录添加到 Python 路径中
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+# 从 'wan' 库中导入所需模块
+import wan
+from wan.configs import WAN_CONFIGS, SIZE_CONFIGS, MAX_AREA_CONFIGS
+from wan.utils.utils import cache_video
+# --- 1. 模型下载器 ---
+def download_models():
     """
+    从 Hugging Face Hub 下载并缓存所需的模型。
     """
+    repo_id = "Wan-AI/Wan2.2-TI2V-5B"
+    print(f"正在为 {repo_id} 下载模型检查点...")
     try:
+        ckpt_dir = snapshot_download(repo_id, local_dir_use_symlinks=False)
+        print(f"✅ 模型成功下载到: {ckpt_dir}")
     except Exception as e:
+        print(f"❌ 下载模型时出错: {e}")
+        sys.exit(1)
+# --- 2. 视频生成函数 ---
+def generate_video_cli(prompt: str):
     """
+    使用命令行设置，根据文本提示生成视频。
     """
+    print("🎬 开始视频生成流程...")
+    # --- 设置 ---
+    print("正在加载模型配置...")
+    repo_id = "Wan-AI/Wan2.2-TI2V-5B"
+    # 确保模型已下载，否则立即下载。
+    try:
+        # snapshot_download 会检查本地缓存，如果已存在则不会重复下载
+        ckpt_dir = snapshot_download(repo_id, local_dir_use_symlinks=False)
+    except Exception as e:
+        print(f"❌ 无法找到或下载模型。请先运行 `python app.py --downloader`。")
+        print(f"错误详情: {e}")
+        sys.exit(1)
+    print(f"使用来自 {ckpt_dir} 的检查点")
+    TASK_NAME = 'ti2v-5B'
+    cfg = WAN_CONFIGS[TASK_NAME]
+    # --- 生成参数 (使用原脚本中的默认值) ---
+    height = 704
+    width = 1280
+    duration_seconds = 2.0
+    sampling_steps = 38
+    guide_scale = cfg.sample_guide_scale
+    shift = cfg.sample_shift
+    seed = -1  # -1 代表随机种子
+    image = None # 当前命令行版本不处理图像输入
+    # --- 处理 ---
     if seed == -1:
         seed = random.randint(0, sys.maxsize)
+    print(f"使用随机种子: {seed}")
+    # 确保尺寸有效
+    MOD_VALUE = 32
     target_h = max(MOD_VALUE, (int(height) // MOD_VALUE) * MOD_VALUE)
     target_w = max(MOD_VALUE, (int(width) // MOD_VALUE) * MOD_VALUE)
+    # 计算帧数
+    FIXED_FPS = 24
+    MIN_FRAMES_MODEL = 8
+    MAX_FRAMES_MODEL = 121
     num_frames = np.clip(int(round(duration_seconds * FIXED_FPS)), MIN_FRAMES_MODEL, MAX_FRAMES_MODEL)
+    print(f"正在生成 {num_frames} 帧 ({duration_seconds}秒 @ {FIXED_FPS}fps)，分辨率为 {target_w}x{target_h}。")
+    # --- 初始化 Pipeline ---
+    print("正在初始化 WanTI2V pipeline... (可能需要一些时间)")
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    device_id = 0 if torch.cuda.is_available() else -1
+    if device == "cpu":
+        print("⚠️ 警告: 未检测到 GPU。在 CPU 上运行会非常慢。")
+    try:
+        pipeline = wan.WanTI2V(
+            config=cfg,
+            checkpoint_dir=ckpt_dir,
+            device_id=device_id,
+            rank=0,
+            t5_fsdp=False,
+            dit_fsdp=False,
+            use_sp=False,
+            t5_cpu=False,
+            init_on_cpu=False,
+            convert_model_dtype=True,
+        )
+        print("Pipeline 初始化完成。")
+    except Exception as e:
+        print(f"❌ 初始化 pipeline 失败: {e}")
+        sys.exit(1)
+    # --- 生成视频 ---
+    print(f"正在为提示词生成视频: '{prompt}'")
+    size_str = f"{target_h}*{target_w}"
     video_tensor = pipeline.generate(
         input_prompt=prompt,
+        img=image,
         size=SIZE_CONFIGS.get(size_str, (target_h, target_w)),
         max_area=MAX_AREA_CONFIGS.get(size_str, target_h * target_w),
+        frame_num=num_frames,
         shift=shift,
         sample_solver='unipc',
         sampling_steps=int(sampling_steps),
         offload_model=True
     )
+    # --- 保存视频 ---
+    print("正在保存视频...")
+    # 根据提示词生成一个安全的文件名
+    safe_prompt = "".join([c for c in prompt if c.isalnum() or c==' ']).rstrip()
+    safe_prompt = safe_prompt.replace(" ", "_")
+    output_filename = f"{safe_prompt[:50]}_{seed}.mp4"
+    output_path = os.path.join(os.getcwd(), output_filename) #保存在当前工作目录
     video_path = cache_video(
+        tensor=video_tensor[None],
+        save_file=output_path, # 指定保存路径
         fps=cfg.sample_fps,
         normalize=True,
         value_range=(-1, 1)
     )
+    # --- 清理 ---
+    del pipeline
     del video_tensor
     gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    print(f"✅ 视频生成完成！已保存至: {video_path}")
+# --- 3. 主执行模块 ---
+def main():
+    """
+    解析命令行参数并运行相应的功能。
+    """
+    parser = argparse.ArgumentParser(
+        description="Wan 2.2 TI2V-5B 命令行工具。用于从文本生成视频或下载模型。",
+        formatter_class=argparse.RawTextHelpFormatter
     )
+    parser.add_argument(
+        '--prompt',
+        nargs='+',
+        type=str,
+        help="用于视频生成的文本提示词。\n示例: --prompt A beautiful waterfall"
     )
+    parser.add_argument(
+        '--downloader',
+        action='store_true',
+        help="如果指定此参数，将只下载所需的模型然后退出。"
     )
+    args = parser.parse_args()
+    if args.downloader:
+        download_models()
+    elif args.prompt:
+        # 将单词列表合并成一个完整的提示词字符串
+        # 这能正确处理 'prompt text' 和 "prompt text" 以及 prompt text
+        prompt_text = " ".join(args.prompt)
+        generate_video_cli(prompt_text)
+    else:
+        print("未指定操作。请输入 --prompt 或使用 --downloader 标志。")
+        parser.print_help()
 if __name__ == "__main__":
+    main()