Spaces:

Sergidev
/

Huanyan-Studio

Build error

File size: 9,911 Bytes

import spaces
import gc
import gradio as gr
import numpy as np
import os
from pathlib import Path
from diffusers import GGUFQuantizationConfig, HunyuanVideoPipeline, HunyuanVideoTransformer3DModel
from diffusers.utils import export_to_video
from huggingface_hub import snapshot_download
import torch
from PIL import Image

# Configuration
gc.collect()
torch.cuda.empty_cache()
torch.set_grad_enabled(False)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Load base model
model_id = "hunyuanvideo-community/HunyuanVideo"
base_path = f"/home/user/app/{model_id}"
os.makedirs(base_path, exist_ok=True)
snapshot_download(repo_id=model_id, local_dir=base_path)

# Load transformer
ckp_path = Path(base_path)
gguf_filename = "hunyuan-video-t2v-720p-Q4_0.gguf"
transformer_path = f"https://huggingface.co/city96/HunyuanVideo-gguf/blob/main/{gguf_filename}"
transformer = HunyuanVideoTransformer3DModel.from_single_file(
    transformer_path,
    quantization_config=GGUFQuantizationConfig(compute_dtype=torch.bfloat16),
    torch_dtype=torch.bfloat16,
).to('cuda')

# Initialize pipeline
pipe = HunyuanVideoPipeline.from_pretrained(
    ckp_path,
    transformer=transformer,
    torch_dtype=torch.float16
).to("cuda")

# Configure VAE
pipe.vae.enable_tiling()
pipe.vae.enable_slicing()
pipe.vae.eval()

# Available LORAs with display names
LORA_CHOICES = [
    ("stripe_v2.safetensors", "Stripe Style"),
    ("Top_Off.safetensors", "Top Off Effect"),
    ("huanyan_helper.safetensors", "Hunyuan Helper"),
    ("huanyan_helper_alpha.safetensors", "Hunyuan Alpha"),
    ("hunyuan-t-solo-v1.0.safetensors", "Solo Animation")
]

# Load all LORAs with hunyuanvideo-lora adapter
for weight_name, display_name in LORA_CHOICES:
    pipe.load_lora_weights(
        "Sergidev/TTV4ME",
        weight_name=weight_name,
        adapter_name=display_name.replace(" ", "_").lower(),
        token=os.environ.get("HF_TOKEN")
    )

# Memory cleanup
gc.collect()
torch.cuda.empty_cache()

MAX_SEED = np.iinfo(np.int32).max
MAX_IMAGE_SIZE = 1024

@spaces.GPU(duration=300)
def generate(
    prompt,
    image_input,
    height,
    width,
    num_frames,
    num_inference_steps,
    seed_value,
    fps,
    selected_loras,
    lora_weights,
    progress=gr.Progress(track_tqdm=True)
):
    # Validate image resolution
    if image_input is not None:
        img = Image.open(image_input)
        if img.size != (width, height):
            raise gr.Error(f"Image resolution {img.size} must match video resolution ({width}x{height})")

    # Configure LORAs
    active_adapters = [lora[1].replace(" ", "_").lower() for lora in LORA_CHOICES if lora[1] in selected_loras]
    weights = [float(lora_weights[selected_loras.index(lora[1])]) for lora in LORA_CHOICES if lora[1] in selected_loras]
    pipe.set_adapters(active_adapters, weights)

    with torch.cuda.device(0):
        if seed_value == -1:
            seed_value = torch.randint(0, MAX_SEED, (1,)).item()
        generator = torch.Generator('cuda').manual_seed(seed_value)

        with torch.amp.autocast_mode.autocast('cuda', dtype=torch.bfloat16), torch.inference_mode(), torch.no_grad():
            # Use image input if provided, else use text prompt
            if image_input:
                output = pipe(
                    image=Image.open(image_input).convert("RGB"),
                    height=height,
                    width=width,
                    num_frames=num_frames,
                    num_inference_steps=num_inference_steps,
                    generator=generator,
                ).frames[0]
            else:
                output = pipe(
                    prompt=prompt,
                    height=height,
                    width=width,
                    num_frames=num_frames,
                    num_inference_steps=num_inference_steps,
                    generator=generator,
                ).frames[0]

        output_path = "output.mp4"
        export_to_video(output, output_path, fps=fps)
        torch.cuda.empty_cache()
        gc.collect()
        return output_path

def apply_preset(preset_name, *current_values):
    if preset_name == "Higher Resolution":
        return [608, 448, 24, 29, 12]
    elif preset_name == "More Frames":
        return [512, 320, 42, 27, 14]
    return current_values

css = """
#col-container {
    margin: 0 auto;
    max-width: 850px;
}

.dark-theme {
    background-color: #1f1f1f;
    color: #ffffff;
}

.container {
    margin: 0 auto;
    padding: 20px;
    border-radius: 10px;
    background-color: #2d2d2d;
    box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
}

.title {
    text-align: center;
    margin-bottom: 1em;
    color: #ffffff;
}

.description {
    text-align: center;
    margin-bottom: 2em;
    color: #cccccc;
    font-size: 0.95em;
    line-height: 1.5;
}

.prompt-container {
    background-color: #363636;
    padding: 15px;
    border-radius: 8px;
    margin-bottom: 1em;
    width: 100%;
}

.prompt-textbox {
    min-height: 80px !important;
}

.preset-buttons {
    display: flex;
    gap: 10px;
    justify-content: center;
    margin-bottom: 1em;
}

.support-text {
    text-align: center;
    margin-top: 1em;
    color: #cccccc;
    font-size: 0.9em;
}

a {
    color: #00a7e1;
    text-decoration: none;
}

a:hover {
    text-decoration: underline;
}

.lora-sliders {
    margin-top: 15px;
    border-top: 1px solid #444;
    padding-top: 15px;
}
"""

with gr.Blocks(css=css, theme="dark") as demo:
    with gr.Column(elem_id="col-container"):
        gr.Markdown("# 🎬 Hunyuan Studio", elem_classes=["title"])
        gr.Markdown(
            """Generate videos from text or images using multiple LoRA adapters.
            Requires matching resolution between input image and output settings.""",
            elem_classes=["description"]
        )

        with gr.Column(elem_classes=["prompt-container"]):
            prompt = gr.Textbox(
                label="Prompt",
                placeholder="Enter text prompt or upload image below",
                show_label=False,
                elem_classes=["prompt-textbox"],
                lines=3
            )
            image_input = gr.Image(type="filepath", label="Upload Image (Optional)")

        with gr.Row():
            run_button = gr.Button("🎨 Generate", variant="primary", size="lg")

        with gr.Row(elem_classes=["preset-buttons"]):
            preset_high_res = gr.Button("📺 Higher Resolution Preset")
            preset_more_frames = gr.Button("🎞️ More Frames Preset")

        with gr.Row():
            result = gr.Video(label="Generated Video")

        with gr.Accordion("⚙️ Advanced Settings", open=False):
            seed = gr.Slider(
                label="Seed (-1 for random)",
                minimum=-1,
                maximum=MAX_SEED,
                step=1,
                value=-1,
            )

            with gr.Row():
                height = gr.Slider(
                    label="Height",
                    minimum=256,
                    maximum=MAX_IMAGE_SIZE,
                    step=16,
                    value=608,
                )
                width = gr.Slider(
                    label="Width",
                    minimum=256,
                    maximum=MAX_IMAGE_SIZE,
                    step=16,
                    value=448,
                )

            with gr.Row():
                num_frames = gr.Slider(
                    label="Number of frames",
                    minimum=1.0,
                    maximum=257.0,
                    step=1,
                    value=24,
                )
                num_inference_steps = gr.Slider(
                    label="Inference steps",
                    minimum=1,
                    maximum=50,
                    step=1,
                    value=29,
                )
                fps = gr.Slider(
                    label="Frames per second",
                    minimum=1,
                    maximum=60,
                    step=1,
                    value=12,
                )

            with gr.Column(elem_classes=["lora-sliders"]):
                gr.Markdown("### LoRA Adapters")
                lora_checkboxes = gr.CheckboxGroup(
                    label="Select LoRAs",
                    choices=[display for (_, display) in LORA_CHOICES],
                    value=["Stripe Style", "Top Off Effect"]
                )
                lora_weight_sliders = []
                for _, display_name in LORA_CHOICES:
                    lora_weight_sliders.append(
                        gr.Slider(
                            label=f"{display_name} Weight",
                            minimum=0.0,
                            maximum=1.0,
                            value=0.9 if "Stripe" in display_name else 0.8,
                            visible=False
                        )
                    )

    # Event handling
    run_button.click(
        fn=generate,
        inputs=[prompt, image_input, height, width, num_frames,
                num_inference_steps, seed, fps, lora_checkboxes, lora_weight_sliders],
        outputs=[result],
    )

    # Preset button handlers
    preset_high_res.click(
        fn=lambda: apply_preset("Higher Resolution"),
        outputs=[height, width, num_frames, num_inference_steps, fps]
    )
    preset_more_frames.click(
        fn=lambda: apply_preset("More Frames"),
        outputs=[height, width, num_frames, num_inference_steps, fps]
    )

    # Show/hide LORA weight sliders based on checkbox selection
    def toggle_lora_sliders(selected_loras):
        updates = []
        for lora in LORA_CHOICES:
            updates.append(gr.update(visible=lora[1] in selected_loras))
        return updates

    lora_checkboxes.change(
        fn=toggle_lora_sliders,
        inputs=lora_checkboxes,
        outputs=lora_weight_sliders
    )