STUDIO

Sleeping

File size: 26,711 Bytes

3707c06

import gradio as gr
import numpy as np
from PIL import Image, ImageDraw
from gradio_client import Client, handle_file
import random
import tempfile
import os
import logging
import torch
from diffusers import AutoencoderKL, TCDScheduler
from diffusers.models.model_loading_utils import load_state_dict
from huggingface_hub import hf_hub_download
from pathlib import Path
import torchaudio
from einops import rearrange
from scipy.io import wavfile
from transformers import pipeline

# 환경 변수 설정으로 torch.load 체크 우회 (임시 해결책)
os.environ["TRANSFORMERS_ALLOW_UNSAFE_DESERIALIZATION"] = "1"

# Spaces GPU
try:
    import spaces
except:
    # GPU 데코레이터가 없을 때를 위한 더미 데코레이터
    class spaces:
        @staticmethod
        def GPU(duration=None):
            def decorator(func):
                return func
            return decorator

# MMAudio imports
try:
    import mmaudio
except ImportError:
    os.system("pip install -e .")
    import mmaudio

from mmaudio.eval_utils import (ModelConfig, all_model_cfg, generate, load_video, make_video,
                                setup_eval_logging)
from mmaudio.model.flow_matching import FlowMatching
from mmaudio.model.networks import MMAudio, get_my_mmaudio
from mmaudio.model.sequence_config import SequenceConfig
from mmaudio.model.utils.features_utils import FeaturesUtils

# ControlNet 모델 로드
try:
    from controlnet_union import ControlNetModel_Union
    from pipeline_fill_sd_xl import StableDiffusionXLFillPipeline
    
    # ControlNet 설정 및 로드
    config_file = hf_hub_download(
        "xinsir/controlnet-union-sdxl-1.0",
        filename="config_promax.json",
    )
    
    config = ControlNetModel_Union.load_config(config_file)
    controlnet_model = ControlNetModel_Union.from_config(config)
    
    model_file = hf_hub_download(
        "xinsir/controlnet-union-sdxl-1.0",
        filename="diffusion_pytorch_model_promax.safetensors",
    )
    state_dict = load_state_dict(model_file)
    loaded_keys = list(state_dict.keys())
    
    result = ControlNetModel_Union._load_pretrained_model(
        controlnet_model, state_dict, model_file, "xinsir/controlnet-union-sdxl-1.0", loaded_keys
    )
    
    model = result[0]
    model = model.to(device="cuda", dtype=torch.float16)
    
    # VAE 로드
    vae = AutoencoderKL.from_pretrained(
        "madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16
    ).to("cuda")
    
    # 파이프라인 로드
    pipe = StableDiffusionXLFillPipeline.from_pretrained(
        "SG161222/RealVisXL_V5.0_Lightning",
        torch_dtype=torch.float16,
        vae=vae,
        controlnet=model,
        variant="fp16",
    ).to("cuda")
    
    pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
    
    OUTPAINT_MODEL_LOADED = True
except Exception as e:
    logging.error(f"Failed to load outpainting models: {str(e)}")
    OUTPAINT_MODEL_LOADED = False

# MMAudio 모델 설정
if torch.cuda.is_available():
    device = torch.device("cuda")
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.allow_tf32 = True
    torch.backends.cudnn.benchmark = True
else:
    device = torch.device("cpu")

dtype = torch.bfloat16

# MMAudio 모델 초기화
try:
    model_mmaudio: ModelConfig = all_model_cfg['large_44k_v2']
    model_mmaudio.download_if_needed()
    output_dir = Path('./output/gradio')
    setup_eval_logging()
    
    # 번역기 설정
    try:
        translator = pipeline("translation", 
                             model="Helsinki-NLP/opus-mt-ko-en", 
                             device="cpu",
                             use_fast=True,
                             trust_remote_code=False)
    except Exception as e:
        logging.warning(f"Failed to load translation model: {e}")
        translator = None
    
    def get_mmaudio_model() -> tuple[MMAudio, FeaturesUtils, SequenceConfig]:
        with torch.cuda.device(device):
            seq_cfg = model_mmaudio.seq_cfg
            net: MMAudio = get_my_mmaudio(model_mmaudio.model_name).to(device, dtype).eval()
            net.load_weights(torch.load(model_mmaudio.model_path, map_location=device, weights_only=True))
            logging.info(f'Loaded weights from {model_mmaudio.model_path}')

            feature_utils = FeaturesUtils(
                tod_vae_ckpt=model_mmaudio.vae_path,
                synchformer_ckpt=model_mmaudio.synchformer_ckpt,
                enable_conditions=True,
                mode=model_mmaudio.mode,
                bigvgan_vocoder_ckpt=model_mmaudio.bigvgan_16k_path,
                need_vae_encoder=False
            ).to(device, dtype).eval()

            return net, feature_utils, seq_cfg

    net_mmaudio, feature_utils, seq_cfg = get_mmaudio_model()
    MMAUDIO_MODEL_LOADED = True
except Exception as e:
    logging.error(f"Failed to load MMAudio models: {str(e)}")
    MMAUDIO_MODEL_LOADED = False
    translator = None

# API URLs
TEXT2IMG_API_URL = "http://211.233.58.201:7896"
VIDEO_API_URL = "http://211.233.58.201:7875"

# 로깅 설정
logging.basicConfig(level=logging.INFO)

# Image size presets
IMAGE_PRESETS = {
    "커스텀": {"width": 1024, "height": 1024},
    "1:1 정사각형": {"width": 1024, "height": 1024},
    "4:3 표준": {"width": 1024, "height": 768},
    "16:9 와이드스크린": {"width": 1024, "height": 576},
    "9:16 세로형": {"width": 576, "height": 1024},
    "6:19 특수 세로형": {"width": 324, "height": 1024},
    "Instagram 정사각형": {"width": 1080, "height": 1080},
    "Instagram 스토리": {"width": 1080, "height": 1920},
    "Instagram 가로형": {"width": 1080, "height": 566},
    "Facebook 커버": {"width": 820, "height": 312},
    "Twitter 헤더": {"width": 1500, "height": 500},
    "YouTube 썸네일": {"width": 1280, "height": 720},
    "LinkedIn 배너": {"width": 1584, "height": 396},
}

def update_dimensions(preset):
    if preset in IMAGE_PRESETS:
        return IMAGE_PRESETS[preset]["width"], IMAGE_PRESETS[preset]["height"]
    return 1024, 1024

def generate_text_to_image(prompt, width, height, guidance, inference_steps, seed):
    if not prompt:
        return None, "프롬프트를 입력해주세요"
    
    try:
        client = Client(TEXT2IMG_API_URL)
        if seed == -1:
            seed = random.randint(0, 9999999)
            
        result = client.predict(
            prompt=prompt,
            width=int(width),
            height=int(height),
            guidance=float(guidance),
            inference_steps=int(inference_steps),
            seed=int(seed),
            do_img2img=False,
            init_image=None,
            image2image_strength=0.8,
            resize_img=True,
            api_name="/generate_image"
        )
        return result[0], f"사용된 시드: {result[1]}"
    except Exception as e:
        logging.error(f"Image generation error: {str(e)}")
        return None, f"오류: {str(e)}"

def generate_video_from_image(image, prompt="", length=4.0):
    if image is None:
        return None
    
    try:
        # 이미지 저장
        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as fp:
            temp_path = fp.name
            Image.fromarray(image).save(temp_path)
        
        # API 호출
        client = Client(VIDEO_API_URL)
        result = client.predict(
            input_image=handle_file(temp_path),
            prompt=prompt if prompt else "Generate natural motion",
            n_prompt="",
            seed=random.randint(0, 9999999),
            use_teacache=True,
            video_length=float(length),
            api_name="/process"
        )
        
        os.unlink(temp_path)
        
        if result and len(result) > 0:
            video_dict = result[0]
            return video_dict.get("video") if isinstance(video_dict, dict) else None
            
    except Exception as e:
        logging.error(f"Video generation error: {str(e)}")
        return None

def prepare_image_and_mask(image, width, height, overlap_percentage, alignment):
    """이미지와 마스크를 준비하는 함수"""
    if image is None:
        return None, None
    
    # PIL 이미지로 변환
    if isinstance(image, np.ndarray):
        image = Image.fromarray(image).convert('RGB')
    
    target_size = (width, height)
    
    # 이미지를 타겟 크기에 맞게 조정
    scale_factor = min(target_size[0] / image.width, target_size[1] / image.height)
    new_width = int(image.width * scale_factor)
    new_height = int(image.height * scale_factor)
    
    # 이미지 리사이즈
    source = image.resize((new_width, new_height), Image.LANCZOS)
    
    # 오버랩 계산
    overlap_x = int(new_width * (overlap_percentage / 100))
    overlap_y = int(new_height * (overlap_percentage / 100))
    overlap_x = max(overlap_x, 1)
    overlap_y = max(overlap_y, 1)
    
    # 정렬에 따른 마진 계산
    if alignment == "가운데":
        margin_x = (target_size[0] - new_width) // 2
        margin_y = (target_size[1] - new_height) // 2
    elif alignment == "왼쪽":
        margin_x = 0
        margin_y = (target_size[1] - new_height) // 2
    elif alignment == "오른쪽":
        margin_x = target_size[0] - new_width
        margin_y = (target_size[1] - new_height) // 2
    elif alignment == "위":
        margin_x = (target_size[0] - new_width) // 2
        margin_y = 0
    elif alignment == "아래":
        margin_x = (target_size[0] - new_width) // 2
        margin_y = target_size[1] - new_height
    
    # 배경 이미지 생성
    background = Image.new('RGB', target_size, (255, 255, 255))
    background.paste(source, (margin_x, margin_y))
    
    # 마스크 생성
    mask = Image.new('L', target_size, 255)
    mask_draw = ImageDraw.Draw(mask)
    
    # 마스크 영역 그리기 (영어 정렬과 매칭)
    white_gaps_patch = 2
    
    left_overlap = margin_x + overlap_x if alignment != "왼쪽" else margin_x
    right_overlap = margin_x + new_width - overlap_x if alignment != "오른쪽" else margin_x + new_width
    top_overlap = margin_y + overlap_y if alignment != "위" else margin_y
    bottom_overlap = margin_y + new_height - overlap_y if alignment != "아래" else margin_y + new_height
    
    mask_draw.rectangle([
        (left_overlap, top_overlap),
        (right_overlap, bottom_overlap)
    ], fill=0)
    
    return background, mask

def preview_outpaint(image, width, height, overlap_percentage, alignment):
    """아웃페인팅 미리보기"""
    background, mask = prepare_image_and_mask(image, width, height, overlap_percentage, alignment)
    if background is None:
        return None
    
    # 미리보기 이미지 생성
    preview = background.copy().convert('RGBA')
    
    # 반투명 빨간색 오버레이
    red_overlay = Image.new('RGBA', background.size, (255, 0, 0, 64))
    
    # 마스크 적용
    red_mask = Image.new('RGBA', background.size, (0, 0, 0, 0))
    red_mask.paste(red_overlay, (0, 0), mask)
    
    # 오버레이 합성
    preview = Image.alpha_composite(preview, red_mask)
    
    return preview

@spaces.GPU(duration=24)
def outpaint_image(image, prompt, width, height, overlap_percentage, alignment, num_steps=8):
    """이미지 아웃페인팅 실행"""
    if image is None:
        return None
    
    if not OUTPAINT_MODEL_LOADED:
        return Image.new('RGB', (width, height), (200, 200, 200))
    
    try:
        # 이미지와 마스크 준비
        background, mask = prepare_image_and_mask(image, width, height, overlap_percentage, alignment)
        if background is None:
            return None
        
        # cnet_image 생성 (마스크 영역을 검은색으로)
        cnet_image = background.copy()
        cnet_image.paste(0, (0, 0), mask)
        
        # 프롬프트 준비
        final_prompt = f"{prompt}, high quality, 4k" if prompt else "high quality, 4k"
        
        # GPU에서 실행
        with torch.autocast(device_type="cuda", dtype=torch.float16):
            (
                prompt_embeds,
                negative_prompt_embeds,
                pooled_prompt_embeds,
                negative_pooled_prompt_embeds,
            ) = pipe.encode_prompt(final_prompt, "cuda", True)
            
            # 생성 프로세스
            for generated_image in pipe(
                prompt_embeds=prompt_embeds,
                negative_prompt_embeds=negative_prompt_embeds,
                pooled_prompt_embeds=pooled_prompt_embeds,
                negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
                image=cnet_image,
                num_inference_steps=num_steps
            ):
                # 중간 결과 (필요시 사용)
                pass
            
            # 최종 이미지
            final_image = generated_image
        
        # RGBA로 변환하고 마스크 적용
        final_image = final_image.convert("RGBA")
        cnet_image.paste(final_image, (0, 0), mask)
        
        return cnet_image
        
    except Exception as e:
        logging.error(f"Outpainting error: {str(e)}")
        return background if 'background' in locals() else None

# MMAudio 관련 함수들
def translate_prompt(text):
    try:
        if translator is None:
            return text
            
        if text and any(ord(char) >= 0x3131 and ord(char) <= 0xD7A3 for char in text):
            with torch.no_grad():
                translation = translator(text)[0]['translation_text']
            return translation
        return text
    except Exception as e:
        logging.error(f"Translation error: {e}")
        return text

@spaces.GPU
@torch.inference_mode()
def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int, num_steps: int,
                   cfg_strength: float, duration: float):
    if not MMAUDIO_MODEL_LOADED:
        return None
    
    prompt = translate_prompt(prompt)
    negative_prompt = translate_prompt(negative_prompt)

    rng = torch.Generator(device=device)
    rng.manual_seed(seed)
    fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)

    clip_frames, sync_frames, duration = load_video(video, duration)
    clip_frames = clip_frames.unsqueeze(0)
    sync_frames = sync_frames.unsqueeze(0)
    seq_cfg.duration = duration
    net_mmaudio.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)

    audios = generate(clip_frames,
                      sync_frames, [prompt],
                      negative_text=[negative_prompt],
                      feature_utils=feature_utils,
                      net=net_mmaudio,
                      fm=fm,
                      rng=rng,
                      cfg_strength=cfg_strength)
    audio = audios.float().cpu()[0]

    video_save_path = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name
    make_video(video,
               video_save_path,
               audio,
               sampling_rate=seq_cfg.sampling_rate,
               duration_sec=seq_cfg.duration)
    return video_save_path

# CSS
css = """
:root {
    --primary-color: #f8c3cd;
    --secondary-color: #b3e5fc;
    --background-color: #f5f5f7;
    --card-background: #ffffff;
    --text-color: #424242;
    --accent-color: #ffb6c1;
    --success-color: #c8e6c9;
    --warning-color: #fff9c4;
    --shadow-color: rgba(0, 0, 0, 0.1);
    --border-radius: 12px;
}
.gradio-container {
    max-width: 1200px !important;
    margin: 0 auto !important;
}
.panel-box {
    border-radius: var(--border-radius) !important;
    box-shadow: 0 8px 16px var(--shadow-color) !important;
    background-color: var(--card-background) !important;
    padding: 20px !important;
    margin-bottom: 20px !important;
}
#generate-btn, #video-btn, #outpaint-btn, #preview-btn, #audio-btn {
    background: linear-gradient(135deg, #ff9a9e, #fad0c4) !important;
    font-size: 1.1rem !important;
    padding: 12px 24px !important;
    margin-top: 10px !important;
    width: 100% !important;
}
.tabitem {
    min-height: 700px !important;
}
"""

# Gradio Interface
demo = gr.Blocks(css=css, title="AI 이미지 & 비디오 & 오디오 생성기")

with demo:
    gr.Markdown("# 🎨 Ginigen 스튜디오")
    
    with gr.Tabs() as tabs:
        # 첫 번째 탭: 텍스트 to 이미지
        with gr.Tab("텍스트→이미지→비디오", elem_classes="tabitem"):
            with gr.Row(equal_height=True):
                # 입력 컬럼
                with gr.Column(scale=1):
                    with gr.Group(elem_classes="panel-box"):
                        gr.Markdown("### 📝 이미지 생성 설정")
                        
                        prompt = gr.Textbox(
                            label="프롬프트(한글/영어 가능)",
                            placeholder="생성하고 싶은 이미지를 설명하세요...",
                            lines=3
                        )
                        
                        size_preset = gr.Dropdown(
                            choices=list(IMAGE_PRESETS.keys()),
                            value="1:1 정사각형",
                            label="크기 프리셋"
                        )
                        
                        with gr.Row():
                            width = gr.Slider(256, 2048, 1024, step=64, label="너비")
                            height = gr.Slider(256, 2048, 1024, step=64, label="높이")
                        
                        with gr.Row():
                            guidance = gr.Slider(1.0, 20.0, 3.5, step=0.1, label="가이던스")
                            steps = gr.Slider(1, 50, 30, step=1, label="스텝")
                        
                        seed = gr.Number(label="시드 (-1=랜덤)", value=-1)
                        
                        generate_btn = gr.Button("🎨 이미지 생성", variant="primary", elem_id="generate-btn")
                        
                    with gr.Group(elem_classes="panel-box"):
                        gr.Markdown("### 🎬 비디오 생성 설정")
                        
                        video_prompt = gr.Textbox(
                            label="(선택) 비디오 프롬프트(영어로 입력)",
                            placeholder="비디오의 움직임을 설명하세요... (비워두면 기본 움직임 적용)",
                            lines=2
                        )
                        
                        video_length = gr.Slider(
                            minimum=1, 
                            maximum=60, 
                            value=4, 
                            step=0.5, 
                            label="비디오 길이 (초)",
                            info="1초에서 60초까지 선택 가능합니다"
                        )
                        
                        video_btn = gr.Button("🎬 비디오로 변환", variant="secondary", elem_id="video-btn")
                
                # 출력 컬럼
                with gr.Column(scale=1):
                    with gr.Group(elem_classes="panel-box"):
                        gr.Markdown("### 🖼️ 생성 결과")
                        
                        output_image = gr.Image(label="생성된 이미지", type="numpy")
                        output_seed = gr.Textbox(label="시드 정보")
                        output_video = gr.Video(label="생성된 비디오")
        
        # 두 번째 탭: 이미지 아웃페인팅
        with gr.Tab("이미지 비율 변경/생성", elem_classes="tabitem"):
            with gr.Row(equal_height=True):
                # 입력 컬럼
                with gr.Column(scale=1):
                    with gr.Group(elem_classes="panel-box"):
                        gr.Markdown("### 🖼️ 이미지 업로드")
                        
                        input_image = gr.Image(
                            label="원본 이미지",
                            type="numpy"
                        )
                        
                        outpaint_prompt = gr.Textbox(
                            label="프롬프트 (선택)",
                            placeholder="확장할 영역에 대한 설명...",
                            lines=2
                        )
                        
                    with gr.Group(elem_classes="panel-box"):
                        gr.Markdown("### ⚙️ 아웃페인팅 설정")
                        
                        outpaint_size_preset = gr.Dropdown(
                            choices=list(IMAGE_PRESETS.keys()),
                            value="16:9 와이드스크린",
                            label="목표 크기 프리셋"
                        )
                        
                        with gr.Row():
                            outpaint_width = gr.Slider(256, 2048, 1280, step=64, label="목표 너비")
                            outpaint_height = gr.Slider(256, 2048, 720, step=64, label="목표 높이")
                        
                        alignment = gr.Dropdown(
                            choices=["가운데", "왼쪽", "오른쪽", "위", "아래"],
                            value="가운데",
                            label="정렬"
                        )
                        
                        overlap_percentage = gr.Slider(
                            minimum=1,
                            maximum=50,
                            value=10,
                            step=1,
                            label="마스크 오버랩 (%)"
                        )
                        
                        outpaint_steps = gr.Slider(
                            minimum=4,
                            maximum=12,
                            value=8,
                            step=1,
                            label="추론 스텝"
                        )
                        
                        preview_btn = gr.Button("👁️ 미리보기", elem_id="preview-btn")
                        outpaint_btn = gr.Button("🎨 아웃페인팅 실행", variant="primary", elem_id="outpaint-btn")
                
                # 출력 컬럼
                with gr.Column(scale=1):
                    with gr.Group(elem_classes="panel-box"):
                        gr.Markdown("### 🖼️ 결과")
                        
                        preview_image = gr.Image(label="미리보기")
                        outpaint_result = gr.Image(label="아웃페인팅 결과")
        
        # 세 번째 탭: 비디오 + 오디오
        with gr.Tab("비디오 + 오디오", elem_classes="tabitem"):
            with gr.Row(equal_height=True):
                # 입력 컬럼
                with gr.Column(scale=1):
                    with gr.Group(elem_classes="panel-box"):
                        gr.Markdown("### 🎥 비디오 업로드")
                        
                        audio_video_input = gr.Video(
                            label="입력 비디오",
                            sources=["upload"]
                        )
                        
                    with gr.Group(elem_classes="panel-box"):
                        gr.Markdown("### 🎵 오디오 생성 설정")
                        
                        audio_prompt = gr.Textbox(
                            label="프롬프트 (한글 지원)" if MMAUDIO_MODEL_LOADED and translator else "프롬프트",
                            placeholder="생성하고 싶은 오디오를 설명하세요... (예: 평화로운 피아노 음악)",
                            lines=3
                        )
                        
                        audio_negative_prompt = gr.Textbox(
                            label="네거티브 프롬프트",
                            value="music",
                            placeholder="원하지 않는 요소...",
                            lines=2
                        )
                        
                        with gr.Row():
                            audio_seed = gr.Number(label="시드", value=0)
                            audio_steps = gr.Number(label="스텝", value=25)
                        
                        with gr.Row():
                            audio_cfg = gr.Number(label="가이던스 스케일", value=4.5)
                            audio_duration = gr.Number(label="지속시간 (초)", value=9999)
                        
                        audio_btn = gr.Button("🎵 오디오 생성 및 합성", variant="primary", elem_id="audio-btn")
                
                # 출력 컬럼
                with gr.Column(scale=1):
                    with gr.Group(elem_classes="panel-box"):
                        gr.Markdown("### 🎬 생성 결과")
                        
                        output_video_with_audio = gr.Video(
                            label="오디오가 추가된 비디오",
                            interactive=False
                        )
                        
                        if not MMAUDIO_MODEL_LOADED:
                            gr.Markdown("⚠️ MMAudio 모델을 로드하지 못했습니다. 이 기능은 사용할 수 없습니다.")
    
    # 이벤트 연결 - 첫 번째 탭
    size_preset.change(update_dimensions, [size_preset], [width, height])
    
    generate_btn.click(
        generate_text_to_image,
        [prompt, width, height, guidance, steps, seed],
        [output_image, output_seed]
    )
    
    video_btn.click(
        lambda img, v_prompt, length: generate_video_from_image(img, v_prompt, length) if img is not None else None,
        [output_image, video_prompt, video_length],
        [output_video]
    )
    
    # 이벤트 연결 - 두 번째 탭
    outpaint_size_preset.change(update_dimensions, [outpaint_size_preset], [outpaint_width, outpaint_height])
    
    preview_btn.click(
        preview_outpaint,
        [input_image, outpaint_width, outpaint_height, overlap_percentage, alignment],
        [preview_image]
    )
    
    outpaint_btn.click(
        outpaint_image,
        [input_image, outpaint_prompt, outpaint_width, outpaint_height, overlap_percentage, alignment, outpaint_steps],
        [outpaint_result]
    )
    
    # 이벤트 연결 - 세 번째 탭
    audio_btn.click(
        video_to_audio,
        [audio_video_input, audio_prompt, audio_negative_prompt, audio_seed, audio_steps, audio_cfg, audio_duration],
        [output_video_with_audio]
    )

demo.launch()