STUDIO

Running on Zero

App Files Files Community

openfree commited on May 28

Commit

ee252b7

verified ·

1 Parent(s): 98fa5ec

Update app.py

Browse files

Files changed (1) hide show

app.py +597 -319

app.py CHANGED Viewed

@@ -1,354 +1,632 @@
-import spaces
-import logging
-from datetime import datetime
-from pathlib import Path
 import gradio as gr
-import torch
-import torchaudio
-import os
-import requests
-from transformers import pipeline
-import tempfile
 import numpy as np
-from einops import rearrange
-import cv2
-from scipy.io import wavfile
-import librosa
-import json
-from typing import Optional, Tuple, List
-import atexit
-# 환경 변수 설정으로 torch.load 체크 우회 (임시 해결책)
 os.environ["TRANSFORMERS_ALLOW_UNSAFE_DESERIALIZATION"] = "1"
 try:
     import mmaudio
 except ImportError:
-    os.system("pip install -e .")
-    import mmaudio
-from mmaudio.eval_utils import (ModelConfig, all_model_cfg, generate, load_video, make_video,
-                                setup_eval_logging)
-from mmaudio.model.flow_matching import FlowMatching
-from mmaudio.model.networks import MMAudio, get_my_mmaudio
-from mmaudio.model.sequence_config import SequenceConfig
-from mmaudio.model.utils.features_utils import FeaturesUtils
-# 로깅 설정
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
-)
-log = logging.getLogger()
-# CUDA 설정
-if torch.cuda.is_available():
-    device = torch.device("cuda")
-    torch.backends.cuda.matmul.allow_tf32 = True
-    torch.backends.cudnn.allow_tf32 = True
-    torch.backends.cudnn.benchmark = True
-else:
-    device = torch.device("cpu")
-dtype = torch.bfloat16
-# 모델 설정
-model: ModelConfig = all_model_cfg['large_44k_v2']
-model.download_if_needed()
-output_dir = Path('./output/gradio')
-setup_eval_logging()
-# 번역기 설정 - safetensors 사용 시도
 try:
-    # 먼저 safetensors 형식이 있는지 확인
-    translator = pipeline("translation",
-                         model="Helsinki-NLP/opus-mt-ko-en",
-                         device="cpu",
-                         use_fast=True,  # Fast tokenizer 사용
-                         trust_remote_code=False)
 except Exception as e:
-    log.warning(f"Failed to load translation model with safetensors: {e}")
-    # 대체 방법: 환경 변수 설정 후 로드
-    try:
-        translator = pipeline("translation",
-                             model="Helsinki-NLP/opus-mt-ko-en",
-                             device="cpu")
-    except Exception as e2:
-        log.error(f"Failed to load translation model: {e2}")
-        translator = None
-PIXABAY_API_KEY = "33492762-a28a596ec4f286f84cd328b17"
-def cleanup_temp_files():
-    temp_dir = tempfile.gettempdir()
-    for file in os.listdir(temp_dir):
-        if file.endswith(('.mp4', '.flac')):
-            try:
-                os.remove(os.path.join(temp_dir, file))
-            except:
-                pass
-atexit.register(cleanup_temp_files)
-def get_model() -> tuple[MMAudio, FeaturesUtils, SequenceConfig]:
-    with torch.cuda.device(device):
-        seq_cfg = model.seq_cfg
-        net: MMAudio = get_my_mmaudio(model.model_name).to(device, dtype).eval()
-        net.load_weights(torch.load(model.model_path, map_location=device, weights_only=True))
-        log.info(f'Loaded weights from {model.model_path}')
-        feature_utils = FeaturesUtils(
-            tod_vae_ckpt=model.vae_path,
-            synchformer_ckpt=model.synchformer_ckpt,
-            enable_conditions=True,
-            mode=model.mode,
-            bigvgan_vocoder_ckpt=model.bigvgan_16k_path,
-            need_vae_encoder=False
-        ).to(device, dtype).eval()
-        return net, feature_utils, seq_cfg
-net, feature_utils, seq_cfg = get_model()
-# translate_prompt 함수 수정
-def translate_prompt(text):
     try:
-        # 번역기가 없으면 원본 텍스트 반환
-        if translator is None:
-            return text
-        if text and any(ord(char) >= 0x3131 and ord(char) <= 0xD7A3 for char in text):
-            # CPU에서 번역 실행
-            with torch.no_grad():
-                translation = translator(text)[0]['translation_text']
-            return translation
-        return text
     except Exception as e:
-        logging.error(f"Translation error: {e}")
-        return text
-# search_videos 함수 수정
-@torch.no_grad()
-def search_videos(query):
     try:
-        # CPU에서 번역 실행
-        query = translate_prompt(query)
-        return search_pixabay_videos(query, PIXABAY_API_KEY)
     except Exception as e:
-        logging.error(f"Video search error: {e}")
-        return []
-def search_pixabay_videos(query, api_key):
     try:
-        base_url = "https://pixabay.com/api/videos/"
-        params = {
-            "key": api_key,
-            "q": query,
-            "per_page": 40
-        }
-        response = requests.get(base_url, params=params)
-        if response.status_code == 200:
-            data = response.json()
-            return [video['videos']['large']['url'] for video in data.get('hits', [])]
-        return []
     except Exception as e:
-        logging.error(f"Pixabay API error: {e}")
-        return []
-@spaces.GPU
-@torch.inference_mode()
-def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int, num_steps: int,
-                   cfg_strength: float, duration: float):
-    prompt = translate_prompt(prompt)
-    negative_prompt = translate_prompt(negative_prompt)
-    rng = torch.Generator(device=device)
-    rng.manual_seed(seed)
-    fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
-    clip_frames, sync_frames, duration = load_video(video, duration)
-    clip_frames = clip_frames.unsqueeze(0)
-    sync_frames = sync_frames.unsqueeze(0)
-    seq_cfg.duration = duration
-    net.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
-    audios = generate(clip_frames,
-                      sync_frames, [prompt],
-                      negative_text=[negative_prompt],
-                      feature_utils=feature_utils,
-                      net=net,
-                      fm=fm,
-                      rng=rng,
-                      cfg_strength=cfg_strength)
-    audio = audios.float().cpu()[0]
-    video_save_path = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name
-    make_video(video,
-               video_save_path,
-               audio,
-               sampling_rate=seq_cfg.sampling_rate,
-               duration_sec=seq_cfg.duration)
-    return video_save_path
-@spaces.GPU
-@torch.inference_mode()
-def text_to_audio(prompt: str, negative_prompt: str, seed: int, num_steps: int, cfg_strength: float,
-                  duration: float):
-    prompt = translate_prompt(prompt)
-    negative_prompt = translate_prompt(negative_prompt)
-    rng = torch.Generator(device=device)
-    rng.manual_seed(seed)
-    fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
-    clip_frames = sync_frames = None
-    seq_cfg.duration = duration
-    net.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
-    audios = generate(clip_frames,
-                      sync_frames, [prompt],
-                      negative_text=[negative_prompt],
-                      feature_utils=feature_utils,
-                      net=net,
-                      fm=fm,
-                      rng=rng,
-                      cfg_strength=cfg_strength)
-    audio = audios.float().cpu()[0]
-    audio_save_path = tempfile.NamedTemporaryFile(delete=False, suffix='.flac').name
-    torchaudio.save(audio_save_path, audio, seq_cfg.sampling_rate)
-    return audio_save_path
-# CSS 스타일
-custom_css = """
-.gradio-container {
-    background: linear-gradient(45deg, #1a1a1a, #2a2a2a);
-    border-radius: 15px;
-    box-shadow: 0 8px 32px rgba(0,0,0,0.3);
-    color: #e0e0e0;
-}
-.input-container, .output-container {
-    background: rgba(40, 40, 40, 0.95);
-    backdrop-filter: blur(10px);
-    border-radius: 10px;
-    padding: 20px;
-    transform-style: preserve-3d;
-    transition: transform 0.3s ease;
-    border: 1px solid rgba(255, 255, 255, 0.1);
-}
-.input-container:hover {
-    transform: translateZ(20px);
-    box-shadow: 0 8px 32px rgba(0,0,0,0.5);
-}
-.gallery-item {
-    transition: transform 0.3s ease;
-    border-radius: 8px;
-    overflow: hidden;
-    background: #2a2a2a;
-}
-.gallery-item:hover {
-    transform: scale(1.05);
-    box-shadow: 0 4px 15px rgba(0,0,0,0.4);
-}
-.tabs {
-    background: rgba(30, 30, 30, 0.95);
-    border-radius: 10px;
-    padding: 10px;
-    border: 1px solid rgba(255, 255, 255, 0.05);
-}
-button {
-    background: linear-gradient(45deg, #2196F3, #1976D2);
-    border: none;
-    border-radius: 5px;
-    transition: all 0.3s ease;
-    color: white;
 }
-button:hover {
-    transform: translateY(-2px);
-    box-shadow: 0 4px 15px rgba(33,150,243,0.3);
 }
-textarea, input[type="text"], input[type="number"] {
-    background: rgba(30, 30, 30, 0.95) !important;
-    border: 1px solid rgba(255, 255, 255, 0.1) !important;
-    color: #e0e0e0 !important;
-    border-radius: 5px !important;
 }
-label {
-    color: #e0e0e0 !important;
 }
-.gallery {
-    background: rgba(30, 30, 30, 0.95);
-    padding: 15px;
-    border-radius: 10px;
-    border: 1px solid rgba(255, 255, 255, 0.05);
 }
 """
-css = """
-footer {
-    visibility: hidden;
-}
-""" + custom_css
-# Gradio 인터페이스 생성
-text_to_audio_tab = gr.Interface(
-    fn=text_to_audio,
-    inputs=[
-        gr.Textbox(label="Prompt(한글지원)" if translator else "Prompt"),
-        gr.Textbox(label="Negative Prompt"),
-        gr.Number(label="Seed", value=0),
-        gr.Number(label="Steps", value=25),
-        gr.Number(label="Guidance Scale", value=4.5),
-        gr.Number(label="Duration (sec)", value=8),
-    ],
-    outputs=gr.Audio(label="Generated Audio"),
-    css=custom_css
-)
-video_to_audio_tab = gr.Interface(
-    fn=video_to_audio,
-    inputs=[
-        gr.Video(label="Input Video"),
-        gr.Textbox(label="Prompt(한글지원)" if translator else "Prompt"),
-        gr.Textbox(label="Negative Prompt", value="music"),
-        gr.Number(label="Seed", value=0),
-        gr.Number(label="Steps", value=25),
-        gr.Number(label="Guidance Scale", value=4.5),
-        gr.Number(label="Duration (sec)", value=8),
-    ],
-    outputs=gr.Video(label="Generated Result"),
-    css=custom_css
-)
-video_search_tab = gr.Interface(
-    fn=search_videos,
-    inputs=gr.Textbox(label="Search Query(한글지원)" if translator else "Search Query"),
-    outputs=gr.Gallery(label="Search Results", columns=4, rows=20),
-    css=custom_css,
-    api_name=False
-)
-# 메인 실행
-if __name__ == "__main__":
-    # 번역기 로드 실패 시 경고 메시지
-    if translator is None:
-        log.warning("Translation model failed to load. Korean translation will be disabled.")
-    gr.TabbedInterface(
-        [video_search_tab, video_to_audio_tab, text_to_audio_tab],
-        ["Video Search", "Video-to-Audio", "Text-to-Audio"],
-        theme="soft",
-        css=css
-    ).launch(allowed_paths=[output_dir])

 import gradio as gr
 import numpy as np
+from PIL import Image, ImageDraw
+from gradio_client import Client, handle_file
+import random
+import tempfile
+import os
+import logging
+import torch
+from diffusers import AutoencoderKL, TCDScheduler
+from diffusers.models.model_loading_utils import load_state_dict
+from huggingface_hub import hf_hub_download
+# Spaces GPU
+try:
+    import spaces
+except:
+    # GPU 데코레이터가 없을 때를 위한 더미 데코레이터
+    class spaces:
+        @staticmethod
+        def GPU(duration=None):
+            def decorator(func):
+                return func
+            return decorator
+# 환경 변수 설정
 os.environ["TRANSFORMERS_ALLOW_UNSAFE_DESERIALIZATION"] = "1"
+# MMAudio 관련 임포트
 try:
     import mmaudio
+    from mmaudio.eval_utils import (ModelConfig, all_model_cfg, generate, load_video, make_video,
+                                    setup_eval_logging)
+    from mmaudio.model.flow_matching import FlowMatching
+    from mmaudio.model.networks import MMAudio, get_my_mmaudio
+    from mmaudio.model.sequence_config import SequenceConfig
+    from mmaudio.model.utils.features_utils import FeaturesUtils
+    MMAUDIO_AVAILABLE = True
 except ImportError:
+    MMAUDIO_AVAILABLE = False
+    logging.warning("MMAudio not available. Sound generation will be disabled.")
+# ControlNet 모델 로드
 try:
+    from controlnet_union import ControlNetModel_Union
+    from pipeline_fill_sd_xl import StableDiffusionXLFillPipeline
+    # ControlNet 설정 및 로드
+    config_file = hf_hub_download(
+        "xinsir/controlnet-union-sdxl-1.0",
+        filename="config_promax.json",
+    )
+    config = ControlNetModel_Union.load_config(config_file)
+    controlnet_model = ControlNetModel_Union.from_config(config)
+    model_file = hf_hub_download(
+        "xinsir/controlnet-union-sdxl-1.0",
+        filename="diffusion_pytorch_model_promax.safetensors",
+    )
+    state_dict = load_state_dict(model_file)
+    loaded_keys = list(state_dict.keys())
+    result = ControlNetModel_Union._load_pretrained_model(
+        controlnet_model, state_dict, model_file, "xinsir/controlnet-union-sdxl-1.0", loaded_keys
+    )
+    model = result[0]
+    model = model.to(device="cuda", dtype=torch.float16)
+    # VAE 로드
+    vae = AutoencoderKL.from_pretrained(
+        "madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16
+    ).to("cuda")
+    # 파이프라인 로드
+    pipe = StableDiffusionXLFillPipeline.from_pretrained(
+        "SG161222/RealVisXL_V5.0_Lightning",
+        torch_dtype=torch.float16,
+        vae=vae,
+        controlnet=model,
+        variant="fp16",
+    ).to("cuda")
+    pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
+    OUTPAINT_MODEL_LOADED = True
 except Exception as e:
+    logging.error(f"Failed to load outpainting models: {str(e)}")
+    OUTPAINT_MODEL_LOADED = False
+# MMAudio 모델 설정 및 로드
+if MMAUDIO_AVAILABLE:
+    try:
+        # CUDA 설정
+        if torch.cuda.is_available():
+            device = torch.device("cuda")
+            torch.backends.cuda.matmul.allow_tf32 = True
+            torch.backends.cudnn.allow_tf32 = True
+            torch.backends.cudnn.benchmark = True
+        else:
+            device = torch.device("cpu")
+        dtype = torch.bfloat16
+        # 모델 설정
+        model_cfg: ModelConfig = all_model_cfg['large_44k_v2']
+        model_cfg.download_if_needed()
+        setup_eval_logging()
+        # 모델 로드
+        def get_mmaudio_model():
+            with torch.cuda.device(device):
+                seq_cfg = model_cfg.seq_cfg
+                net: MMAudio = get_my_mmaudio(model_cfg.model_name).to(device, dtype).eval()
+                net.load_weights(torch.load(model_cfg.model_path, map_location=device, weights_only=True))
+                logging.info(f'Loaded MMAudio weights from {model_cfg.model_path}')
+                feature_utils = FeaturesUtils(
+                    tod_vae_ckpt=model_cfg.vae_path,
+                    synchformer_ckpt=model_cfg.synchformer_ckpt,
+                    enable_conditions=True,
+                    mode=model_cfg.mode,
+                    bigvgan_vocoder_ckpt=model_cfg.bigvgan_16k_path,
+                    need_vae_encoder=False
+                ).to(device, dtype).eval()
+                return net, feature_utils, seq_cfg
+        mmaudio_net, mmaudio_feature_utils, mmaudio_seq_cfg = get_mmaudio_model()
+        MMAUDIO_LOADED = True
+    except Exception as e:
+        logging.error(f"Failed to load MMAudio models: {str(e)}")
+        MMAUDIO_LOADED = False
+else:
+    MMAUDIO_LOADED = False
+# API URLs
+TEXT2IMG_API_URL = "http://211.233.58.201:7896"
+VIDEO_API_URL = "http://211.233.58.201:7875"
+# 로깅 설정
+logging.basicConfig(level=logging.INFO)
+# Image size presets
+IMAGE_PRESETS = {
+    "커스텀": {"width": 1024, "height": 1024},
+    "1:1 정사각형": {"width": 1024, "height": 1024},
+    "4:3 표준": {"width": 1024, "height": 768},
+    "16:9 와이드스크린": {"width": 1024, "height": 576},
+    "9:16 세로형": {"width": 576, "height": 1024},
+    "6:19 특수 세로형": {"width": 324, "height": 1024},
+    "Instagram 정사각형": {"width": 1080, "height": 1080},
+    "Instagram 스토리": {"width": 1080, "height": 1920},
+    "Instagram 가로형": {"width": 1080, "height": 566},
+    "Facebook 커버": {"width": 820, "height": 312},
+    "Twitter 헤더": {"width": 1500, "height": 500},
+    "YouTube 썸네일": {"width": 1280, "height": 720},
+    "LinkedIn 배너": {"width": 1584, "height": 396},
+}
+def update_dimensions(preset):
+    if preset in IMAGE_PRESETS:
+        return IMAGE_PRESETS[preset]["width"], IMAGE_PRESETS[preset]["height"]
+    return 1024, 1024
+def generate_text_to_image(prompt, width, height, guidance, inference_steps, seed):
+    if not prompt:
+        return None, "프롬프트를 입력해주세요"
     try:
+        client = Client(TEXT2IMG_API_URL)
+        if seed == -1:
+            seed = random.randint(0, 9999999)
+        result = client.predict(
+            prompt=prompt,
+            width=int(width),
+            height=int(height),
+            guidance=float(guidance),
+            inference_steps=int(inference_steps),
+            seed=int(seed),
+            do_img2img=False,
+            init_image=None,
+            image2image_strength=0.8,
+            resize_img=True,
+            api_name="/generate_image"
+        )
+        return result[0], f"사용된 시드: {result[1]}"
     except Exception as e:
+        logging.error(f"Image generation error: {str(e)}")
+        return None, f"오류: {str(e)}"
+@spaces.GPU(duration=60)
+@torch.inference_mode()
+def video_to_audio(video_path, prompt, negative_prompt="music", seed=0, num_steps=25, cfg_strength=4.5, duration=8.0):
+    """비디오에 사운드를 추가하는 함수"""
+    if not MMAUDIO_LOADED:
+        logging.error("MMAudio model not loaded")
+        return video_path
     try:
+        rng = torch.Generator(device=device)
+        rng.manual_seed(seed)
+        fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
+        # 비디오 로드
+        clip_frames, sync_frames, actual_duration = load_video(video_path, duration)
+        clip_frames = clip_frames.unsqueeze(0)
+        sync_frames = sync_frames.unsqueeze(0)
+        mmaudio_seq_cfg.duration = actual_duration
+        mmaudio_net.update_seq_lengths(mmaudio_seq_cfg.latent_seq_len, mmaudio_seq_cfg.clip_seq_len, mmaudio_seq_cfg.sync_seq_len)
+        # 오디오 생성
+        audios = generate(clip_frames,
+                          sync_frames, [prompt],
+                          negative_text=[negative_prompt],
+                          feature_utils=mmaudio_feature_utils,
+                          net=mmaudio_net,
+                          fm=fm,
+                          rng=rng,
+                          cfg_strength=cfg_strength)
+        audio = audios.float().cpu()[0]
+        # 비디오와 오디오 결합
+        video_save_path = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name
+        make_video(video_path,
+                   video_save_path,
+                   audio,
+                   sampling_rate=mmaudio_seq_cfg.sampling_rate,
+                   duration_sec=mmaudio_seq_cfg.duration)
+        return video_save_path
     except Exception as e:
+        logging.error(f"Video to audio error: {str(e)}")
+        return video_path
+def generate_video_from_image(image, prompt="", length=4.0, sound_generation="사운드 없음", sound_prompt="", sound_negative_prompt="music"):
+    if image is None:
+        return None
     try:
+        # 이미지 저장
+        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as fp:
+            temp_path = fp.name
+            Image.fromarray(image).save(temp_path)
+        # 비디오 생성 API 호출
+        client = Client(VIDEO_API_URL)
+        result = client.predict(
+            input_image=handle_file(temp_path),
+            prompt=prompt if prompt else "Generate natural motion",
+            n_prompt="",
+            seed=random.randint(0, 9999999),
+            use_teacache=True,
+            video_length=float(length),
+            api_name="/process"
+        )
+        os.unlink(temp_path)
+        if result and len(result) > 0:
+            video_dict = result[0]
+            video_path = video_dict.get("video") if isinstance(video_dict, dict) else None
+            # 사운드 생성 옵션이 선택된 경우
+            if video_path and sound_generation == "사운드 생성" and MMAUDIO_LOADED:
+                # 사운드 프롬프트가 비어있으면 기본값 사용
+                if not sound_prompt:
+                    sound_prompt = prompt if prompt else "ambient sound"
+                # 비디오에 사운드 추가
+                video_with_sound = video_to_audio(
+                    video_path,
+                    sound_prompt,
+                    sound_negative_prompt,
+                    duration=length
+                )
+                return video_with_sound
+            return video_path
     except Exception as e:
+        logging.error(f"Video generation error: {str(e)}")
+        return None
+def prepare_image_and_mask(image, width, height, overlap_percentage, alignment):
+    """이미지와 마스크를 준비하는 함수"""
+    if image is None:
+        return None, None
+    # PIL 이미지로 변환
+    if isinstance(image, np.ndarray):
+        image = Image.fromarray(image).convert('RGB')
+    target_size = (width, height)
+    # 이미지를 타겟 크기에 맞게 조정
+    scale_factor = min(target_size[0] / image.width, target_size[1] / image.height)
+    new_width = int(image.width * scale_factor)
+    new_height = int(image.height * scale_factor)
+    # 이미지 리사이즈
+    source = image.resize((new_width, new_height), Image.LANCZOS)
+    # 오버랩 계산
+    overlap_x = int(new_width * (overlap_percentage / 100))
+    overlap_y = int(new_height * (overlap_percentage / 100))
+    overlap_x = max(overlap_x, 1)
+    overlap_y = max(overlap_y, 1)
+    # 정렬에 따른 마진 계산
+    if alignment == "가운데":
+        margin_x = (target_size[0] - new_width) // 2
+        margin_y = (target_size[1] - new_height) // 2
+    elif alignment == "왼쪽":
+        margin_x = 0
+        margin_y = (target_size[1] - new_height) // 2
+    elif alignment == "오른쪽":
+        margin_x = target_size[0] - new_width
+        margin_y = (target_size[1] - new_height) // 2
+    elif alignment == "위":
+        margin_x = (target_size[0] - new_width) // 2
+        margin_y = 0
+    elif alignment == "아래":
+        margin_x = (target_size[0] - new_width) // 2
+        margin_y = target_size[1] - new_height
+    # 배경 이미지 생성
+    background = Image.new('RGB', target_size, (255, 255, 255))
+    background.paste(source, (margin_x, margin_y))
+    # 마스크 생성
+    mask = Image.new('L', target_size, 255)
+    mask_draw = ImageDraw.Draw(mask)
+    # 마스크 영역 그리기
+    white_gaps_patch = 2
+    left_overlap = margin_x + overlap_x if alignment != "왼쪽" else margin_x
+    right_overlap = margin_x + new_width - overlap_x if alignment != "오른쪽" else margin_x + new_width
+    top_overlap = margin_y + overlap_y if alignment != "위" else margin_y
+    bottom_overlap = margin_y + new_height - overlap_y if alignment != "아래" else margin_y + new_height
+    mask_draw.rectangle([
+        (left_overlap, top_overlap),
+        (right_overlap, bottom_overlap)
+    ], fill=0)
+    return background, mask
+@spaces.GPU(duration=24)
+def outpaint_image(image, prompt, width, height, overlap_percentage, alignment, num_steps=8):
+    """이미지 아웃페인팅 실행"""
+    if image is None:
+        return None
+    if not OUTPAINT_MODEL_LOADED:
+        return Image.new('RGB', (width, height), (200, 200, 200))
+    try:
+        # 이미지와 마스크 준비
+        background, mask = prepare_image_and_mask(image, width, height, overlap_percentage, alignment)
+        if background is None:
+            return None
+        # cnet_image 생성 (마스크 영역을 검은색으로)
+        cnet_image = background.copy()
+        cnet_image.paste(0, (0, 0), mask)
+        # 프롬프트 준비
+        final_prompt = f"{prompt}, high quality, 4k" if prompt else "high quality, 4k"
+        # GPU에서 실행
+        with torch.autocast(device_type="cuda", dtype=torch.float16):
+            (
+                prompt_embeds,
+                negative_prompt_embeds,
+                pooled_prompt_embeds,
+                negative_pooled_prompt_embeds,
+            ) = pipe.encode_prompt(final_prompt, "cuda", True)
+            # 생성 프로세스
+            for generated_image in pipe(
+                prompt_embeds=prompt_embeds,
+                negative_prompt_embeds=negative_prompt_embeds,
+                pooled_prompt_embeds=pooled_prompt_embeds,
+                negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+                image=cnet_image,
+                num_inference_steps=num_steps
+            ):
+                # 중간 결과 (필요시 사용)
+                pass
+            # 최종 이미지
+            final_image = generated_image
+        # RGBA로 변환하고 마스크 적용
+        final_image = final_image.convert("RGBA")
+        cnet_image.paste(final_image, (0, 0), mask)
+        return cnet_image
+    except Exception as e:
+        logging.error(f"Outpainting error: {str(e)}")
+        return background if 'background' in locals() else None
+# CSS
+css = """
+:root {
+    --primary-color: #f8c3cd;
+    --secondary-color: #b3e5fc;
+    --background-color: #f5f5f7;
+    --card-background: #ffffff;
+    --text-color: #424242;
+    --accent-color: #ffb6c1;
+    --success-color: #c8e6c9;
+    --warning-color: #fff9c4;
+    --shadow-color: rgba(0, 0, 0, 0.1);
+    --border-radius: 12px;
 }
+.gradio-container {
+    max-width: 1200px !important;
+    margin: 0 auto !important;
 }
+.panel-box {
+    border-radius: var(--border-radius) !important;
+    box-shadow: 0 8px 16px var(--shadow-color) !important;
+    background-color: var(--card-background) !important;
+    padding: 20px !important;
+    margin-bottom: 20px !important;
 }
+#generate-btn, #video-btn, #outpaint-btn {
+    background: linear-gradient(135deg, #ff9a9e, #fad0c4) !important;
+    font-size: 1.1rem !important;
+    padding: 12px 24px !important;
+    margin-top: 10px !important;
+    width: 100% !important;
 }
+.tabitem {
+    min-height: 700px !important;
 }
 """
+# Gradio Interface
+demo = gr.Blocks(css=css, title="AI 이미지 & 비디오 생성기")
+with demo:
+    gr.Markdown("# 🎨 Ginigen 스튜디오")
+    with gr.Tabs() as tabs:
+        # 첫 번째 탭: 텍스트 to 이미지
+        with gr.Tab("텍스트→이미지→비디오", elem_classes="tabitem"):
+            with gr.Row(equal_height=True):
+                # 입력 컬럼
+                with gr.Column(scale=1):
+                    with gr.Group(elem_classes="panel-box"):
+                        gr.Markdown("### 📝 이미지 생성 설정")
+                        prompt = gr.Textbox(
+                            label="프롬프트(한글/영어 가능)",
+                            placeholder="생성하고 싶은 이미지를 설명하세요...",
+                            lines=3
+                        )
+                        size_preset = gr.Dropdown(
+                            choices=list(IMAGE_PRESETS.keys()),
+                            value="1:1 정사각형",
+                            label="크기 프리셋"
+                        )
+                        with gr.Row():
+                            width = gr.Slider(256, 2048, 1024, step=64, label="너비")
+                            height = gr.Slider(256, 2048, 1024, step=64, label="높이")
+                        with gr.Row():
+                            guidance = gr.Slider(1.0, 20.0, 3.5, step=0.1, label="가이던스")
+                            steps = gr.Slider(1, 50, 30, step=1, label="스텝")
+                        seed = gr.Number(label="시드 (-1=랜덤)", value=-1)
+                        generate_btn = gr.Button("🎨 이미지 생성", variant="primary", elem_id="generate-btn")
+                    with gr.Group(elem_classes="panel-box"):
+                        gr.Markdown("### 🎬 비디오 생성 설정")
+                        video_prompt = gr.Textbox(
+                            label="(선택) 비디오 프롬프트(영어로 입력)",
+                            placeholder="비디오의 움직임을 설명하세요... (비워두면 기본 움직임 적용)",
+                            lines=2
+                        )
+                        video_length = gr.Slider(
+                            minimum=1,
+                            maximum=60,
+                            value=4,
+                            step=0.5,
+                            label="비디오 길이 (초)",
+                            info="1초에서 60초까지 선택 가능합니다"
+                        )
+                        # 사운드 생성 옵션 추가
+                        sound_generation = gr.Radio(
+                            choices=["사운드 없음", "사운드 생성"],
+                            value="사운드 없음",
+                            label="사운드 옵션",
+                            info="비디오에 사운드를 추가할지 선택하세요"
+                        )
+                        # 사운드 관련 입력 필드 (조건부 표시)
+                        with gr.Column(visible=False) as sound_options:
+                            sound_prompt = gr.Textbox(
+                                label="사운드 프롬프트 (선택)",
+                                placeholder="생성할 사운드를 설명하세요... (비워두면 비디오 프롬프트 사용)",
+                                lines=2
+                            )
+                            sound_negative_prompt = gr.Textbox(
+                                label="사운드 네거티브 프롬프트",
+                                value="music",
+                                lines=1
+                            )
+                        video_btn = gr.Button("🎬 비디오로 변환", variant="secondary", elem_id="video-btn")
+                # 출력 컬럼
+                with gr.Column(scale=1):
+                    with gr.Group(elem_classes="panel-box"):
+                        gr.Markdown("### 🖼️ 생성 결과")
+                        output_image = gr.Image(label="생성된 이미지", type="numpy")
+                        output_seed = gr.Textbox(label="시드 정보")
+                        output_video = gr.Video(label="생성된 비디오")
+        # 두 번째 탭: 이미지 아웃페인팅
+        with gr.Tab("이미지 비율 변경/생성", elem_classes="tabitem"):
+            with gr.Row(equal_height=True):
+                # 입력 컬럼
+                with gr.Column(scale=1):
+                    with gr.Group(elem_classes="panel-box"):
+                        gr.Markdown("### 🖼️ 이미지 업로드")
+                        input_image = gr.Image(
+                            label="원본 이미지",
+                            type="numpy"
+                        )
+                        outpaint_prompt = gr.Textbox(
+                            label="프롬프트 (선택)",
+                            placeholder="확장할 영역에 대한 설명...",
+                            lines=2
+                        )
+                    with gr.Group(elem_classes="panel-box"):
+                        gr.Markdown("### ⚙️ 아웃페인팅 설정")
+                        outpaint_size_preset = gr.Dropdown(
+                            choices=list(IMAGE_PRESETS.keys()),
+                            value="16:9 와이드스크린",
+                            label="목표 크기 프리셋"
+                        )
+                        with gr.Row():
+                            outpaint_width = gr.Slider(256, 2048, 1280, step=64, label="목표 너비")
+                            outpaint_height = gr.Slider(256, 2048, 720, step=64, label="목표 높이")
+                        alignment = gr.Dropdown(
+                            choices=["가운데", "왼쪽", "오른쪽", "위", "아래"],
+                            value="가운데",
+                            label="정렬"
+                        )
+                        overlap_percentage = gr.Slider(
+                            minimum=1,
+                            maximum=50,
+                            value=10,
+                            step=1,
+                            label="마스크 오버랩 (%)"
+                        )
+                        outpaint_steps = gr.Slider(
+                            minimum=4,
+                            maximum=12,
+                            value=8,
+                            step=1,
+                            label="추론 스텝"
+                        )
+                        outpaint_btn = gr.Button("🎨 아웃페인팅 실행", variant="primary", elem_id="outpaint-btn")
+                # 출력 컬럼
+                with gr.Column(scale=1):
+                    with gr.Group(elem_classes="panel-box"):
+                        gr.Markdown("### 🖼️ 결과")
+                        outpaint_result = gr.Image(label="아웃페인팅 결과")
+    # 이벤트 연결 - 첫 번째 탭
+    size_preset.change(update_dimensions, [size_preset], [width, height])
+    generate_btn.click(
+        generate_text_to_image,
+        [prompt, width, height, guidance, steps, seed],
+        [output_image, output_seed]
+    )
+    # 사운드 옵션 표시/숨김
+    def toggle_sound_options(choice):
+        return gr.update(visible=(choice == "사운드 생성"))
+    sound_generation.change(
+        toggle_sound_options,
+        [sound_generation],
+        [sound_options]
+    )
+    video_btn.click(
+        generate_video_from_image,
+        [output_image, video_prompt, video_length, sound_generation, sound_prompt, sound_negative_prompt],
+        [output_video]
+    )
+    # 이벤트 연결 - 두 번째 탭
+    outpaint_size_preset.change(update_dimensions, [outpaint_size_preset], [outpaint_width, outpaint_height])
+    outpaint_btn.click(
+        outpaint_image,
+        [input_image, outpaint_prompt, outpaint_width, outpaint_height, overlap_percentage, alignment, outpaint_steps],
+        [outpaint_result]
+    )
+demo.launch()