import gradio as gr import numpy as np from PIL import Image, ImageDraw from gradio_client import Client, handle_file import random import tempfile import os import logging import torch from diffusers import AutoencoderKL, TCDScheduler from diffusers.models.model_loading_utils import load_state_dict from huggingface_hub import hf_hub_download from pathlib import Path import torchaudio from einops import rearrange from scipy.io import wavfile from transformers import pipeline # 환경 변수 설정으로 torch.load 체크 우회 (임시 해결책) os.environ["TRANSFORMERS_ALLOW_UNSAFE_DESERIALIZATION"] = "1" # Spaces GPU try: import spaces except: # GPU 데코레이터가 없을 때를 위한 더미 데코레이터 class spaces: @staticmethod def GPU(duration=None): def decorator(func): return func return decorator # MMAudio imports try: import mmaudio except ImportError: os.system("pip install -e .") import mmaudio from mmaudio.eval_utils import (ModelConfig, all_model_cfg, generate, load_video, make_video, setup_eval_logging) from mmaudio.model.flow_matching import FlowMatching from mmaudio.model.networks import MMAudio, get_my_mmaudio from mmaudio.model.sequence_config import SequenceConfig from mmaudio.model.utils.features_utils import FeaturesUtils # ControlNet 모델 로드 try: from controlnet_union import ControlNetModel_Union from pipeline_fill_sd_xl import StableDiffusionXLFillPipeline # ControlNet 설정 및 로드 config_file = hf_hub_download( "xinsir/controlnet-union-sdxl-1.0", filename="config_promax.json", ) config = ControlNetModel_Union.load_config(config_file) controlnet_model = ControlNetModel_Union.from_config(config) model_file = hf_hub_download( "xinsir/controlnet-union-sdxl-1.0", filename="diffusion_pytorch_model_promax.safetensors", ) state_dict = load_state_dict(model_file) loaded_keys = list(state_dict.keys()) result = ControlNetModel_Union._load_pretrained_model( controlnet_model, state_dict, model_file, "xinsir/controlnet-union-sdxl-1.0", loaded_keys ) model = result[0] model = model.to(device="cuda", dtype=torch.float16) # VAE 로드 vae = AutoencoderKL.from_pretrained( "madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16 ).to("cuda") # 파이프라인 로드 pipe = StableDiffusionXLFillPipeline.from_pretrained( "SG161222/RealVisXL_V5.0_Lightning", torch_dtype=torch.float16, vae=vae, controlnet=model, variant="fp16", ).to("cuda") pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config) OUTPAINT_MODEL_LOADED = True except Exception as e: logging.error(f"Failed to load outpainting models: {str(e)}") OUTPAINT_MODEL_LOADED = False # MMAudio 모델 설정 if torch.cuda.is_available(): device = torch.device("cuda") torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cudnn.allow_tf32 = True torch.backends.cudnn.benchmark = True else: device = torch.device("cpu") dtype = torch.bfloat16 # MMAudio 모델 초기화 try: model_mmaudio: ModelConfig = all_model_cfg['large_44k_v2'] model_mmaudio.download_if_needed() output_dir = Path('./output/gradio') setup_eval_logging() # 번역기 설정 try: translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ko-en", device="cpu", use_fast=True, trust_remote_code=False) except Exception as e: logging.warning(f"Failed to load translation model: {e}") translator = None def get_mmaudio_model() -> tuple[MMAudio, FeaturesUtils, SequenceConfig]: with torch.cuda.device(device): seq_cfg = model_mmaudio.seq_cfg net: MMAudio = get_my_mmaudio(model_mmaudio.model_name).to(device, dtype).eval() net.load_weights(torch.load(model_mmaudio.model_path, map_location=device, weights_only=True)) logging.info(f'Loaded weights from {model_mmaudio.model_path}') feature_utils = FeaturesUtils( tod_vae_ckpt=model_mmaudio.vae_path, synchformer_ckpt=model_mmaudio.synchformer_ckpt, enable_conditions=True, mode=model_mmaudio.mode, bigvgan_vocoder_ckpt=model_mmaudio.bigvgan_16k_path, need_vae_encoder=False ).to(device, dtype).eval() return net, feature_utils, seq_cfg net_mmaudio, feature_utils, seq_cfg = get_mmaudio_model() MMAUDIO_MODEL_LOADED = True except Exception as e: logging.error(f"Failed to load MMAudio models: {str(e)}") MMAUDIO_MODEL_LOADED = False translator = None # API URLs TEXT2IMG_API_URL = "http://211.233.58.201:7896" VIDEO_API_URL = "http://211.233.58.201:7875" # 로깅 설정 logging.basicConfig(level=logging.INFO) # Image size presets IMAGE_PRESETS = { "커스텀": {"width": 1024, "height": 1024}, "1:1 정사각형": {"width": 1024, "height": 1024}, "4:3 표준": {"width": 1024, "height": 768}, "16:9 와이드스크린": {"width": 1024, "height": 576}, "9:16 세로형": {"width": 576, "height": 1024}, "6:19 특수 세로형": {"width": 324, "height": 1024}, "Instagram 정사각형": {"width": 1080, "height": 1080}, "Instagram 스토리": {"width": 1080, "height": 1920}, "Instagram 가로형": {"width": 1080, "height": 566}, "Facebook 커버": {"width": 820, "height": 312}, "Twitter 헤더": {"width": 1500, "height": 500}, "YouTube 썸네일": {"width": 1280, "height": 720}, "LinkedIn 배너": {"width": 1584, "height": 396}, } def update_dimensions(preset): if preset in IMAGE_PRESETS: return IMAGE_PRESETS[preset]["width"], IMAGE_PRESETS[preset]["height"] return 1024, 1024 def generate_text_to_image(prompt, width, height, guidance, inference_steps, seed): if not prompt: return None, "프롬프트를 입력해주세요" try: client = Client(TEXT2IMG_API_URL) if seed == -1: seed = random.randint(0, 9999999) result = client.predict( prompt=prompt, width=int(width), height=int(height), guidance=float(guidance), inference_steps=int(inference_steps), seed=int(seed), do_img2img=False, init_image=None, image2image_strength=0.8, resize_img=True, api_name="/generate_image" ) return result[0], f"사용된 시드: {result[1]}" except Exception as e: logging.error(f"Image generation error: {str(e)}") return None, f"오류: {str(e)}" def generate_video_from_image(image, prompt="", length=4.0): if image is None: return None try: # 이미지 저장 with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as fp: temp_path = fp.name Image.fromarray(image).save(temp_path) # API 호출 client = Client(VIDEO_API_URL) result = client.predict( input_image=handle_file(temp_path), prompt=prompt if prompt else "Generate natural motion", n_prompt="", seed=random.randint(0, 9999999), use_teacache=True, video_length=float(length), api_name="/process" ) os.unlink(temp_path) if result and len(result) > 0: video_dict = result[0] return video_dict.get("video") if isinstance(video_dict, dict) else None except Exception as e: logging.error(f"Video generation error: {str(e)}") return None def prepare_image_and_mask(image, width, height, overlap_percentage, alignment): """이미지와 마스크를 준비하는 함수""" if image is None: return None, None # PIL 이미지로 변환 if isinstance(image, np.ndarray): image = Image.fromarray(image).convert('RGB') target_size = (width, height) # 이미지를 타겟 크기에 맞게 조정 scale_factor = min(target_size[0] / image.width, target_size[1] / image.height) new_width = int(image.width * scale_factor) new_height = int(image.height * scale_factor) # 이미지 리사이즈 source = image.resize((new_width, new_height), Image.LANCZOS) # 오버랩 계산 overlap_x = int(new_width * (overlap_percentage / 100)) overlap_y = int(new_height * (overlap_percentage / 100)) overlap_x = max(overlap_x, 1) overlap_y = max(overlap_y, 1) # 정렬에 따른 마진 계산 if alignment == "가운데": margin_x = (target_size[0] - new_width) // 2 margin_y = (target_size[1] - new_height) // 2 elif alignment == "왼쪽": margin_x = 0 margin_y = (target_size[1] - new_height) // 2 elif alignment == "오른쪽": margin_x = target_size[0] - new_width margin_y = (target_size[1] - new_height) // 2 elif alignment == "위": margin_x = (target_size[0] - new_width) // 2 margin_y = 0 elif alignment == "아래": margin_x = (target_size[0] - new_width) // 2 margin_y = target_size[1] - new_height # 배경 이미지 생성 background = Image.new('RGB', target_size, (255, 255, 255)) background.paste(source, (margin_x, margin_y)) # 마스크 생성 mask = Image.new('L', target_size, 255) mask_draw = ImageDraw.Draw(mask) # 마스크 영역 그리기 (영어 정렬과 매칭) white_gaps_patch = 2 left_overlap = margin_x + overlap_x if alignment != "왼쪽" else margin_x right_overlap = margin_x + new_width - overlap_x if alignment != "오른쪽" else margin_x + new_width top_overlap = margin_y + overlap_y if alignment != "위" else margin_y bottom_overlap = margin_y + new_height - overlap_y if alignment != "아래" else margin_y + new_height mask_draw.rectangle([ (left_overlap, top_overlap), (right_overlap, bottom_overlap) ], fill=0) return background, mask def preview_outpaint(image, width, height, overlap_percentage, alignment): """아웃페인팅 미리보기""" background, mask = prepare_image_and_mask(image, width, height, overlap_percentage, alignment) if background is None: return None # 미리보기 이미지 생성 preview = background.copy().convert('RGBA') # 반투명 빨간색 오버레이 red_overlay = Image.new('RGBA', background.size, (255, 0, 0, 64)) # 마스크 적용 red_mask = Image.new('RGBA', background.size, (0, 0, 0, 0)) red_mask.paste(red_overlay, (0, 0), mask) # 오버레이 합성 preview = Image.alpha_composite(preview, red_mask) return preview @spaces.GPU(duration=24) def outpaint_image(image, prompt, width, height, overlap_percentage, alignment, num_steps=8): """이미지 아웃페인팅 실행""" if image is None: return None if not OUTPAINT_MODEL_LOADED: return Image.new('RGB', (width, height), (200, 200, 200)) try: # 이미지와 마스크 준비 background, mask = prepare_image_and_mask(image, width, height, overlap_percentage, alignment) if background is None: return None # cnet_image 생성 (마스크 영역을 검은색으로) cnet_image = background.copy() cnet_image.paste(0, (0, 0), mask) # 프롬프트 준비 final_prompt = f"{prompt}, high quality, 4k" if prompt else "high quality, 4k" # GPU에서 실행 with torch.autocast(device_type="cuda", dtype=torch.float16): ( prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds, ) = pipe.encode_prompt(final_prompt, "cuda", True) # 생성 프로세스 for generated_image in pipe( prompt_embeds=prompt_embeds, negative_prompt_embeds=negative_prompt_embeds, pooled_prompt_embeds=pooled_prompt_embeds, negative_pooled_prompt_embeds=negative_pooled_prompt_embeds, image=cnet_image, num_inference_steps=num_steps ): # 중간 결과 (필요시 사용) pass # 최종 이미지 final_image = generated_image # RGBA로 변환하고 마스크 적용 final_image = final_image.convert("RGBA") cnet_image.paste(final_image, (0, 0), mask) return cnet_image except Exception as e: logging.error(f"Outpainting error: {str(e)}") return background if 'background' in locals() else None # MMAudio 관련 함수들 def translate_prompt(text): try: if translator is None: return text if text and any(ord(char) >= 0x3131 and ord(char) <= 0xD7A3 for char in text): with torch.no_grad(): translation = translator(text)[0]['translation_text'] return translation return text except Exception as e: logging.error(f"Translation error: {e}") return text @spaces.GPU @torch.inference_mode() def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int, num_steps: int, cfg_strength: float, duration: float): if not MMAUDIO_MODEL_LOADED: return None prompt = translate_prompt(prompt) negative_prompt = translate_prompt(negative_prompt) rng = torch.Generator(device=device) rng.manual_seed(seed) fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps) clip_frames, sync_frames, duration = load_video(video, duration) clip_frames = clip_frames.unsqueeze(0) sync_frames = sync_frames.unsqueeze(0) seq_cfg.duration = duration net_mmaudio.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len) audios = generate(clip_frames, sync_frames, [prompt], negative_text=[negative_prompt], feature_utils=feature_utils, net=net_mmaudio, fm=fm, rng=rng, cfg_strength=cfg_strength) audio = audios.float().cpu()[0] video_save_path = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name make_video(video, video_save_path, audio, sampling_rate=seq_cfg.sampling_rate, duration_sec=seq_cfg.duration) return video_save_path # CSS css = """ :root { --primary-color: #f8c3cd; --secondary-color: #b3e5fc; --background-color: #f5f5f7; --card-background: #ffffff; --text-color: #424242; --accent-color: #ffb6c1; --success-color: #c8e6c9; --warning-color: #fff9c4; --shadow-color: rgba(0, 0, 0, 0.1); --border-radius: 12px; } .gradio-container { max-width: 1200px !important; margin: 0 auto !important; } .panel-box { border-radius: var(--border-radius) !important; box-shadow: 0 8px 16px var(--shadow-color) !important; background-color: var(--card-background) !important; padding: 20px !important; margin-bottom: 20px !important; } #generate-btn, #video-btn, #outpaint-btn, #preview-btn, #audio-btn { background: linear-gradient(135deg, #ff9a9e, #fad0c4) !important; font-size: 1.1rem !important; padding: 12px 24px !important; margin-top: 10px !important; width: 100% !important; } .tabitem { min-height: 700px !important; } """ # Gradio Interface demo = gr.Blocks(css=css, title="AI 이미지 & 비디오 & 오디오 생성기") with demo: gr.Markdown("# 🎨 Ginigen 스튜디오") with gr.Tabs() as tabs: # 첫 번째 탭: 텍스트 to 이미지 with gr.Tab("텍스트→이미지→비디오", elem_classes="tabitem"): with gr.Row(equal_height=True): # 입력 컬럼 with gr.Column(scale=1): with gr.Group(elem_classes="panel-box"): gr.Markdown("### 📝 이미지 생성 설정") prompt = gr.Textbox( label="프롬프트(한글/영어 가능)", placeholder="생성하고 싶은 이미지를 설명하세요...", lines=3 ) size_preset = gr.Dropdown( choices=list(IMAGE_PRESETS.keys()), value="1:1 정사각형", label="크기 프리셋" ) with gr.Row(): width = gr.Slider(256, 2048, 1024, step=64, label="너비") height = gr.Slider(256, 2048, 1024, step=64, label="높이") with gr.Row(): guidance = gr.Slider(1.0, 20.0, 3.5, step=0.1, label="가이던스") steps = gr.Slider(1, 50, 30, step=1, label="스텝") seed = gr.Number(label="시드 (-1=랜덤)", value=-1) generate_btn = gr.Button("🎨 이미지 생성", variant="primary", elem_id="generate-btn") with gr.Group(elem_classes="panel-box"): gr.Markdown("### 🎬 비디오 생성 설정") video_prompt = gr.Textbox( label="(선택) 비디오 프롬프트(영어로 입력)", placeholder="비디오의 움직임을 설명하세요... (비워두면 기본 움직임 적용)", lines=2 ) video_length = gr.Slider( minimum=1, maximum=60, value=4, step=0.5, label="비디오 길이 (초)", info="1초에서 60초까지 선택 가능합니다" ) video_btn = gr.Button("🎬 비디오로 변환", variant="secondary", elem_id="video-btn") # 출력 컬럼 with gr.Column(scale=1): with gr.Group(elem_classes="panel-box"): gr.Markdown("### 🖼️ 생성 결과") output_image = gr.Image(label="생성된 이미지", type="numpy") output_seed = gr.Textbox(label="시드 정보") output_video = gr.Video(label="생성된 비디오") # 두 번째 탭: 이미지 아웃페인팅 with gr.Tab("이미지 비율 변경/생성", elem_classes="tabitem"): with gr.Row(equal_height=True): # 입력 컬럼 with gr.Column(scale=1): with gr.Group(elem_classes="panel-box"): gr.Markdown("### 🖼️ 이미지 업로드") input_image = gr.Image( label="원본 이미지", type="numpy" ) outpaint_prompt = gr.Textbox( label="프롬프트 (선택)", placeholder="확장할 영역에 대한 설명...", lines=2 ) with gr.Group(elem_classes="panel-box"): gr.Markdown("### ⚙️ 아웃페인팅 설정") outpaint_size_preset = gr.Dropdown( choices=list(IMAGE_PRESETS.keys()), value="16:9 와이드스크린", label="목표 크기 프리셋" ) with gr.Row(): outpaint_width = gr.Slider(256, 2048, 1280, step=64, label="목표 너비") outpaint_height = gr.Slider(256, 2048, 720, step=64, label="목표 높이") alignment = gr.Dropdown( choices=["가운데", "왼쪽", "오른쪽", "위", "아래"], value="가운데", label="정렬" ) overlap_percentage = gr.Slider( minimum=1, maximum=50, value=10, step=1, label="마스크 오버랩 (%)" ) outpaint_steps = gr.Slider( minimum=4, maximum=12, value=8, step=1, label="추론 스텝" ) preview_btn = gr.Button("👁️ 미리보기", elem_id="preview-btn") outpaint_btn = gr.Button("🎨 아웃페인팅 실행", variant="primary", elem_id="outpaint-btn") # 출력 컬럼 with gr.Column(scale=1): with gr.Group(elem_classes="panel-box"): gr.Markdown("### 🖼️ 결과") preview_image = gr.Image(label="미리보기") outpaint_result = gr.Image(label="아웃페인팅 결과") # 세 번째 탭: 비디오 + 오디오 with gr.Tab("비디오 + 오디오", elem_classes="tabitem"): with gr.Row(equal_height=True): # 입력 컬럼 with gr.Column(scale=1): with gr.Group(elem_classes="panel-box"): gr.Markdown("### 🎥 비디오 업로드") audio_video_input = gr.Video( label="입력 비디오", sources=["upload"] ) with gr.Group(elem_classes="panel-box"): gr.Markdown("### 🎵 오디오 생성 설정") audio_prompt = gr.Textbox( label="프롬프트 (한글 지원)" if MMAUDIO_MODEL_LOADED and translator else "프롬프트", placeholder="생성하고 싶은 오디오를 설명하세요... (예: 평화로운 피아노 음악)", lines=3 ) audio_negative_prompt = gr.Textbox( label="네거티브 프롬프트", value="music", placeholder="원하지 않는 요소...", lines=2 ) with gr.Row(): audio_seed = gr.Number(label="시드", value=0) audio_steps = gr.Number(label="스텝", value=25) with gr.Row(): audio_cfg = gr.Number(label="가이던스 스케일", value=4.5) audio_duration = gr.Number(label="지속시간 (초)", value=9999) audio_btn = gr.Button("🎵 오디오 생성 및 합성", variant="primary", elem_id="audio-btn") # 출력 컬럼 with gr.Column(scale=1): with gr.Group(elem_classes="panel-box"): gr.Markdown("### 🎬 생성 결과") output_video_with_audio = gr.Video( label="오디오가 추가된 비디오", interactive=False ) if not MMAUDIO_MODEL_LOADED: gr.Markdown("⚠️ MMAudio 모델을 로드하지 못했습니다. 이 기능은 사용할 수 없습니다.") # 이벤트 연결 - 첫 번째 탭 size_preset.change(update_dimensions, [size_preset], [width, height]) generate_btn.click( generate_text_to_image, [prompt, width, height, guidance, steps, seed], [output_image, output_seed] ) video_btn.click( lambda img, v_prompt, length: generate_video_from_image(img, v_prompt, length) if img is not None else None, [output_image, video_prompt, video_length], [output_video] ) # 이벤트 연결 - 두 번째 탭 outpaint_size_preset.change(update_dimensions, [outpaint_size_preset], [outpaint_width, outpaint_height]) preview_btn.click( preview_outpaint, [input_image, outpaint_width, outpaint_height, overlap_percentage, alignment], [preview_image] ) outpaint_btn.click( outpaint_image, [input_image, outpaint_prompt, outpaint_width, outpaint_height, overlap_percentage, alignment, outpaint_steps], [outpaint_result] ) # 이벤트 연결 - 세 번째 탭 audio_btn.click( video_to_audio, [audio_video_input, audio_prompt, audio_negative_prompt, audio_seed, audio_steps, audio_cfg, audio_duration], [output_video_with_audio] ) demo.launch()