STUDIO / app-backup.py
openfree's picture
Create app-backup.py
3707c06 verified
raw
history blame
26.7 kB
import gradio as gr
import numpy as np
from PIL import Image, ImageDraw
from gradio_client import Client, handle_file
import random
import tempfile
import os
import logging
import torch
from diffusers import AutoencoderKL, TCDScheduler
from diffusers.models.model_loading_utils import load_state_dict
from huggingface_hub import hf_hub_download
from pathlib import Path
import torchaudio
from einops import rearrange
from scipy.io import wavfile
from transformers import pipeline
# ํ™˜๊ฒฝ ๋ณ€์ˆ˜ ์„ค์ •์œผ๋กœ torch.load ์ฒดํฌ ์šฐํšŒ (์ž„์‹œ ํ•ด๊ฒฐ์ฑ…)
os.environ["TRANSFORMERS_ALLOW_UNSAFE_DESERIALIZATION"] = "1"
# Spaces GPU
try:
import spaces
except:
# GPU ๋ฐ์ฝ”๋ ˆ์ดํ„ฐ๊ฐ€ ์—†์„ ๋•Œ๋ฅผ ์œ„ํ•œ ๋”๋ฏธ ๋ฐ์ฝ”๋ ˆ์ดํ„ฐ
class spaces:
@staticmethod
def GPU(duration=None):
def decorator(func):
return func
return decorator
# MMAudio imports
try:
import mmaudio
except ImportError:
os.system("pip install -e .")
import mmaudio
from mmaudio.eval_utils import (ModelConfig, all_model_cfg, generate, load_video, make_video,
setup_eval_logging)
from mmaudio.model.flow_matching import FlowMatching
from mmaudio.model.networks import MMAudio, get_my_mmaudio
from mmaudio.model.sequence_config import SequenceConfig
from mmaudio.model.utils.features_utils import FeaturesUtils
# ControlNet ๋ชจ๋ธ ๋กœ๋“œ
try:
from controlnet_union import ControlNetModel_Union
from pipeline_fill_sd_xl import StableDiffusionXLFillPipeline
# ControlNet ์„ค์ • ๋ฐ ๋กœ๋“œ
config_file = hf_hub_download(
"xinsir/controlnet-union-sdxl-1.0",
filename="config_promax.json",
)
config = ControlNetModel_Union.load_config(config_file)
controlnet_model = ControlNetModel_Union.from_config(config)
model_file = hf_hub_download(
"xinsir/controlnet-union-sdxl-1.0",
filename="diffusion_pytorch_model_promax.safetensors",
)
state_dict = load_state_dict(model_file)
loaded_keys = list(state_dict.keys())
result = ControlNetModel_Union._load_pretrained_model(
controlnet_model, state_dict, model_file, "xinsir/controlnet-union-sdxl-1.0", loaded_keys
)
model = result[0]
model = model.to(device="cuda", dtype=torch.float16)
# VAE ๋กœ๋“œ
vae = AutoencoderKL.from_pretrained(
"madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16
).to("cuda")
# ํŒŒ์ดํ”„๋ผ์ธ ๋กœ๋“œ
pipe = StableDiffusionXLFillPipeline.from_pretrained(
"SG161222/RealVisXL_V5.0_Lightning",
torch_dtype=torch.float16,
vae=vae,
controlnet=model,
variant="fp16",
).to("cuda")
pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
OUTPAINT_MODEL_LOADED = True
except Exception as e:
logging.error(f"Failed to load outpainting models: {str(e)}")
OUTPAINT_MODEL_LOADED = False
# MMAudio ๋ชจ๋ธ ์„ค์ •
if torch.cuda.is_available():
device = torch.device("cuda")
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
torch.backends.cudnn.benchmark = True
else:
device = torch.device("cpu")
dtype = torch.bfloat16
# MMAudio ๋ชจ๋ธ ์ดˆ๊ธฐํ™”
try:
model_mmaudio: ModelConfig = all_model_cfg['large_44k_v2']
model_mmaudio.download_if_needed()
output_dir = Path('./output/gradio')
setup_eval_logging()
# ๋ฒˆ์—ญ๊ธฐ ์„ค์ •
try:
translator = pipeline("translation",
model="Helsinki-NLP/opus-mt-ko-en",
device="cpu",
use_fast=True,
trust_remote_code=False)
except Exception as e:
logging.warning(f"Failed to load translation model: {e}")
translator = None
def get_mmaudio_model() -> tuple[MMAudio, FeaturesUtils, SequenceConfig]:
with torch.cuda.device(device):
seq_cfg = model_mmaudio.seq_cfg
net: MMAudio = get_my_mmaudio(model_mmaudio.model_name).to(device, dtype).eval()
net.load_weights(torch.load(model_mmaudio.model_path, map_location=device, weights_only=True))
logging.info(f'Loaded weights from {model_mmaudio.model_path}')
feature_utils = FeaturesUtils(
tod_vae_ckpt=model_mmaudio.vae_path,
synchformer_ckpt=model_mmaudio.synchformer_ckpt,
enable_conditions=True,
mode=model_mmaudio.mode,
bigvgan_vocoder_ckpt=model_mmaudio.bigvgan_16k_path,
need_vae_encoder=False
).to(device, dtype).eval()
return net, feature_utils, seq_cfg
net_mmaudio, feature_utils, seq_cfg = get_mmaudio_model()
MMAUDIO_MODEL_LOADED = True
except Exception as e:
logging.error(f"Failed to load MMAudio models: {str(e)}")
MMAUDIO_MODEL_LOADED = False
translator = None
# API URLs
TEXT2IMG_API_URL = "http://211.233.58.201:7896"
VIDEO_API_URL = "http://211.233.58.201:7875"
# ๋กœ๊น… ์„ค์ •
logging.basicConfig(level=logging.INFO)
# Image size presets
IMAGE_PRESETS = {
"์ปค์Šคํ…€": {"width": 1024, "height": 1024},
"1:1 ์ •์‚ฌ๊ฐํ˜•": {"width": 1024, "height": 1024},
"4:3 ํ‘œ์ค€": {"width": 1024, "height": 768},
"16:9 ์™€์ด๋“œ์Šคํฌ๋ฆฐ": {"width": 1024, "height": 576},
"9:16 ์„ธ๋กœํ˜•": {"width": 576, "height": 1024},
"6:19 ํŠน์ˆ˜ ์„ธ๋กœํ˜•": {"width": 324, "height": 1024},
"Instagram ์ •์‚ฌ๊ฐํ˜•": {"width": 1080, "height": 1080},
"Instagram ์Šคํ† ๋ฆฌ": {"width": 1080, "height": 1920},
"Instagram ๊ฐ€๋กœํ˜•": {"width": 1080, "height": 566},
"Facebook ์ปค๋ฒ„": {"width": 820, "height": 312},
"Twitter ํ—ค๋”": {"width": 1500, "height": 500},
"YouTube ์ธ๋„ค์ผ": {"width": 1280, "height": 720},
"LinkedIn ๋ฐฐ๋„ˆ": {"width": 1584, "height": 396},
}
def update_dimensions(preset):
if preset in IMAGE_PRESETS:
return IMAGE_PRESETS[preset]["width"], IMAGE_PRESETS[preset]["height"]
return 1024, 1024
def generate_text_to_image(prompt, width, height, guidance, inference_steps, seed):
if not prompt:
return None, "ํ”„๋กฌํ”„ํŠธ๋ฅผ ์ž…๋ ฅํ•ด์ฃผ์„ธ์š”"
try:
client = Client(TEXT2IMG_API_URL)
if seed == -1:
seed = random.randint(0, 9999999)
result = client.predict(
prompt=prompt,
width=int(width),
height=int(height),
guidance=float(guidance),
inference_steps=int(inference_steps),
seed=int(seed),
do_img2img=False,
init_image=None,
image2image_strength=0.8,
resize_img=True,
api_name="/generate_image"
)
return result[0], f"์‚ฌ์šฉ๋œ ์‹œ๋“œ: {result[1]}"
except Exception as e:
logging.error(f"Image generation error: {str(e)}")
return None, f"์˜ค๋ฅ˜: {str(e)}"
def generate_video_from_image(image, prompt="", length=4.0):
if image is None:
return None
try:
# ์ด๋ฏธ์ง€ ์ €์žฅ
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as fp:
temp_path = fp.name
Image.fromarray(image).save(temp_path)
# API ํ˜ธ์ถœ
client = Client(VIDEO_API_URL)
result = client.predict(
input_image=handle_file(temp_path),
prompt=prompt if prompt else "Generate natural motion",
n_prompt="",
seed=random.randint(0, 9999999),
use_teacache=True,
video_length=float(length),
api_name="/process"
)
os.unlink(temp_path)
if result and len(result) > 0:
video_dict = result[0]
return video_dict.get("video") if isinstance(video_dict, dict) else None
except Exception as e:
logging.error(f"Video generation error: {str(e)}")
return None
def prepare_image_and_mask(image, width, height, overlap_percentage, alignment):
"""์ด๋ฏธ์ง€์™€ ๋งˆ์Šคํฌ๋ฅผ ์ค€๋น„ํ•˜๋Š” ํ•จ์ˆ˜"""
if image is None:
return None, None
# PIL ์ด๋ฏธ์ง€๋กœ ๋ณ€ํ™˜
if isinstance(image, np.ndarray):
image = Image.fromarray(image).convert('RGB')
target_size = (width, height)
# ์ด๋ฏธ์ง€๋ฅผ ํƒ€๊ฒŸ ํฌ๊ธฐ์— ๋งž๊ฒŒ ์กฐ์ •
scale_factor = min(target_size[0] / image.width, target_size[1] / image.height)
new_width = int(image.width * scale_factor)
new_height = int(image.height * scale_factor)
# ์ด๋ฏธ์ง€ ๋ฆฌ์‚ฌ์ด์ฆˆ
source = image.resize((new_width, new_height), Image.LANCZOS)
# ์˜ค๋ฒ„๋žฉ ๊ณ„์‚ฐ
overlap_x = int(new_width * (overlap_percentage / 100))
overlap_y = int(new_height * (overlap_percentage / 100))
overlap_x = max(overlap_x, 1)
overlap_y = max(overlap_y, 1)
# ์ •๋ ฌ์— ๋”ฐ๋ฅธ ๋งˆ์ง„ ๊ณ„์‚ฐ
if alignment == "๊ฐ€์šด๋ฐ":
margin_x = (target_size[0] - new_width) // 2
margin_y = (target_size[1] - new_height) // 2
elif alignment == "์™ผ์ชฝ":
margin_x = 0
margin_y = (target_size[1] - new_height) // 2
elif alignment == "์˜ค๋ฅธ์ชฝ":
margin_x = target_size[0] - new_width
margin_y = (target_size[1] - new_height) // 2
elif alignment == "์œ„":
margin_x = (target_size[0] - new_width) // 2
margin_y = 0
elif alignment == "์•„๋ž˜":
margin_x = (target_size[0] - new_width) // 2
margin_y = target_size[1] - new_height
# ๋ฐฐ๊ฒฝ ์ด๋ฏธ์ง€ ์ƒ์„ฑ
background = Image.new('RGB', target_size, (255, 255, 255))
background.paste(source, (margin_x, margin_y))
# ๋งˆ์Šคํฌ ์ƒ์„ฑ
mask = Image.new('L', target_size, 255)
mask_draw = ImageDraw.Draw(mask)
# ๋งˆ์Šคํฌ ์˜์—ญ ๊ทธ๋ฆฌ๊ธฐ (์˜์–ด ์ •๋ ฌ๊ณผ ๋งค์นญ)
white_gaps_patch = 2
left_overlap = margin_x + overlap_x if alignment != "์™ผ์ชฝ" else margin_x
right_overlap = margin_x + new_width - overlap_x if alignment != "์˜ค๋ฅธ์ชฝ" else margin_x + new_width
top_overlap = margin_y + overlap_y if alignment != "์œ„" else margin_y
bottom_overlap = margin_y + new_height - overlap_y if alignment != "์•„๋ž˜" else margin_y + new_height
mask_draw.rectangle([
(left_overlap, top_overlap),
(right_overlap, bottom_overlap)
], fill=0)
return background, mask
def preview_outpaint(image, width, height, overlap_percentage, alignment):
"""์•„์›ƒํŽ˜์ธํŒ… ๋ฏธ๋ฆฌ๋ณด๊ธฐ"""
background, mask = prepare_image_and_mask(image, width, height, overlap_percentage, alignment)
if background is None:
return None
# ๋ฏธ๋ฆฌ๋ณด๊ธฐ ์ด๋ฏธ์ง€ ์ƒ์„ฑ
preview = background.copy().convert('RGBA')
# ๋ฐ˜ํˆฌ๋ช… ๋นจ๊ฐ„์ƒ‰ ์˜ค๋ฒ„๋ ˆ์ด
red_overlay = Image.new('RGBA', background.size, (255, 0, 0, 64))
# ๋งˆ์Šคํฌ ์ ์šฉ
red_mask = Image.new('RGBA', background.size, (0, 0, 0, 0))
red_mask.paste(red_overlay, (0, 0), mask)
# ์˜ค๋ฒ„๋ ˆ์ด ํ•ฉ์„ฑ
preview = Image.alpha_composite(preview, red_mask)
return preview
@spaces.GPU(duration=24)
def outpaint_image(image, prompt, width, height, overlap_percentage, alignment, num_steps=8):
"""์ด๋ฏธ์ง€ ์•„์›ƒํŽ˜์ธํŒ… ์‹คํ–‰"""
if image is None:
return None
if not OUTPAINT_MODEL_LOADED:
return Image.new('RGB', (width, height), (200, 200, 200))
try:
# ์ด๋ฏธ์ง€์™€ ๋งˆ์Šคํฌ ์ค€๋น„
background, mask = prepare_image_and_mask(image, width, height, overlap_percentage, alignment)
if background is None:
return None
# cnet_image ์ƒ์„ฑ (๋งˆ์Šคํฌ ์˜์—ญ์„ ๊ฒ€์€์ƒ‰์œผ๋กœ)
cnet_image = background.copy()
cnet_image.paste(0, (0, 0), mask)
# ํ”„๋กฌํ”„ํŠธ ์ค€๋น„
final_prompt = f"{prompt}, high quality, 4k" if prompt else "high quality, 4k"
# GPU์—์„œ ์‹คํ–‰
with torch.autocast(device_type="cuda", dtype=torch.float16):
(
prompt_embeds,
negative_prompt_embeds,
pooled_prompt_embeds,
negative_pooled_prompt_embeds,
) = pipe.encode_prompt(final_prompt, "cuda", True)
# ์ƒ์„ฑ ํ”„๋กœ์„ธ์Šค
for generated_image in pipe(
prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds,
pooled_prompt_embeds=pooled_prompt_embeds,
negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
image=cnet_image,
num_inference_steps=num_steps
):
# ์ค‘๊ฐ„ ๊ฒฐ๊ณผ (ํ•„์š”์‹œ ์‚ฌ์šฉ)
pass
# ์ตœ์ข… ์ด๋ฏธ์ง€
final_image = generated_image
# RGBA๋กœ ๋ณ€ํ™˜ํ•˜๊ณ  ๋งˆ์Šคํฌ ์ ์šฉ
final_image = final_image.convert("RGBA")
cnet_image.paste(final_image, (0, 0), mask)
return cnet_image
except Exception as e:
logging.error(f"Outpainting error: {str(e)}")
return background if 'background' in locals() else None
# MMAudio ๊ด€๋ จ ํ•จ์ˆ˜๋“ค
def translate_prompt(text):
try:
if translator is None:
return text
if text and any(ord(char) >= 0x3131 and ord(char) <= 0xD7A3 for char in text):
with torch.no_grad():
translation = translator(text)[0]['translation_text']
return translation
return text
except Exception as e:
logging.error(f"Translation error: {e}")
return text
@spaces.GPU
@torch.inference_mode()
def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int, num_steps: int,
cfg_strength: float, duration: float):
if not MMAUDIO_MODEL_LOADED:
return None
prompt = translate_prompt(prompt)
negative_prompt = translate_prompt(negative_prompt)
rng = torch.Generator(device=device)
rng.manual_seed(seed)
fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
clip_frames, sync_frames, duration = load_video(video, duration)
clip_frames = clip_frames.unsqueeze(0)
sync_frames = sync_frames.unsqueeze(0)
seq_cfg.duration = duration
net_mmaudio.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
audios = generate(clip_frames,
sync_frames, [prompt],
negative_text=[negative_prompt],
feature_utils=feature_utils,
net=net_mmaudio,
fm=fm,
rng=rng,
cfg_strength=cfg_strength)
audio = audios.float().cpu()[0]
video_save_path = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name
make_video(video,
video_save_path,
audio,
sampling_rate=seq_cfg.sampling_rate,
duration_sec=seq_cfg.duration)
return video_save_path
# CSS
css = """
:root {
--primary-color: #f8c3cd;
--secondary-color: #b3e5fc;
--background-color: #f5f5f7;
--card-background: #ffffff;
--text-color: #424242;
--accent-color: #ffb6c1;
--success-color: #c8e6c9;
--warning-color: #fff9c4;
--shadow-color: rgba(0, 0, 0, 0.1);
--border-radius: 12px;
}
.gradio-container {
max-width: 1200px !important;
margin: 0 auto !important;
}
.panel-box {
border-radius: var(--border-radius) !important;
box-shadow: 0 8px 16px var(--shadow-color) !important;
background-color: var(--card-background) !important;
padding: 20px !important;
margin-bottom: 20px !important;
}
#generate-btn, #video-btn, #outpaint-btn, #preview-btn, #audio-btn {
background: linear-gradient(135deg, #ff9a9e, #fad0c4) !important;
font-size: 1.1rem !important;
padding: 12px 24px !important;
margin-top: 10px !important;
width: 100% !important;
}
.tabitem {
min-height: 700px !important;
}
"""
# Gradio Interface
demo = gr.Blocks(css=css, title="AI ์ด๋ฏธ์ง€ & ๋น„๋””์˜ค & ์˜ค๋””์˜ค ์ƒ์„ฑ๊ธฐ")
with demo:
gr.Markdown("# ๐ŸŽจ Ginigen ์ŠคํŠœ๋””์˜ค")
with gr.Tabs() as tabs:
# ์ฒซ ๋ฒˆ์งธ ํƒญ: ํ…์ŠคํŠธ to ์ด๋ฏธ์ง€
with gr.Tab("ํ…์ŠคํŠธโ†’์ด๋ฏธ์ง€โ†’๋น„๋””์˜ค", elem_classes="tabitem"):
with gr.Row(equal_height=True):
# ์ž…๋ ฅ ์ปฌ๋Ÿผ
with gr.Column(scale=1):
with gr.Group(elem_classes="panel-box"):
gr.Markdown("### ๐Ÿ“ ์ด๋ฏธ์ง€ ์ƒ์„ฑ ์„ค์ •")
prompt = gr.Textbox(
label="ํ”„๋กฌํ”„ํŠธ(ํ•œ๊ธ€/์˜์–ด ๊ฐ€๋Šฅ)",
placeholder="์ƒ์„ฑํ•˜๊ณ  ์‹ถ์€ ์ด๋ฏธ์ง€๋ฅผ ์„ค๋ช…ํ•˜์„ธ์š”...",
lines=3
)
size_preset = gr.Dropdown(
choices=list(IMAGE_PRESETS.keys()),
value="1:1 ์ •์‚ฌ๊ฐํ˜•",
label="ํฌ๊ธฐ ํ”„๋ฆฌ์…‹"
)
with gr.Row():
width = gr.Slider(256, 2048, 1024, step=64, label="๋„ˆ๋น„")
height = gr.Slider(256, 2048, 1024, step=64, label="๋†’์ด")
with gr.Row():
guidance = gr.Slider(1.0, 20.0, 3.5, step=0.1, label="๊ฐ€์ด๋˜์Šค")
steps = gr.Slider(1, 50, 30, step=1, label="์Šคํ…")
seed = gr.Number(label="์‹œ๋“œ (-1=๋žœ๋ค)", value=-1)
generate_btn = gr.Button("๐ŸŽจ ์ด๋ฏธ์ง€ ์ƒ์„ฑ", variant="primary", elem_id="generate-btn")
with gr.Group(elem_classes="panel-box"):
gr.Markdown("### ๐ŸŽฌ ๋น„๋””์˜ค ์ƒ์„ฑ ์„ค์ •")
video_prompt = gr.Textbox(
label="(์„ ํƒ) ๋น„๋””์˜ค ํ”„๋กฌํ”„ํŠธ(์˜์–ด๋กœ ์ž…๋ ฅ)",
placeholder="๋น„๋””์˜ค์˜ ์›€์ง์ž„์„ ์„ค๋ช…ํ•˜์„ธ์š”... (๋น„์›Œ๋‘๋ฉด ๊ธฐ๋ณธ ์›€์ง์ž„ ์ ์šฉ)",
lines=2
)
video_length = gr.Slider(
minimum=1,
maximum=60,
value=4,
step=0.5,
label="๋น„๋””์˜ค ๊ธธ์ด (์ดˆ)",
info="1์ดˆ์—์„œ 60์ดˆ๊นŒ์ง€ ์„ ํƒ ๊ฐ€๋Šฅํ•ฉ๋‹ˆ๋‹ค"
)
video_btn = gr.Button("๐ŸŽฌ ๋น„๋””์˜ค๋กœ ๋ณ€ํ™˜", variant="secondary", elem_id="video-btn")
# ์ถœ๋ ฅ ์ปฌ๋Ÿผ
with gr.Column(scale=1):
with gr.Group(elem_classes="panel-box"):
gr.Markdown("### ๐Ÿ–ผ๏ธ ์ƒ์„ฑ ๊ฒฐ๊ณผ")
output_image = gr.Image(label="์ƒ์„ฑ๋œ ์ด๋ฏธ์ง€", type="numpy")
output_seed = gr.Textbox(label="์‹œ๋“œ ์ •๋ณด")
output_video = gr.Video(label="์ƒ์„ฑ๋œ ๋น„๋””์˜ค")
# ๋‘ ๋ฒˆ์งธ ํƒญ: ์ด๋ฏธ์ง€ ์•„์›ƒํŽ˜์ธํŒ…
with gr.Tab("์ด๋ฏธ์ง€ ๋น„์œจ ๋ณ€๊ฒฝ/์ƒ์„ฑ", elem_classes="tabitem"):
with gr.Row(equal_height=True):
# ์ž…๋ ฅ ์ปฌ๋Ÿผ
with gr.Column(scale=1):
with gr.Group(elem_classes="panel-box"):
gr.Markdown("### ๐Ÿ–ผ๏ธ ์ด๋ฏธ์ง€ ์—…๋กœ๋“œ")
input_image = gr.Image(
label="์›๋ณธ ์ด๋ฏธ์ง€",
type="numpy"
)
outpaint_prompt = gr.Textbox(
label="ํ”„๋กฌํ”„ํŠธ (์„ ํƒ)",
placeholder="ํ™•์žฅํ•  ์˜์—ญ์— ๋Œ€ํ•œ ์„ค๋ช…...",
lines=2
)
with gr.Group(elem_classes="panel-box"):
gr.Markdown("### โš™๏ธ ์•„์›ƒํŽ˜์ธํŒ… ์„ค์ •")
outpaint_size_preset = gr.Dropdown(
choices=list(IMAGE_PRESETS.keys()),
value="16:9 ์™€์ด๋“œ์Šคํฌ๋ฆฐ",
label="๋ชฉํ‘œ ํฌ๊ธฐ ํ”„๋ฆฌ์…‹"
)
with gr.Row():
outpaint_width = gr.Slider(256, 2048, 1280, step=64, label="๋ชฉํ‘œ ๋„ˆ๋น„")
outpaint_height = gr.Slider(256, 2048, 720, step=64, label="๋ชฉํ‘œ ๋†’์ด")
alignment = gr.Dropdown(
choices=["๊ฐ€์šด๋ฐ", "์™ผ์ชฝ", "์˜ค๋ฅธ์ชฝ", "์œ„", "์•„๋ž˜"],
value="๊ฐ€์šด๋ฐ",
label="์ •๋ ฌ"
)
overlap_percentage = gr.Slider(
minimum=1,
maximum=50,
value=10,
step=1,
label="๋งˆ์Šคํฌ ์˜ค๋ฒ„๋žฉ (%)"
)
outpaint_steps = gr.Slider(
minimum=4,
maximum=12,
value=8,
step=1,
label="์ถ”๋ก  ์Šคํ…"
)
preview_btn = gr.Button("๐Ÿ‘๏ธ ๋ฏธ๋ฆฌ๋ณด๊ธฐ", elem_id="preview-btn")
outpaint_btn = gr.Button("๐ŸŽจ ์•„์›ƒํŽ˜์ธํŒ… ์‹คํ–‰", variant="primary", elem_id="outpaint-btn")
# ์ถœ๋ ฅ ์ปฌ๋Ÿผ
with gr.Column(scale=1):
with gr.Group(elem_classes="panel-box"):
gr.Markdown("### ๐Ÿ–ผ๏ธ ๊ฒฐ๊ณผ")
preview_image = gr.Image(label="๋ฏธ๋ฆฌ๋ณด๊ธฐ")
outpaint_result = gr.Image(label="์•„์›ƒํŽ˜์ธํŒ… ๊ฒฐ๊ณผ")
# ์„ธ ๋ฒˆ์งธ ํƒญ: ๋น„๋””์˜ค + ์˜ค๋””์˜ค
with gr.Tab("๋น„๋””์˜ค + ์˜ค๋””์˜ค", elem_classes="tabitem"):
with gr.Row(equal_height=True):
# ์ž…๋ ฅ ์ปฌ๋Ÿผ
with gr.Column(scale=1):
with gr.Group(elem_classes="panel-box"):
gr.Markdown("### ๐ŸŽฅ ๋น„๋””์˜ค ์—…๋กœ๋“œ")
audio_video_input = gr.Video(
label="์ž…๋ ฅ ๋น„๋””์˜ค",
sources=["upload"]
)
with gr.Group(elem_classes="panel-box"):
gr.Markdown("### ๐ŸŽต ์˜ค๋””์˜ค ์ƒ์„ฑ ์„ค์ •")
audio_prompt = gr.Textbox(
label="ํ”„๋กฌํ”„ํŠธ (ํ•œ๊ธ€ ์ง€์›)" if MMAUDIO_MODEL_LOADED and translator else "ํ”„๋กฌํ”„ํŠธ",
placeholder="์ƒ์„ฑํ•˜๊ณ  ์‹ถ์€ ์˜ค๋””์˜ค๋ฅผ ์„ค๋ช…ํ•˜์„ธ์š”... (์˜ˆ: ํ‰ํ™”๋กœ์šด ํ”ผ์•„๋…ธ ์Œ์•…)",
lines=3
)
audio_negative_prompt = gr.Textbox(
label="๋„ค๊ฑฐํ‹ฐ๋ธŒ ํ”„๋กฌํ”„ํŠธ",
value="music",
placeholder="์›ํ•˜์ง€ ์•Š๋Š” ์š”์†Œ...",
lines=2
)
with gr.Row():
audio_seed = gr.Number(label="์‹œ๋“œ", value=0)
audio_steps = gr.Number(label="์Šคํ…", value=25)
with gr.Row():
audio_cfg = gr.Number(label="๊ฐ€์ด๋˜์Šค ์Šค์ผ€์ผ", value=4.5)
audio_duration = gr.Number(label="์ง€์†์‹œ๊ฐ„ (์ดˆ)", value=9999)
audio_btn = gr.Button("๐ŸŽต ์˜ค๋””์˜ค ์ƒ์„ฑ ๋ฐ ํ•ฉ์„ฑ", variant="primary", elem_id="audio-btn")
# ์ถœ๋ ฅ ์ปฌ๋Ÿผ
with gr.Column(scale=1):
with gr.Group(elem_classes="panel-box"):
gr.Markdown("### ๐ŸŽฌ ์ƒ์„ฑ ๊ฒฐ๊ณผ")
output_video_with_audio = gr.Video(
label="์˜ค๋””์˜ค๊ฐ€ ์ถ”๊ฐ€๋œ ๋น„๋””์˜ค",
interactive=False
)
if not MMAUDIO_MODEL_LOADED:
gr.Markdown("โš ๏ธ MMAudio ๋ชจ๋ธ์„ ๋กœ๋“œํ•˜์ง€ ๋ชปํ–ˆ์Šต๋‹ˆ๋‹ค. ์ด ๊ธฐ๋Šฅ์€ ์‚ฌ์šฉํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.")
# ์ด๋ฒคํŠธ ์—ฐ๊ฒฐ - ์ฒซ ๋ฒˆ์งธ ํƒญ
size_preset.change(update_dimensions, [size_preset], [width, height])
generate_btn.click(
generate_text_to_image,
[prompt, width, height, guidance, steps, seed],
[output_image, output_seed]
)
video_btn.click(
lambda img, v_prompt, length: generate_video_from_image(img, v_prompt, length) if img is not None else None,
[output_image, video_prompt, video_length],
[output_video]
)
# ์ด๋ฒคํŠธ ์—ฐ๊ฒฐ - ๋‘ ๋ฒˆ์งธ ํƒญ
outpaint_size_preset.change(update_dimensions, [outpaint_size_preset], [outpaint_width, outpaint_height])
preview_btn.click(
preview_outpaint,
[input_image, outpaint_width, outpaint_height, overlap_percentage, alignment],
[preview_image]
)
outpaint_btn.click(
outpaint_image,
[input_image, outpaint_prompt, outpaint_width, outpaint_height, overlap_percentage, alignment, outpaint_steps],
[outpaint_result]
)
# ์ด๋ฒคํŠธ ์—ฐ๊ฒฐ - ์„ธ ๋ฒˆ์งธ ํƒญ
audio_btn.click(
video_to_audio,
[audio_video_input, audio_prompt, audio_negative_prompt, audio_seed, audio_steps, audio_cfg, audio_duration],
[output_video_with_audio]
)
demo.launch()