STUDIO / app.py
openfree's picture
Update app.py
a74793f verified
raw
history blame
54.1 kB
# Spaces GPU - λ°˜λ“œμ‹œ 첫 번째둜 importν•΄μ•Ό 함!
import os
IS_SPACES = os.environ.get("SPACE_ID") is not None
if IS_SPACES:
import spaces
else:
# GPU λ°μ½”λ ˆμ΄ν„°κ°€ 없을 λ•Œλ₯Ό μœ„ν•œ 더미 λ°μ½”λ ˆμ΄ν„°
class spaces:
@staticmethod
def GPU(duration=None):
def decorator(func):
return func
return decorator
# 이제 λ‹€λ₯Έ λΌμ΄λΈŒλŸ¬λ¦¬λ“€μ„ import
import gradio as gr
import numpy as np
from PIL import Image, ImageDraw
from gradio_client import Client, handle_file
import random
import tempfile
import logging
import torch
from diffusers import AutoencoderKL, TCDScheduler
from diffusers.models.model_loading_utils import load_state_dict
from huggingface_hub import hf_hub_download
from pathlib import Path
import torchaudio
from einops import rearrange
from scipy.io import wavfile
from transformers import pipeline
# λΉ„λ””μ˜€ 배경제거 κ΄€λ ¨ import
from transformers import AutoModelForImageSegmentation
from torchvision import transforms
# ── moviepy import ──────────────────────────────────────────
# editor μ„œλΈŒλͺ¨λ“ˆμ΄ μžˆλŠ” 경우 μš°μ„  μ‚¬μš©, μ—†μœΌλ©΄ μ΅œμƒμœ„ moviepy μ‚¬μš©
try:
from moviepy.editor import VideoFileClip, concatenate_videoclips
except ImportError:
from moviepy import VideoFileClip, concatenate_videoclips
from moviepy import (
vfx, # 효과 ν•¨μˆ˜ (resize λ“±)
ImageSequenceClip, # 이미지 μ‹œν€€μŠ€ β†’ λΉ„λ””μ˜€
concatenate_audioclips, # μ˜€λ””μ˜€ 클립 ν•©μΉ˜κΈ°
AudioFileClip, # μ˜€λ””μ˜€ 클립
CompositeAudioClip # μ˜€λ””μ˜€ ν•©μ„±
)
from moviepy.video.compositing.CompositeVideoClip import CompositeVideoClip
from moviepy.video.VideoClip import ColorClip
# ────────────────────────────────────────────────────────────
import time
from concurrent.futures import ThreadPoolExecutor
# ν™˜κ²½ λ³€μˆ˜ μ„€μ •μœΌλ‘œ torch.load 체크 우회 (μž„μ‹œ ν•΄κ²°μ±…)
os.environ["TRANSFORMERS_ALLOW_UNSAFE_DESERIALIZATION"] = "1"
# GPU μ΄ˆκΈ°ν™”λ₯Ό μœ„ν•œ κ°„λ‹¨ν•œ ν•¨μˆ˜ (Spaces ν™˜κ²½μ—μ„œ ν•„μˆ˜)
@spaces.GPU(duration=1)
def gpu_warmup():
"""GPU μ›Œλ°μ—… ν•¨μˆ˜ - Spaces ν™˜κ²½μ—μ„œ GPU μ‚¬μš©μ„ μœ„ν•΄ ν•„μš”"""
if torch.cuda.is_available():
dummy = torch.zeros(1).cuda()
del dummy
return "GPU ready"
# MMAudio imports - spaces import 이후에 와야 함
try:
import mmaudio
except ImportError:
os.system("pip install -e .")
import mmaudio
from mmaudio.eval_utils import (ModelConfig, all_model_cfg, generate, load_video, make_video,
setup_eval_logging)
from mmaudio.model.flow_matching import FlowMatching
from mmaudio.model.networks import MMAudio, get_my_mmaudio
from mmaudio.model.sequence_config import SequenceConfig
from mmaudio.model.utils.features_utils import FeaturesUtils
# λ‘œκΉ… μ„€μ •
logging.basicConfig(level=logging.INFO)
# κΈ°μ‘΄ μ½”λ“œμ˜ λͺ¨λ“  μ„€μ •κ³Ό μ΄ˆκΈ°ν™” λΆ€λΆ„ μœ μ§€
torch.set_float32_matmul_precision("medium")
# Device 섀정을 더 λͺ…ν™•ν•˜κ²Œ
if torch.cuda.is_available():
device = torch.device("cuda")
torch_dtype = torch.float16
else:
device = torch.device("cpu")
torch_dtype = torch.float32
logging.info(f"Using device: {device}")
# μ „μ—­ λ³€μˆ˜λ‘œ λͺ¨λΈ μƒνƒœ 관리
MODELS_LOADED = False
BIREFNET_MODEL = None
BIREFNET_LITE_MODEL = None
OUTPAINT_PIPE = None
MMAUDIO_NET = None
MMAUDIO_FEATURE_UTILS = None
MMAUDIO_SEQ_CFG = None
TRANSLATOR = None
# API URLs
TEXT2IMG_API_URL = "http://211.233.58.201:7896"
VIDEO_API_URL = "http://211.233.58.201:7875"
# Image size presets
IMAGE_PRESETS = {
"μ»€μŠ€ν…€": {"width": 1024, "height": 1024},
"1:1 μ •μ‚¬κ°ν˜•": {"width": 1024, "height": 1024},
"4:3 ν‘œμ€€": {"width": 1024, "height": 768},
"16:9 μ™€μ΄λ“œμŠ€ν¬λ¦°": {"width": 1024, "height": 576},
"9:16 μ„Έλ‘œν˜•": {"width": 576, "height": 1024},
"6:19 특수 μ„Έλ‘œν˜•": {"width": 324, "height": 1024},
"Instagram μ •μ‚¬κ°ν˜•": {"width": 1080, "height": 1080},
"Instagram μŠ€ν† λ¦¬": {"width": 1080, "height": 1920},
"Instagram κ°€λ‘œν˜•": {"width": 1080, "height": 566},
"Facebook 컀버": {"width": 820, "height": 312},
"Twitter 헀더": {"width": 1500, "height": 500},
"YouTube 썸넀일": {"width": 1280, "height": 720},
"LinkedIn λ°°λ„ˆ": {"width": 1584, "height": 396},
}
# Transform for BiRefNet
transform_image = transforms.Compose([
transforms.Resize((768, 768)),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
])
@spaces.GPU(duration=60)
def load_models():
"""λͺ¨λ“  λͺ¨λΈμ„ λ‘œλ“œν•˜λŠ” ν•¨μˆ˜"""
global MODELS_LOADED, BIREFNET_MODEL, BIREFNET_LITE_MODEL, OUTPAINT_PIPE
global MMAUDIO_NET, MMAUDIO_FEATURE_UTILS, MMAUDIO_SEQ_CFG, TRANSLATOR
if MODELS_LOADED:
return True
try:
# BiRefNet λͺ¨λΈ λ‘œλ“œ
logging.info("Loading BiRefNet models...")
BIREFNET_MODEL = AutoModelForImageSegmentation.from_pretrained("ZhengPeng7/BiRefNet", trust_remote_code=True)
BIREFNET_MODEL.to(device)
BIREFNET_LITE_MODEL = AutoModelForImageSegmentation.from_pretrained("ZhengPeng7/BiRefNet_lite", trust_remote_code=True)
BIREFNET_LITE_MODEL.to(device)
# ControlNet 및 Outpainting λͺ¨λΈ λ‘œλ“œ
logging.info("Loading ControlNet models...")
from controlnet_union import ControlNetModel_Union
from pipeline_fill_sd_xl import StableDiffusionXLFillPipeline
config_file = hf_hub_download(
"xinsir/controlnet-union-sdxl-1.0",
filename="config_promax.json",
)
config = ControlNetModel_Union.load_config(config_file)
controlnet_model = ControlNetModel_Union.from_config(config)
model_file = hf_hub_download(
"xinsir/controlnet-union-sdxl-1.0",
filename="diffusion_pytorch_model_promax.safetensors",
)
state_dict = load_state_dict(model_file)
loaded_keys = list(state_dict.keys())
result = ControlNetModel_Union._load_pretrained_model(
controlnet_model, state_dict, model_file, "xinsir/controlnet-union-sdxl-1.0", loaded_keys
)
model = result[0]
model = model.to(device=device, dtype=torch_dtype)
# VAE λ‘œλ“œ
vae = AutoencoderKL.from_pretrained(
"madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch_dtype
).to(device)
# νŒŒμ΄ν”„λΌμΈ λ‘œλ“œ
OUTPAINT_PIPE = StableDiffusionXLFillPipeline.from_pretrained(
"SG161222/RealVisXL_V5.0_Lightning",
torch_dtype=torch_dtype,
vae=vae,
controlnet=model,
variant="fp16" if device.type == "cuda" else None,
).to(device)
OUTPAINT_PIPE.scheduler = TCDScheduler.from_config(OUTPAINT_PIPE.scheduler.config)
# MMAudio λͺ¨λΈ λ‘œλ“œ
logging.info("Loading MMAudio models...")
model_mmaudio: ModelConfig = all_model_cfg['large_44k_v2']
model_mmaudio.download_if_needed()
setup_eval_logging()
# λ²ˆμ—­κΈ° μ„€μ •
try:
TRANSLATOR = pipeline("translation",
model="Helsinki-NLP/opus-mt-ko-en",
device="cpu",
use_fast=True,
trust_remote_code=False)
except Exception as e:
logging.warning(f"Failed to load translation model: {e}")
TRANSLATOR = None
# MMAudio λͺ¨λΈ μ΄ˆκΈ°ν™”
if torch.cuda.is_available():
mmaudio_dtype = torch.bfloat16
else:
mmaudio_dtype = torch.float32
with torch.cuda.device(device):
MMAUDIO_SEQ_CFG = model_mmaudio.seq_cfg
MMAUDIO_NET = get_my_mmaudio(model_mmaudio.model_name).to(device, mmaudio_dtype).eval()
MMAUDIO_NET.load_weights(torch.load(model_mmaudio.model_path, map_location=device, weights_only=True))
logging.info(f'Loaded weights from {model_mmaudio.model_path}')
MMAUDIO_FEATURE_UTILS = FeaturesUtils(
tod_vae_ckpt=model_mmaudio.vae_path,
synchformer_ckpt=model_mmaudio.synchformer_ckpt,
enable_conditions=True,
mode=model_mmaudio.mode,
bigvgan_vocoder_ckpt=model_mmaudio.bigvgan_16k_path,
need_vae_encoder=False
).to(device, mmaudio_dtype).eval()
MODELS_LOADED = True
logging.info("All models loaded successfully!")
return True
except Exception as e:
logging.error(f"Failed to load models: {str(e)}")
return False
# κΈ°μ‘΄ ν•¨μˆ˜λ“€ λͺ¨λ‘ μœ μ§€
def update_dimensions(preset):
if preset in IMAGE_PRESETS:
return IMAGE_PRESETS[preset]["width"], IMAGE_PRESETS[preset]["height"]
return 1024, 1024
def generate_text_to_image(prompt, width, height, guidance, inference_steps, seed):
if not prompt:
return None, "ν”„λ‘¬ν”„νŠΈλ₯Ό μž…λ ₯ν•΄μ£Όμ„Έμš”"
try:
client = Client(TEXT2IMG_API_URL)
if seed == -1:
seed = random.randint(0, 9999999)
result = client.predict(
prompt=prompt,
width=int(width),
height=int(height),
guidance=float(guidance),
inference_steps=int(inference_steps),
seed=int(seed),
do_img2img=False,
init_image=None,
image2image_strength=0.8,
resize_img=True,
api_name="/generate_image"
)
return result[0], f"μ‚¬μš©λœ μ‹œλ“œ: {result[1]}"
except Exception as e:
logging.error(f"Image generation error: {str(e)}")
return None, f"였λ₯˜: {str(e)}"
def generate_video_from_image(image, prompt="", length=4.0):
if image is None:
return None
try:
# 이미지 μ €μž₯
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as fp:
temp_path = fp.name
Image.fromarray(image).save(temp_path)
# API 호좜
client = Client(VIDEO_API_URL)
result = client.predict(
input_image=handle_file(temp_path),
prompt=prompt if prompt else "Generate natural motion",
n_prompt="",
seed=random.randint(0, 9999999),
use_teacache=True,
video_length=float(length),
api_name="/process"
)
os.unlink(temp_path)
if result and len(result) > 0:
video_dict = result[0]
return video_dict.get("video") if isinstance(video_dict, dict) else None
except Exception as e:
logging.error(f"Video generation error: {str(e)}")
return None
def prepare_image_and_mask(image, width, height, overlap_percentage, alignment):
"""이미지와 마슀크λ₯Ό μ€€λΉ„ν•˜λŠ” ν•¨μˆ˜"""
if image is None:
return None, None
# PIL μ΄λ―Έμ§€λ‘œ λ³€ν™˜
if isinstance(image, np.ndarray):
image = Image.fromarray(image).convert('RGB')
target_size = (width, height)
# 이미지λ₯Ό νƒ€κ²Ÿ 크기에 맞게 μ‘°μ •
scale_factor = min(target_size[0] / image.width, target_size[1] / image.height)
new_width = int(image.width * scale_factor)
new_height = int(image.height * scale_factor)
# 이미지 λ¦¬μ‚¬μ΄μ¦ˆ
source = image.resize((new_width, new_height), Image.LANCZOS)
# μ˜€λ²„λž© 계산
overlap_x = int(new_width * (overlap_percentage / 100))
overlap_y = int(new_height * (overlap_percentage / 100))
overlap_x = max(overlap_x, 1)
overlap_y = max(overlap_y, 1)
# 정렬에 λ”°λ₯Έ λ§ˆμ§„ 계산
if alignment == "κ°€μš΄λ°":
margin_x = (target_size[0] - new_width) // 2
margin_y = (target_size[1] - new_height) // 2
elif alignment == "μ™Όμͺ½":
margin_x = 0
margin_y = (target_size[1] - new_height) // 2
elif alignment == "였λ₯Έμͺ½":
margin_x = target_size[0] - new_width
margin_y = (target_size[1] - new_height) // 2
elif alignment == "μœ„":
margin_x = (target_size[0] - new_width) // 2
margin_y = 0
elif alignment == "μ•„λž˜":
margin_x = (target_size[0] - new_width) // 2
margin_y = target_size[1] - new_height
# λ°°κ²½ 이미지 생성
background = Image.new('RGB', target_size, (255, 255, 255))
background.paste(source, (margin_x, margin_y))
# 마슀크 생성
mask = Image.new('L', target_size, 255)
mask_draw = ImageDraw.Draw(mask)
# 마슀크 μ˜μ—­ 그리기
left_overlap = margin_x + overlap_x if alignment != "μ™Όμͺ½" else margin_x
right_overlap = margin_x + new_width - overlap_x if alignment != "였λ₯Έμͺ½" else margin_x + new_width
top_overlap = margin_y + overlap_y if alignment != "μœ„" else margin_y
bottom_overlap = margin_y + new_height - overlap_y if alignment != "μ•„λž˜" else margin_y + new_height
mask_draw.rectangle([
(left_overlap, top_overlap),
(right_overlap, bottom_overlap)
], fill=0)
return background, mask
def preview_outpaint(image, width, height, overlap_percentage, alignment):
"""μ•„μ›ƒνŽ˜μΈνŒ… 미리보기"""
background, mask = prepare_image_and_mask(image, width, height, overlap_percentage, alignment)
if background is None:
return None
# 미리보기 이미지 생성
preview = background.copy().convert('RGBA')
# 반투λͺ… 빨간색 μ˜€λ²„λ ˆμ΄
red_overlay = Image.new('RGBA', background.size, (255, 0, 0, 64))
# 마슀크 적용
red_mask = Image.new('RGBA', background.size, (0, 0, 0, 0))
red_mask.paste(red_overlay, (0, 0), mask)
# μ˜€λ²„λ ˆμ΄ ν•©μ„±
preview = Image.alpha_composite(preview, red_mask)
return preview
@spaces.GPU(duration=120)
def outpaint_image(image, prompt, width, height, overlap_percentage, alignment, num_steps=8):
"""이미지 μ•„μ›ƒνŽ˜μΈνŒ… μ‹€ν–‰"""
if image is None:
return None
# λͺ¨λΈ λ‘œλ“œ 확인
if not MODELS_LOADED:
load_models()
if OUTPAINT_PIPE is None:
return Image.new('RGB', (width, height), (200, 200, 200))
try:
# 이미지와 마슀크 μ€€λΉ„
background, mask = prepare_image_and_mask(image, width, height, overlap_percentage, alignment)
if background is None:
return None
# cnet_image 생성 (마슀크 μ˜μ—­μ„ κ²€μ€μƒ‰μœΌλ‘œ)
cnet_image = background.copy()
cnet_image.paste(0, (0, 0), mask)
# ν”„λ‘¬ν”„νŠΈ μ€€λΉ„
final_prompt = f"{prompt}, high quality, 4k" if prompt else "high quality, 4k"
# GPUμ—μ„œ μ‹€ν–‰
with torch.autocast(device_type=device.type, dtype=torch_dtype):
(
prompt_embeds,
negative_prompt_embeds,
pooled_prompt_embeds,
negative_pooled_prompt_embeds,
) = OUTPAINT_PIPE.encode_prompt(final_prompt, str(device), True)
# 생성 ν”„λ‘œμ„ΈμŠ€
for generated_image in OUTPAINT_PIPE(
prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds,
pooled_prompt_embeds=pooled_prompt_embeds,
negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
image=cnet_image,
num_inference_steps=num_steps
):
# 쀑간 κ²°κ³Ό (ν•„μš”μ‹œ μ‚¬μš©)
pass
# μ΅œμ’… 이미지
final_image = generated_image
# RGBA둜 λ³€ν™˜ν•˜κ³  마슀크 적용
final_image = final_image.convert("RGBA")
cnet_image.paste(final_image, (0, 0), mask)
return cnet_image
except Exception as e:
logging.error(f"Outpainting error: {str(e)}")
return background if 'background' in locals() else None
# MMAudio κ΄€λ ¨ ν•¨μˆ˜λ“€
def translate_prompt(text):
try:
if TRANSLATOR is None:
return text
if text and any(ord(char) >= 0x3131 and ord(char) <= 0xD7A3 for char in text):
with torch.no_grad():
translation = TRANSLATOR(text)[0]['translation_text']
return translation
return text
except Exception as e:
logging.error(f"Translation error: {e}")
return text
@spaces.GPU(duration=120)
@torch.inference_mode()
def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int, num_steps: int,
cfg_strength: float, duration: float):
# λͺ¨λΈ λ‘œλ“œ 확인
if not MODELS_LOADED:
load_models()
if MMAUDIO_NET is None:
return None
prompt = translate_prompt(prompt)
negative_prompt = translate_prompt(negative_prompt)
rng = torch.Generator(device=device)
rng.manual_seed(seed)
fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
clip_frames, sync_frames, duration = load_video(video, duration)
clip_frames = clip_frames.unsqueeze(0)
sync_frames = sync_frames.unsqueeze(0)
MMAUDIO_SEQ_CFG.duration = duration
MMAUDIO_NET.update_seq_lengths(MMAUDIO_SEQ_CFG.latent_seq_len, MMAUDIO_SEQ_CFG.clip_seq_len, MMAUDIO_SEQ_CFG.sync_seq_len)
audios = generate(clip_frames,
sync_frames, [prompt],
negative_text=[negative_prompt],
feature_utils=MMAUDIO_FEATURE_UTILS,
net=MMAUDIO_NET,
fm=fm,
rng=rng,
cfg_strength=cfg_strength)
audio = audios.float().cpu()[0]
video_save_path = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name
make_video(video,
video_save_path,
audio,
sampling_rate=MMAUDIO_SEQ_CFG.sampling_rate,
duration_sec=MMAUDIO_SEQ_CFG.duration)
return video_save_path
# λΉ„λ””μ˜€ 배경제거 κ΄€λ ¨ ν•¨μˆ˜λ“€
def process_bg_image(image, bg, fast_mode=False):
"""단일 이미지 λ°°κ²½ 처리"""
if BIREFNET_MODEL is None or BIREFNET_LITE_MODEL is None:
return image
image_size = image.size
input_images = transform_image(image).unsqueeze(0).to(device)
model = BIREFNET_LITE_MODEL if fast_mode else BIREFNET_MODEL
with torch.no_grad():
preds = model(input_images)[-1].sigmoid().cpu()
pred = preds[0].squeeze()
pred_pil = transforms.ToPILImage()(pred)
mask = pred_pil.resize(image_size)
if isinstance(bg, str) and bg.startswith("#"):
color_rgb = tuple(int(bg[i:i+2], 16) for i in (1, 3, 5))
background = Image.new("RGBA", image_size, color_rgb + (255,))
elif isinstance(bg, Image.Image):
background = bg.convert("RGBA").resize(image_size)
else:
background = Image.open(bg).convert("RGBA").resize(image_size)
image = Image.composite(image, background, mask)
return image
def process_video_frame(frame, bg_type, bg, fast_mode, bg_frame_index, background_frames, color):
"""λΉ„λ””μ˜€ ν”„λ ˆμž„ 처리"""
try:
pil_image = Image.fromarray(frame)
if bg_type == "색상":
processed_image = process_bg_image(pil_image, color, fast_mode)
elif bg_type == "이미지":
processed_image = process_bg_image(pil_image, bg, fast_mode)
elif bg_type == "λΉ„λ””μ˜€":
background_frame = background_frames[bg_frame_index]
bg_frame_index += 1
background_image = Image.fromarray(background_frame)
processed_image = process_bg_image(pil_image, background_image, fast_mode)
else:
processed_image = pil_image
return np.array(processed_image), bg_frame_index
except Exception as e:
print(f"Error processing frame: {e}")
return frame, bg_frame_index
@spaces.GPU(duration=300)
def process_video_bg(vid, bg_type="색상", bg_image=None, bg_video=None, color="#00FF00",
fps=0, video_handling="slow_down", fast_mode=True, max_workers=10):
"""λΉ„λ””μ˜€ λ°°κ²½ 처리 메인 ν•¨μˆ˜"""
# λͺ¨λΈ λ‘œλ“œ 확인
if not MODELS_LOADED:
load_models()
if BIREFNET_MODEL is None:
yield gr.update(visible=False), gr.update(visible=True), "BiRefNet λͺ¨λΈμ„ λ‘œλ“œν•˜μ§€ λͺ»ν–ˆμŠ΅λ‹ˆλ‹€."
yield None, None, "BiRefNet λͺ¨λΈμ„ λ‘œλ“œν•˜μ§€ λͺ»ν–ˆμŠ΅λ‹ˆλ‹€."
return
try:
start_time = time.time()
video = VideoFileClip(vid)
if fps == 0:
fps = video.fps
audio = video.audio
frames = list(video.iter_frames(fps=fps))
processed_frames = []
yield gr.update(visible=True), gr.update(visible=False), f"처리 μ‹œμž‘... κ²½κ³Ό μ‹œκ°„: 0초"
if bg_type == "λΉ„λ””μ˜€":
background_video = VideoFileClip(bg_video)
if background_video.duration < video.duration:
if video_handling == "slow_down":
background_video = background_video.fx(vfx.speedx, factor=video.duration / background_video.duration)
else: # video_handling == "loop"
background_video = concatenate_videoclips([background_video] * int(video.duration / background_video.duration + 1))
background_frames = list(background_video.iter_frames(fps=fps))
else:
background_frames = None
bg_frame_index = 0
with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = [executor.submit(process_video_frame, frames[i], bg_type, bg_image, fast_mode,
bg_frame_index + i, background_frames, color) for i in range(len(frames))]
for i, future in enumerate(futures):
result, _ = future.result()
processed_frames.append(result)
elapsed_time = time.time() - start_time
yield result, None, f"ν”„λ ˆμž„ {i+1}/{len(frames)} 처리 쀑... κ²½κ³Ό μ‹œκ°„: {elapsed_time:.2f}초"
processed_video = ImageSequenceClip(processed_frames, fps=fps)
processed_video = processed_video.with_audio(audio)
with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as temp_file:
temp_filepath = temp_file.name
processed_video.write_videofile(temp_filepath, codec="libx264")
elapsed_time = time.time() - start_time
yield gr.update(visible=False), gr.update(visible=True), f"처리 μ™„λ£Œ! κ²½κ³Ό μ‹œκ°„: {elapsed_time:.2f}초"
yield processed_frames[-1], temp_filepath, f"처리 μ™„λ£Œ! κ²½κ³Ό μ‹œκ°„: {elapsed_time:.2f}초"
except Exception as e:
print(f"Error: {e}")
elapsed_time = time.time() - start_time
yield gr.update(visible=False), gr.update(visible=True), f"λΉ„λ””μ˜€ 처리 였λ₯˜: {e}. κ²½κ³Ό μ‹œκ°„: {elapsed_time:.2f}초"
yield None, None, f"λΉ„λ””μ˜€ 처리 였λ₯˜: {e}. κ²½κ³Ό μ‹œκ°„: {elapsed_time:.2f}초"
@spaces.GPU(duration=180)
def merge_videos_with_audio(video_files, audio_file, audio_volume, output_fps):
"""μ—¬λŸ¬ λΉ„λ””μ˜€λ₯Ό λ³‘ν•©ν•˜κ³  μ˜€λ””μ˜€λ₯Ό μΆ”κ°€ν•˜λŠ” ν•¨μˆ˜"""
if not video_files:
return None, "λΉ„λ””μ˜€ νŒŒμΌμ„ μ—…λ‘œλ“œν•΄μ£Όμ„Έμš”."
if isinstance(video_files, list) and len(video_files) > 10:
return None, "μ΅œλŒ€ 10개의 λΉ„λ””μ˜€λ§Œ μ—…λ‘œλ“œ κ°€λŠ₯ν•©λ‹ˆλ‹€."
try:
# μƒνƒœ μ—…λ°μ΄νŠΈ
status = "λΉ„λ””μ˜€ 파일 μ •λ ¬ 쀑..."
# 파일 κ²½λ‘œμ™€ 파일λͺ…을 νŠœν”Œλ‘œ μ €μž₯ν•˜κ³  파일λͺ…μœΌλ‘œ μ •λ ¬
video_paths = []
if isinstance(video_files, list):
for video_file in video_files:
if video_file is not None:
video_paths.append(video_file)
else:
video_paths.append(video_files)
# 파일λͺ…μœΌλ‘œ μ •λ ¬ (κ²½λ‘œμ—μ„œ 파일λͺ…λ§Œ μΆ”μΆœν•˜μ—¬ μ •λ ¬)
video_paths.sort(key=lambda x: os.path.basename(x))
status = f"{len(video_paths)}개의 λΉ„λ””μ˜€ λ‘œλ“œ 쀑..."
# λΉ„λ””μ˜€ 클립 λ‘œλ“œ
video_clips = []
clip_sizes = []
for i, video_path in enumerate(video_paths):
status = f"λΉ„λ””μ˜€ {i+1}/{len(video_paths)} λ‘œλ“œ 쀑: {os.path.basename(video_path)}"
clip = VideoFileClip(video_path)
video_clips.append(clip)
# 각 클립의 크기 μ €μž₯
try:
clip_sizes.append((clip.w, clip.h))
except:
clip_sizes.append(clip.size)
# 첫 번째 λΉ„λ””μ˜€μ˜ 크기λ₯Ό κΈ°μ€€μœΌλ‘œ 함
target_width, target_height = clip_sizes[0]
# λͺ¨λ“  λΉ„λ””μ˜€μ˜ 크기가 같은지 확인
all_same_size = all(size == (target_width, target_height) for size in clip_sizes)
if not all_same_size:
logging.warning(f"λΉ„λ””μ˜€ 크기가 μ„œλ‘œ λ‹€λ¦…λ‹ˆλ‹€. 첫 번째 λΉ„λ””μ˜€ 크기({target_width}x{target_height})둜 μ‘°μ •ν•©λ‹ˆλ‹€.")
# 크기가 λ‹€λ₯Έ λΉ„λ””μ˜€λ“€μ„ μ‘°μ •
adjusted_clips = []
for clip, size in zip(video_clips, clip_sizes):
if size != (target_width, target_height):
adjusted_clip = vfx.resize(clip, newsize=(target_width, target_height))
adjusted_clips.append(adjusted_clip)
else:
adjusted_clips.append(clip)
video_clips = adjusted_clips
# 첫 번째 λΉ„λ””μ˜€μ˜ FPSλ₯Ό κΈ°λ³Έκ°’μœΌλ‘œ μ‚¬μš©
if output_fps == 0:
output_fps = video_clips[0].fps
status = "λΉ„λ””μ˜€ 병합 쀑..."
# λΉ„λ””μ˜€ 병합
final_video = concatenate_videoclips(video_clips, method="compose")
# μ˜€λ””μ˜€ 처리
if audio_file:
status = "μ˜€λ””μ˜€ 처리 쀑..."
try:
# μ˜€λ””μ˜€ 파일 경둜 확인
if isinstance(audio_file, str):
audio_path = audio_file
else:
# gr.Audioμ—μ„œ λ°˜ν™˜λœ νŠœν”ŒμΈ 경우
audio_path = audio_file
logging.info(f"Processing audio from: {audio_path}")
# μ˜€λ””μ˜€ λ‘œλ“œ
if audio_path.endswith(('.mp4', '.avi', '.mov', '.mkv')):
# λΉ„λ””μ˜€ νŒŒμΌμ—μ„œ μ˜€λ””μ˜€ μΆ”μΆœ
temp_video = VideoFileClip(audio_path)
audio_clip = temp_video.audio
temp_video.close()
else:
# μ˜€λ””μ˜€ 파일 직접 λ‘œλ“œ
audio_clip = AudioFileClip(audio_path)
if audio_clip is None:
raise ValueError("μ˜€λ””μ˜€λ₯Ό λ‘œλ“œν•  수 μ—†μŠ΅λ‹ˆλ‹€.")
# λ³Όλ₯¨ 쑰절
if audio_volume != 100:
audio_clip = audio_clip.volumex(audio_volume / 100)
# μ˜€λ””μ˜€λ₯Ό λΉ„λ””μ˜€ 길이에 맞좀
video_duration = final_video.duration
audio_duration = audio_clip.duration
if audio_duration > video_duration:
# μ˜€λ””μ˜€κ°€ 더 κΈΈλ©΄ μž˜λΌλƒ„
audio_clip = audio_clip.subclip(0, video_duration)
elif audio_duration < video_duration:
# μ˜€λ””μ˜€κ°€ 더 짧으면 반볡
loops_needed = int(video_duration / audio_duration) + 1
audio_clips_list = [audio_clip] * loops_needed
looped_audio = concatenate_audioclips(audio_clips_list)
audio_clip = looped_audio.subclip(0, video_duration)
# κΈ°μ‘΄ μ˜€λ””μ˜€ μ œκ±°ν•˜κ³  μƒˆ μ˜€λ””μ˜€λ‘œ ꡐ체
# (κΈ°μ‘΄ μ˜€λ””μ˜€μ™€ ν•©μ„±ν•˜λ €λ©΄ μ•„λž˜ 주석 ν•΄μ œ)
final_video = final_video.set_audio(audio_clip)
# κΈ°μ‘΄ μ˜€λ””μ˜€μ™€ μƒˆ μ˜€λ””μ˜€ 합성을 μ›ν•˜λŠ” 경우:
# if final_video.audio:
# final_audio = CompositeAudioClip([final_video.audio, audio_clip])
# final_video = final_video.set_audio(final_audio)
# else:
# final_video = final_video.set_audio(audio_clip)
logging.info("Audio successfully added to video")
except Exception as e:
logging.error(f"μ˜€λ””μ˜€ 처리 쀑 였λ₯˜ λ°œμƒ: {str(e)}")
# μ˜€λ””μ˜€ 처리 μ‹€νŒ¨ν•΄λ„ λΉ„λ””μ˜€λŠ” 계속 처리
status = f"μ˜€λ””μ˜€ 처리 μ‹€νŒ¨: {str(e)}, λΉ„λ””μ˜€λ§Œ λ³‘ν•©ν•©λ‹ˆλ‹€."
status = "λΉ„λ””μ˜€ μ €μž₯ 쀑..."
# μž„μ‹œ 파일둜 μ €μž₯
with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as temp_file:
temp_filepath = temp_file.name
# 코덱 μ„€μ • - 원본 ν’ˆμ§ˆ μœ μ§€
final_video.write_videofile(
temp_filepath,
fps=output_fps,
codec="libx264",
audio_codec="aac",
preset="medium", # ν’ˆμ§ˆ μ„€μ •
bitrate="5000k", # λΉ„νŠΈλ ˆμ΄νŠΈ μ„€μ •μœΌλ‘œ ν’ˆμ§ˆ μœ μ§€
audio_bitrate="192k"
)
# λ¦¬μ†ŒμŠ€ 정리
for clip in video_clips:
clip.close()
if 'adjusted_clips' in locals():
for clip in adjusted_clips:
if clip not in video_clips:
clip.close()
if audio_file and 'audio_clip' in locals():
audio_clip.close()
final_video.close()
return temp_filepath, f"βœ… μ„±κ³΅μ μœΌλ‘œ {len(video_paths)}개의 λΉ„λ””μ˜€λ₯Ό λ³‘ν•©ν–ˆμŠ΅λ‹ˆλ‹€! (크기: {target_width}x{target_height})"
except Exception as e:
logging.error(f"Video merge error: {str(e)}")
import traceback
traceback.print_exc()
return None, f"❌ 였λ₯˜ λ°œμƒ: {str(e)}"
# CSS
css = """
:root {
--primary-color: #f8c3cd;
--secondary-color: #b3e5fc;
--background-color: #f5f5f7;
--card-background: #ffffff;
--text-color: #424242;
--accent-color: #ffb6c1;
--success-color: #c8e6c9;
--warning-color: #fff9c4;
--shadow-color: rgba(0, 0, 0, 0.1);
--border-radius: 12px;
}
.gradio-container {
max-width: 1200px !important;
margin: 0 auto !important;
}
.panel-box {
border-radius: var(--border-radius) !important;
box-shadow: 0 8px 16px var(--shadow-color) !important;
background-color: var(--card-background) !important;
padding: 20px !important;
margin-bottom: 20px !important;
}
#generate-btn, #video-btn, #outpaint-btn, #preview-btn, #audio-btn, #bg-remove-btn, #merge-btn {
background: linear-gradient(135deg, #ff9a9e, #fad0c4) !important;
font-size: 1.1rem !important;
padding: 12px 24px !important;
margin-top: 10px !important;
width: 100% !important;
}
.tabitem {
min-height: 700px !important;
}
"""
# Gradio Interface
demo = gr.Blocks(css=css, title="AI 이미지 & λΉ„λ””μ˜€ & μ˜€λ””μ˜€ 생성기")
with demo:
gr.Markdown("# 🎨 Ginigen μŠ€νŠœλ””μ˜€")
gr.Markdown("처음 μ‚¬μš© μ‹œ λͺ¨λΈ λ‘œλ”©μ— μ‹œκ°„μ΄ 걸릴 수 μžˆμŠ΅λ‹ˆλ‹€. μž μ‹œλ§Œ κΈ°λ‹€λ €μ£Όμ„Έμš”.")
# λͺ¨λΈ λ‘œλ“œ μƒνƒœ ν‘œμ‹œ
model_status = gr.Textbox(label="λͺ¨λΈ μƒνƒœ", value="λͺ¨λΈ λ‘œλ”© λŒ€κΈ° 쀑...", interactive=False)
with gr.Tabs() as tabs:
# 첫 번째 νƒ­: ν…μŠ€νŠΈ to 이미지
with gr.Tab("ν…μŠ€νŠΈβ†’μ΄λ―Έμ§€β†’λΉ„λ””μ˜€", elem_classes="tabitem"):
with gr.Row(equal_height=True):
# μž…λ ₯ 컬럼
with gr.Column(scale=1):
with gr.Group(elem_classes="panel-box"):
gr.Markdown("### πŸ“ 이미지 생성 μ„€μ •")
prompt = gr.Textbox(
label="ν”„λ‘¬ν”„νŠΈ(ν•œκΈ€/μ˜μ–΄ κ°€λŠ₯)",
placeholder="μƒμ„±ν•˜κ³  싢은 이미지λ₯Ό μ„€λͺ…ν•˜μ„Έμš”...",
lines=3
)
size_preset = gr.Dropdown(
choices=list(IMAGE_PRESETS.keys()),
value="1:1 μ •μ‚¬κ°ν˜•",
label="크기 프리셋"
)
with gr.Row():
width = gr.Slider(256, 2048, 1024, step=64, label="λ„ˆλΉ„")
height = gr.Slider(256, 2048, 1024, step=64, label="높이")
with gr.Row():
guidance = gr.Slider(1.0, 20.0, 3.5, step=0.1, label="κ°€μ΄λ˜μŠ€")
steps = gr.Slider(1, 50, 30, step=1, label="μŠ€ν…")
seed = gr.Number(label="μ‹œλ“œ (-1=랜덀)", value=-1)
generate_btn = gr.Button("🎨 이미지 생성", variant="primary", elem_id="generate-btn")
with gr.Group(elem_classes="panel-box"):
gr.Markdown("### 🎬 λΉ„λ””μ˜€ 생성 μ„€μ •")
video_prompt = gr.Textbox(
label="(선택) λΉ„λ””μ˜€ ν”„λ‘¬ν”„νŠΈ(μ˜μ–΄λ‘œ μž…λ ₯)",
placeholder="λΉ„λ””μ˜€μ˜ μ›€μ§μž„μ„ μ„€λͺ…ν•˜μ„Έμš”... (λΉ„μ›Œλ‘λ©΄ κΈ°λ³Έ μ›€μ§μž„ 적용)",
lines=2
)
video_length = gr.Slider(
minimum=1,
maximum=60,
value=4,
step=0.5,
label="λΉ„λ””μ˜€ 길이 (초)",
info="1μ΄ˆμ—μ„œ 60μ΄ˆκΉŒμ§€ 선택 κ°€λŠ₯ν•©λ‹ˆλ‹€"
)
video_btn = gr.Button("🎬 λΉ„λ””μ˜€λ‘œ λ³€ν™˜", variant="secondary", elem_id="video-btn")
# 좜λ ₯ 컬럼
with gr.Column(scale=1):
with gr.Group(elem_classes="panel-box"):
gr.Markdown("### πŸ–ΌοΈ 생성 κ²°κ³Ό")
output_image = gr.Image(label="μƒμ„±λœ 이미지", type="numpy")
output_seed = gr.Textbox(label="μ‹œλ“œ 정보")
output_video = gr.Video(label="μƒμ„±λœ λΉ„λ””μ˜€")
# 두 번째 νƒ­: 이미지 μ•„μ›ƒνŽ˜μΈνŒ…
with gr.Tab("이미지 λΉ„μœ¨ λ³€κ²½/생성", elem_classes="tabitem"):
with gr.Row(equal_height=True):
# μž…λ ₯ 컬럼
with gr.Column(scale=1):
with gr.Group(elem_classes="panel-box"):
gr.Markdown("### πŸ–ΌοΈ 이미지 μ—…λ‘œλ“œ")
input_image = gr.Image(
label="원본 이미지",
type="numpy"
)
outpaint_prompt = gr.Textbox(
label="ν”„λ‘¬ν”„νŠΈ (선택)",
placeholder="ν™•μž₯ν•  μ˜μ—­μ— λŒ€ν•œ μ„€λͺ…...",
lines=2
)
with gr.Group(elem_classes="panel-box"):
gr.Markdown("### βš™οΈ μ•„μ›ƒνŽ˜μΈνŒ… μ„€μ •")
outpaint_size_preset = gr.Dropdown(
choices=list(IMAGE_PRESETS.keys()),
value="16:9 μ™€μ΄λ“œμŠ€ν¬λ¦°",
label="λͺ©ν‘œ 크기 프리셋"
)
with gr.Row():
outpaint_width = gr.Slider(256, 2048, 1280, step=64, label="λͺ©ν‘œ λ„ˆλΉ„")
outpaint_height = gr.Slider(256, 2048, 720, step=64, label="λͺ©ν‘œ 높이")
alignment = gr.Dropdown(
choices=["κ°€μš΄λ°", "μ™Όμͺ½", "였λ₯Έμͺ½", "μœ„", "μ•„λž˜"],
value="κ°€μš΄λ°",
label="μ •λ ¬"
)
overlap_percentage = gr.Slider(
minimum=1,
maximum=50,
value=10,
step=1,
label="마슀크 μ˜€λ²„λž© (%)"
)
outpaint_steps = gr.Slider(
minimum=4,
maximum=12,
value=8,
step=1,
label="μΆ”λ‘  μŠ€ν…"
)
preview_btn = gr.Button("πŸ‘οΈ 미리보기", elem_id="preview-btn")
outpaint_btn = gr.Button("🎨 μ•„μ›ƒνŽ˜μΈνŒ… μ‹€ν–‰", variant="primary", elem_id="outpaint-btn")
# 좜λ ₯ 컬럼
with gr.Column(scale=1):
with gr.Group(elem_classes="panel-box"):
gr.Markdown("### πŸ–ΌοΈ κ²°κ³Ό")
preview_image = gr.Image(label="미리보기")
outpaint_result = gr.Image(label="μ•„μ›ƒνŽ˜μΈνŒ… κ²°κ³Ό")
# μ„Έ 번째 νƒ­: λΉ„λ””μ˜€ + μ˜€λ””μ˜€
with gr.Tab("λΉ„λ””μ˜€ + μ˜€λ””μ˜€", elem_classes="tabitem"):
with gr.Row(equal_height=True):
# μž…λ ₯ 컬럼
with gr.Column(scale=1):
with gr.Group(elem_classes="panel-box"):
gr.Markdown("### πŸŽ₯ λΉ„λ””μ˜€ μ—…λ‘œλ“œ")
audio_video_input = gr.Video(
label="μž…λ ₯ λΉ„λ””μ˜€",
sources=["upload"]
)
with gr.Group(elem_classes="panel-box"):
gr.Markdown("### 🎡 μ˜€λ””μ˜€ 생성 μ„€μ •")
audio_prompt = gr.Textbox(
label="ν”„λ‘¬ν”„νŠΈ (ν•œκΈ€ 지원)",
placeholder="μƒμ„±ν•˜κ³  싢은 μ˜€λ””μ˜€λ₯Ό μ„€λͺ…ν•˜μ„Έμš”... (예: ν‰ν™”λ‘œμš΄ ν”Όμ•„λ…Έ μŒμ•…)",
lines=3
)
audio_negative_prompt = gr.Textbox(
label="λ„€κ±°ν‹°λΈŒ ν”„λ‘¬ν”„νŠΈ",
value="music",
placeholder="μ›ν•˜μ§€ μ•ŠλŠ” μš”μ†Œ...",
lines=2
)
with gr.Row():
audio_seed = gr.Number(label="μ‹œλ“œ", value=0)
audio_steps = gr.Number(label="μŠ€ν…", value=25)
with gr.Row():
audio_cfg = gr.Number(label="κ°€μ΄λ˜μŠ€ μŠ€μΌ€μΌ", value=4.5)
audio_duration = gr.Number(label="μ§€μ†μ‹œκ°„ (초)", value=9999)
audio_btn = gr.Button("🎡 μ˜€λ””μ˜€ 생성 및 ν•©μ„±", variant="primary", elem_id="audio-btn")
# 좜λ ₯ 컬럼
with gr.Column(scale=1):
with gr.Group(elem_classes="panel-box"):
gr.Markdown("### 🎬 생성 κ²°κ³Ό")
output_video_with_audio = gr.Video(
label="μ˜€λ””μ˜€κ°€ μΆ”κ°€λœ λΉ„λ””μ˜€",
interactive=False
)
# λ„€ 번째 νƒ­: λΉ„λ””μ˜€ νŽΈμ§‘
with gr.Tab("λΉ„λ””μ˜€ νŽΈμ§‘", elem_classes="tabitem"):
with gr.Row(equal_height=True):
# μž…λ ₯ 컬럼
with gr.Column(scale=1):
with gr.Group(elem_classes="panel-box"):
gr.Markdown("### πŸŽ₯ λΉ„λ””μ˜€ μ—…λ‘œλ“œ (μ΅œλŒ€ 10개)")
gr.Markdown("**파일λͺ…이 μž‘μ„μˆ˜λ‘ μš°μ„ μˆœμœ„κ°€ λ†’μŠ΅λ‹ˆλ‹€** (예: 1.mp4, 2.mp4, 3.mp4)")
video_files = gr.File(
label="λΉ„λ””μ˜€ νŒŒμΌλ“€",
file_count="multiple",
file_types=["video"],
type="filepath"
)
with gr.Group(elem_classes="panel-box"):
gr.Markdown("### 🎡 μ˜€λ””μ˜€ μ„€μ • (선택)")
gr.Markdown("**주의**: μ—…λ‘œλ“œν•œ μ˜€λ””μ˜€κ°€ λΉ„λ””μ˜€μ˜ κΈ°μ‘΄ μ˜€λ””μ˜€λ₯Ό μ™„μ „νžˆ λŒ€μ²΄ν•©λ‹ˆλ‹€.")
audio_file = gr.Audio(
label="μ˜€λ””μ˜€ 파일 (MP3, WAV, M4A λ“±)",
type="filepath",
sources=["upload"]
)
audio_volume = gr.Slider(
minimum=0,
maximum=200,
value=100,
step=1,
label="μ˜€λ””μ˜€ λ³Όλ₯¨ (%)",
info="100% = 원본 λ³Όλ₯¨"
)
gr.Markdown("""
**μ˜€λ””μ˜€ μ˜΅μ…˜**:
- μ˜€λ””μ˜€κ°€ λΉ„λ””μ˜€λ³΄λ‹€ 짧으면 μžλ™μœΌλ‘œ λ°˜λ³΅λ©λ‹ˆλ‹€
- μ˜€λ””μ˜€κ°€ λΉ„λ””μ˜€λ³΄λ‹€ κΈΈλ©΄ λΉ„λ””μ˜€ 길이에 맞좰 μž˜λ¦½λ‹ˆλ‹€
""")
with gr.Group(elem_classes="panel-box"):
gr.Markdown("### βš™οΈ νŽΈμ§‘ μ„€μ •")
output_fps = gr.Slider(
minimum=0,
maximum=60,
value=0,
step=1,
label="좜λ ₯ FPS (0 = 첫 번째 λΉ„λ””μ˜€μ˜ FPS μ‚¬μš©)"
)
gr.Markdown("""
**크기 처리**:
- 첫 번째 λΉ„λ””μ˜€μ˜ 크기가 기쀀이 λ©λ‹ˆλ‹€
- λ‹€λ₯Έ 크기의 λΉ„λ””μ˜€λŠ” 첫 번째 λΉ„λ””μ˜€ 크기둜 μ‘°μ •λ©λ‹ˆλ‹€
- μ΅œμƒμ˜ κ²°κ³Όλ₯Ό μœ„ν•΄ 같은 크기의 λΉ„λ””μ˜€λ₯Ό μ‚¬μš©ν•˜μ„Έμš”
""")
merge_videos_btn = gr.Button("🎬 λΉ„λ””μ˜€ 병합", variant="primary", elem_id="merge-btn")
# 좜λ ₯ 컬럼
with gr.Column(scale=1):
with gr.Group(elem_classes="panel-box"):
gr.Markdown("### 🎬 병합 κ²°κ³Ό")
merge_status = gr.Textbox(label="처리 μƒνƒœ", interactive=False)
merged_video = gr.Video(label="λ³‘ν•©λœ λΉ„λ””μ˜€")
gr.Markdown("""
### ℹ️ μ‚¬μš© 방법
1. μ—¬λŸ¬ λΉ„λ””μ˜€ νŒŒμΌμ„ μ—…λ‘œλ“œν•˜μ„Έμš” (μ΅œλŒ€ 10개)
2. 파일λͺ…이 μž‘μ€ μˆœμ„œλŒ€λ‘œ μžλ™ μ •λ ¬λ©λ‹ˆλ‹€
3. (선택) μ˜€λ””μ˜€ νŒŒμΌμ„ μΆ”κ°€ν•˜κ³  λ³Όλ₯¨μ„ μ‘°μ ˆν•˜μ„Έμš”
4. 'λΉ„λ””μ˜€ 병합' λ²„νŠΌμ„ ν΄λ¦­ν•˜μ„Έμš”
**νŠΉμ§•**:
- βœ… 첫 번째 λΉ„λ””μ˜€μ˜ 크기λ₯Ό κΈ°μ€€μœΌλ‘œ 톡합
- βœ… μ—…λ‘œλ“œν•œ μ˜€λ””μ˜€κ°€ 전체 λΉ„λ””μ˜€μ— μ μš©λ©λ‹ˆλ‹€
- βœ… 높은 λΉ„νŠΈλ ˆμ΄νŠΈλ‘œ ν’ˆμ§ˆ μœ μ§€
**팁**:
- 파일λͺ…을 01.mp4, 02.mp4, 03.mp4 ν˜•μ‹μœΌλ‘œ μ§€μ •ν•˜λ©΄ μˆœμ„œ 관리가 μ‰½μŠ΅λ‹ˆλ‹€
- μ˜€λ””μ˜€λ₯Ό μΆ”κ°€ν•˜λ©΄ κΈ°μ‘΄ λΉ„λ””μ˜€μ˜ μ˜€λ””μ˜€λŠ” λŒ€μ²΄λ©λ‹ˆλ‹€
""")
# λ‹€μ„― 번째 νƒ­: λΉ„λ””μ˜€ 배경제거/ν•©μ„±
with gr.Tab("λΉ„λ””μ˜€ 배경제거/ν•©μ„±", elem_classes="tabitem"):
with gr.Row(equal_height=True):
# μž…λ ₯ 컬럼
with gr.Column(scale=1):
with gr.Group(elem_classes="panel-box"):
gr.Markdown("### πŸŽ₯ λΉ„λ””μ˜€ μ—…λ‘œλ“œ")
bg_video_input = gr.Video(
label="μž…λ ₯ λΉ„λ””μ˜€",
interactive=True
)
with gr.Group(elem_classes="panel-box"):
gr.Markdown("### 🎨 λ°°κ²½ μ„€μ •")
bg_type = gr.Radio(
["색상", "이미지", "λΉ„λ””μ˜€"],
label="λ°°κ²½ μœ ν˜•",
value="색상",
interactive=True
)
color_picker = gr.ColorPicker(
label="λ°°κ²½ 색상",
value="#00FF00",
visible=True,
interactive=True
)
bg_image_input = gr.Image(
label="λ°°κ²½ 이미지",
type="filepath",
visible=False,
interactive=True
)
bg_video_bg = gr.Video(
label="λ°°κ²½ λΉ„λ””μ˜€",
visible=False,
interactive=True
)
with gr.Column(visible=False) as video_handling_options:
video_handling_radio = gr.Radio(
["slow_down", "loop"],
label="λΉ„λ””μ˜€ 처리 방식",
value="slow_down",
interactive=True,
info="slow_down: λ°°κ²½ λΉ„λ””μ˜€λ₯Ό 느리게 μž¬μƒ, loop: λ°°κ²½ λΉ„λ””μ˜€λ₯Ό 반볡"
)
with gr.Group(elem_classes="panel-box"):
gr.Markdown("### βš™οΈ 처리 μ„€μ •")
fps_slider = gr.Slider(
minimum=0,
maximum=60,
step=1,
value=0,
label="좜λ ₯ FPS (0 = 원본 FPS μœ μ§€)",
interactive=True
)
fast_mode_checkbox = gr.Checkbox(
label="λΉ λ₯Έ λͺ¨λ“œ (BiRefNet_lite μ‚¬μš©)",
value=True,
interactive=True
)
max_workers_slider = gr.Slider(
minimum=1,
maximum=32,
step=1,
value=10,
label="μ΅œλŒ€ μ›Œμ»€ 수",
info="λ³‘λ ¬λ‘œ μ²˜λ¦¬ν•  ν”„λ ˆμž„ 수",
interactive=True
)
bg_remove_btn = gr.Button("🎬 λ°°κ²½ λ³€κ²½", variant="primary", elem_id="bg-remove-btn")
# 좜λ ₯ 컬럼
with gr.Column(scale=1):
with gr.Group(elem_classes="panel-box"):
gr.Markdown("### 🎬 처리 결과")
stream_image = gr.Image(label="μ‹€μ‹œκ°„ 슀트리밍", visible=False)
output_bg_video = gr.Video(label="μ΅œμ’… λΉ„λ””μ˜€")
time_textbox = gr.Textbox(label="κ²½κ³Ό μ‹œκ°„", interactive=False)
gr.Markdown("""
### ℹ️ μ‚¬μš© 방법
1. λΉ„λ””μ˜€λ₯Ό μ—…λ‘œλ“œν•˜μ„Έμš”
2. μ›ν•˜λŠ” λ°°κ²½ μœ ν˜•μ„ μ„ νƒν•˜μ„Έμš”
3. 섀정을 μ‘°μ •ν•˜κ³  'λ°°κ²½ λ³€κ²½' λ²„νŠΌμ„ ν΄λ¦­ν•˜μ„Έμš”
**μ°Έκ³ **: GPU μ œν•œμœΌλ‘œ ν•œ λ²ˆμ— μ•½ 200ν”„λ ˆμž„κΉŒμ§€ 처리 κ°€λŠ₯ν•©λ‹ˆλ‹€.
κΈ΄ λΉ„λ””μ˜€λŠ” μž‘μ€ 쑰각으둜 λ‚˜λˆ„μ–΄ μ²˜λ¦¬ν•˜μ„Έμš”.
""")
# λͺ¨λΈ λ‘œλ“œ ν•¨μˆ˜ μ‹€ν–‰
def on_demo_load():
try:
if IS_SPACES:
# Spaces ν™˜κ²½μ—μ„œ GPU μ›Œλ°μ—…
gpu_warmup()
# λͺ¨λΈ λ‘œλ“œλŠ” 첫 번째 GPU ν•¨μˆ˜ 호좜 μ‹œ μžλ™μœΌλ‘œ μˆ˜ν–‰λ¨
return "λͺ¨λΈ λ‘œλ”© μ€€λΉ„ μ™„λ£Œ"
except Exception as e:
return f"μ΄ˆκΈ°ν™” 였λ₯˜: {str(e)}"
# 이벀트 μ—°κ²° - 첫 번째 νƒ­
size_preset.change(update_dimensions, [size_preset], [width, height])
generate_btn.click(
generate_text_to_image,
[prompt, width, height, guidance, steps, seed],
[output_image, output_seed]
)
video_btn.click(
lambda img, v_prompt, length: generate_video_from_image(img, v_prompt, length) if img is not None else None,
[output_image, video_prompt, video_length],
[output_video]
)
# 이벀트 μ—°κ²° - 두 번째 νƒ­
outpaint_size_preset.change(update_dimensions, [outpaint_size_preset], [outpaint_width, outpaint_height])
preview_btn.click(
preview_outpaint,
[input_image, outpaint_width, outpaint_height, overlap_percentage, alignment],
[preview_image]
)
outpaint_btn.click(
outpaint_image,
[input_image, outpaint_prompt, outpaint_width, outpaint_height, overlap_percentage, alignment, outpaint_steps],
[outpaint_result]
)
# 이벀트 μ—°κ²° - μ„Έ 번째 νƒ­
audio_btn.click(
video_to_audio,
[audio_video_input, audio_prompt, audio_negative_prompt, audio_seed, audio_steps, audio_cfg, audio_duration],
[output_video_with_audio]
)
# 이벀트 μ—°κ²° - λ„€ 번째 νƒ­ (λΉ„λ””μ˜€ νŽΈμ§‘)
merge_videos_btn.click(
merge_videos_with_audio,
inputs=[video_files, audio_file, audio_volume, output_fps],
outputs=[merged_video, merge_status]
)
# 이벀트 μ—°κ²° - λ‹€μ„― 번째 νƒ­ (λΉ„λ””μ˜€ 배경제거/ν•©μ„±)
def update_bg_visibility(bg_type):
if bg_type == "색상":
return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
elif bg_type == "이미지":
return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)
elif bg_type == "λΉ„λ””μ˜€":
return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=True)
else:
return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
bg_type.change(
update_bg_visibility,
inputs=bg_type,
outputs=[color_picker, bg_image_input, bg_video_bg, video_handling_options]
)
bg_remove_btn.click(
process_video_bg,
inputs=[bg_video_input, bg_type, bg_image_input, bg_video_bg, color_picker,
fps_slider, video_handling_radio, fast_mode_checkbox, max_workers_slider],
outputs=[stream_image, output_bg_video, time_textbox]
)
# 데λͺ¨ λ‘œλ“œ μ‹œ μ‹€ν–‰
demo.load(on_demo_load, outputs=model_status)
if __name__ == "__main__":
# Spaces ν™˜κ²½μ—μ„œ μΆ”κ°€ 체크
if IS_SPACES:
try:
gpu_warmup()
except:
pass
demo.launch()