STUDIO / app-backup.py
ginipick's picture
Update app-backup.py
2b5656b verified
raw
history blame
72.1 kB
# Spaces GPU - ๋ฐ˜๋“œ์‹œ ์ฒซ ๋ฒˆ์งธ๋กœ importํ•ด์•ผ ํ•จ!
import os
IS_SPACES = os.environ.get("SPACE_ID") is not None
if IS_SPACES:
import spaces
else:
# GPU ๋ฐ์ฝ”๋ ˆ์ดํ„ฐ๊ฐ€ ์—†์„ ๋•Œ๋ฅผ ์œ„ํ•œ ๋”๋ฏธ ๋ฐ์ฝ”๋ ˆ์ดํ„ฐ
class spaces:
@staticmethod
def GPU(duration=None):
def decorator(func):
return func
return decorator
# ์ด์ œ ๋‹ค๋ฅธ ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ๋“ค์„ import
import gradio as gr
import numpy as np
from PIL import Image, ImageDraw
from gradio_client import Client, handle_file
import random
import tempfile
import logging
import torch
from diffusers import AutoencoderKL, TCDScheduler
from diffusers.models.model_loading_utils import load_state_dict
from huggingface_hub import hf_hub_download
from pathlib import Path
import torchaudio
from einops import rearrange
from scipy.io import wavfile
from transformers import pipeline
# ๋น„๋””์˜ค ๋ฐฐ๊ฒฝ์ œ๊ฑฐ ๊ด€๋ จ import
from transformers import AutoModelForImageSegmentation
from torchvision import transforms
# โ”€โ”€ moviepy import โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
try:
from moviepy.editor import (
VideoFileClip,
concatenate_videoclips,
ImageSequenceClip,
concatenate_audioclips,
AudioFileClip,
CompositeAudioClip,
CompositeVideoClip,
ColorClip
)
except ImportError:
# ๊ฐœ๋ณ„์ ์œผ๋กœ import ์‹œ๋„
try:
from moviepy.video.io.VideoFileClip import VideoFileClip
except ImportError:
from moviepy import VideoFileClip
try:
from moviepy.video.compositing.concatenate import concatenate_videoclips
except ImportError:
from moviepy import concatenate_videoclips
try:
from moviepy.video.io.ImageSequenceClip import ImageSequenceClip
except ImportError:
from moviepy.editor import ImageSequenceClip
try:
from moviepy.audio.io.AudioFileClip import AudioFileClip
except ImportError:
from moviepy.editor import AudioFileClip
try:
from moviepy.audio.AudioClip import concatenate_audioclips, CompositeAudioClip
except ImportError:
from moviepy.editor import concatenate_audioclips, CompositeAudioClip
try:
from moviepy.video.compositing.CompositeVideoClip import CompositeVideoClip
except ImportError:
from moviepy.editor import CompositeVideoClip
try:
from moviepy.video.VideoClip import ColorClip
except ImportError:
from moviepy.editor import ColorClip
# resize ํ•จ์ˆ˜ import ์‹œ๋„
resize = None
try:
from moviepy.video.fx.resize import resize
except ImportError:
try:
from moviepy.video.fx.all import resize
except ImportError:
try:
# editor๋ฅผ ํ†ตํ•œ import ์‹œ๋„
from moviepy.editor import resize
except ImportError:
pass # resize๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Œ
# resize๊ฐ€ ์—†์œผ๋ฉด ๋Œ€์ฒด ํ•จ์ˆ˜ ์ƒ์„ฑ
if resize is None:
def resize(clip, newsize=None, height=None, width=None):
"""Fallback resize function when moviepy resize is not available"""
if hasattr(clip, 'resize'):
if newsize:
return clip.resize(newsize)
elif height:
return clip.resize(height=height)
elif width:
return clip.resize(width=width)
# ํฌ๊ธฐ ๋ณ€๊ฒฝ์ด ๋ถˆ๊ฐ€๋Šฅํ•˜๋ฉด ์›๋ณธ ๋ฐ˜ํ™˜
return clip
# speedx ํ•จ์ˆ˜ import ์‹œ๋„
speedx = None
try:
from moviepy.video.fx.speedx import speedx
except ImportError:
try:
from moviepy.video.fx.all import speedx
except ImportError:
try:
from moviepy.editor import speedx
except ImportError:
pass # speedx๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Œ
# speedx๊ฐ€ ์—†์œผ๋ฉด ๋Œ€์ฒด ํ•จ์ˆ˜ ์ƒ์„ฑ
if speedx is None:
def speedx(clip, factor=1.0, final_duration=None):
"""Fallback speedx function"""
if hasattr(clip, 'fx') and hasattr(clip.fx, 'speedx'):
return clip.fx.speedx(factor, final_duration)
elif hasattr(clip, 'fl_time'):
return clip.fl_time(lambda t: t * factor)
elif hasattr(clip, 'with_fps') and factor != 1.0:
# FPS๋ฅผ ์กฐ์ •ํ•˜์—ฌ ์†๋„ ๋ณ€๊ฒฝ ํšจ๊ณผ ๊ตฌํ˜„
new_fps = clip.fps * factor if hasattr(clip, 'fps') else 24 * factor
return clip.with_fps(new_fps)
else:
# ์ตœํ›„์˜ ์ˆ˜๋‹จ: ํด๋ฆฝ ๊ทธ๋Œ€๋กœ ๋ฐ˜ํ™˜
return clip
import time
from concurrent.futures import ThreadPoolExecutor
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
import httpx
from datetime import datetime
# ํ™˜๊ฒฝ ๋ณ€์ˆ˜ ์„ค์ •์œผ๋กœ torch.load ์ฒดํฌ ์šฐํšŒ (์ž„์‹œ ํ•ด๊ฒฐ์ฑ…)
os.environ["TRANSFORMERS_ALLOW_UNSAFE_DESERIALIZATION"] = "1"
# GPU ์ดˆ๊ธฐํ™”๋ฅผ ์œ„ํ•œ ๊ฐ„๋‹จํ•œ ํ•จ์ˆ˜ (Spaces ํ™˜๊ฒฝ์—์„œ ํ•„์ˆ˜)
@spaces.GPU(duration=1)
def gpu_warmup():
"""GPU ์›Œ๋ฐ์—… ํ•จ์ˆ˜ - Spaces ํ™˜๊ฒฝ์—์„œ GPU ์‚ฌ์šฉ์„ ์œ„ํ•ด ํ•„์š”"""
if torch.cuda.is_available():
dummy = torch.zeros(1).cuda()
del dummy
return "GPU ready"
# MMAudio imports - spaces import ์ดํ›„์— ์™€์•ผ ํ•จ
try:
import mmaudio
except ImportError:
os.system("pip install -e .")
import mmaudio
from mmaudio.eval_utils import (ModelConfig, all_model_cfg, generate, load_video, make_video,
setup_eval_logging)
from mmaudio.model.flow_matching import FlowMatching
from mmaudio.model.networks import MMAudio, get_my_mmaudio
from mmaudio.model.sequence_config import SequenceConfig
from mmaudio.model.utils.features_utils import FeaturesUtils
# ๋กœ๊น… ์„ค์ •
logging.basicConfig(level=logging.INFO)
# ๊ธฐ์กด ์ฝ”๋“œ์˜ ๋ชจ๋“  ์„ค์ •๊ณผ ์ดˆ๊ธฐํ™” ๋ถ€๋ถ„ ์œ ์ง€
torch.set_float32_matmul_precision("medium")
# Device ์„ค์ •์„ ๋” ๋ช…ํ™•ํ•˜๊ฒŒ
if torch.cuda.is_available():
device = torch.device("cuda")
torch_dtype = torch.float16
else:
device = torch.device("cpu")
torch_dtype = torch.float32
logging.info(f"Using device: {device}")
# ์ „์—ญ ๋ณ€์ˆ˜๋กœ ๋ชจ๋ธ ์ƒํƒœ ๊ด€๋ฆฌ
MODELS_LOADED = False
BIREFNET_MODEL = None
BIREFNET_LITE_MODEL = None
OUTPAINT_PIPE = None
MMAUDIO_NET = None
MMAUDIO_FEATURE_UTILS = None
MMAUDIO_SEQ_CFG = None
TRANSLATOR = None
# API URLs
TEXT2IMG_API_URL = "http://211.233.58.201:7896"
VIDEO_API_URL = "http://211.233.58.201:7875"
ANIM_API_URL = os.getenv("ANIM_API_URL", "http://211.233.58.201:7862/")
# HTTP ํƒ€์ž„์•„์›ƒ ์„ค์ • - ๊ด„ํ˜ธ ์ˆ˜์ •
ANIM_TIMEOUT = httpx.Timeout(connect=30.0, read=120.0, write=120.0, pool=30.0)
# Image size presets
IMAGE_PRESETS = {
"์ปค์Šคํ…€": {"width": 1024, "height": 1024},
"1:1 ์ •์‚ฌ๊ฐํ˜•": {"width": 1024, "height": 1024},
"4:3 ํ‘œ์ค€": {"width": 1024, "height": 768},
"16:9 ์™€์ด๋“œ์Šคํฌ๋ฆฐ": {"width": 1024, "height": 576},
"9:16 ์„ธ๋กœํ˜•": {"width": 576, "height": 1024},
"6:19 ํŠน์ˆ˜ ์„ธ๋กœํ˜•": {"width": 324, "height": 1024},
"Instagram ์ •์‚ฌ๊ฐํ˜•": {"width": 1080, "height": 1080},
"Instagram ์Šคํ† ๋ฆฌ": {"width": 1080, "height": 1920},
"Instagram ๊ฐ€๋กœํ˜•": {"width": 1080, "height": 566},
"Facebook ์ปค๋ฒ„": {"width": 820, "height": 312},
"Twitter ํ—ค๋”": {"width": 1500, "height": 500},
"YouTube ์ธ๋„ค์ผ": {"width": 1280, "height": 720},
"LinkedIn ๋ฐฐ๋„ˆ": {"width": 1584, "height": 396},
}
# Transform for BiRefNet
transform_image = transforms.Compose([
transforms.Resize((768, 768)),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
])
@spaces.GPU(duration=60)
def load_models():
"""๋ชจ๋“  ๋ชจ๋ธ์„ ๋กœ๋“œํ•˜๋Š” ํ•จ์ˆ˜"""
global MODELS_LOADED, BIREFNET_MODEL, BIREFNET_LITE_MODEL, OUTPAINT_PIPE
global MMAUDIO_NET, MMAUDIO_FEATURE_UTILS, MMAUDIO_SEQ_CFG, TRANSLATOR
if MODELS_LOADED:
return True
try:
# BiRefNet ๋ชจ๋ธ ๋กœ๋“œ
logging.info("Loading BiRefNet models...")
BIREFNET_MODEL = AutoModelForImageSegmentation.from_pretrained("ZhengPeng7/BiRefNet", trust_remote_code=True)
BIREFNET_MODEL.to(device)
BIREFNET_LITE_MODEL = AutoModelForImageSegmentation.from_pretrained("ZhengPeng7/BiRefNet_lite", trust_remote_code=True)
BIREFNET_LITE_MODEL.to(device)
# ControlNet ๋ฐ Outpainting ๋ชจ๋ธ ๋กœ๋“œ
logging.info("Loading ControlNet models...")
from controlnet_union import ControlNetModel_Union
from pipeline_fill_sd_xl import StableDiffusionXLFillPipeline
config_file = hf_hub_download(
"xinsir/controlnet-union-sdxl-1.0",
filename="config_promax.json",
)
config = ControlNetModel_Union.load_config(config_file)
controlnet_model = ControlNetModel_Union.from_config(config)
model_file = hf_hub_download(
"xinsir/controlnet-union-sdxl-1.0",
filename="diffusion_pytorch_model_promax.safetensors",
)
state_dict = load_state_dict(model_file)
loaded_keys = list(state_dict.keys())
result = ControlNetModel_Union._load_pretrained_model(
controlnet_model, state_dict, model_file, "xinsir/controlnet-union-sdxl-1.0", loaded_keys
)
model = result[0]
model = model.to(device=device, dtype=torch_dtype)
# VAE ๋กœ๋“œ
vae = AutoencoderKL.from_pretrained(
"madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch_dtype
).to(device)
# ํŒŒ์ดํ”„๋ผ์ธ ๋กœ๋“œ
OUTPAINT_PIPE = StableDiffusionXLFillPipeline.from_pretrained(
"SG161222/RealVisXL_V5.0_Lightning",
torch_dtype=torch_dtype,
vae=vae,
controlnet=model,
variant="fp16" if device.type == "cuda" else None,
).to(device)
OUTPAINT_PIPE.scheduler = TCDScheduler.from_config(OUTPAINT_PIPE.scheduler.config)
# MMAudio ๋ชจ๋ธ ๋กœ๋“œ
logging.info("Loading MMAudio models...")
model_mmaudio: ModelConfig = all_model_cfg['large_44k_v2']
model_mmaudio.download_if_needed()
setup_eval_logging()
# ๋ฒˆ์—ญ๊ธฐ ์„ค์ •
try:
TRANSLATOR = pipeline("translation",
model="Helsinki-NLP/opus-mt-ko-en",
device="cpu",
use_fast=True,
trust_remote_code=False)
except Exception as e:
logging.warning(f"Failed to load translation model: {e}")
TRANSLATOR = None
# MMAudio ๋ชจ๋ธ ์ดˆ๊ธฐํ™”
if torch.cuda.is_available():
mmaudio_dtype = torch.bfloat16
else:
mmaudio_dtype = torch.float32
with torch.cuda.device(device):
MMAUDIO_SEQ_CFG = model_mmaudio.seq_cfg
MMAUDIO_NET = get_my_mmaudio(model_mmaudio.model_name).to(device, mmaudio_dtype).eval()
MMAUDIO_NET.load_weights(torch.load(model_mmaudio.model_path, map_location=device, weights_only=True))
logging.info(f'Loaded weights from {model_mmaudio.model_path}')
MMAUDIO_FEATURE_UTILS = FeaturesUtils(
tod_vae_ckpt=model_mmaudio.vae_path,
synchformer_ckpt=model_mmaudio.synchformer_ckpt,
enable_conditions=True,
mode=model_mmaudio.mode,
bigvgan_vocoder_ckpt=model_mmaudio.bigvgan_16k_path,
need_vae_encoder=False
).to(device, mmaudio_dtype).eval()
MODELS_LOADED = True
logging.info("All models loaded successfully!")
return True
except Exception as e:
logging.error(f"Failed to load models: {str(e)}")
return False
# ๊ธฐ์กด ํ•จ์ˆ˜๋“ค ๋ชจ๋‘ ์œ ์ง€
def update_dimensions(preset):
if preset in IMAGE_PRESETS:
return IMAGE_PRESETS[preset]["width"], IMAGE_PRESETS[preset]["height"]
return 1024, 1024
def generate_text_to_image(prompt, width, height, guidance, inference_steps, seed):
if not prompt:
return None, "ํ”„๋กฌํ”„ํŠธ๋ฅผ ์ž…๋ ฅํ•ด์ฃผ์„ธ์š”"
try:
client = Client(TEXT2IMG_API_URL)
if seed == -1:
seed = random.randint(0, 9999999)
result = client.predict(
prompt=prompt,
width=int(width),
height=int(height),
guidance=float(guidance),
inference_steps=int(inference_steps),
seed=int(seed),
do_img2img=False,
init_image=None,
image2image_strength=0.8,
resize_img=True,
api_name="/generate_image"
)
return result[0], f"์‚ฌ์šฉ๋œ ์‹œ๋“œ: {result[1]}"
except Exception as e:
logging.error(f"Image generation error: {str(e)}")
return None, f"์˜ค๋ฅ˜: {str(e)}"
def generate_video_from_image(image, prompt="", length=4.0):
if image is None:
return None
try:
# ์ด๋ฏธ์ง€ ์ €์žฅ
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as fp:
temp_path = fp.name
Image.fromarray(image).save(temp_path)
# API ํ˜ธ์ถœ
client = Client(VIDEO_API_URL)
result = client.predict(
input_image=handle_file(temp_path),
prompt=prompt if prompt else "Generate natural motion",
n_prompt="",
seed=random.randint(0, 9999999),
use_teacache=True,
video_length=float(length),
api_name="/process"
)
os.unlink(temp_path)
if result and len(result) > 0:
video_dict = result[0]
return video_dict.get("video") if isinstance(video_dict, dict) else None
except Exception as e:
logging.error(f"Video generation error: {str(e)}")
return None
def prepare_image_and_mask(image, width, height, overlap_percentage, alignment):
"""์ด๋ฏธ์ง€์™€ ๋งˆ์Šคํฌ๋ฅผ ์ค€๋น„ํ•˜๋Š” ํ•จ์ˆ˜"""
if image is None:
return None, None
# PIL ์ด๋ฏธ์ง€๋กœ ๋ณ€ํ™˜
if isinstance(image, np.ndarray):
image = Image.fromarray(image).convert('RGB')
target_size = (width, height)
# ์ด๋ฏธ์ง€๋ฅผ ํƒ€๊ฒŸ ํฌ๊ธฐ์— ๋งž๊ฒŒ ์กฐ์ •
scale_factor = min(target_size[0] / image.width, target_size[1] / image.height)
new_width = int(image.width * scale_factor)
new_height = int(image.height * scale_factor)
# ์ด๋ฏธ์ง€ ๋ฆฌ์‚ฌ์ด์ฆˆ
source = image.resize((new_width, new_height), Image.LANCZOS)
# ์˜ค๋ฒ„๋žฉ ๊ณ„์‚ฐ
overlap_x = int(new_width * (overlap_percentage / 100))
overlap_y = int(new_height * (overlap_percentage / 100))
overlap_x = max(overlap_x, 1)
overlap_y = max(overlap_y, 1)
# ์ •๋ ฌ์— ๋”ฐ๋ฅธ ๋งˆ์ง„ ๊ณ„์‚ฐ
if alignment == "๊ฐ€์šด๋ฐ":
margin_x = (target_size[0] - new_width) // 2
margin_y = (target_size[1] - new_height) // 2
elif alignment == "์™ผ์ชฝ":
margin_x = 0
margin_y = (target_size[1] - new_height) // 2
elif alignment == "์˜ค๋ฅธ์ชฝ":
margin_x = target_size[0] - new_width
margin_y = (target_size[1] - new_height) // 2
elif alignment == "์œ„":
margin_x = (target_size[0] - new_width) // 2
margin_y = 0
elif alignment == "์•„๋ž˜":
margin_x = (target_size[0] - new_width) // 2
margin_y = target_size[1] - new_height
# ๋ฐฐ๊ฒฝ ์ด๋ฏธ์ง€ ์ƒ์„ฑ
background = Image.new('RGB', target_size, (255, 255, 255))
background.paste(source, (margin_x, margin_y))
# ๋งˆ์Šคํฌ ์ƒ์„ฑ
mask = Image.new('L', target_size, 255)
mask_draw = ImageDraw.Draw(mask)
# ๋งˆ์Šคํฌ ์˜์—ญ ๊ทธ๋ฆฌ๊ธฐ
left_overlap = margin_x + overlap_x if alignment != "์™ผ์ชฝ" else margin_x
right_overlap = margin_x + new_width - overlap_x if alignment != "์˜ค๋ฅธ์ชฝ" else margin_x + new_width
top_overlap = margin_y + overlap_y if alignment != "์œ„" else margin_y
bottom_overlap = margin_y + new_height - overlap_y if alignment != "์•„๋ž˜" else margin_y + new_height
mask_draw.rectangle([
(left_overlap, top_overlap),
(right_overlap, bottom_overlap)
], fill=0)
return background, mask
def preview_outpaint(image, width, height, overlap_percentage, alignment):
"""์•„์›ƒํŽ˜์ธํŒ… ๋ฏธ๋ฆฌ๋ณด๊ธฐ"""
background, mask = prepare_image_and_mask(image, width, height, overlap_percentage, alignment)
if background is None:
return None
# ๋ฏธ๋ฆฌ๋ณด๊ธฐ ์ด๋ฏธ์ง€ ์ƒ์„ฑ
preview = background.copy().convert('RGBA')
# ๋ฐ˜ํˆฌ๋ช… ๋นจ๊ฐ„์ƒ‰ ์˜ค๋ฒ„๋ ˆ์ด
red_overlay = Image.new('RGBA', background.size, (255, 0, 0, 64))
# ๋งˆ์Šคํฌ ์ ์šฉ
red_mask = Image.new('RGBA', background.size, (0, 0, 0, 0))
red_mask.paste(red_overlay, (0, 0), mask)
# ์˜ค๋ฒ„๋ ˆ์ด ํ•ฉ์„ฑ
preview = Image.alpha_composite(preview, red_mask)
return preview
@spaces.GPU(duration=120)
def outpaint_image(image, prompt, width, height, overlap_percentage, alignment, num_steps=8):
"""์ด๋ฏธ์ง€ ์•„์›ƒํŽ˜์ธํŒ… ์‹คํ–‰"""
if image is None:
return None
# ๋ชจ๋ธ ๋กœ๋“œ ํ™•์ธ
if not MODELS_LOADED:
load_models()
if OUTPAINT_PIPE is None:
return Image.new('RGB', (width, height), (200, 200, 200))
try:
# ์ด๋ฏธ์ง€์™€ ๋งˆ์Šคํฌ ์ค€๋น„
background, mask = prepare_image_and_mask(image, width, height, overlap_percentage, alignment)
if background is None:
return None
# cnet_image ์ƒ์„ฑ (๋งˆ์Šคํฌ ์˜์—ญ์„ ๊ฒ€์€์ƒ‰์œผ๋กœ)
cnet_image = background.copy()
cnet_image.paste(0, (0, 0), mask)
# ํ”„๋กฌํ”„ํŠธ ์ค€๋น„
final_prompt = f"{prompt}, high quality, 4k" if prompt else "high quality, 4k"
# GPU์—์„œ ์‹คํ–‰
with torch.autocast(device_type=device.type, dtype=torch_dtype):
(
prompt_embeds,
negative_prompt_embeds,
pooled_prompt_embeds,
negative_pooled_prompt_embeds,
) = OUTPAINT_PIPE.encode_prompt(final_prompt, str(device), True)
# ์ƒ์„ฑ ํ”„๋กœ์„ธ์Šค
for generated_image in OUTPAINT_PIPE(
prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_prompt_embeds,
pooled_prompt_embeds=pooled_prompt_embeds,
negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
image=cnet_image,
num_inference_steps=num_steps
):
# ์ค‘๊ฐ„ ๊ฒฐ๊ณผ (ํ•„์š”์‹œ ์‚ฌ์šฉ)
pass
# ์ตœ์ข… ์ด๋ฏธ์ง€
final_image = generated_image
# RGBA๋กœ ๋ณ€ํ™˜ํ•˜๊ณ  ๋งˆ์Šคํฌ ์ ์šฉ
final_image = final_image.convert("RGBA")
cnet_image.paste(final_image, (0, 0), mask)
return cnet_image
except Exception as e:
logging.error(f"Outpainting error: {str(e)}")
return background if 'background' in locals() else None
# MMAudio ๊ด€๋ จ ํ•จ์ˆ˜๋“ค
def translate_prompt(text):
try:
if TRANSLATOR is None:
return text
if text and any(ord(char) >= 0x3131 and ord(char) <= 0xD7A3 for char in text):
with torch.no_grad():
translation = TRANSLATOR(text)[0]['translation_text']
return translation
return text
except Exception as e:
logging.error(f"Translation error: {e}")
return text
@spaces.GPU(duration=120)
@torch.inference_mode()
def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int, num_steps: int,
cfg_strength: float, duration: float):
# ๋ชจ๋ธ ๋กœ๋“œ ํ™•์ธ
if not MODELS_LOADED:
load_models()
if MMAUDIO_NET is None:
return None
prompt = translate_prompt(prompt)
negative_prompt = translate_prompt(negative_prompt)
rng = torch.Generator(device=device)
rng.manual_seed(seed)
fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
clip_frames, sync_frames, duration = load_video(video, duration)
clip_frames = clip_frames.unsqueeze(0)
sync_frames = sync_frames.unsqueeze(0)
MMAUDIO_SEQ_CFG.duration = duration
MMAUDIO_NET.update_seq_lengths(MMAUDIO_SEQ_CFG.latent_seq_len, MMAUDIO_SEQ_CFG.clip_seq_len, MMAUDIO_SEQ_CFG.sync_seq_len)
audios = generate(clip_frames,
sync_frames, [prompt],
negative_text=[negative_prompt],
feature_utils=MMAUDIO_FEATURE_UTILS,
net=MMAUDIO_NET,
fm=fm,
rng=rng,
cfg_strength=cfg_strength)
audio = audios.float().cpu()[0]
video_save_path = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name
make_video(video,
video_save_path,
audio,
sampling_rate=MMAUDIO_SEQ_CFG.sampling_rate,
duration_sec=MMAUDIO_SEQ_CFG.duration)
return video_save_path
# ๋น„๋””์˜ค ๋ฐฐ๊ฒฝ์ œ๊ฑฐ ๊ด€๋ จ ํ•จ์ˆ˜๋“ค
def process_bg_image(image, bg, fast_mode=False):
"""๋‹จ์ผ ์ด๋ฏธ์ง€ ๋ฐฐ๊ฒฝ ์ฒ˜๋ฆฌ"""
if BIREFNET_MODEL is None or BIREFNET_LITE_MODEL is None:
return image
image_size = image.size
input_images = transform_image(image).unsqueeze(0).to(device)
model = BIREFNET_LITE_MODEL if fast_mode else BIREFNET_MODEL
with torch.no_grad():
preds = model(input_images)[-1].sigmoid().cpu()
pred = preds[0].squeeze()
pred_pil = transforms.ToPILImage()(pred)
mask = pred_pil.resize(image_size)
if isinstance(bg, str) and bg.startswith("#"):
color_rgb = tuple(int(bg[i:i+2], 16) for i in (1, 3, 5))
background = Image.new("RGBA", image_size, color_rgb + (255,))
elif isinstance(bg, Image.Image):
background = bg.convert("RGBA").resize(image_size)
else:
background = Image.open(bg).convert("RGBA").resize(image_size)
image = Image.composite(image, background, mask)
return image
def process_video_frame(frame, bg_type, bg, fast_mode, frame_index, background_frames, color):
"""๋น„๋””์˜ค ํ”„๋ ˆ์ž„ ์ฒ˜๋ฆฌ"""
try:
pil_image = Image.fromarray(frame)
if bg_type == "์ƒ‰์ƒ":
processed_image = process_bg_image(pil_image, color, fast_mode)
elif bg_type == "์ด๋ฏธ์ง€":
processed_image = process_bg_image(pil_image, bg, fast_mode)
elif bg_type == "๋น„๋””์˜ค":
# ์ธ๋ฑ์Šค ๋ฒ”์œ„ ํ™•์ธ
if background_frames and len(background_frames) > 0:
# ํ”„๋ ˆ์ž„ ์ธ๋ฑ์Šค๋ฅผ ๋ฐฐ๊ฒฝ ๋น„๋””์˜ค ๊ธธ์ด๋กœ ๋‚˜๋ˆˆ ๋‚˜๋จธ์ง€๋ฅผ ์‚ฌ์šฉ (๋ฃจํ”„ ํšจ๊ณผ)
bg_frame_index = frame_index % len(background_frames)
background_frame = background_frames[bg_frame_index]
background_image = Image.fromarray(background_frame)
processed_image = process_bg_image(pil_image, background_image, fast_mode)
else:
processed_image = pil_image
else:
processed_image = pil_image
# ์ฒ˜๋ฆฌ๋œ ์ด๋ฏธ์ง€๊ฐ€ numpy array๋กœ ๋ฐ˜ํ™˜๋˜๋Š”์ง€ ํ™•์ธ
if isinstance(processed_image, Image.Image):
return np.array(processed_image)
return processed_image
except Exception as e:
print(f"Error processing frame {frame_index}: {e}")
# ์˜ค๋ฅ˜ ๋ฐœ์ƒ ์‹œ ์›๋ณธ ํ”„๋ ˆ์ž„ ๋ฐ˜ํ™˜
if isinstance(frame, np.ndarray):
return frame
return np.array(pil_image)
@spaces.GPU(duration=300)
def process_video_bg(vid, bg_type="์ƒ‰์ƒ", bg_image=None, bg_video=None, color="#00FF00",
fps=0, video_handling="slow_down", fast_mode=True, max_workers=10):
"""๋น„๋””์˜ค ๋ฐฐ๊ฒฝ ์ฒ˜๋ฆฌ ๋ฉ”์ธ ํ•จ์ˆ˜"""
# ๋ชจ๋ธ ๋กœ๋“œ ํ™•์ธ
if not MODELS_LOADED:
load_models()
if BIREFNET_MODEL is None:
yield gr.update(visible=False), gr.update(visible=True), "BiRefNet ๋ชจ๋ธ์„ ๋กœ๋“œํ•˜์ง€ ๋ชปํ–ˆ์Šต๋‹ˆ๋‹ค."
yield None, None, "BiRefNet ๋ชจ๋ธ์„ ๋กœ๋“œํ•˜์ง€ ๋ชปํ–ˆ์Šต๋‹ˆ๋‹ค."
return
try:
start_time = time.time()
video = VideoFileClip(vid)
if fps == 0:
fps = video.fps
audio = video.audio
frames = list(video.iter_frames(fps=fps))
# ํ”„๋ ˆ์ž„ ํฌ๊ธฐ ์ €์žฅ
if frames:
frame_height, frame_width = frames[0].shape[:2]
else:
yield gr.update(visible=False), gr.update(visible=True), "๋น„๋””์˜ค์— ํ”„๋ ˆ์ž„์ด ์—†์Šต๋‹ˆ๋‹ค."
yield None, None, "๋น„๋””์˜ค์— ํ”„๋ ˆ์ž„์ด ์—†์Šต๋‹ˆ๋‹ค."
return
processed_frames = []
yield gr.update(visible=True), gr.update(visible=False), f"์ฒ˜๋ฆฌ ์‹œ์ž‘... ๊ฒฝ๊ณผ ์‹œ๊ฐ„: 0์ดˆ"
# ๋ฐฐ๊ฒฝ ๋น„๋””์˜ค ์ฒ˜๋ฆฌ
background_frames = None
if bg_type == "๋น„๋””์˜ค" and bg_video:
background_video = VideoFileClip(bg_video)
# ๋ฐฐ๊ฒฝ ๋น„๋””์˜ค ๊ธธ์ด ์กฐ์ •
if video_handling == "slow_down" and background_video.duration < video.duration:
if speedx is not None:
factor = video.duration / background_video.duration
background_video = speedx(background_video, factor=factor)
else:
# speedx๊ฐ€ ์—†์œผ๋ฉด ๋ฐ˜๋ณต์œผ๋กœ ๋Œ€์ฒด
loops = int(video.duration / background_video.duration) + 1
background_video = concatenate_videoclips([background_video] * loops)
elif video_handling == "loop" or background_video.duration < video.duration:
# ๋ฐ˜๋ณต ๋ชจ๋“œ
loops = int(video.duration / background_video.duration) + 1
background_video = concatenate_videoclips([background_video] * loops)
# ๋ฐฐ๊ฒฝ ํ”„๋ ˆ์ž„ ์ถ”์ถœ
background_frames = list(background_video.iter_frames(fps=fps))
# ๋ฐฐ๊ฒฝ ๋น„๋””์˜ค๊ฐ€ ๋” ๊ธธ๋ฉด ์ž˜๋ผ๋ƒ„
if len(background_frames) > len(frames):
background_frames = background_frames[:len(frames)]
# ๋ณ‘๋ ฌ ์ฒ˜๋ฆฌ
with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = []
for i in range(len(frames)):
future = executor.submit(
process_video_frame,
frames[i],
bg_type,
bg_image,
fast_mode,
i, # ํ”„๋ ˆ์ž„ ์ธ๋ฑ์Šค ์ „๋‹ฌ
background_frames,
color
)
futures.append(future)
# ๊ฒฐ๊ณผ ์ˆ˜์ง‘
for i, future in enumerate(futures):
try:
result = future.result()
# ๊ฒฐ๊ณผ๊ฐ€ ์˜ฌ๋ฐ”๋ฅธ ํฌ๊ธฐ์ธ์ง€ ํ™•์ธ
if result.shape[:2] != (frame_height, frame_width):
# ํฌ๊ธฐ๊ฐ€ ๋‹ค๋ฅด๋ฉด ๋ฆฌ์‚ฌ์ด์ฆˆ
pil_result = Image.fromarray(result)
pil_result = pil_result.resize((frame_width, frame_height), Image.LANCZOS)
result = np.array(pil_result)
processed_frames.append(result)
elapsed_time = time.time() - start_time
# 10ํ”„๋ ˆ์ž„๋งˆ๋‹ค ์ƒํƒœ ์—…๋ฐ์ดํŠธ
if i % 10 == 0:
yield result, None, f"ํ”„๋ ˆ์ž„ {i+1}/{len(frames)} ์ฒ˜๋ฆฌ ์ค‘... ๊ฒฝ๊ณผ ์‹œ๊ฐ„: {elapsed_time:.2f}์ดˆ"
except Exception as e:
print(f"Error getting result for frame {i}: {e}")
# ์˜ค๋ฅ˜ ๋ฐœ์ƒ ์‹œ ์›๋ณธ ํ”„๋ ˆ์ž„ ์‚ฌ์šฉ
processed_frames.append(frames[i])
# ๋ชจ๋“  ํ”„๋ ˆ์ž„์ด ๋™์ผํ•œ ํฌ๊ธฐ์ธ์ง€ ์ตœ์ข… ํ™•์ธ
frame_sizes = [frame.shape for frame in processed_frames]
if len(set(frame_sizes)) > 1:
print(f"Warning: Different frame sizes detected: {set(frame_sizes)}")
# ์ฒซ ๋ฒˆ์งธ ํ”„๋ ˆ์ž„ ํฌ๊ธฐ๋กœ ๋ชจ๋‘ ํ†ต์ผ
target_size = processed_frames[0].shape
for i in range(len(processed_frames)):
if processed_frames[i].shape != target_size:
pil_frame = Image.fromarray(processed_frames[i])
pil_frame = pil_frame.resize((target_size[1], target_size[0]), Image.LANCZOS)
processed_frames[i] = np.array(pil_frame)
# ๋น„๋””์˜ค ์ƒ์„ฑ
processed_video = ImageSequenceClip(processed_frames, fps=fps)
# ์˜ค๋””์˜ค ์ถ”๊ฐ€
if audio:
processed_video = processed_video.set_audio(audio)
# ์ €์žฅ
with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as temp_file:
temp_filepath = temp_file.name
processed_video.write_videofile(temp_filepath, codec="libx264", audio_codec="aac")
elapsed_time = time.time() - start_time
yield gr.update(visible=False), gr.update(visible=True), f"์ฒ˜๋ฆฌ ์™„๋ฃŒ! ๊ฒฝ๊ณผ ์‹œ๊ฐ„: {elapsed_time:.2f}์ดˆ"
yield processed_frames[-1], temp_filepath, f"์ฒ˜๋ฆฌ ์™„๋ฃŒ! ๊ฒฝ๊ณผ ์‹œ๊ฐ„: {elapsed_time:.2f}์ดˆ"
except Exception as e:
print(f"Error: {e}")
import traceback
traceback.print_exc()
elapsed_time = time.time() - start_time
yield gr.update(visible=False), gr.update(visible=True), f"๋น„๋””์˜ค ์ฒ˜๋ฆฌ ์˜ค๋ฅ˜: {e}. ๊ฒฝ๊ณผ ์‹œ๊ฐ„: {elapsed_time:.2f}์ดˆ"
yield None, None, f"๋น„๋””์˜ค ์ฒ˜๋ฆฌ ์˜ค๋ฅ˜: {e}. ๊ฒฝ๊ณผ ์‹œ๊ฐ„: {elapsed_time:.2f}์ดˆ"
@spaces.GPU(duration=180)
def merge_videos_with_audio(video_files, audio_file, audio_mode, audio_volume, original_audio_volume, output_fps):
"""์—ฌ๋Ÿฌ ๋น„๋””์˜ค๋ฅผ ๋ณ‘ํ•ฉํ•˜๊ณ  ์˜ค๋””์˜ค๋ฅผ ์ถ”๊ฐ€ํ•˜๋Š” ํ•จ์ˆ˜"""
if not video_files:
return None, "๋น„๋””์˜ค ํŒŒ์ผ์„ ์—…๋กœ๋“œํ•ด์ฃผ์„ธ์š”."
if isinstance(video_files, list) and len(video_files) > 10:
return None, "์ตœ๋Œ€ 10๊ฐœ์˜ ๋น„๋””์˜ค๋งŒ ์—…๋กœ๋“œ ๊ฐ€๋Šฅํ•ฉ๋‹ˆ๋‹ค."
try:
# ์ƒํƒœ ์—…๋ฐ์ดํŠธ
status = "๋น„๋””์˜ค ํŒŒ์ผ ์ •๋ ฌ ์ค‘..."
# ํŒŒ์ผ ๊ฒฝ๋กœ์™€ ํŒŒ์ผ๋ช…์„ ํŠœํ”Œ๋กœ ์ €์žฅํ•˜๊ณ  ํŒŒ์ผ๋ช…์œผ๋กœ ์ •๋ ฌ
video_paths = []
if isinstance(video_files, list):
for video_file in video_files:
if video_file is not None:
video_paths.append(video_file)
else:
video_paths.append(video_files)
# ํŒŒ์ผ๋ช…์œผ๋กœ ์ •๋ ฌ (๊ฒฝ๋กœ์—์„œ ํŒŒ์ผ๋ช…๋งŒ ์ถ”์ถœํ•˜์—ฌ ์ •๋ ฌ)
video_paths.sort(key=lambda x: os.path.basename(x))
status = f"{len(video_paths)}๊ฐœ์˜ ๋น„๋””์˜ค ๋กœ๋“œ ์ค‘..."
# ๋น„๋””์˜ค ํด๋ฆฝ ๋กœ๋“œ
video_clips = []
clip_sizes = []
for i, video_path in enumerate(video_paths):
status = f"๋น„๋””์˜ค {i+1}/{len(video_paths)} ๋กœ๋“œ ์ค‘: {os.path.basename(video_path)}"
clip = VideoFileClip(video_path)
video_clips.append(clip)
# ๊ฐ ํด๋ฆฝ์˜ ํฌ๊ธฐ ์ €์žฅ
try:
clip_sizes.append((clip.w, clip.h))
except:
clip_sizes.append(clip.size)
# ์ฒซ ๋ฒˆ์งธ ๋น„๋””์˜ค์˜ ํฌ๊ธฐ๋ฅผ ๊ธฐ์ค€์œผ๋กœ ํ•จ
target_width, target_height = clip_sizes[0]
# ๋ชจ๋“  ๋น„๋””์˜ค์˜ ํฌ๊ธฐ๊ฐ€ ๊ฐ™์€์ง€ ํ™•์ธ
all_same_size = all(size == (target_width, target_height) for size in clip_sizes)
if not all_same_size:
logging.warning(f"๋น„๋””์˜ค ํฌ๊ธฐ๊ฐ€ ์„œ๋กœ ๋‹ค๋ฆ…๋‹ˆ๋‹ค. ์ฒซ ๋ฒˆ์งธ ๋น„๋””์˜ค ํฌ๊ธฐ({target_width}x{target_height})๋กœ ์กฐ์ •ํ•ฉ๋‹ˆ๋‹ค.")
# ํฌ๊ธฐ๊ฐ€ ๋‹ค๋ฅธ ๋น„๋””์˜ค๋“ค์„ ์กฐ์ •
adjusted_clips = []
for clip, size in zip(video_clips, clip_sizes):
if size != (target_width, target_height):
if resize is not None:
adjusted_clip = resize(clip, newsize=(target_width, target_height))
else:
if hasattr(clip, 'resize'):
adjusted_clip = clip.resize((target_width, target_height))
else:
adjusted_clip = clip
logging.warning(f"Cannot resize video. Using original size.")
adjusted_clips.append(adjusted_clip)
else:
adjusted_clips.append(clip)
video_clips = adjusted_clips
# ์ฒซ ๋ฒˆ์งธ ๋น„๋””์˜ค์˜ FPS๋ฅผ ๊ธฐ๋ณธ๊ฐ’์œผ๋กœ ์‚ฌ์šฉ
if output_fps == 0:
output_fps = video_clips[0].fps
status = "๋น„๋””์˜ค ๋ณ‘ํ•ฉ ์ค‘..."
# ๋น„๋””์˜ค ๋ณ‘ํ•ฉ
final_video = concatenate_videoclips(video_clips, method="compose")
# ์˜ค๋””์˜ค ์ฒ˜๋ฆฌ
if audio_file:
status = "์˜ค๋””์˜ค ์ฒ˜๋ฆฌ ์ค‘..."
try:
# ์˜ค๋””์˜ค ํŒŒ์ผ ๊ฒฝ๋กœ ํ™•์ธ
if isinstance(audio_file, str):
audio_path = audio_file
else:
audio_path = audio_file
logging.info(f"Processing audio from: {audio_path}")
logging.info(f"Audio mode: {audio_mode}")
# ์˜ค๋””์˜ค ๋กœ๋“œ
if audio_path.endswith(('.mp4', '.avi', '.mov', '.mkv')):
temp_video = VideoFileClip(audio_path)
audio_clip = temp_video.audio
temp_video.close()
else:
audio_clip = AudioFileClip(audio_path)
if audio_clip is None:
raise ValueError("์˜ค๋””์˜ค๋ฅผ ๋กœ๋“œํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.")
# ๋ณผ๋ฅจ ์กฐ์ ˆ
if audio_volume != 100:
audio_clip = audio_clip.volumex(audio_volume / 100)
# ์˜ค๋””์˜ค๋ฅผ ๋น„๋””์˜ค ๊ธธ์ด์— ๋งž์ถค
video_duration = final_video.duration
audio_duration = audio_clip.duration
if audio_duration > video_duration:
audio_clip = audio_clip.subclip(0, video_duration)
elif audio_duration < video_duration:
loops_needed = int(video_duration / audio_duration) + 1
audio_clips_list = [audio_clip] * loops_needed
looped_audio = concatenate_audioclips(audio_clips_list)
audio_clip = looped_audio.subclip(0, video_duration)
# ์˜ค๋””์˜ค ๋ชจ๋“œ์— ๋”ฐ๋ฅธ ์ฒ˜๋ฆฌ
if audio_mode == "๋ฐฑ๊ทธ๋ผ์šด๋“œ ๋ฎค์ง":
# ๋ฐฑ๊ทธ๋ผ์šด๋“œ ๋ฎค์ง ๋ชจ๋“œ: ๊ธฐ์กด ์˜ค๋””์˜ค์™€ ํ•ฉ์„ฑ
if final_video.audio:
# ์›๋ณธ ์˜ค๋””์˜ค ๋ณผ๋ฅจ ์กฐ์ ˆ
original_audio = final_video.audio
if original_audio_volume != 100:
original_audio = original_audio.volumex(original_audio_volume / 100)
# ๋‘ ์˜ค๋””์˜ค ํ•ฉ์„ฑ
final_audio = CompositeAudioClip([original_audio, audio_clip])
final_video = final_video.set_audio(final_audio)
logging.info("Background music mode: Mixed original and new audio")
else:
# ์›๋ณธ ์˜ค๋””์˜ค๊ฐ€ ์—†์œผ๋ฉด ๊ทธ๋ƒฅ ์ถ”๊ฐ€
final_video = final_video.set_audio(audio_clip)
logging.info("No original audio found, adding new audio only")
else:
# ๋Œ€์ฒด ๋ชจ๋“œ: ๊ธฐ์กด ์˜ค๋””์˜ค๋ฅผ ์™„์ „ํžˆ ๊ต์ฒด
final_video = final_video.set_audio(audio_clip)
logging.info("Replace mode: Replaced original audio")
logging.info("Audio successfully processed")
except Exception as e:
logging.error(f"์˜ค๋””์˜ค ์ฒ˜๋ฆฌ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}")
status = f"์˜ค๋””์˜ค ์ฒ˜๋ฆฌ ์‹คํŒจ: {str(e)}, ๋น„๋””์˜ค๋งŒ ๋ณ‘ํ•ฉํ•ฉ๋‹ˆ๋‹ค."
status = "๋น„๋””์˜ค ์ €์žฅ ์ค‘..."
# ์ž„์‹œ ํŒŒ์ผ๋กœ ์ €์žฅ
with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as temp_file:
temp_filepath = temp_file.name
# ์ฝ”๋ฑ ์„ค์ • - ์›๋ณธ ํ’ˆ์งˆ ์œ ์ง€
final_video.write_videofile(
temp_filepath,
fps=output_fps,
codec="libx264",
audio_codec="aac",
preset="medium",
bitrate="5000k",
audio_bitrate="192k"
)
# ๋ฆฌ์†Œ์Šค ์ •๋ฆฌ
for clip in video_clips:
clip.close()
if 'adjusted_clips' in locals():
for clip in adjusted_clips:
if clip not in video_clips:
clip.close()
if audio_file and 'audio_clip' in locals():
audio_clip.close()
final_video.close()
# ์ƒํƒœ ๋ฉ”์‹œ์ง€ ์ƒ์„ฑ
if audio_file and audio_mode == "๋ฐฑ๊ทธ๋ผ์šด๋“œ ๋ฎค์ง":
mode_msg = "๋ฐฑ๊ทธ๋ผ์šด๋“œ ๋ฎค์ง ์ถ”๊ฐ€๋จ"
elif audio_file:
mode_msg = "์˜ค๋””์˜ค ๋Œ€์ฒด๋จ"
else:
mode_msg = "์˜ค๋””์˜ค ์—†์Œ"
return temp_filepath, f"โœ… ์„ฑ๊ณต์ ์œผ๋กœ {len(video_paths)}๊ฐœ์˜ ๋น„๋””์˜ค๋ฅผ ๋ณ‘ํ•ฉํ–ˆ์Šต๋‹ˆ๋‹ค! (ํฌ๊ธฐ: {target_width}x{target_height}, {mode_msg})"
except Exception as e:
logging.error(f"Video merge error: {str(e)}")
import traceback
traceback.print_exc()
return None, f"โŒ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}"
def test_anim_api_connection():
"""์• ๋‹ˆ๋ฉ”์ด์…˜ ์„œ๋ฒ„ ์—ฐ๊ฒฐ ํ…Œ์ŠคํŠธ"""
now = datetime.now().strftime("%H:%M:%S")
try:
resp = httpx.get(f"{ANIM_API_URL.rstrip('/')}/healthz", timeout=ANIM_TIMEOUT)
ready = resp.json().get("ready", False)
msg = f"[{now}] ์• ๋‹ˆ๋ฉ”์ด์…˜ ์„œ๋ฒ„ ์—ฐ๊ฒฐ ์„ฑ๊ณต โœ… (ready={ready})"
logging.info(msg)
return True, msg
except Exception as e:
msg = f"[{now}] ์• ๋‹ˆ๋ฉ”์ด์…˜ ์„œ๋ฒ„ ์—ฐ๊ฒฐ ์‹คํŒจ โŒ : {e}"
logging.error(msg)
return False, msg
def generate_avatar_animation(image, audio, guidance_scale, steps, progress=gr.Progress()):
"""์ด๋ฏธ์ง€์™€ ์˜ค๋””์˜ค๋กœ ์•„๋ฐ”ํƒ€ ์• ๋‹ˆ๋ฉ”์ด์…˜ ์ƒ์„ฑ"""
start = datetime.now().strftime("%H:%M:%S")
logs = [f"[{start}] ์š”์ฒญ ์‹œ์ž‘"]
try:
if image is None or audio is None:
raise ValueError("์ด๋ฏธ์ง€์™€ ์˜ค๋””์˜ค๋ฅผ ๋ชจ๋‘ ์—…๋กœ๋“œํ•˜์„ธ์š”.")
progress(0.05, desc="ํŒŒ์ผ ์ค€๋น„")
client = Client(ANIM_API_URL)
progress(0.15, desc="์„œ๋ฒ„ ํ˜ธ์ถœ ์ค‘โ€ฆ (์ˆ˜ ๋ถ„ ์†Œ์š” ๊ฐ€๋Šฅ)")
result = client.predict(
image_path=handle_file(image),
audio_path=handle_file(audio),
guidance_scale=guidance_scale,
steps=steps,
api_name="/generate_animation"
)
progress(0.95, desc="๊ฒฐ๊ณผ ์ •๋ฆฌ")
# ๊ฒฐ๊ณผ ์ฒ˜๋ฆฌ - dict ํ˜•ํƒœ ์ฒ˜๋ฆฌ ์ถ”๊ฐ€
def extract_video_path(obj):
"""๋น„๋””์˜ค ๊ฐ์ฒด์—์„œ ๊ฒฝ๋กœ ์ถ”์ถœ"""
if isinstance(obj, str):
return obj
elif isinstance(obj, dict):
# Gradio์˜ FileData dict ์ฒ˜๋ฆฌ
if 'video' in obj:
return obj['video'] # {'video': '๊ฒฝ๋กœ', 'subtitles': None} ํ˜•ํƒœ ์ฒ˜๋ฆฌ
elif 'path' in obj:
return obj['path']
elif 'url' in obj:
return obj['url']
elif 'name' in obj:
return obj['name']
else:
logging.warning(f"Unexpected dict structure: {obj.keys()}")
return None
else:
logging.warning(f"Unexpected type: {type(obj)}")
return None
if isinstance(result, (list, tuple)) and len(result) >= 2:
anim_path = extract_video_path(result[0])
comp_path = extract_video_path(result[1])
if anim_path and comp_path:
logs.append(f"[{datetime.now().strftime('%H:%M:%S')}] ์„ฑ๊ณต")
return anim_path, comp_path, "\n".join(logs)
else:
raise RuntimeError(f"๋น„๋””์˜ค ๊ฒฝ๋กœ ์ถ”์ถœ ์‹คํŒจ: {result}")
else:
raise RuntimeError(f"์˜ˆ์ƒ์น˜ ๋ชปํ•œ ๋ฐ˜ํ™˜ ํ˜•์‹: {type(result)}")
except Exception as e:
logs.append(f"[{datetime.now().strftime('%H:%M:%S')}] ์˜ค๋ฅ˜: {e}")
logging.error(f"Avatar animation generation error: {e}", exc_info=True)
return None, None, "\n".join(logs)
# CSS
css = """
:root {
--primary-color: #f8c3cd;
--secondary-color: #b3e5fc;
--background-color: #f5f5f7;
--card-background: #ffffff;
--text-color: #424242;
--accent-color: #ffb6c1;
--success-color: #c8e6c9;
--warning-color: #fff9c4;
--shadow-color: rgba(0, 0, 0, 0.1);
--border-radius: 12px;
}
.gradio-container {
max-width: 1200px !important;
margin: 0 auto !important;
}
.panel-box {
border-radius: var(--border-radius) !important;
box-shadow: 0 8px 16px var(--shadow-color) !important;
background-color: var(--card-background) !important;
padding: 20px !important;
margin-bottom: 20px !important;
}
#generate-btn, #video-btn, #outpaint-btn, #preview-btn, #audio-btn, #bg-remove-btn, #merge-btn, #avatar-btn, #test-connection-btn {
background: linear-gradient(135deg, #ff9a9e, #fad0c4) !important;
font-size: 1.1rem !important;
padding: 12px 24px !important;
margin-top: 10px !important;
width: 100% !important;
}
#avatar-btn, #test-connection-btn {
background: linear-gradient(135deg, #667eea, #764ba2) !important;
}
.tabitem {
min-height: 700px !important;
}
"""
# Gradio Interface
demo = gr.Blocks(css=css, title="AI ์ด๋ฏธ์ง€ & ๋น„๋””์˜ค & ์˜ค๋””์˜ค ์ƒ์„ฑ๊ธฐ")
with demo:
gr.Markdown("# ๐ŸŽจ Ginigen ์ŠคํŠœ๋””์˜ค")
gr.Markdown("์ฒ˜์Œ ์‚ฌ์šฉ ์‹œ ๋ชจ๋ธ ๋กœ๋”ฉ์— ์‹œ๊ฐ„์ด ๊ฑธ๋ฆด ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค. ์ž ์‹œ๋งŒ ๊ธฐ๋‹ค๋ ค์ฃผ์„ธ์š”.")
# ๋ชจ๋ธ ๋กœ๋“œ ์ƒํƒœ ํ‘œ์‹œ
model_status = gr.Textbox(label="๋ชจ๋ธ ์ƒํƒœ", value="๋ชจ๋ธ ๋กœ๋”ฉ ๋Œ€๊ธฐ ์ค‘...", interactive=False)
with gr.Tabs() as tabs:
# ์ฒซ ๋ฒˆ์งธ ํƒญ: ํ…์ŠคํŠธ to ์ด๋ฏธ์ง€
with gr.Tab("ํ…์ŠคํŠธโ†’์ด๋ฏธ์ง€โ†’๋น„๋””์˜ค", elem_classes="tabitem"):
with gr.Row(equal_height=True):
# ์ž…๋ ฅ ์ปฌ๋Ÿผ
with gr.Column(scale=1):
with gr.Group(elem_classes="panel-box"):
gr.Markdown("### ๐Ÿ“ ์ด๋ฏธ์ง€ ์ƒ์„ฑ ์„ค์ •")
prompt = gr.Textbox(
label="ํ”„๋กฌํ”„ํŠธ(ํ•œ๊ธ€/์˜์–ด ๊ฐ€๋Šฅ)",
placeholder="์ƒ์„ฑํ•˜๊ณ  ์‹ถ์€ ์ด๋ฏธ์ง€๋ฅผ ์„ค๋ช…ํ•˜์„ธ์š”...",
lines=3
)
size_preset = gr.Dropdown(
choices=list(IMAGE_PRESETS.keys()),
value="1:1 ์ •์‚ฌ๊ฐํ˜•",
label="ํฌ๊ธฐ ํ”„๋ฆฌ์…‹"
)
with gr.Row():
width = gr.Slider(256, 2048, 1024, step=64, label="๋„ˆ๋น„")
height = gr.Slider(256, 2048, 1024, step=64, label="๋†’์ด")
with gr.Row():
guidance = gr.Slider(1.0, 20.0, 3.5, step=0.1, label="๊ฐ€์ด๋˜์Šค")
steps = gr.Slider(1, 50, 30, step=1, label="์Šคํ…")
seed = gr.Number(label="์‹œ๋“œ (-1=๋žœ๋ค)", value=-1)
generate_btn = gr.Button("๐ŸŽจ ์ด๋ฏธ์ง€ ์ƒ์„ฑ", variant="primary", elem_id="generate-btn")
with gr.Group(elem_classes="panel-box"):
gr.Markdown("### ๐ŸŽฌ ๋น„๋””์˜ค ์ƒ์„ฑ ์„ค์ •")
video_prompt = gr.Textbox(
label="(์„ ํƒ) ๋น„๋””์˜ค ํ”„๋กฌํ”„ํŠธ(์˜์–ด๋กœ ์ž…๋ ฅ)",
placeholder="๋น„๋””์˜ค์˜ ์›€์ง์ž„์„ ์„ค๋ช…ํ•˜์„ธ์š”... (๋น„์›Œ๋‘๋ฉด ๊ธฐ๋ณธ ์›€์ง์ž„ ์ ์šฉ)",
lines=2
)
video_length = gr.Slider(
minimum=1,
maximum=60,
value=4,
step=0.5,
label="๋น„๋””์˜ค ๊ธธ์ด (์ดˆ)",
info="1์ดˆ์—์„œ 60์ดˆ๊นŒ์ง€ ์„ ํƒ ๊ฐ€๋Šฅํ•ฉ๋‹ˆ๋‹ค"
)
video_btn = gr.Button("๐ŸŽฌ ๋น„๋””์˜ค๋กœ ๋ณ€ํ™˜", variant="secondary", elem_id="video-btn")
# ์ถœ๋ ฅ ์ปฌ๋Ÿผ
with gr.Column(scale=1):
with gr.Group(elem_classes="panel-box"):
gr.Markdown("### ๐Ÿ–ผ๏ธ ์ƒ์„ฑ ๊ฒฐ๊ณผ")
output_image = gr.Image(label="์ƒ์„ฑ๋œ ์ด๋ฏธ์ง€", type="numpy")
output_seed = gr.Textbox(label="์‹œ๋“œ ์ •๋ณด")
output_video = gr.Video(label="์ƒ์„ฑ๋œ ๋น„๋””์˜ค")
# ๋‘ ๋ฒˆ์งธ ํƒญ: ์ด๋ฏธ์ง€ ์•„์›ƒํŽ˜์ธํŒ…
with gr.Tab("์ด๋ฏธ์ง€ ๋น„์œจ ๋ณ€๊ฒฝ/์ƒ์„ฑ", elem_classes="tabitem"):
with gr.Row(equal_height=True):
# ์ž…๋ ฅ ์ปฌ๋Ÿผ
with gr.Column(scale=1):
with gr.Group(elem_classes="panel-box"):
gr.Markdown("### ๐Ÿ–ผ๏ธ ์ด๋ฏธ์ง€ ์—…๋กœ๋“œ")
input_image = gr.Image(
label="์›๋ณธ ์ด๋ฏธ์ง€",
type="numpy"
)
outpaint_prompt = gr.Textbox(
label="ํ”„๋กฌํ”„ํŠธ (์„ ํƒ)",
placeholder="ํ™•์žฅํ•  ์˜์—ญ์— ๋Œ€ํ•œ ์„ค๋ช…...",
lines=2
)
with gr.Group(elem_classes="panel-box"):
gr.Markdown("### โš™๏ธ ์•„์›ƒํŽ˜์ธํŒ… ์„ค์ •")
outpaint_size_preset = gr.Dropdown(
choices=list(IMAGE_PRESETS.keys()),
value="16:9 ์™€์ด๋“œ์Šคํฌ๋ฆฐ",
label="๋ชฉํ‘œ ํฌ๊ธฐ ํ”„๋ฆฌ์…‹"
)
with gr.Row():
outpaint_width = gr.Slider(256, 2048, 1280, step=64, label="๋ชฉํ‘œ ๋„ˆ๋น„")
outpaint_height = gr.Slider(256, 2048, 720, step=64, label="๋ชฉํ‘œ ๋†’์ด")
alignment = gr.Dropdown(
choices=["๊ฐ€์šด๋ฐ", "์™ผ์ชฝ", "์˜ค๋ฅธ์ชฝ", "์œ„", "์•„๋ž˜"],
value="๊ฐ€์šด๋ฐ",
label="์ •๋ ฌ"
)
overlap_percentage = gr.Slider(
minimum=1,
maximum=50,
value=10,
step=1,
label="๋งˆ์Šคํฌ ์˜ค๋ฒ„๋žฉ (%)"
)
outpaint_steps = gr.Slider(
minimum=4,
maximum=12,
value=8,
step=1,
label="์ถ”๋ก  ์Šคํ…"
)
preview_btn = gr.Button("๐Ÿ‘๏ธ ๋ฏธ๋ฆฌ๋ณด๊ธฐ", elem_id="preview-btn")
outpaint_btn = gr.Button("๐ŸŽจ ์•„์›ƒํŽ˜์ธํŒ… ์‹คํ–‰", variant="primary", elem_id="outpaint-btn")
# ์ถœ๋ ฅ ์ปฌ๋Ÿผ
with gr.Column(scale=1):
with gr.Group(elem_classes="panel-box"):
gr.Markdown("### ๐Ÿ–ผ๏ธ ๊ฒฐ๊ณผ")
preview_image = gr.Image(label="๋ฏธ๋ฆฌ๋ณด๊ธฐ")
outpaint_result = gr.Image(label="์•„์›ƒํŽ˜์ธํŒ… ๊ฒฐ๊ณผ")
# ์„ธ ๋ฒˆ์งธ ํƒญ: ๋น„๋””์˜ค + ์˜ค๋””์˜ค
with gr.Tab("๋น„๋””์˜ค + ์˜ค๋””์˜ค", elem_classes="tabitem"):
with gr.Row(equal_height=True):
# ์ž…๋ ฅ ์ปฌ๋Ÿผ
with gr.Column(scale=1):
with gr.Group(elem_classes="panel-box"):
gr.Markdown("### ๐ŸŽฅ ๋น„๋””์˜ค ์—…๋กœ๋“œ")
audio_video_input = gr.Video(
label="์ž…๋ ฅ ๋น„๋””์˜ค",
sources=["upload"]
)
with gr.Group(elem_classes="panel-box"):
gr.Markdown("### ๐ŸŽต ์˜ค๋””์˜ค ์ƒ์„ฑ ์„ค์ •")
audio_prompt = gr.Textbox(
label="ํ”„๋กฌํ”„ํŠธ (ํ•œ๊ธ€ ์ง€์›)",
placeholder="์ƒ์„ฑํ•˜๊ณ  ์‹ถ์€ ์˜ค๋””์˜ค๋ฅผ ์„ค๋ช…ํ•˜์„ธ์š”... (์˜ˆ: ํ‰ํ™”๋กœ์šด ํ”ผ์•„๋…ธ ์Œ์•…)",
lines=3
)
audio_negative_prompt = gr.Textbox(
label="๋„ค๊ฑฐํ‹ฐ๋ธŒ ํ”„๋กฌํ”„ํŠธ",
value="music",
placeholder="์›ํ•˜์ง€ ์•Š๋Š” ์š”์†Œ...",
lines=2
)
with gr.Row():
audio_seed = gr.Number(label="์‹œ๋“œ", value=0)
audio_steps = gr.Number(label="์Šคํ…", value=25)
with gr.Row():
audio_cfg = gr.Number(label="๊ฐ€์ด๋˜์Šค ์Šค์ผ€์ผ", value=4.5)
audio_duration = gr.Number(label="์ง€์†์‹œ๊ฐ„ (์ดˆ)", value=9999)
audio_btn = gr.Button("๐ŸŽต ์˜ค๋””์˜ค ์ƒ์„ฑ ๋ฐ ํ•ฉ์„ฑ", variant="primary", elem_id="audio-btn")
# ์ถœ๋ ฅ ์ปฌ๋Ÿผ
with gr.Column(scale=1):
with gr.Group(elem_classes="panel-box"):
gr.Markdown("### ๐ŸŽฌ ์ƒ์„ฑ ๊ฒฐ๊ณผ")
output_video_with_audio = gr.Video(
label="์˜ค๋””์˜ค๊ฐ€ ์ถ”๊ฐ€๋œ ๋น„๋””์˜ค",
interactive=False
)
# ๋„ค ๋ฒˆ์งธ ํƒญ: ๋น„๋””์˜ค ํŽธ์ง‘
with gr.Tab("๋น„๋””์˜ค ํŽธ์ง‘", elem_classes="tabitem"):
with gr.Row(equal_height=True):
# ์ž…๋ ฅ ์ปฌ๋Ÿผ
with gr.Column(scale=1):
with gr.Group(elem_classes="panel-box"):
gr.Markdown("### ๐ŸŽฅ ๋น„๋””์˜ค ์—…๋กœ๋“œ (์ตœ๋Œ€ 10๊ฐœ)")
gr.Markdown("**ํŒŒ์ผ๋ช…์ด ์ž‘์„์ˆ˜๋ก ์šฐ์„ ์ˆœ์œ„๊ฐ€ ๋†’์Šต๋‹ˆ๋‹ค** (์˜ˆ: 1.mp4, 2.mp4, 3.mp4)")
video_files = gr.File(
label="๋น„๋””์˜ค ํŒŒ์ผ๋“ค",
file_count="multiple",
file_types=["video"],
type="filepath"
)
with gr.Group(elem_classes="panel-box"):
gr.Markdown("### โš™๏ธ ํŽธ์ง‘ ์„ค์ •")
output_fps = gr.Slider(
minimum=0,
maximum=60,
value=0,
step=1,
label="์ถœ๋ ฅ FPS (0 = ์ฒซ ๋ฒˆ์งธ ๋น„๋””์˜ค์˜ FPS ์‚ฌ์šฉ)"
)
gr.Markdown("""
**ํฌ๊ธฐ ์ฒ˜๋ฆฌ**:
- ์ฒซ ๋ฒˆ์งธ ๋น„๋””์˜ค์˜ ํฌ๊ธฐ๊ฐ€ ๊ธฐ์ค€์ด ๋ฉ๋‹ˆ๋‹ค
- ๋‹ค๋ฅธ ํฌ๊ธฐ์˜ ๋น„๋””์˜ค๋Š” ์ฒซ ๋ฒˆ์งธ ๋น„๋””์˜ค ํฌ๊ธฐ๋กœ ์กฐ์ •๋ฉ๋‹ˆ๋‹ค
- ์ตœ์ƒ์˜ ๊ฒฐ๊ณผ๋ฅผ ์œ„ํ•ด ๊ฐ™์€ ํฌ๊ธฐ์˜ ๋น„๋””์˜ค๋ฅผ ์‚ฌ์šฉํ•˜์„ธ์š”
""")
with gr.Group(elem_classes="panel-box"):
gr.Markdown("### ๐ŸŽต ์˜ค๋””์˜ค ์„ค์ • (์„ ํƒ)")
# ์˜ค๋””์˜ค ๋ชจ๋“œ ์„ ํƒ ์ถ”๊ฐ€
audio_mode = gr.Radio(
["๋Œ€์ฒด", "๋ฐฑ๊ทธ๋ผ์šด๋“œ ๋ฎค์ง"],
label="์˜ค๋””์˜ค ๋ชจ๋“œ",
value="๋Œ€์ฒด",
info="๋Œ€์ฒด: ๊ธฐ์กด ์˜ค๋””์˜ค๋ฅผ ์™„์ „ํžˆ ๊ต์ฒด | ๋ฐฑ๊ทธ๋ผ์šด๋“œ ๋ฎค์ง: ๊ธฐ์กด ์˜ค๋””์˜ค์™€ ํ•จ๊ป˜ ์žฌ์ƒ"
)
audio_file = gr.Audio(
label="์˜ค๋””์˜ค ํŒŒ์ผ (MP3, WAV, M4A ๋“ฑ)",
type="filepath",
sources=["upload"]
)
audio_volume = gr.Slider(
minimum=0,
maximum=200,
value=100,
step=1,
label="์ถ”๊ฐ€ ์˜ค๋””์˜ค ๋ณผ๋ฅจ (%)",
info="100% = ์›๋ณธ ๋ณผ๋ฅจ"
)
# ๋ฐฑ๊ทธ๋ผ์šด๋“œ ๋ชจ๋“œ์ผ ๋•Œ๋งŒ ๋ณด์ด๋Š” ์›๋ณธ ์˜ค๋””์˜ค ๋ณผ๋ฅจ ์กฐ์ ˆ
original_audio_volume = gr.Slider(
minimum=0,
maximum=200,
value=100,
step=1,
label="์›๋ณธ ์˜ค๋””์˜ค ๋ณผ๋ฅจ (%)",
info="๋ฐฑ๊ทธ๋ผ์šด๋“œ ๋ฎค์ง ๋ชจ๋“œ์—์„œ ์›๋ณธ ๋น„๋””์˜ค ์˜ค๋””์˜ค์˜ ๋ณผ๋ฅจ",
visible=False
)
gr.Markdown("""
**์˜ค๋””์˜ค ์˜ต์…˜**:
- **๋Œ€์ฒด ๋ชจ๋“œ**: ์—…๋กœ๋“œํ•œ ์˜ค๋””์˜ค๊ฐ€ ๋น„๋””์˜ค์˜ ๊ธฐ์กด ์˜ค๋””์˜ค๋ฅผ ์™„์ „ํžˆ ๋Œ€์ฒดํ•ฉ๋‹ˆ๋‹ค
- **๋ฐฑ๊ทธ๋ผ์šด๋“œ ๋ฎค์ง ๋ชจ๋“œ**: ์—…๋กœ๋“œํ•œ ์˜ค๋””์˜ค๊ฐ€ ๊ธฐ์กด ์˜ค๋””์˜ค์™€ ํ•จ๊ป˜ ์žฌ์ƒ๋ฉ๋‹ˆ๋‹ค
- ์˜ค๋””์˜ค๊ฐ€ ๋น„๋””์˜ค๋ณด๋‹ค ์งง์œผ๋ฉด ์ž๋™์œผ๋กœ ๋ฐ˜๋ณต๋ฉ๋‹ˆ๋‹ค
- ์˜ค๋””์˜ค๊ฐ€ ๋น„๋””์˜ค๋ณด๋‹ค ๊ธธ๋ฉด ๋น„๋””์˜ค ๊ธธ์ด์— ๋งž์ถฐ ์ž˜๋ฆฝ๋‹ˆ๋‹ค
""")
merge_videos_btn = gr.Button("๐ŸŽฌ ๋น„๋””์˜ค ๋ณ‘ํ•ฉ", variant="primary", elem_id="merge-btn")
# ์ถœ๋ ฅ ์ปฌ๋Ÿผ
with gr.Column(scale=1):
with gr.Group(elem_classes="panel-box"):
gr.Markdown("### ๐ŸŽฌ ๋ณ‘ํ•ฉ ๊ฒฐ๊ณผ")
merge_status = gr.Textbox(label="์ฒ˜๋ฆฌ ์ƒํƒœ", interactive=False)
merged_video = gr.Video(label="๋ณ‘ํ•ฉ๋œ ๋น„๋””์˜ค")
gr.Markdown("""
### โ„น๏ธ ์‚ฌ์šฉ ๋ฐฉ๋ฒ•
1. ์—ฌ๋Ÿฌ ๋น„๋””์˜ค ํŒŒ์ผ์„ ์—…๋กœ๋“œํ•˜์„ธ์š” (์ตœ๋Œ€ 10๊ฐœ)
2. ํŒŒ์ผ๋ช…์ด ์ž‘์€ ์ˆœ์„œ๋Œ€๋กœ ์ž๋™ ์ •๋ ฌ๋ฉ๋‹ˆ๋‹ค
3. (์„ ํƒ) ์˜ค๋””์˜ค ํŒŒ์ผ์„ ์ถ”๊ฐ€ํ•˜๊ณ  ๋ณผ๋ฅจ์„ ์กฐ์ ˆํ•˜์„ธ์š”
4. '๋น„๋””์˜ค ๋ณ‘ํ•ฉ' ๋ฒ„ํŠผ์„ ํด๋ฆญํ•˜์„ธ์š”
**ํŠน์ง•**:
- โœ… ์ฒซ ๋ฒˆ์งธ ๋น„๋””์˜ค์˜ ํฌ๊ธฐ๋ฅผ ๊ธฐ์ค€์œผ๋กœ ํ†ตํ•ฉ
- โœ… ์—…๋กœ๋“œํ•œ ์˜ค๋””์˜ค๊ฐ€ ์ „์ฒด ๋น„๋””์˜ค์— ์ ์šฉ๋ฉ๋‹ˆ๋‹ค
- โœ… ๋†’์€ ๋น„ํŠธ๋ ˆ์ดํŠธ๋กœ ํ’ˆ์งˆ ์œ ์ง€
**ํŒ**:
- ํŒŒ์ผ๋ช…์„ 01.mp4, 02.mp4, 03.mp4 ํ˜•์‹์œผ๋กœ ์ง€์ •ํ•˜๋ฉด ์ˆœ์„œ ๊ด€๋ฆฌ๊ฐ€ ์‰ฝ์Šต๋‹ˆ๋‹ค
- ์˜ค๋””์˜ค๋ฅผ ์ถ”๊ฐ€ํ•˜๋ฉด ๊ธฐ์กด ๋น„๋””์˜ค์˜ ์˜ค๋””์˜ค๋Š” ๋Œ€์ฒด๋ฉ๋‹ˆ๋‹ค
""")
# ๋‹ค์„ฏ ๋ฒˆ์งธ ํƒญ: ๋น„๋””์˜ค ๋ฐฐ๊ฒฝ์ œ๊ฑฐ/ํ•ฉ์„ฑ
with gr.Tab("๋น„๋””์˜ค ๋ฐฐ๊ฒฝ์ œ๊ฑฐ/ํ•ฉ์„ฑ", elem_classes="tabitem"):
with gr.Row(equal_height=True):
# ์ž…๋ ฅ ์ปฌ๋Ÿผ
with gr.Column(scale=1):
with gr.Group(elem_classes="panel-box"):
gr.Markdown("### ๐ŸŽฅ ๋น„๋””์˜ค ์—…๋กœ๋“œ")
bg_video_input = gr.Video(
label="์ž…๋ ฅ ๋น„๋””์˜ค",
interactive=True
)
with gr.Group(elem_classes="panel-box"):
gr.Markdown("### ๐ŸŽจ ๋ฐฐ๊ฒฝ ์„ค์ •")
bg_type = gr.Radio(
["์ƒ‰์ƒ", "์ด๋ฏธ์ง€", "๋น„๋””์˜ค"],
label="๋ฐฐ๊ฒฝ ์œ ํ˜•",
value="์ƒ‰์ƒ",
interactive=True
)
color_picker = gr.ColorPicker(
label="๋ฐฐ๊ฒฝ ์ƒ‰์ƒ",
value="#00FF00",
visible=True,
interactive=True
)
bg_image_input = gr.Image(
label="๋ฐฐ๊ฒฝ ์ด๋ฏธ์ง€",
type="filepath",
visible=False,
interactive=True
)
bg_video_bg = gr.Video(
label="๋ฐฐ๊ฒฝ ๋น„๋””์˜ค",
visible=False,
interactive=True
)
with gr.Column(visible=False) as video_handling_options:
video_handling_radio = gr.Radio(
["slow_down", "loop"],
label="๋น„๋””์˜ค ์ฒ˜๋ฆฌ ๋ฐฉ์‹",
value="slow_down",
interactive=True,
info="slow_down: ๋ฐฐ๊ฒฝ ๋น„๋””์˜ค๋ฅผ ๋А๋ฆฌ๊ฒŒ ์žฌ์ƒ, loop: ๋ฐฐ๊ฒฝ ๋น„๋””์˜ค๋ฅผ ๋ฐ˜๋ณต"
)
with gr.Group(elem_classes="panel-box"):
gr.Markdown("### โš™๏ธ ์ฒ˜๋ฆฌ ์„ค์ •")
fps_slider = gr.Slider(
minimum=0,
maximum=60,
step=1,
value=0,
label="์ถœ๋ ฅ FPS (0 = ์›๋ณธ FPS ์œ ์ง€)",
interactive=True
)
fast_mode_checkbox = gr.Checkbox(
label="๋น ๋ฅธ ๋ชจ๋“œ (BiRefNet_lite ์‚ฌ์šฉ)",
value=True,
interactive=True
)
max_workers_slider = gr.Slider(
minimum=1,
maximum=32,
step=1,
value=10,
label="์ตœ๋Œ€ ์›Œ์ปค ์ˆ˜",
info="๋ณ‘๋ ฌ๋กœ ์ฒ˜๋ฆฌํ•  ํ”„๋ ˆ์ž„ ์ˆ˜",
interactive=True
)
bg_remove_btn = gr.Button("๐ŸŽฌ ๋ฐฐ๊ฒฝ ๋ณ€๊ฒฝ", variant="primary", elem_id="bg-remove-btn")
# ์ถœ๋ ฅ ์ปฌ๋Ÿผ
with gr.Column(scale=1):
with gr.Group(elem_classes="panel-box"):
gr.Markdown("### ๐ŸŽฌ ์ฒ˜๋ฆฌ ๊ฒฐ๊ณผ")
stream_image = gr.Image(label="์‹ค์‹œ๊ฐ„ ์ŠคํŠธ๋ฆฌ๋ฐ", visible=False)
output_bg_video = gr.Video(label="์ตœ์ข… ๋น„๋””์˜ค")
time_textbox = gr.Textbox(label="๊ฒฝ๊ณผ ์‹œ๊ฐ„", interactive=False)
gr.Markdown("""
### โ„น๏ธ ์‚ฌ์šฉ ๋ฐฉ๋ฒ•
1. ๋น„๋””์˜ค๋ฅผ ์—…๋กœ๋“œํ•˜์„ธ์š”
2. ์›ํ•˜๋Š” ๋ฐฐ๊ฒฝ ์œ ํ˜•์„ ์„ ํƒํ•˜์„ธ์š”
3. ์„ค์ •์„ ์กฐ์ •ํ•˜๊ณ  '๋ฐฐ๊ฒฝ ๋ณ€๊ฒฝ' ๋ฒ„ํŠผ์„ ํด๋ฆญํ•˜์„ธ์š”
**์ฐธ๊ณ **: GPU ์ œํ•œ์œผ๋กœ ํ•œ ๋ฒˆ์— ์•ฝ 200ํ”„๋ ˆ์ž„๊นŒ์ง€ ์ฒ˜๋ฆฌ ๊ฐ€๋Šฅํ•ฉ๋‹ˆ๋‹ค.
๊ธด ๋น„๋””์˜ค๋Š” ์ž‘์€ ์กฐ๊ฐ์œผ๋กœ ๋‚˜๋ˆ„์–ด ์ฒ˜๋ฆฌํ•˜์„ธ์š”.
""")
# ์—ฌ์„ฏ ๋ฒˆ์งธ ํƒญ: ์ด๋ฏธ์ง€to์•„๋ฐ”ํƒ€ (์ค‘๋ณต ์ œ๊ฑฐํ•˜๊ณ  ํ•˜๋‚˜๋งŒ ์œ ์ง€)
with gr.Tab("์ด๋ฏธ์ง€to์•„๋ฐ”ํƒ€", elem_classes="tabitem"):
with gr.Row(equal_height=True):
# ์ž…๋ ฅ ์ปฌ๋Ÿผ
with gr.Column(scale=1):
with gr.Group(elem_classes="panel-box"):
gr.Markdown("### ๐ŸŽญ ์•„๋ฐ”ํƒ€ ์• ๋‹ˆ๋ฉ”์ด์…˜ ์ƒ์„ฑ")
gr.Markdown("""
ํฌํŠธ๋ ˆ์ดํŠธ ์ด๋ฏธ์ง€์™€ ์˜ค๋””์˜ค๋ฅผ ์—…๋กœ๋“œํ•˜๋ฉด ๋งํ•˜๋Š” ์•„๋ฐ”ํƒ€ ์• ๋‹ˆ๋ฉ”์ด์…˜์„ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค.
**๊ถŒ์žฅ ์‚ฌํ•ญ**:
- ์ด๋ฏธ์ง€: ์ •๋ฉด์„ ๋ณด๊ณ  ์žˆ๋Š” ์–ผ๊ตด ์‚ฌ์ง„
- ์˜ค๋””์˜ค: ๋ช…ํ™•ํ•œ ์Œ์„ฑ์ด ๋‹ด๊ธด ์˜ค๋””์˜ค ํŒŒ์ผ
""")
avatar_image = gr.Image(
label="ํฌํŠธ๋ ˆ์ดํŠธ ์ด๋ฏธ์ง€",
type="filepath",
elem_classes="panel-box"
)
avatar_audio = gr.Audio(
label="๋“œ๋ผ์ด๋น™ ์˜ค๋””์˜ค",
type="filepath",
elem_classes="panel-box"
)
with gr.Group(elem_classes="panel-box"):
gr.Markdown("### โš™๏ธ ์ƒ์„ฑ ์„ค์ •")
guidance_scale = gr.Slider(
minimum=1.0,
maximum=10.0,
value=3.0,
step=0.1,
label="๊ฐ€์ด๋˜์Šค ์Šค์ผ€์ผ",
info="๋†’์„์ˆ˜๋ก ์˜ค๋””์˜ค์— ๋” ์ถฉ์‹คํ•œ ์›€์ง์ž„ ์ƒ์„ฑ"
)
inference_steps = gr.Slider(
minimum=5,
maximum=30,
value=10,
step=1,
label="์ถ”๋ก  ์Šคํ…",
info="๋†’์„์ˆ˜๋ก ํ’ˆ์งˆ์ด ์ข‹์•„์ง€์ง€๋งŒ ์ƒ์„ฑ ์‹œ๊ฐ„์ด ์ฆ๊ฐ€"
)
# ์„œ๋ฒ„ ์ƒํƒœ ์ฒดํฌ
with gr.Row():
test_connection_btn = gr.Button(
"๐Ÿ”Œ ์„œ๋ฒ„ ์—ฐ๊ฒฐ ํ…Œ์ŠคํŠธ",
elem_id="test-connection-btn",
scale=1
)
anim_status = gr.Textbox(
label="์„œ๋ฒ„ ์ƒํƒœ",
interactive=False,
elem_classes="panel-box"
)
generate_avatar_btn = gr.Button(
"๐ŸŽฌ ์•„๋ฐ”ํƒ€ ์ƒ์„ฑ",
variant="primary",
elem_id="avatar-btn"
)
# ์ถœ๋ ฅ ์ปฌ๋Ÿผ
with gr.Column(scale=1):
with gr.Group(elem_classes="panel-box"):
gr.Markdown("### ๐ŸŽญ ์ƒ์„ฑ ๊ฒฐ๊ณผ")
avatar_result = gr.Video(
label="์• ๋‹ˆ๋ฉ”์ด์…˜ ๊ฒฐ๊ณผ",
elem_classes="panel-box"
)
avatar_comparison = gr.Video(
label="์›๋ณธ ๋Œ€๋น„ ๊ฒฐ๊ณผ (Side-by-Side)",
elem_classes="panel-box"
)
with gr.Accordion("์‹คํ–‰ ๋กœ๊ทธ", open=False):
avatar_logs = gr.Textbox(
label="๋กœ๊ทธ",
lines=10,
max_lines=20,
interactive=False,
elem_classes="panel-box"
)
gr.Markdown("""
### โ„น๏ธ ์‚ฌ์šฉ ์•ˆ๋‚ด
1. **ํฌํŠธ๋ ˆ์ดํŠธ ์ด๋ฏธ์ง€ ์—…๋กœ๋“œ**: ์ •๋ฉด์„ ๋ณด๊ณ  ์žˆ๋Š” ์„ ๋ช…ํ•œ ์–ผ๊ตด ์‚ฌ์ง„
2. **์˜ค๋””์˜ค ์—…๋กœ๋“œ**: ์• ๋‹ˆ๋ฉ”์ด์…˜์— ์‚ฌ์šฉํ•  ์Œ์„ฑ ํŒŒ์ผ
3. **์„ค์ • ์กฐ์ •**: ๊ฐ€์ด๋˜์Šค ์Šค์ผ€์ผ๊ณผ ์ถ”๋ก  ์Šคํ… ์กฐ์ •
4. **์ƒ์„ฑ ์‹œ์ž‘**: '์•„๋ฐ”ํƒ€ ์ƒ์„ฑ' ๋ฒ„ํŠผ ํด๋ฆญ
**์ฒ˜๋ฆฌ ์‹œ๊ฐ„**:
- ์ผ๋ฐ˜์ ์œผ๋กœ 2-5๋ถ„ ์†Œ์š”
- ๊ธด ์˜ค๋””์˜ค์ผ์ˆ˜๋ก ์ฒ˜๋ฆฌ ์‹œ๊ฐ„ ์ฆ๊ฐ€
**ํŒ**:
- ๋ฐฐ๊ฒฝ์ด ๋‹จ์ˆœํ•œ ์ด๋ฏธ์ง€๊ฐ€ ๋” ์ข‹์€ ๊ฒฐ๊ณผ๋ฅผ ์ƒ์„ฑํ•ฉ๋‹ˆ๋‹ค
- ์˜ค๋””์˜ค์˜ ์Œ์„ฑ์ด ๋ช…ํ™•ํ• ์ˆ˜๋ก ๋ฆฝ์‹ฑํฌ๊ฐ€ ์ •ํ™•ํ•ฉ๋‹ˆ๋‹ค
""")
# ๋ชจ๋ธ ๋กœ๋“œ ํ•จ์ˆ˜ ์‹คํ–‰
def on_demo_load():
try:
if IS_SPACES:
# Spaces ํ™˜๊ฒฝ์—์„œ GPU ์›Œ๋ฐ์—…
gpu_warmup()
# ๋ชจ๋ธ ๋กœ๋“œ๋Š” ์ฒซ ๋ฒˆ์งธ GPU ํ•จ์ˆ˜ ํ˜ธ์ถœ ์‹œ ์ž๋™์œผ๋กœ ์ˆ˜ํ–‰๋จ
return "๋ชจ๋ธ ๋กœ๋”ฉ ์ค€๋น„ ์™„๋ฃŒ"
except Exception as e:
return f"์ดˆ๊ธฐํ™” ์˜ค๋ฅ˜: {str(e)}"
# ์ด๋ฒคํŠธ ์—ฐ๊ฒฐ - ์ฒซ ๋ฒˆ์งธ ํƒญ
size_preset.change(update_dimensions, [size_preset], [width, height])
generate_btn.click(
generate_text_to_image,
[prompt, width, height, guidance, steps, seed],
[output_image, output_seed]
)
video_btn.click(
lambda img, v_prompt, length: generate_video_from_image(img, v_prompt, length) if img is not None else None,
[output_image, video_prompt, video_length],
[output_video]
)
# ์ด๋ฒคํŠธ ์—ฐ๊ฒฐ - ๋‘ ๋ฒˆ์งธ ํƒญ
outpaint_size_preset.change(update_dimensions, [outpaint_size_preset], [outpaint_width, outpaint_height])
preview_btn.click(
preview_outpaint,
[input_image, outpaint_width, outpaint_height, overlap_percentage, alignment],
[preview_image]
)
outpaint_btn.click(
outpaint_image,
[input_image, outpaint_prompt, outpaint_width, outpaint_height, overlap_percentage, alignment, outpaint_steps],
[outpaint_result]
)
# ์ด๋ฒคํŠธ ์—ฐ๊ฒฐ - ์„ธ ๋ฒˆ์งธ ํƒญ
audio_btn.click(
video_to_audio,
[audio_video_input, audio_prompt, audio_negative_prompt, audio_seed, audio_steps, audio_cfg, audio_duration],
[output_video_with_audio]
)
# ์ด๋ฒคํŠธ ์—ฐ๊ฒฐ - ๋„ค ๋ฒˆ์งธ ํƒญ (๋น„๋””์˜ค ํŽธ์ง‘)
def toggle_original_volume(mode):
return gr.update(visible=(mode == "๋ฐฑ๊ทธ๋ผ์šด๋“œ ๋ฎค์ง"))
audio_mode.change(
toggle_original_volume,
inputs=[audio_mode],
outputs=[original_audio_volume]
)
merge_videos_btn.click(
merge_videos_with_audio,
inputs=[video_files, audio_file, audio_mode, audio_volume, original_audio_volume, output_fps],
outputs=[merged_video, merge_status]
)
# ์ด๋ฒคํŠธ ์—ฐ๊ฒฐ - ๋‹ค์„ฏ ๋ฒˆ์งธ ํƒญ (๋น„๋””์˜ค ๋ฐฐ๊ฒฝ์ œ๊ฑฐ/ํ•ฉ์„ฑ)
def update_bg_visibility(bg_type):
if bg_type == "์ƒ‰์ƒ":
return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
elif bg_type == "์ด๋ฏธ์ง€":
return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)
elif bg_type == "๋น„๋””์˜ค":
return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=True)
else:
return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
bg_type.change(
update_bg_visibility,
inputs=bg_type,
outputs=[color_picker, bg_image_input, bg_video_bg, video_handling_options]
)
bg_remove_btn.click(
process_video_bg,
inputs=[bg_video_input, bg_type, bg_image_input, bg_video_bg, color_picker,
fps_slider, video_handling_radio, fast_mode_checkbox, max_workers_slider],
outputs=[stream_image, output_bg_video, time_textbox]
)
# ์ด๋ฒคํŠธ ์—ฐ๊ฒฐ - ์—ฌ์„ฏ ๋ฒˆ์งธ ํƒญ (์ด๋ฏธ์ง€to์•„๋ฐ”ํƒ€)
test_connection_btn.click(
test_anim_api_connection,
outputs=[anim_status, anim_status]
)
generate_avatar_btn.click(
generate_avatar_animation,
inputs=[avatar_image, avatar_audio, guidance_scale, inference_steps],
outputs=[avatar_result, avatar_comparison, avatar_logs]
)
# ๋ฐ๋ชจ ๋กœ๋“œ ์‹œ ์‹คํ–‰
demo.load(on_demo_load, outputs=model_status)
if __name__ == "__main__":
# Spaces ํ™˜๊ฒฝ์—์„œ ์ถ”๊ฐ€ ์ฒดํฌ
if IS_SPACES:
try:
gpu_warmup()
except:
pass
demo.launch()