|
import gradio as gr |
|
import numpy as np |
|
from PIL import Image, ImageDraw |
|
from gradio_client import Client, handle_file |
|
import random |
|
import tempfile |
|
import os |
|
import logging |
|
import torch |
|
from diffusers import AutoencoderKL, TCDScheduler |
|
from diffusers.models.model_loading_utils import load_state_dict |
|
from huggingface_hub import hf_hub_download |
|
from pathlib import Path |
|
import torchaudio |
|
from einops import rearrange |
|
from scipy.io import wavfile |
|
from transformers import pipeline |
|
|
|
|
|
os.environ["TRANSFORMERS_ALLOW_UNSAFE_DESERIALIZATION"] = "1" |
|
|
|
|
|
try: |
|
import spaces |
|
except: |
|
|
|
class spaces: |
|
@staticmethod |
|
def GPU(duration=None): |
|
def decorator(func): |
|
return func |
|
return decorator |
|
|
|
|
|
try: |
|
import mmaudio |
|
except ImportError: |
|
os.system("pip install -e .") |
|
import mmaudio |
|
|
|
from mmaudio.eval_utils import (ModelConfig, all_model_cfg, generate, load_video, make_video, |
|
setup_eval_logging) |
|
from mmaudio.model.flow_matching import FlowMatching |
|
from mmaudio.model.networks import MMAudio, get_my_mmaudio |
|
from mmaudio.model.sequence_config import SequenceConfig |
|
from mmaudio.model.utils.features_utils import FeaturesUtils |
|
|
|
|
|
try: |
|
from controlnet_union import ControlNetModel_Union |
|
from pipeline_fill_sd_xl import StableDiffusionXLFillPipeline |
|
|
|
|
|
config_file = hf_hub_download( |
|
"xinsir/controlnet-union-sdxl-1.0", |
|
filename="config_promax.json", |
|
) |
|
|
|
config = ControlNetModel_Union.load_config(config_file) |
|
controlnet_model = ControlNetModel_Union.from_config(config) |
|
|
|
model_file = hf_hub_download( |
|
"xinsir/controlnet-union-sdxl-1.0", |
|
filename="diffusion_pytorch_model_promax.safetensors", |
|
) |
|
state_dict = load_state_dict(model_file) |
|
loaded_keys = list(state_dict.keys()) |
|
|
|
result = ControlNetModel_Union._load_pretrained_model( |
|
controlnet_model, state_dict, model_file, "xinsir/controlnet-union-sdxl-1.0", loaded_keys |
|
) |
|
|
|
model = result[0] |
|
model = model.to(device="cuda", dtype=torch.float16) |
|
|
|
|
|
vae = AutoencoderKL.from_pretrained( |
|
"madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16 |
|
).to("cuda") |
|
|
|
|
|
pipe = StableDiffusionXLFillPipeline.from_pretrained( |
|
"SG161222/RealVisXL_V5.0_Lightning", |
|
torch_dtype=torch.float16, |
|
vae=vae, |
|
controlnet=model, |
|
variant="fp16", |
|
).to("cuda") |
|
|
|
pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config) |
|
|
|
OUTPAINT_MODEL_LOADED = True |
|
except Exception as e: |
|
logging.error(f"Failed to load outpainting models: {str(e)}") |
|
OUTPAINT_MODEL_LOADED = False |
|
|
|
|
|
if torch.cuda.is_available(): |
|
device = torch.device("cuda") |
|
torch.backends.cuda.matmul.allow_tf32 = True |
|
torch.backends.cudnn.allow_tf32 = True |
|
torch.backends.cudnn.benchmark = True |
|
else: |
|
device = torch.device("cpu") |
|
|
|
dtype = torch.bfloat16 |
|
|
|
|
|
try: |
|
model_mmaudio: ModelConfig = all_model_cfg['large_44k_v2'] |
|
model_mmaudio.download_if_needed() |
|
output_dir = Path('./output/gradio') |
|
setup_eval_logging() |
|
|
|
|
|
try: |
|
translator = pipeline("translation", |
|
model="Helsinki-NLP/opus-mt-ko-en", |
|
device="cpu", |
|
use_fast=True, |
|
trust_remote_code=False) |
|
except Exception as e: |
|
logging.warning(f"Failed to load translation model: {e}") |
|
translator = None |
|
|
|
def get_mmaudio_model() -> tuple[MMAudio, FeaturesUtils, SequenceConfig]: |
|
with torch.cuda.device(device): |
|
seq_cfg = model_mmaudio.seq_cfg |
|
net: MMAudio = get_my_mmaudio(model_mmaudio.model_name).to(device, dtype).eval() |
|
net.load_weights(torch.load(model_mmaudio.model_path, map_location=device, weights_only=True)) |
|
logging.info(f'Loaded weights from {model_mmaudio.model_path}') |
|
|
|
feature_utils = FeaturesUtils( |
|
tod_vae_ckpt=model_mmaudio.vae_path, |
|
synchformer_ckpt=model_mmaudio.synchformer_ckpt, |
|
enable_conditions=True, |
|
mode=model_mmaudio.mode, |
|
bigvgan_vocoder_ckpt=model_mmaudio.bigvgan_16k_path, |
|
need_vae_encoder=False |
|
).to(device, dtype).eval() |
|
|
|
return net, feature_utils, seq_cfg |
|
|
|
net_mmaudio, feature_utils, seq_cfg = get_mmaudio_model() |
|
MMAUDIO_MODEL_LOADED = True |
|
except Exception as e: |
|
logging.error(f"Failed to load MMAudio models: {str(e)}") |
|
MMAUDIO_MODEL_LOADED = False |
|
translator = None |
|
|
|
|
|
TEXT2IMG_API_URL = "http://211.233.58.201:7896" |
|
VIDEO_API_URL = "http://211.233.58.201:7875" |
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
|
|
|
|
IMAGE_PRESETS = { |
|
"์ปค์คํ
": {"width": 1024, "height": 1024}, |
|
"1:1 ์ ์ฌ๊ฐํ": {"width": 1024, "height": 1024}, |
|
"4:3 ํ์ค": {"width": 1024, "height": 768}, |
|
"16:9 ์์ด๋์คํฌ๋ฆฐ": {"width": 1024, "height": 576}, |
|
"9:16 ์ธ๋กํ": {"width": 576, "height": 1024}, |
|
"6:19 ํน์ ์ธ๋กํ": {"width": 324, "height": 1024}, |
|
"Instagram ์ ์ฌ๊ฐํ": {"width": 1080, "height": 1080}, |
|
"Instagram ์คํ ๋ฆฌ": {"width": 1080, "height": 1920}, |
|
"Instagram ๊ฐ๋กํ": {"width": 1080, "height": 566}, |
|
"Facebook ์ปค๋ฒ": {"width": 820, "height": 312}, |
|
"Twitter ํค๋": {"width": 1500, "height": 500}, |
|
"YouTube ์ธ๋ค์ผ": {"width": 1280, "height": 720}, |
|
"LinkedIn ๋ฐฐ๋": {"width": 1584, "height": 396}, |
|
} |
|
|
|
def update_dimensions(preset): |
|
if preset in IMAGE_PRESETS: |
|
return IMAGE_PRESETS[preset]["width"], IMAGE_PRESETS[preset]["height"] |
|
return 1024, 1024 |
|
|
|
def generate_text_to_image(prompt, width, height, guidance, inference_steps, seed): |
|
if not prompt: |
|
return None, "ํ๋กฌํํธ๋ฅผ ์
๋ ฅํด์ฃผ์ธ์" |
|
|
|
try: |
|
client = Client(TEXT2IMG_API_URL) |
|
if seed == -1: |
|
seed = random.randint(0, 9999999) |
|
|
|
result = client.predict( |
|
prompt=prompt, |
|
width=int(width), |
|
height=int(height), |
|
guidance=float(guidance), |
|
inference_steps=int(inference_steps), |
|
seed=int(seed), |
|
do_img2img=False, |
|
init_image=None, |
|
image2image_strength=0.8, |
|
resize_img=True, |
|
api_name="/generate_image" |
|
) |
|
return result[0], f"์ฌ์ฉ๋ ์๋: {result[1]}" |
|
except Exception as e: |
|
logging.error(f"Image generation error: {str(e)}") |
|
return None, f"์ค๋ฅ: {str(e)}" |
|
|
|
def generate_video_from_image(image, prompt="", length=4.0): |
|
if image is None: |
|
return None |
|
|
|
try: |
|
|
|
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as fp: |
|
temp_path = fp.name |
|
Image.fromarray(image).save(temp_path) |
|
|
|
|
|
client = Client(VIDEO_API_URL) |
|
result = client.predict( |
|
input_image=handle_file(temp_path), |
|
prompt=prompt if prompt else "Generate natural motion", |
|
n_prompt="", |
|
seed=random.randint(0, 9999999), |
|
use_teacache=True, |
|
video_length=float(length), |
|
api_name="/process" |
|
) |
|
|
|
os.unlink(temp_path) |
|
|
|
if result and len(result) > 0: |
|
video_dict = result[0] |
|
return video_dict.get("video") if isinstance(video_dict, dict) else None |
|
|
|
except Exception as e: |
|
logging.error(f"Video generation error: {str(e)}") |
|
return None |
|
|
|
def prepare_image_and_mask(image, width, height, overlap_percentage, alignment): |
|
"""์ด๋ฏธ์ง์ ๋ง์คํฌ๋ฅผ ์ค๋นํ๋ ํจ์""" |
|
if image is None: |
|
return None, None |
|
|
|
|
|
if isinstance(image, np.ndarray): |
|
image = Image.fromarray(image).convert('RGB') |
|
|
|
target_size = (width, height) |
|
|
|
|
|
scale_factor = min(target_size[0] / image.width, target_size[1] / image.height) |
|
new_width = int(image.width * scale_factor) |
|
new_height = int(image.height * scale_factor) |
|
|
|
|
|
source = image.resize((new_width, new_height), Image.LANCZOS) |
|
|
|
|
|
overlap_x = int(new_width * (overlap_percentage / 100)) |
|
overlap_y = int(new_height * (overlap_percentage / 100)) |
|
overlap_x = max(overlap_x, 1) |
|
overlap_y = max(overlap_y, 1) |
|
|
|
|
|
if alignment == "๊ฐ์ด๋ฐ": |
|
margin_x = (target_size[0] - new_width) // 2 |
|
margin_y = (target_size[1] - new_height) // 2 |
|
elif alignment == "์ผ์ชฝ": |
|
margin_x = 0 |
|
margin_y = (target_size[1] - new_height) // 2 |
|
elif alignment == "์ค๋ฅธ์ชฝ": |
|
margin_x = target_size[0] - new_width |
|
margin_y = (target_size[1] - new_height) // 2 |
|
elif alignment == "์": |
|
margin_x = (target_size[0] - new_width) // 2 |
|
margin_y = 0 |
|
elif alignment == "์๋": |
|
margin_x = (target_size[0] - new_width) // 2 |
|
margin_y = target_size[1] - new_height |
|
|
|
|
|
background = Image.new('RGB', target_size, (255, 255, 255)) |
|
background.paste(source, (margin_x, margin_y)) |
|
|
|
|
|
mask = Image.new('L', target_size, 255) |
|
mask_draw = ImageDraw.Draw(mask) |
|
|
|
|
|
white_gaps_patch = 2 |
|
|
|
left_overlap = margin_x + overlap_x if alignment != "์ผ์ชฝ" else margin_x |
|
right_overlap = margin_x + new_width - overlap_x if alignment != "์ค๋ฅธ์ชฝ" else margin_x + new_width |
|
top_overlap = margin_y + overlap_y if alignment != "์" else margin_y |
|
bottom_overlap = margin_y + new_height - overlap_y if alignment != "์๋" else margin_y + new_height |
|
|
|
mask_draw.rectangle([ |
|
(left_overlap, top_overlap), |
|
(right_overlap, bottom_overlap) |
|
], fill=0) |
|
|
|
return background, mask |
|
|
|
def preview_outpaint(image, width, height, overlap_percentage, alignment): |
|
"""์์ํ์ธํ
๋ฏธ๋ฆฌ๋ณด๊ธฐ""" |
|
background, mask = prepare_image_and_mask(image, width, height, overlap_percentage, alignment) |
|
if background is None: |
|
return None |
|
|
|
|
|
preview = background.copy().convert('RGBA') |
|
|
|
|
|
red_overlay = Image.new('RGBA', background.size, (255, 0, 0, 64)) |
|
|
|
|
|
red_mask = Image.new('RGBA', background.size, (0, 0, 0, 0)) |
|
red_mask.paste(red_overlay, (0, 0), mask) |
|
|
|
|
|
preview = Image.alpha_composite(preview, red_mask) |
|
|
|
return preview |
|
|
|
@spaces.GPU(duration=24) |
|
def outpaint_image(image, prompt, width, height, overlap_percentage, alignment, num_steps=8): |
|
"""์ด๋ฏธ์ง ์์ํ์ธํ
์คํ""" |
|
if image is None: |
|
return None |
|
|
|
if not OUTPAINT_MODEL_LOADED: |
|
return Image.new('RGB', (width, height), (200, 200, 200)) |
|
|
|
try: |
|
|
|
background, mask = prepare_image_and_mask(image, width, height, overlap_percentage, alignment) |
|
if background is None: |
|
return None |
|
|
|
|
|
cnet_image = background.copy() |
|
cnet_image.paste(0, (0, 0), mask) |
|
|
|
|
|
final_prompt = f"{prompt}, high quality, 4k" if prompt else "high quality, 4k" |
|
|
|
|
|
with torch.autocast(device_type="cuda", dtype=torch.float16): |
|
( |
|
prompt_embeds, |
|
negative_prompt_embeds, |
|
pooled_prompt_embeds, |
|
negative_pooled_prompt_embeds, |
|
) = pipe.encode_prompt(final_prompt, "cuda", True) |
|
|
|
|
|
for generated_image in pipe( |
|
prompt_embeds=prompt_embeds, |
|
negative_prompt_embeds=negative_prompt_embeds, |
|
pooled_prompt_embeds=pooled_prompt_embeds, |
|
negative_pooled_prompt_embeds=negative_pooled_prompt_embeds, |
|
image=cnet_image, |
|
num_inference_steps=num_steps |
|
): |
|
|
|
pass |
|
|
|
|
|
final_image = generated_image |
|
|
|
|
|
final_image = final_image.convert("RGBA") |
|
cnet_image.paste(final_image, (0, 0), mask) |
|
|
|
return cnet_image |
|
|
|
except Exception as e: |
|
logging.error(f"Outpainting error: {str(e)}") |
|
return background if 'background' in locals() else None |
|
|
|
|
|
def translate_prompt(text): |
|
try: |
|
if translator is None: |
|
return text |
|
|
|
if text and any(ord(char) >= 0x3131 and ord(char) <= 0xD7A3 for char in text): |
|
with torch.no_grad(): |
|
translation = translator(text)[0]['translation_text'] |
|
return translation |
|
return text |
|
except Exception as e: |
|
logging.error(f"Translation error: {e}") |
|
return text |
|
|
|
@spaces.GPU |
|
@torch.inference_mode() |
|
def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int, num_steps: int, |
|
cfg_strength: float, duration: float): |
|
if not MMAUDIO_MODEL_LOADED: |
|
return None |
|
|
|
prompt = translate_prompt(prompt) |
|
negative_prompt = translate_prompt(negative_prompt) |
|
|
|
rng = torch.Generator(device=device) |
|
rng.manual_seed(seed) |
|
fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps) |
|
|
|
clip_frames, sync_frames, duration = load_video(video, duration) |
|
clip_frames = clip_frames.unsqueeze(0) |
|
sync_frames = sync_frames.unsqueeze(0) |
|
seq_cfg.duration = duration |
|
net_mmaudio.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len) |
|
|
|
audios = generate(clip_frames, |
|
sync_frames, [prompt], |
|
negative_text=[negative_prompt], |
|
feature_utils=feature_utils, |
|
net=net_mmaudio, |
|
fm=fm, |
|
rng=rng, |
|
cfg_strength=cfg_strength) |
|
audio = audios.float().cpu()[0] |
|
|
|
video_save_path = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name |
|
make_video(video, |
|
video_save_path, |
|
audio, |
|
sampling_rate=seq_cfg.sampling_rate, |
|
duration_sec=seq_cfg.duration) |
|
return video_save_path |
|
|
|
|
|
css = """ |
|
:root { |
|
--primary-color: #f8c3cd; |
|
--secondary-color: #b3e5fc; |
|
--background-color: #f5f5f7; |
|
--card-background: #ffffff; |
|
--text-color: #424242; |
|
--accent-color: #ffb6c1; |
|
--success-color: #c8e6c9; |
|
--warning-color: #fff9c4; |
|
--shadow-color: rgba(0, 0, 0, 0.1); |
|
--border-radius: 12px; |
|
} |
|
.gradio-container { |
|
max-width: 1200px !important; |
|
margin: 0 auto !important; |
|
} |
|
.panel-box { |
|
border-radius: var(--border-radius) !important; |
|
box-shadow: 0 8px 16px var(--shadow-color) !important; |
|
background-color: var(--card-background) !important; |
|
padding: 20px !important; |
|
margin-bottom: 20px !important; |
|
} |
|
#generate-btn, #video-btn, #outpaint-btn, #preview-btn, #audio-btn { |
|
background: linear-gradient(135deg, #ff9a9e, #fad0c4) !important; |
|
font-size: 1.1rem !important; |
|
padding: 12px 24px !important; |
|
margin-top: 10px !important; |
|
width: 100% !important; |
|
} |
|
.tabitem { |
|
min-height: 700px !important; |
|
} |
|
""" |
|
|
|
|
|
demo = gr.Blocks(css=css, title="AI ์ด๋ฏธ์ง & ๋น๋์ค & ์ค๋์ค ์์ฑ๊ธฐ") |
|
|
|
with demo: |
|
gr.Markdown("# ๐จ Ginigen ์คํ๋์ค") |
|
|
|
with gr.Tabs() as tabs: |
|
|
|
with gr.Tab("ํ
์คํธโ์ด๋ฏธ์งโ๋น๋์ค", elem_classes="tabitem"): |
|
with gr.Row(equal_height=True): |
|
|
|
with gr.Column(scale=1): |
|
with gr.Group(elem_classes="panel-box"): |
|
gr.Markdown("### ๐ ์ด๋ฏธ์ง ์์ฑ ์ค์ ") |
|
|
|
prompt = gr.Textbox( |
|
label="ํ๋กฌํํธ(ํ๊ธ/์์ด ๊ฐ๋ฅ)", |
|
placeholder="์์ฑํ๊ณ ์ถ์ ์ด๋ฏธ์ง๋ฅผ ์ค๋ช
ํ์ธ์...", |
|
lines=3 |
|
) |
|
|
|
size_preset = gr.Dropdown( |
|
choices=list(IMAGE_PRESETS.keys()), |
|
value="1:1 ์ ์ฌ๊ฐํ", |
|
label="ํฌ๊ธฐ ํ๋ฆฌ์
" |
|
) |
|
|
|
with gr.Row(): |
|
width = gr.Slider(256, 2048, 1024, step=64, label="๋๋น") |
|
height = gr.Slider(256, 2048, 1024, step=64, label="๋์ด") |
|
|
|
with gr.Row(): |
|
guidance = gr.Slider(1.0, 20.0, 3.5, step=0.1, label="๊ฐ์ด๋์ค") |
|
steps = gr.Slider(1, 50, 30, step=1, label="์คํ
") |
|
|
|
seed = gr.Number(label="์๋ (-1=๋๋ค)", value=-1) |
|
|
|
generate_btn = gr.Button("๐จ ์ด๋ฏธ์ง ์์ฑ", variant="primary", elem_id="generate-btn") |
|
|
|
with gr.Group(elem_classes="panel-box"): |
|
gr.Markdown("### ๐ฌ ๋น๋์ค ์์ฑ ์ค์ ") |
|
|
|
video_prompt = gr.Textbox( |
|
label="(์ ํ) ๋น๋์ค ํ๋กฌํํธ(์์ด๋ก ์
๋ ฅ)", |
|
placeholder="๋น๋์ค์ ์์ง์์ ์ค๋ช
ํ์ธ์... (๋น์๋๋ฉด ๊ธฐ๋ณธ ์์ง์ ์ ์ฉ)", |
|
lines=2 |
|
) |
|
|
|
video_length = gr.Slider( |
|
minimum=1, |
|
maximum=60, |
|
value=4, |
|
step=0.5, |
|
label="๋น๋์ค ๊ธธ์ด (์ด)", |
|
info="1์ด์์ 60์ด๊น์ง ์ ํ ๊ฐ๋ฅํฉ๋๋ค" |
|
) |
|
|
|
video_btn = gr.Button("๐ฌ ๋น๋์ค๋ก ๋ณํ", variant="secondary", elem_id="video-btn") |
|
|
|
|
|
with gr.Column(scale=1): |
|
with gr.Group(elem_classes="panel-box"): |
|
gr.Markdown("### ๐ผ๏ธ ์์ฑ ๊ฒฐ๊ณผ") |
|
|
|
output_image = gr.Image(label="์์ฑ๋ ์ด๋ฏธ์ง", type="numpy") |
|
output_seed = gr.Textbox(label="์๋ ์ ๋ณด") |
|
output_video = gr.Video(label="์์ฑ๋ ๋น๋์ค") |
|
|
|
|
|
with gr.Tab("์ด๋ฏธ์ง ๋น์จ ๋ณ๊ฒฝ/์์ฑ", elem_classes="tabitem"): |
|
with gr.Row(equal_height=True): |
|
|
|
with gr.Column(scale=1): |
|
with gr.Group(elem_classes="panel-box"): |
|
gr.Markdown("### ๐ผ๏ธ ์ด๋ฏธ์ง ์
๋ก๋") |
|
|
|
input_image = gr.Image( |
|
label="์๋ณธ ์ด๋ฏธ์ง", |
|
type="numpy" |
|
) |
|
|
|
outpaint_prompt = gr.Textbox( |
|
label="ํ๋กฌํํธ (์ ํ)", |
|
placeholder="ํ์ฅํ ์์ญ์ ๋ํ ์ค๋ช
...", |
|
lines=2 |
|
) |
|
|
|
with gr.Group(elem_classes="panel-box"): |
|
gr.Markdown("### โ๏ธ ์์ํ์ธํ
์ค์ ") |
|
|
|
outpaint_size_preset = gr.Dropdown( |
|
choices=list(IMAGE_PRESETS.keys()), |
|
value="16:9 ์์ด๋์คํฌ๋ฆฐ", |
|
label="๋ชฉํ ํฌ๊ธฐ ํ๋ฆฌ์
" |
|
) |
|
|
|
with gr.Row(): |
|
outpaint_width = gr.Slider(256, 2048, 1280, step=64, label="๋ชฉํ ๋๋น") |
|
outpaint_height = gr.Slider(256, 2048, 720, step=64, label="๋ชฉํ ๋์ด") |
|
|
|
alignment = gr.Dropdown( |
|
choices=["๊ฐ์ด๋ฐ", "์ผ์ชฝ", "์ค๋ฅธ์ชฝ", "์", "์๋"], |
|
value="๊ฐ์ด๋ฐ", |
|
label="์ ๋ ฌ" |
|
) |
|
|
|
overlap_percentage = gr.Slider( |
|
minimum=1, |
|
maximum=50, |
|
value=10, |
|
step=1, |
|
label="๋ง์คํฌ ์ค๋ฒ๋ฉ (%)" |
|
) |
|
|
|
outpaint_steps = gr.Slider( |
|
minimum=4, |
|
maximum=12, |
|
value=8, |
|
step=1, |
|
label="์ถ๋ก ์คํ
" |
|
) |
|
|
|
preview_btn = gr.Button("๐๏ธ ๋ฏธ๋ฆฌ๋ณด๊ธฐ", elem_id="preview-btn") |
|
outpaint_btn = gr.Button("๐จ ์์ํ์ธํ
์คํ", variant="primary", elem_id="outpaint-btn") |
|
|
|
|
|
with gr.Column(scale=1): |
|
with gr.Group(elem_classes="panel-box"): |
|
gr.Markdown("### ๐ผ๏ธ ๊ฒฐ๊ณผ") |
|
|
|
preview_image = gr.Image(label="๋ฏธ๋ฆฌ๋ณด๊ธฐ") |
|
outpaint_result = gr.Image(label="์์ํ์ธํ
๊ฒฐ๊ณผ") |
|
|
|
|
|
with gr.Tab("๋น๋์ค + ์ค๋์ค", elem_classes="tabitem"): |
|
with gr.Row(equal_height=True): |
|
|
|
with gr.Column(scale=1): |
|
with gr.Group(elem_classes="panel-box"): |
|
gr.Markdown("### ๐ฅ ๋น๋์ค ์
๋ก๋") |
|
|
|
audio_video_input = gr.Video( |
|
label="์
๋ ฅ ๋น๋์ค", |
|
sources=["upload"] |
|
) |
|
|
|
with gr.Group(elem_classes="panel-box"): |
|
gr.Markdown("### ๐ต ์ค๋์ค ์์ฑ ์ค์ ") |
|
|
|
audio_prompt = gr.Textbox( |
|
label="ํ๋กฌํํธ (ํ๊ธ ์ง์)" if MMAUDIO_MODEL_LOADED and translator else "ํ๋กฌํํธ", |
|
placeholder="์์ฑํ๊ณ ์ถ์ ์ค๋์ค๋ฅผ ์ค๋ช
ํ์ธ์... (์: ํํ๋ก์ด ํผ์๋
ธ ์์
)", |
|
lines=3 |
|
) |
|
|
|
audio_negative_prompt = gr.Textbox( |
|
label="๋ค๊ฑฐํฐ๋ธ ํ๋กฌํํธ", |
|
value="music", |
|
placeholder="์ํ์ง ์๋ ์์...", |
|
lines=2 |
|
) |
|
|
|
with gr.Row(): |
|
audio_seed = gr.Number(label="์๋", value=0) |
|
audio_steps = gr.Number(label="์คํ
", value=25) |
|
|
|
with gr.Row(): |
|
audio_cfg = gr.Number(label="๊ฐ์ด๋์ค ์ค์ผ์ผ", value=4.5) |
|
audio_duration = gr.Number(label="์ง์์๊ฐ (์ด)", value=9999) |
|
|
|
audio_btn = gr.Button("๐ต ์ค๋์ค ์์ฑ ๋ฐ ํฉ์ฑ", variant="primary", elem_id="audio-btn") |
|
|
|
|
|
with gr.Column(scale=1): |
|
with gr.Group(elem_classes="panel-box"): |
|
gr.Markdown("### ๐ฌ ์์ฑ ๊ฒฐ๊ณผ") |
|
|
|
output_video_with_audio = gr.Video( |
|
label="์ค๋์ค๊ฐ ์ถ๊ฐ๋ ๋น๋์ค", |
|
interactive=False |
|
) |
|
|
|
if not MMAUDIO_MODEL_LOADED: |
|
gr.Markdown("โ ๏ธ MMAudio ๋ชจ๋ธ์ ๋ก๋ํ์ง ๋ชปํ์ต๋๋ค. ์ด ๊ธฐ๋ฅ์ ์ฌ์ฉํ ์ ์์ต๋๋ค.") |
|
|
|
|
|
size_preset.change(update_dimensions, [size_preset], [width, height]) |
|
|
|
generate_btn.click( |
|
generate_text_to_image, |
|
[prompt, width, height, guidance, steps, seed], |
|
[output_image, output_seed] |
|
) |
|
|
|
video_btn.click( |
|
lambda img, v_prompt, length: generate_video_from_image(img, v_prompt, length) if img is not None else None, |
|
[output_image, video_prompt, video_length], |
|
[output_video] |
|
) |
|
|
|
|
|
outpaint_size_preset.change(update_dimensions, [outpaint_size_preset], [outpaint_width, outpaint_height]) |
|
|
|
preview_btn.click( |
|
preview_outpaint, |
|
[input_image, outpaint_width, outpaint_height, overlap_percentage, alignment], |
|
[preview_image] |
|
) |
|
|
|
outpaint_btn.click( |
|
outpaint_image, |
|
[input_image, outpaint_prompt, outpaint_width, outpaint_height, overlap_percentage, alignment, outpaint_steps], |
|
[outpaint_result] |
|
) |
|
|
|
|
|
audio_btn.click( |
|
video_to_audio, |
|
[audio_video_input, audio_prompt, audio_negative_prompt, audio_seed, audio_steps, audio_cfg, audio_duration], |
|
[output_video_with_audio] |
|
) |
|
|
|
demo.launch() |