openfree commited on
Commit
3707c06
ยท
verified ยท
1 Parent(s): cd0adcf

Create app-backup.py

Browse files
Files changed (1) hide show
  1. app-backup.py +693 -0
app-backup.py ADDED
@@ -0,0 +1,693 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import numpy as np
3
+ from PIL import Image, ImageDraw
4
+ from gradio_client import Client, handle_file
5
+ import random
6
+ import tempfile
7
+ import os
8
+ import logging
9
+ import torch
10
+ from diffusers import AutoencoderKL, TCDScheduler
11
+ from diffusers.models.model_loading_utils import load_state_dict
12
+ from huggingface_hub import hf_hub_download
13
+ from pathlib import Path
14
+ import torchaudio
15
+ from einops import rearrange
16
+ from scipy.io import wavfile
17
+ from transformers import pipeline
18
+
19
+ # ํ™˜๊ฒฝ ๋ณ€์ˆ˜ ์„ค์ •์œผ๋กœ torch.load ์ฒดํฌ ์šฐํšŒ (์ž„์‹œ ํ•ด๊ฒฐ์ฑ…)
20
+ os.environ["TRANSFORMERS_ALLOW_UNSAFE_DESERIALIZATION"] = "1"
21
+
22
+ # Spaces GPU
23
+ try:
24
+ import spaces
25
+ except:
26
+ # GPU ๋ฐ์ฝ”๋ ˆ์ดํ„ฐ๊ฐ€ ์—†์„ ๋•Œ๋ฅผ ์œ„ํ•œ ๋”๋ฏธ ๋ฐ์ฝ”๋ ˆ์ดํ„ฐ
27
+ class spaces:
28
+ @staticmethod
29
+ def GPU(duration=None):
30
+ def decorator(func):
31
+ return func
32
+ return decorator
33
+
34
+ # MMAudio imports
35
+ try:
36
+ import mmaudio
37
+ except ImportError:
38
+ os.system("pip install -e .")
39
+ import mmaudio
40
+
41
+ from mmaudio.eval_utils import (ModelConfig, all_model_cfg, generate, load_video, make_video,
42
+ setup_eval_logging)
43
+ from mmaudio.model.flow_matching import FlowMatching
44
+ from mmaudio.model.networks import MMAudio, get_my_mmaudio
45
+ from mmaudio.model.sequence_config import SequenceConfig
46
+ from mmaudio.model.utils.features_utils import FeaturesUtils
47
+
48
+ # ControlNet ๋ชจ๋ธ ๋กœ๋“œ
49
+ try:
50
+ from controlnet_union import ControlNetModel_Union
51
+ from pipeline_fill_sd_xl import StableDiffusionXLFillPipeline
52
+
53
+ # ControlNet ์„ค์ • ๋ฐ ๋กœ๋“œ
54
+ config_file = hf_hub_download(
55
+ "xinsir/controlnet-union-sdxl-1.0",
56
+ filename="config_promax.json",
57
+ )
58
+
59
+ config = ControlNetModel_Union.load_config(config_file)
60
+ controlnet_model = ControlNetModel_Union.from_config(config)
61
+
62
+ model_file = hf_hub_download(
63
+ "xinsir/controlnet-union-sdxl-1.0",
64
+ filename="diffusion_pytorch_model_promax.safetensors",
65
+ )
66
+ state_dict = load_state_dict(model_file)
67
+ loaded_keys = list(state_dict.keys())
68
+
69
+ result = ControlNetModel_Union._load_pretrained_model(
70
+ controlnet_model, state_dict, model_file, "xinsir/controlnet-union-sdxl-1.0", loaded_keys
71
+ )
72
+
73
+ model = result[0]
74
+ model = model.to(device="cuda", dtype=torch.float16)
75
+
76
+ # VAE ๋กœ๋“œ
77
+ vae = AutoencoderKL.from_pretrained(
78
+ "madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16
79
+ ).to("cuda")
80
+
81
+ # ํŒŒ์ดํ”„๋ผ์ธ ๋กœ๋“œ
82
+ pipe = StableDiffusionXLFillPipeline.from_pretrained(
83
+ "SG161222/RealVisXL_V5.0_Lightning",
84
+ torch_dtype=torch.float16,
85
+ vae=vae,
86
+ controlnet=model,
87
+ variant="fp16",
88
+ ).to("cuda")
89
+
90
+ pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
91
+
92
+ OUTPAINT_MODEL_LOADED = True
93
+ except Exception as e:
94
+ logging.error(f"Failed to load outpainting models: {str(e)}")
95
+ OUTPAINT_MODEL_LOADED = False
96
+
97
+ # MMAudio ๋ชจ๋ธ ์„ค์ •
98
+ if torch.cuda.is_available():
99
+ device = torch.device("cuda")
100
+ torch.backends.cuda.matmul.allow_tf32 = True
101
+ torch.backends.cudnn.allow_tf32 = True
102
+ torch.backends.cudnn.benchmark = True
103
+ else:
104
+ device = torch.device("cpu")
105
+
106
+ dtype = torch.bfloat16
107
+
108
+ # MMAudio ๋ชจ๋ธ ์ดˆ๊ธฐํ™”
109
+ try:
110
+ model_mmaudio: ModelConfig = all_model_cfg['large_44k_v2']
111
+ model_mmaudio.download_if_needed()
112
+ output_dir = Path('./output/gradio')
113
+ setup_eval_logging()
114
+
115
+ # ๋ฒˆ์—ญ๊ธฐ ์„ค์ •
116
+ try:
117
+ translator = pipeline("translation",
118
+ model="Helsinki-NLP/opus-mt-ko-en",
119
+ device="cpu",
120
+ use_fast=True,
121
+ trust_remote_code=False)
122
+ except Exception as e:
123
+ logging.warning(f"Failed to load translation model: {e}")
124
+ translator = None
125
+
126
+ def get_mmaudio_model() -> tuple[MMAudio, FeaturesUtils, SequenceConfig]:
127
+ with torch.cuda.device(device):
128
+ seq_cfg = model_mmaudio.seq_cfg
129
+ net: MMAudio = get_my_mmaudio(model_mmaudio.model_name).to(device, dtype).eval()
130
+ net.load_weights(torch.load(model_mmaudio.model_path, map_location=device, weights_only=True))
131
+ logging.info(f'Loaded weights from {model_mmaudio.model_path}')
132
+
133
+ feature_utils = FeaturesUtils(
134
+ tod_vae_ckpt=model_mmaudio.vae_path,
135
+ synchformer_ckpt=model_mmaudio.synchformer_ckpt,
136
+ enable_conditions=True,
137
+ mode=model_mmaudio.mode,
138
+ bigvgan_vocoder_ckpt=model_mmaudio.bigvgan_16k_path,
139
+ need_vae_encoder=False
140
+ ).to(device, dtype).eval()
141
+
142
+ return net, feature_utils, seq_cfg
143
+
144
+ net_mmaudio, feature_utils, seq_cfg = get_mmaudio_model()
145
+ MMAUDIO_MODEL_LOADED = True
146
+ except Exception as e:
147
+ logging.error(f"Failed to load MMAudio models: {str(e)}")
148
+ MMAUDIO_MODEL_LOADED = False
149
+ translator = None
150
+
151
+ # API URLs
152
+ TEXT2IMG_API_URL = "http://211.233.58.201:7896"
153
+ VIDEO_API_URL = "http://211.233.58.201:7875"
154
+
155
+ # ๋กœ๊น… ์„ค์ •
156
+ logging.basicConfig(level=logging.INFO)
157
+
158
+ # Image size presets
159
+ IMAGE_PRESETS = {
160
+ "์ปค์Šคํ…€": {"width": 1024, "height": 1024},
161
+ "1:1 ์ •์‚ฌ๊ฐํ˜•": {"width": 1024, "height": 1024},
162
+ "4:3 ํ‘œ์ค€": {"width": 1024, "height": 768},
163
+ "16:9 ์™€์ด๋“œ์Šคํฌ๋ฆฐ": {"width": 1024, "height": 576},
164
+ "9:16 ์„ธ๋กœํ˜•": {"width": 576, "height": 1024},
165
+ "6:19 ํŠน์ˆ˜ ์„ธ๋กœํ˜•": {"width": 324, "height": 1024},
166
+ "Instagram ์ •์‚ฌ๊ฐํ˜•": {"width": 1080, "height": 1080},
167
+ "Instagram ์Šคํ† ๋ฆฌ": {"width": 1080, "height": 1920},
168
+ "Instagram ๊ฐ€๋กœํ˜•": {"width": 1080, "height": 566},
169
+ "Facebook ์ปค๋ฒ„": {"width": 820, "height": 312},
170
+ "Twitter ํ—ค๋”": {"width": 1500, "height": 500},
171
+ "YouTube ์ธ๋„ค์ผ": {"width": 1280, "height": 720},
172
+ "LinkedIn ๋ฐฐ๋„ˆ": {"width": 1584, "height": 396},
173
+ }
174
+
175
+ def update_dimensions(preset):
176
+ if preset in IMAGE_PRESETS:
177
+ return IMAGE_PRESETS[preset]["width"], IMAGE_PRESETS[preset]["height"]
178
+ return 1024, 1024
179
+
180
+ def generate_text_to_image(prompt, width, height, guidance, inference_steps, seed):
181
+ if not prompt:
182
+ return None, "ํ”„๋กฌํ”„ํŠธ๋ฅผ ์ž…๋ ฅํ•ด์ฃผ์„ธ์š”"
183
+
184
+ try:
185
+ client = Client(TEXT2IMG_API_URL)
186
+ if seed == -1:
187
+ seed = random.randint(0, 9999999)
188
+
189
+ result = client.predict(
190
+ prompt=prompt,
191
+ width=int(width),
192
+ height=int(height),
193
+ guidance=float(guidance),
194
+ inference_steps=int(inference_steps),
195
+ seed=int(seed),
196
+ do_img2img=False,
197
+ init_image=None,
198
+ image2image_strength=0.8,
199
+ resize_img=True,
200
+ api_name="/generate_image"
201
+ )
202
+ return result[0], f"์‚ฌ์šฉ๋œ ์‹œ๋“œ: {result[1]}"
203
+ except Exception as e:
204
+ logging.error(f"Image generation error: {str(e)}")
205
+ return None, f"์˜ค๋ฅ˜: {str(e)}"
206
+
207
+ def generate_video_from_image(image, prompt="", length=4.0):
208
+ if image is None:
209
+ return None
210
+
211
+ try:
212
+ # ์ด๋ฏธ์ง€ ์ €์žฅ
213
+ with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as fp:
214
+ temp_path = fp.name
215
+ Image.fromarray(image).save(temp_path)
216
+
217
+ # API ํ˜ธ์ถœ
218
+ client = Client(VIDEO_API_URL)
219
+ result = client.predict(
220
+ input_image=handle_file(temp_path),
221
+ prompt=prompt if prompt else "Generate natural motion",
222
+ n_prompt="",
223
+ seed=random.randint(0, 9999999),
224
+ use_teacache=True,
225
+ video_length=float(length),
226
+ api_name="/process"
227
+ )
228
+
229
+ os.unlink(temp_path)
230
+
231
+ if result and len(result) > 0:
232
+ video_dict = result[0]
233
+ return video_dict.get("video") if isinstance(video_dict, dict) else None
234
+
235
+ except Exception as e:
236
+ logging.error(f"Video generation error: {str(e)}")
237
+ return None
238
+
239
+ def prepare_image_and_mask(image, width, height, overlap_percentage, alignment):
240
+ """์ด๋ฏธ์ง€์™€ ๋งˆ์Šคํฌ๋ฅผ ์ค€๋น„ํ•˜๋Š” ํ•จ์ˆ˜"""
241
+ if image is None:
242
+ return None, None
243
+
244
+ # PIL ์ด๋ฏธ์ง€๋กœ ๋ณ€ํ™˜
245
+ if isinstance(image, np.ndarray):
246
+ image = Image.fromarray(image).convert('RGB')
247
+
248
+ target_size = (width, height)
249
+
250
+ # ์ด๋ฏธ์ง€๋ฅผ ํƒ€๊ฒŸ ํฌ๊ธฐ์— ๋งž๊ฒŒ ์กฐ์ •
251
+ scale_factor = min(target_size[0] / image.width, target_size[1] / image.height)
252
+ new_width = int(image.width * scale_factor)
253
+ new_height = int(image.height * scale_factor)
254
+
255
+ # ์ด๋ฏธ์ง€ ๋ฆฌ์‚ฌ์ด์ฆˆ
256
+ source = image.resize((new_width, new_height), Image.LANCZOS)
257
+
258
+ # ์˜ค๋ฒ„๋žฉ ๊ณ„์‚ฐ
259
+ overlap_x = int(new_width * (overlap_percentage / 100))
260
+ overlap_y = int(new_height * (overlap_percentage / 100))
261
+ overlap_x = max(overlap_x, 1)
262
+ overlap_y = max(overlap_y, 1)
263
+
264
+ # ์ •๋ ฌ์— ๋”ฐ๋ฅธ ๋งˆ์ง„ ๊ณ„์‚ฐ
265
+ if alignment == "๊ฐ€์šด๋ฐ":
266
+ margin_x = (target_size[0] - new_width) // 2
267
+ margin_y = (target_size[1] - new_height) // 2
268
+ elif alignment == "์™ผ์ชฝ":
269
+ margin_x = 0
270
+ margin_y = (target_size[1] - new_height) // 2
271
+ elif alignment == "์˜ค๋ฅธ์ชฝ":
272
+ margin_x = target_size[0] - new_width
273
+ margin_y = (target_size[1] - new_height) // 2
274
+ elif alignment == "์œ„":
275
+ margin_x = (target_size[0] - new_width) // 2
276
+ margin_y = 0
277
+ elif alignment == "์•„๋ž˜":
278
+ margin_x = (target_size[0] - new_width) // 2
279
+ margin_y = target_size[1] - new_height
280
+
281
+ # ๋ฐฐ๊ฒฝ ์ด๋ฏธ์ง€ ์ƒ์„ฑ
282
+ background = Image.new('RGB', target_size, (255, 255, 255))
283
+ background.paste(source, (margin_x, margin_y))
284
+
285
+ # ๋งˆ์Šคํฌ ์ƒ์„ฑ
286
+ mask = Image.new('L', target_size, 255)
287
+ mask_draw = ImageDraw.Draw(mask)
288
+
289
+ # ๋งˆ์Šคํฌ ์˜์—ญ ๊ทธ๋ฆฌ๊ธฐ (์˜์–ด ์ •๋ ฌ๊ณผ ๋งค์นญ)
290
+ white_gaps_patch = 2
291
+
292
+ left_overlap = margin_x + overlap_x if alignment != "์™ผ์ชฝ" else margin_x
293
+ right_overlap = margin_x + new_width - overlap_x if alignment != "์˜ค๋ฅธ์ชฝ" else margin_x + new_width
294
+ top_overlap = margin_y + overlap_y if alignment != "์œ„" else margin_y
295
+ bottom_overlap = margin_y + new_height - overlap_y if alignment != "์•„๋ž˜" else margin_y + new_height
296
+
297
+ mask_draw.rectangle([
298
+ (left_overlap, top_overlap),
299
+ (right_overlap, bottom_overlap)
300
+ ], fill=0)
301
+
302
+ return background, mask
303
+
304
+ def preview_outpaint(image, width, height, overlap_percentage, alignment):
305
+ """์•„์›ƒํŽ˜์ธํŒ… ๋ฏธ๋ฆฌ๋ณด๊ธฐ"""
306
+ background, mask = prepare_image_and_mask(image, width, height, overlap_percentage, alignment)
307
+ if background is None:
308
+ return None
309
+
310
+ # ๋ฏธ๋ฆฌ๋ณด๊ธฐ ์ด๋ฏธ์ง€ ์ƒ์„ฑ
311
+ preview = background.copy().convert('RGBA')
312
+
313
+ # ๋ฐ˜ํˆฌ๋ช… ๋นจ๊ฐ„์ƒ‰ ์˜ค๋ฒ„๋ ˆ์ด
314
+ red_overlay = Image.new('RGBA', background.size, (255, 0, 0, 64))
315
+
316
+ # ๋งˆ์Šคํฌ ์ ์šฉ
317
+ red_mask = Image.new('RGBA', background.size, (0, 0, 0, 0))
318
+ red_mask.paste(red_overlay, (0, 0), mask)
319
+
320
+ # ์˜ค๋ฒ„๋ ˆ์ด ํ•ฉ์„ฑ
321
+ preview = Image.alpha_composite(preview, red_mask)
322
+
323
+ return preview
324
+
325
+ @spaces.GPU(duration=24)
326
+ def outpaint_image(image, prompt, width, height, overlap_percentage, alignment, num_steps=8):
327
+ """์ด๋ฏธ์ง€ ์•„์›ƒํŽ˜์ธํŒ… ์‹คํ–‰"""
328
+ if image is None:
329
+ return None
330
+
331
+ if not OUTPAINT_MODEL_LOADED:
332
+ return Image.new('RGB', (width, height), (200, 200, 200))
333
+
334
+ try:
335
+ # ์ด๋ฏธ์ง€์™€ ๋งˆ์Šคํฌ ์ค€๋น„
336
+ background, mask = prepare_image_and_mask(image, width, height, overlap_percentage, alignment)
337
+ if background is None:
338
+ return None
339
+
340
+ # cnet_image ์ƒ์„ฑ (๋งˆ์Šคํฌ ์˜์—ญ์„ ๊ฒ€์€์ƒ‰์œผ๋กœ)
341
+ cnet_image = background.copy()
342
+ cnet_image.paste(0, (0, 0), mask)
343
+
344
+ # ํ”„๋กฌํ”„ํŠธ ์ค€๋น„
345
+ final_prompt = f"{prompt}, high quality, 4k" if prompt else "high quality, 4k"
346
+
347
+ # GPU์—์„œ ์‹คํ–‰
348
+ with torch.autocast(device_type="cuda", dtype=torch.float16):
349
+ (
350
+ prompt_embeds,
351
+ negative_prompt_embeds,
352
+ pooled_prompt_embeds,
353
+ negative_pooled_prompt_embeds,
354
+ ) = pipe.encode_prompt(final_prompt, "cuda", True)
355
+
356
+ # ์ƒ์„ฑ ํ”„๋กœ์„ธ์Šค
357
+ for generated_image in pipe(
358
+ prompt_embeds=prompt_embeds,
359
+ negative_prompt_embeds=negative_prompt_embeds,
360
+ pooled_prompt_embeds=pooled_prompt_embeds,
361
+ negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
362
+ image=cnet_image,
363
+ num_inference_steps=num_steps
364
+ ):
365
+ # ์ค‘๊ฐ„ ๊ฒฐ๊ณผ (ํ•„์š”์‹œ ์‚ฌ์šฉ)
366
+ pass
367
+
368
+ # ์ตœ์ข… ์ด๋ฏธ์ง€
369
+ final_image = generated_image
370
+
371
+ # RGBA๋กœ ๋ณ€ํ™˜ํ•˜๊ณ  ๋งˆ์Šคํฌ ์ ์šฉ
372
+ final_image = final_image.convert("RGBA")
373
+ cnet_image.paste(final_image, (0, 0), mask)
374
+
375
+ return cnet_image
376
+
377
+ except Exception as e:
378
+ logging.error(f"Outpainting error: {str(e)}")
379
+ return background if 'background' in locals() else None
380
+
381
+ # MMAudio ๊ด€๋ จ ํ•จ์ˆ˜๋“ค
382
+ def translate_prompt(text):
383
+ try:
384
+ if translator is None:
385
+ return text
386
+
387
+ if text and any(ord(char) >= 0x3131 and ord(char) <= 0xD7A3 for char in text):
388
+ with torch.no_grad():
389
+ translation = translator(text)[0]['translation_text']
390
+ return translation
391
+ return text
392
+ except Exception as e:
393
+ logging.error(f"Translation error: {e}")
394
+ return text
395
+
396
+ @spaces.GPU
397
+ @torch.inference_mode()
398
+ def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int, num_steps: int,
399
+ cfg_strength: float, duration: float):
400
+ if not MMAUDIO_MODEL_LOADED:
401
+ return None
402
+
403
+ prompt = translate_prompt(prompt)
404
+ negative_prompt = translate_prompt(negative_prompt)
405
+
406
+ rng = torch.Generator(device=device)
407
+ rng.manual_seed(seed)
408
+ fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
409
+
410
+ clip_frames, sync_frames, duration = load_video(video, duration)
411
+ clip_frames = clip_frames.unsqueeze(0)
412
+ sync_frames = sync_frames.unsqueeze(0)
413
+ seq_cfg.duration = duration
414
+ net_mmaudio.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
415
+
416
+ audios = generate(clip_frames,
417
+ sync_frames, [prompt],
418
+ negative_text=[negative_prompt],
419
+ feature_utils=feature_utils,
420
+ net=net_mmaudio,
421
+ fm=fm,
422
+ rng=rng,
423
+ cfg_strength=cfg_strength)
424
+ audio = audios.float().cpu()[0]
425
+
426
+ video_save_path = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name
427
+ make_video(video,
428
+ video_save_path,
429
+ audio,
430
+ sampling_rate=seq_cfg.sampling_rate,
431
+ duration_sec=seq_cfg.duration)
432
+ return video_save_path
433
+
434
+ # CSS
435
+ css = """
436
+ :root {
437
+ --primary-color: #f8c3cd;
438
+ --secondary-color: #b3e5fc;
439
+ --background-color: #f5f5f7;
440
+ --card-background: #ffffff;
441
+ --text-color: #424242;
442
+ --accent-color: #ffb6c1;
443
+ --success-color: #c8e6c9;
444
+ --warning-color: #fff9c4;
445
+ --shadow-color: rgba(0, 0, 0, 0.1);
446
+ --border-radius: 12px;
447
+ }
448
+ .gradio-container {
449
+ max-width: 1200px !important;
450
+ margin: 0 auto !important;
451
+ }
452
+ .panel-box {
453
+ border-radius: var(--border-radius) !important;
454
+ box-shadow: 0 8px 16px var(--shadow-color) !important;
455
+ background-color: var(--card-background) !important;
456
+ padding: 20px !important;
457
+ margin-bottom: 20px !important;
458
+ }
459
+ #generate-btn, #video-btn, #outpaint-btn, #preview-btn, #audio-btn {
460
+ background: linear-gradient(135deg, #ff9a9e, #fad0c4) !important;
461
+ font-size: 1.1rem !important;
462
+ padding: 12px 24px !important;
463
+ margin-top: 10px !important;
464
+ width: 100% !important;
465
+ }
466
+ .tabitem {
467
+ min-height: 700px !important;
468
+ }
469
+ """
470
+
471
+ # Gradio Interface
472
+ demo = gr.Blocks(css=css, title="AI ์ด๋ฏธ์ง€ & ๋น„๋””์˜ค & ์˜ค๋””์˜ค ์ƒ์„ฑ๊ธฐ")
473
+
474
+ with demo:
475
+ gr.Markdown("# ๐ŸŽจ Ginigen ์ŠคํŠœ๋””์˜ค")
476
+
477
+ with gr.Tabs() as tabs:
478
+ # ์ฒซ ๋ฒˆ์งธ ํƒญ: ํ…์ŠคํŠธ to ์ด๋ฏธ์ง€
479
+ with gr.Tab("ํ…์ŠคํŠธโ†’์ด๋ฏธ์ง€โ†’๋น„๋””์˜ค", elem_classes="tabitem"):
480
+ with gr.Row(equal_height=True):
481
+ # ์ž…๋ ฅ ์ปฌ๋Ÿผ
482
+ with gr.Column(scale=1):
483
+ with gr.Group(elem_classes="panel-box"):
484
+ gr.Markdown("### ๐Ÿ“ ์ด๋ฏธ์ง€ ์ƒ์„ฑ ์„ค์ •")
485
+
486
+ prompt = gr.Textbox(
487
+ label="ํ”„๋กฌํ”„ํŠธ(ํ•œ๊ธ€/์˜์–ด ๊ฐ€๋Šฅ)",
488
+ placeholder="์ƒ์„ฑํ•˜๊ณ  ์‹ถ์€ ์ด๋ฏธ์ง€๋ฅผ ์„ค๋ช…ํ•˜์„ธ์š”...",
489
+ lines=3
490
+ )
491
+
492
+ size_preset = gr.Dropdown(
493
+ choices=list(IMAGE_PRESETS.keys()),
494
+ value="1:1 ์ •์‚ฌ๊ฐํ˜•",
495
+ label="ํฌ๊ธฐ ํ”„๋ฆฌ์…‹"
496
+ )
497
+
498
+ with gr.Row():
499
+ width = gr.Slider(256, 2048, 1024, step=64, label="๋„ˆ๋น„")
500
+ height = gr.Slider(256, 2048, 1024, step=64, label="๋†’์ด")
501
+
502
+ with gr.Row():
503
+ guidance = gr.Slider(1.0, 20.0, 3.5, step=0.1, label="๊ฐ€์ด๋˜์Šค")
504
+ steps = gr.Slider(1, 50, 30, step=1, label="์Šคํ…")
505
+
506
+ seed = gr.Number(label="์‹œ๋“œ (-1=๋žœ๋ค)", value=-1)
507
+
508
+ generate_btn = gr.Button("๐ŸŽจ ์ด๋ฏธ์ง€ ์ƒ์„ฑ", variant="primary", elem_id="generate-btn")
509
+
510
+ with gr.Group(elem_classes="panel-box"):
511
+ gr.Markdown("### ๐ŸŽฌ ๋น„๋””์˜ค ์ƒ์„ฑ ์„ค์ •")
512
+
513
+ video_prompt = gr.Textbox(
514
+ label="(์„ ํƒ) ๋น„๋””์˜ค ํ”„๋กฌํ”„ํŠธ(์˜์–ด๋กœ ์ž…๋ ฅ)",
515
+ placeholder="๋น„๋””์˜ค์˜ ์›€์ง์ž„์„ ์„ค๋ช…ํ•˜์„ธ์š”... (๋น„์›Œ๋‘๋ฉด ๊ธฐ๋ณธ ์›€์ง์ž„ ์ ์šฉ)",
516
+ lines=2
517
+ )
518
+
519
+ video_length = gr.Slider(
520
+ minimum=1,
521
+ maximum=60,
522
+ value=4,
523
+ step=0.5,
524
+ label="๋น„๋””์˜ค ๊ธธ์ด (์ดˆ)",
525
+ info="1์ดˆ์—์„œ 60์ดˆ๊นŒ์ง€ ์„ ํƒ ๊ฐ€๋Šฅํ•ฉ๋‹ˆ๋‹ค"
526
+ )
527
+
528
+ video_btn = gr.Button("๐ŸŽฌ ๋น„๋””์˜ค๋กœ ๋ณ€ํ™˜", variant="secondary", elem_id="video-btn")
529
+
530
+ # ์ถœ๋ ฅ ์ปฌ๋Ÿผ
531
+ with gr.Column(scale=1):
532
+ with gr.Group(elem_classes="panel-box"):
533
+ gr.Markdown("### ๐Ÿ–ผ๏ธ ์ƒ์„ฑ ๊ฒฐ๊ณผ")
534
+
535
+ output_image = gr.Image(label="์ƒ์„ฑ๋œ ์ด๋ฏธ์ง€", type="numpy")
536
+ output_seed = gr.Textbox(label="์‹œ๋“œ ์ •๋ณด")
537
+ output_video = gr.Video(label="์ƒ์„ฑ๋œ ๋น„๋””์˜ค")
538
+
539
+ # ๋‘ ๋ฒˆ์งธ ํƒญ: ์ด๋ฏธ์ง€ ์•„์›ƒํŽ˜์ธํŒ…
540
+ with gr.Tab("์ด๋ฏธ์ง€ ๋น„์œจ ๋ณ€๊ฒฝ/์ƒ์„ฑ", elem_classes="tabitem"):
541
+ with gr.Row(equal_height=True):
542
+ # ์ž…๋ ฅ ์ปฌ๋Ÿผ
543
+ with gr.Column(scale=1):
544
+ with gr.Group(elem_classes="panel-box"):
545
+ gr.Markdown("### ๐Ÿ–ผ๏ธ ์ด๋ฏธ์ง€ ์—…๋กœ๋“œ")
546
+
547
+ input_image = gr.Image(
548
+ label="์›๋ณธ ์ด๋ฏธ์ง€",
549
+ type="numpy"
550
+ )
551
+
552
+ outpaint_prompt = gr.Textbox(
553
+ label="ํ”„๋กฌํ”„ํŠธ (์„ ํƒ)",
554
+ placeholder="ํ™•์žฅํ•  ์˜์—ญ์— ๋Œ€ํ•œ ์„ค๋ช…...",
555
+ lines=2
556
+ )
557
+
558
+ with gr.Group(elem_classes="panel-box"):
559
+ gr.Markdown("### โš™๏ธ ์•„์›ƒํŽ˜์ธํŒ… ์„ค์ •")
560
+
561
+ outpaint_size_preset = gr.Dropdown(
562
+ choices=list(IMAGE_PRESETS.keys()),
563
+ value="16:9 ์™€์ด๋“œ์Šคํฌ๋ฆฐ",
564
+ label="๋ชฉํ‘œ ํฌ๊ธฐ ํ”„๋ฆฌ์…‹"
565
+ )
566
+
567
+ with gr.Row():
568
+ outpaint_width = gr.Slider(256, 2048, 1280, step=64, label="๋ชฉํ‘œ ๋„ˆ๋น„")
569
+ outpaint_height = gr.Slider(256, 2048, 720, step=64, label="๋ชฉํ‘œ ๋†’์ด")
570
+
571
+ alignment = gr.Dropdown(
572
+ choices=["๊ฐ€์šด๋ฐ", "์™ผ์ชฝ", "์˜ค๋ฅธ์ชฝ", "์œ„", "์•„๋ž˜"],
573
+ value="๊ฐ€์šด๋ฐ",
574
+ label="์ •๋ ฌ"
575
+ )
576
+
577
+ overlap_percentage = gr.Slider(
578
+ minimum=1,
579
+ maximum=50,
580
+ value=10,
581
+ step=1,
582
+ label="๋งˆ์Šคํฌ ์˜ค๋ฒ„๋žฉ (%)"
583
+ )
584
+
585
+ outpaint_steps = gr.Slider(
586
+ minimum=4,
587
+ maximum=12,
588
+ value=8,
589
+ step=1,
590
+ label="์ถ”๋ก  ์Šคํ…"
591
+ )
592
+
593
+ preview_btn = gr.Button("๐Ÿ‘๏ธ ๋ฏธ๋ฆฌ๋ณด๊ธฐ", elem_id="preview-btn")
594
+ outpaint_btn = gr.Button("๐ŸŽจ ์•„์›ƒํŽ˜์ธํŒ… ์‹คํ–‰", variant="primary", elem_id="outpaint-btn")
595
+
596
+ # ์ถœ๋ ฅ ์ปฌ๋Ÿผ
597
+ with gr.Column(scale=1):
598
+ with gr.Group(elem_classes="panel-box"):
599
+ gr.Markdown("### ๐Ÿ–ผ๏ธ ๊ฒฐ๊ณผ")
600
+
601
+ preview_image = gr.Image(label="๋ฏธ๋ฆฌ๋ณด๊ธฐ")
602
+ outpaint_result = gr.Image(label="์•„์›ƒํŽ˜์ธํŒ… ๊ฒฐ๊ณผ")
603
+
604
+ # ์„ธ ๋ฒˆ์งธ ํƒญ: ๋น„๋””์˜ค + ์˜ค๋””์˜ค
605
+ with gr.Tab("๋น„๋””์˜ค + ์˜ค๋””์˜ค", elem_classes="tabitem"):
606
+ with gr.Row(equal_height=True):
607
+ # ์ž…๋ ฅ ์ปฌ๋Ÿผ
608
+ with gr.Column(scale=1):
609
+ with gr.Group(elem_classes="panel-box"):
610
+ gr.Markdown("### ๐ŸŽฅ ๋น„๋””์˜ค ์—…๋กœ๋“œ")
611
+
612
+ audio_video_input = gr.Video(
613
+ label="์ž…๋ ฅ ๋น„๋””์˜ค",
614
+ sources=["upload"]
615
+ )
616
+
617
+ with gr.Group(elem_classes="panel-box"):
618
+ gr.Markdown("### ๐ŸŽต ์˜ค๋””์˜ค ์ƒ์„ฑ ์„ค์ •")
619
+
620
+ audio_prompt = gr.Textbox(
621
+ label="ํ”„๋กฌํ”„ํŠธ (ํ•œ๊ธ€ ์ง€์›)" if MMAUDIO_MODEL_LOADED and translator else "ํ”„๋กฌํ”„ํŠธ",
622
+ placeholder="์ƒ์„ฑํ•˜๊ณ  ์‹ถ์€ ์˜ค๋””์˜ค๋ฅผ ์„ค๋ช…ํ•˜์„ธ์š”... (์˜ˆ: ํ‰ํ™”๋กœ์šด ํ”ผ์•„๋…ธ ์Œ์•…)",
623
+ lines=3
624
+ )
625
+
626
+ audio_negative_prompt = gr.Textbox(
627
+ label="๋„ค๊ฑฐํ‹ฐ๋ธŒ ํ”„๋กฌํ”„ํŠธ",
628
+ value="music",
629
+ placeholder="์›ํ•˜์ง€ ์•Š๋Š” ์š”์†Œ...",
630
+ lines=2
631
+ )
632
+
633
+ with gr.Row():
634
+ audio_seed = gr.Number(label="์‹œ๋“œ", value=0)
635
+ audio_steps = gr.Number(label="์Šคํ…", value=25)
636
+
637
+ with gr.Row():
638
+ audio_cfg = gr.Number(label="๊ฐ€์ด๋˜์Šค ์Šค์ผ€์ผ", value=4.5)
639
+ audio_duration = gr.Number(label="์ง€์†์‹œ๊ฐ„ (์ดˆ)", value=9999)
640
+
641
+ audio_btn = gr.Button("๐ŸŽต ์˜ค๋””์˜ค ์ƒ์„ฑ ๋ฐ ํ•ฉ์„ฑ", variant="primary", elem_id="audio-btn")
642
+
643
+ # ์ถœ๋ ฅ ์ปฌ๋Ÿผ
644
+ with gr.Column(scale=1):
645
+ with gr.Group(elem_classes="panel-box"):
646
+ gr.Markdown("### ๐ŸŽฌ ์ƒ์„ฑ ๊ฒฐ๊ณผ")
647
+
648
+ output_video_with_audio = gr.Video(
649
+ label="์˜ค๋””์˜ค๊ฐ€ ์ถ”๊ฐ€๋œ ๋น„๋””์˜ค",
650
+ interactive=False
651
+ )
652
+
653
+ if not MMAUDIO_MODEL_LOADED:
654
+ gr.Markdown("โš ๏ธ MMAudio ๋ชจ๋ธ์„ ๋กœ๋“œํ•˜์ง€ ๋ชปํ–ˆ์Šต๋‹ˆ๋‹ค. ์ด ๊ธฐ๋Šฅ์€ ์‚ฌ์šฉํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.")
655
+
656
+ # ์ด๋ฒคํŠธ ์—ฐ๊ฒฐ - ์ฒซ ๋ฒˆ์งธ ํƒญ
657
+ size_preset.change(update_dimensions, [size_preset], [width, height])
658
+
659
+ generate_btn.click(
660
+ generate_text_to_image,
661
+ [prompt, width, height, guidance, steps, seed],
662
+ [output_image, output_seed]
663
+ )
664
+
665
+ video_btn.click(
666
+ lambda img, v_prompt, length: generate_video_from_image(img, v_prompt, length) if img is not None else None,
667
+ [output_image, video_prompt, video_length],
668
+ [output_video]
669
+ )
670
+
671
+ # ์ด๋ฒคํŠธ ์—ฐ๊ฒฐ - ๋‘ ๋ฒˆ์งธ ํƒญ
672
+ outpaint_size_preset.change(update_dimensions, [outpaint_size_preset], [outpaint_width, outpaint_height])
673
+
674
+ preview_btn.click(
675
+ preview_outpaint,
676
+ [input_image, outpaint_width, outpaint_height, overlap_percentage, alignment],
677
+ [preview_image]
678
+ )
679
+
680
+ outpaint_btn.click(
681
+ outpaint_image,
682
+ [input_image, outpaint_prompt, outpaint_width, outpaint_height, overlap_percentage, alignment, outpaint_steps],
683
+ [outpaint_result]
684
+ )
685
+
686
+ # ์ด๋ฒคํŠธ ์—ฐ๊ฒฐ - ์„ธ ๋ฒˆ์งธ ํƒญ
687
+ audio_btn.click(
688
+ video_to_audio,
689
+ [audio_video_input, audio_prompt, audio_negative_prompt, audio_seed, audio_steps, audio_cfg, audio_duration],
690
+ [output_video_with_audio]
691
+ )
692
+
693
+ demo.launch()