openfree commited on
Commit
ee252b7
·
verified ·
1 Parent(s): 98fa5ec

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +597 -319
app.py CHANGED
@@ -1,354 +1,632 @@
1
- import spaces
2
- import logging
3
- from datetime import datetime
4
- from pathlib import Path
5
  import gradio as gr
6
- import torch
7
- import torchaudio
8
- import os
9
- import requests
10
- from transformers import pipeline
11
- import tempfile
12
  import numpy as np
13
- from einops import rearrange
14
- import cv2
15
- from scipy.io import wavfile
16
- import librosa
17
- import json
18
- from typing import Optional, Tuple, List
19
- import atexit
 
 
 
20
 
21
- # 환경 변수 설정으로 torch.load 체크 우회 (임시 해결책)
 
 
 
 
 
 
 
 
 
 
 
 
22
  os.environ["TRANSFORMERS_ALLOW_UNSAFE_DESERIALIZATION"] = "1"
23
 
 
24
  try:
25
  import mmaudio
 
 
 
 
 
 
 
26
  except ImportError:
27
- os.system("pip install -e .")
28
- import mmaudio
29
-
30
- from mmaudio.eval_utils import (ModelConfig, all_model_cfg, generate, load_video, make_video,
31
- setup_eval_logging)
32
- from mmaudio.model.flow_matching import FlowMatching
33
- from mmaudio.model.networks import MMAudio, get_my_mmaudio
34
- from mmaudio.model.sequence_config import SequenceConfig
35
- from mmaudio.model.utils.features_utils import FeaturesUtils
36
 
37
- # 로깅 설정
38
- logging.basicConfig(
39
- level=logging.INFO,
40
- format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
41
- )
42
- log = logging.getLogger()
43
-
44
- # CUDA 설정
45
- if torch.cuda.is_available():
46
- device = torch.device("cuda")
47
- torch.backends.cuda.matmul.allow_tf32 = True
48
- torch.backends.cudnn.allow_tf32 = True
49
- torch.backends.cudnn.benchmark = True
50
- else:
51
- device = torch.device("cpu")
52
-
53
- dtype = torch.bfloat16
54
-
55
- # 모델 설정
56
- model: ModelConfig = all_model_cfg['large_44k_v2']
57
- model.download_if_needed()
58
- output_dir = Path('./output/gradio')
59
-
60
- setup_eval_logging()
61
-
62
- # 번역기 설정 - safetensors 사용 시도
63
  try:
64
- # 먼저 safetensors 형식이 있는지 확인
65
- translator = pipeline("translation",
66
- model="Helsinki-NLP/opus-mt-ko-en",
67
- device="cpu",
68
- use_fast=True, # Fast tokenizer 사용
69
- trust_remote_code=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  except Exception as e:
71
- log.warning(f"Failed to load translation model with safetensors: {e}")
72
- # 대체 방법: 환경 변수 설정 후 로드
73
- try:
74
- translator = pipeline("translation",
75
- model="Helsinki-NLP/opus-mt-ko-en",
76
- device="cpu")
77
- except Exception as e2:
78
- log.error(f"Failed to load translation model: {e2}")
79
- translator = None
80
-
81
- PIXABAY_API_KEY = "33492762-a28a596ec4f286f84cd328b17"
82
-
83
- def cleanup_temp_files():
84
- temp_dir = tempfile.gettempdir()
85
- for file in os.listdir(temp_dir):
86
- if file.endswith(('.mp4', '.flac')):
87
- try:
88
- os.remove(os.path.join(temp_dir, file))
89
- except:
90
- pass
91
-
92
- atexit.register(cleanup_temp_files)
93
 
94
- def get_model() -> tuple[MMAudio, FeaturesUtils, SequenceConfig]:
95
- with torch.cuda.device(device):
96
- seq_cfg = model.seq_cfg
97
- net: MMAudio = get_my_mmaudio(model.model_name).to(device, dtype).eval()
98
- net.load_weights(torch.load(model.model_path, map_location=device, weights_only=True))
99
- log.info(f'Loaded weights from {model.model_path}')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
- feature_utils = FeaturesUtils(
102
- tod_vae_ckpt=model.vae_path,
103
- synchformer_ckpt=model.synchformer_ckpt,
104
- enable_conditions=True,
105
- mode=model.mode,
106
- bigvgan_vocoder_ckpt=model.bigvgan_16k_path,
107
- need_vae_encoder=False
108
- ).to(device, dtype).eval()
109
 
110
- return net, feature_utils, seq_cfg
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
- net, feature_utils, seq_cfg = get_model()
 
 
 
113
 
114
- # translate_prompt 함수 수정
115
- def translate_prompt(text):
 
 
116
  try:
117
- # 번역기가 없으면 원본 텍스트 반환
118
- if translator is None:
119
- return text
120
 
121
- if text and any(ord(char) >= 0x3131 and ord(char) <= 0xD7A3 for char in text):
122
- # CPU에서 번역 실행
123
- with torch.no_grad():
124
- translation = translator(text)[0]['translation_text']
125
- return translation
126
- return text
 
 
 
 
 
 
 
 
127
  except Exception as e:
128
- logging.error(f"Translation error: {e}")
129
- return text
130
 
131
- # search_videos 함수 수정
132
- @torch.no_grad()
133
- def search_videos(query):
 
 
 
 
 
134
  try:
135
- # CPU에서 번역 실행
136
- query = translate_prompt(query)
137
- return search_pixabay_videos(query, PIXABAY_API_KEY)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  except Exception as e:
139
- logging.error(f"Video search error: {e}")
140
- return []
141
 
142
- def search_pixabay_videos(query, api_key):
 
 
 
143
  try:
144
- base_url = "https://pixabay.com/api/videos/"
145
- params = {
146
- "key": api_key,
147
- "q": query,
148
- "per_page": 40
149
- }
150
 
151
- response = requests.get(base_url, params=params)
152
- if response.status_code == 200:
153
- data = response.json()
154
- return [video['videos']['large']['url'] for video in data.get('hits', [])]
155
- return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  except Exception as e:
157
- logging.error(f"Pixabay API error: {e}")
158
- return []
159
-
160
- @spaces.GPU
161
- @torch.inference_mode()
162
- def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int, num_steps: int,
163
- cfg_strength: float, duration: float):
164
- prompt = translate_prompt(prompt)
165
- negative_prompt = translate_prompt(negative_prompt)
166
-
167
- rng = torch.Generator(device=device)
168
- rng.manual_seed(seed)
169
- fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
170
-
171
- clip_frames, sync_frames, duration = load_video(video, duration)
172
- clip_frames = clip_frames.unsqueeze(0)
173
- sync_frames = sync_frames.unsqueeze(0)
174
- seq_cfg.duration = duration
175
- net.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
176
-
177
- audios = generate(clip_frames,
178
- sync_frames, [prompt],
179
- negative_text=[negative_prompt],
180
- feature_utils=feature_utils,
181
- net=net,
182
- fm=fm,
183
- rng=rng,
184
- cfg_strength=cfg_strength)
185
- audio = audios.float().cpu()[0]
186
-
187
- video_save_path = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name
188
- make_video(video,
189
- video_save_path,
190
- audio,
191
- sampling_rate=seq_cfg.sampling_rate,
192
- duration_sec=seq_cfg.duration)
193
- return video_save_path
194
-
195
- @spaces.GPU
196
- @torch.inference_mode()
197
- def text_to_audio(prompt: str, negative_prompt: str, seed: int, num_steps: int, cfg_strength: float,
198
- duration: float):
199
- prompt = translate_prompt(prompt)
200
- negative_prompt = translate_prompt(negative_prompt)
201
-
202
- rng = torch.Generator(device=device)
203
- rng.manual_seed(seed)
204
- fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
205
-
206
- clip_frames = sync_frames = None
207
- seq_cfg.duration = duration
208
- net.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
209
-
210
- audios = generate(clip_frames,
211
- sync_frames, [prompt],
212
- negative_text=[negative_prompt],
213
- feature_utils=feature_utils,
214
- net=net,
215
- fm=fm,
216
- rng=rng,
217
- cfg_strength=cfg_strength)
218
- audio = audios.float().cpu()[0]
219
-
220
- audio_save_path = tempfile.NamedTemporaryFile(delete=False, suffix='.flac').name
221
- torchaudio.save(audio_save_path, audio, seq_cfg.sampling_rate)
222
- return audio_save_path
223
-
224
- # CSS 스타일
225
- custom_css = """
226
- .gradio-container {
227
- background: linear-gradient(45deg, #1a1a1a, #2a2a2a);
228
- border-radius: 15px;
229
- box-shadow: 0 8px 32px rgba(0,0,0,0.3);
230
- color: #e0e0e0;
231
- }
232
-
233
- .input-container, .output-container {
234
- background: rgba(40, 40, 40, 0.95);
235
- backdrop-filter: blur(10px);
236
- border-radius: 10px;
237
- padding: 20px;
238
- transform-style: preserve-3d;
239
- transition: transform 0.3s ease;
240
- border: 1px solid rgba(255, 255, 255, 0.1);
241
- }
242
-
243
- .input-container:hover {
244
- transform: translateZ(20px);
245
- box-shadow: 0 8px 32px rgba(0,0,0,0.5);
246
- }
247
-
248
- .gallery-item {
249
- transition: transform 0.3s ease;
250
- border-radius: 8px;
251
- overflow: hidden;
252
- background: #2a2a2a;
253
- }
254
-
255
- .gallery-item:hover {
256
- transform: scale(1.05);
257
- box-shadow: 0 4px 15px rgba(0,0,0,0.4);
258
- }
259
-
260
- .tabs {
261
- background: rgba(30, 30, 30, 0.95);
262
- border-radius: 10px;
263
- padding: 10px;
264
- border: 1px solid rgba(255, 255, 255, 0.05);
265
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
266
 
267
- button {
268
- background: linear-gradient(45deg, #2196F3, #1976D2);
269
- border: none;
270
- border-radius: 5px;
271
- transition: all 0.3s ease;
272
- color: white;
 
 
 
 
 
 
 
273
  }
274
-
275
- button:hover {
276
- transform: translateY(-2px);
277
- box-shadow: 0 4px 15px rgba(33,150,243,0.3);
278
  }
279
-
280
- textarea, input[type="text"], input[type="number"] {
281
- background: rgba(30, 30, 30, 0.95) !important;
282
- border: 1px solid rgba(255, 255, 255, 0.1) !important;
283
- color: #e0e0e0 !important;
284
- border-radius: 5px !important;
285
  }
286
-
287
- label {
288
- color: #e0e0e0 !important;
 
 
 
289
  }
290
-
291
- .gallery {
292
- background: rgba(30, 30, 30, 0.95);
293
- padding: 15px;
294
- border-radius: 10px;
295
- border: 1px solid rgba(255, 255, 255, 0.05);
296
  }
297
  """
298
 
299
- css = """
300
- footer {
301
- visibility: hidden;
302
- }
303
- """ + custom_css
304
-
305
- # Gradio 인터페이스 생성
306
- text_to_audio_tab = gr.Interface(
307
- fn=text_to_audio,
308
- inputs=[
309
- gr.Textbox(label="Prompt(한글지원)" if translator else "Prompt"),
310
- gr.Textbox(label="Negative Prompt"),
311
- gr.Number(label="Seed", value=0),
312
- gr.Number(label="Steps", value=25),
313
- gr.Number(label="Guidance Scale", value=4.5),
314
- gr.Number(label="Duration (sec)", value=8),
315
- ],
316
- outputs=gr.Audio(label="Generated Audio"),
317
- css=custom_css
318
- )
319
-
320
- video_to_audio_tab = gr.Interface(
321
- fn=video_to_audio,
322
- inputs=[
323
- gr.Video(label="Input Video"),
324
- gr.Textbox(label="Prompt(한글지원)" if translator else "Prompt"),
325
- gr.Textbox(label="Negative Prompt", value="music"),
326
- gr.Number(label="Seed", value=0),
327
- gr.Number(label="Steps", value=25),
328
- gr.Number(label="Guidance Scale", value=4.5),
329
- gr.Number(label="Duration (sec)", value=8),
330
- ],
331
- outputs=gr.Video(label="Generated Result"),
332
- css=custom_css
333
- )
334
-
335
- video_search_tab = gr.Interface(
336
- fn=search_videos,
337
- inputs=gr.Textbox(label="Search Query(한글지원)" if translator else "Search Query"),
338
- outputs=gr.Gallery(label="Search Results", columns=4, rows=20),
339
- css=custom_css,
340
- api_name=False
341
- )
342
-
343
- # 메인 실행
344
- if __name__ == "__main__":
345
- # 번역기 로드 실패 시 경고 메시지
346
- if translator is None:
347
- log.warning("Translation model failed to load. Korean translation will be disabled.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
348
 
349
- gr.TabbedInterface(
350
- [video_search_tab, video_to_audio_tab, text_to_audio_tab],
351
- ["Video Search", "Video-to-Audio", "Text-to-Audio"],
352
- theme="soft",
353
- css=css
354
- ).launch(allowed_paths=[output_dir])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
 
 
 
 
 
 
2
  import numpy as np
3
+ from PIL import Image, ImageDraw
4
+ from gradio_client import Client, handle_file
5
+ import random
6
+ import tempfile
7
+ import os
8
+ import logging
9
+ import torch
10
+ from diffusers import AutoencoderKL, TCDScheduler
11
+ from diffusers.models.model_loading_utils import load_state_dict
12
+ from huggingface_hub import hf_hub_download
13
 
14
+ # Spaces GPU
15
+ try:
16
+ import spaces
17
+ except:
18
+ # GPU 데코레이터가 없을 때를 위한 더미 데코레이터
19
+ class spaces:
20
+ @staticmethod
21
+ def GPU(duration=None):
22
+ def decorator(func):
23
+ return func
24
+ return decorator
25
+
26
+ # 환경 변수 설정
27
  os.environ["TRANSFORMERS_ALLOW_UNSAFE_DESERIALIZATION"] = "1"
28
 
29
+ # MMAudio 관련 임포트
30
  try:
31
  import mmaudio
32
+ from mmaudio.eval_utils import (ModelConfig, all_model_cfg, generate, load_video, make_video,
33
+ setup_eval_logging)
34
+ from mmaudio.model.flow_matching import FlowMatching
35
+ from mmaudio.model.networks import MMAudio, get_my_mmaudio
36
+ from mmaudio.model.sequence_config import SequenceConfig
37
+ from mmaudio.model.utils.features_utils import FeaturesUtils
38
+ MMAUDIO_AVAILABLE = True
39
  except ImportError:
40
+ MMAUDIO_AVAILABLE = False
41
+ logging.warning("MMAudio not available. Sound generation will be disabled.")
 
 
 
 
 
 
 
42
 
43
+ # ControlNet 모델 로드
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  try:
45
+ from controlnet_union import ControlNetModel_Union
46
+ from pipeline_fill_sd_xl import StableDiffusionXLFillPipeline
47
+
48
+ # ControlNet 설정 및 로드
49
+ config_file = hf_hub_download(
50
+ "xinsir/controlnet-union-sdxl-1.0",
51
+ filename="config_promax.json",
52
+ )
53
+
54
+ config = ControlNetModel_Union.load_config(config_file)
55
+ controlnet_model = ControlNetModel_Union.from_config(config)
56
+
57
+ model_file = hf_hub_download(
58
+ "xinsir/controlnet-union-sdxl-1.0",
59
+ filename="diffusion_pytorch_model_promax.safetensors",
60
+ )
61
+ state_dict = load_state_dict(model_file)
62
+ loaded_keys = list(state_dict.keys())
63
+
64
+ result = ControlNetModel_Union._load_pretrained_model(
65
+ controlnet_model, state_dict, model_file, "xinsir/controlnet-union-sdxl-1.0", loaded_keys
66
+ )
67
+
68
+ model = result[0]
69
+ model = model.to(device="cuda", dtype=torch.float16)
70
+
71
+ # VAE 로드
72
+ vae = AutoencoderKL.from_pretrained(
73
+ "madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16
74
+ ).to("cuda")
75
+
76
+ # 파이프라인 로드
77
+ pipe = StableDiffusionXLFillPipeline.from_pretrained(
78
+ "SG161222/RealVisXL_V5.0_Lightning",
79
+ torch_dtype=torch.float16,
80
+ vae=vae,
81
+ controlnet=model,
82
+ variant="fp16",
83
+ ).to("cuda")
84
+
85
+ pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
86
+
87
+ OUTPAINT_MODEL_LOADED = True
88
  except Exception as e:
89
+ logging.error(f"Failed to load outpainting models: {str(e)}")
90
+ OUTPAINT_MODEL_LOADED = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
+ # MMAudio 모델 설정 로드
93
+ if MMAUDIO_AVAILABLE:
94
+ try:
95
+ # CUDA 설정
96
+ if torch.cuda.is_available():
97
+ device = torch.device("cuda")
98
+ torch.backends.cuda.matmul.allow_tf32 = True
99
+ torch.backends.cudnn.allow_tf32 = True
100
+ torch.backends.cudnn.benchmark = True
101
+ else:
102
+ device = torch.device("cpu")
103
+
104
+ dtype = torch.bfloat16
105
+
106
+ # 모델 설정
107
+ model_cfg: ModelConfig = all_model_cfg['large_44k_v2']
108
+ model_cfg.download_if_needed()
109
+
110
+ setup_eval_logging()
111
+
112
+ # 모델 로드
113
+ def get_mmaudio_model():
114
+ with torch.cuda.device(device):
115
+ seq_cfg = model_cfg.seq_cfg
116
+ net: MMAudio = get_my_mmaudio(model_cfg.model_name).to(device, dtype).eval()
117
+ net.load_weights(torch.load(model_cfg.model_path, map_location=device, weights_only=True))
118
+ logging.info(f'Loaded MMAudio weights from {model_cfg.model_path}')
119
+
120
+ feature_utils = FeaturesUtils(
121
+ tod_vae_ckpt=model_cfg.vae_path,
122
+ synchformer_ckpt=model_cfg.synchformer_ckpt,
123
+ enable_conditions=True,
124
+ mode=model_cfg.mode,
125
+ bigvgan_vocoder_ckpt=model_cfg.bigvgan_16k_path,
126
+ need_vae_encoder=False
127
+ ).to(device, dtype).eval()
128
+
129
+ return net, feature_utils, seq_cfg
130
+
131
+ mmaudio_net, mmaudio_feature_utils, mmaudio_seq_cfg = get_mmaudio_model()
132
+ MMAUDIO_LOADED = True
133
+ except Exception as e:
134
+ logging.error(f"Failed to load MMAudio models: {str(e)}")
135
+ MMAUDIO_LOADED = False
136
+ else:
137
+ MMAUDIO_LOADED = False
138
 
139
+ # API URLs
140
+ TEXT2IMG_API_URL = "http://211.233.58.201:7896"
141
+ VIDEO_API_URL = "http://211.233.58.201:7875"
 
 
 
 
 
142
 
143
+ # 로깅 설정
144
+ logging.basicConfig(level=logging.INFO)
145
+
146
+ # Image size presets
147
+ IMAGE_PRESETS = {
148
+ "커스텀": {"width": 1024, "height": 1024},
149
+ "1:1 정사각형": {"width": 1024, "height": 1024},
150
+ "4:3 표준": {"width": 1024, "height": 768},
151
+ "16:9 와이드스크린": {"width": 1024, "height": 576},
152
+ "9:16 세로형": {"width": 576, "height": 1024},
153
+ "6:19 특수 세로형": {"width": 324, "height": 1024},
154
+ "Instagram 정사각형": {"width": 1080, "height": 1080},
155
+ "Instagram 스토리": {"width": 1080, "height": 1920},
156
+ "Instagram 가로형": {"width": 1080, "height": 566},
157
+ "Facebook 커버": {"width": 820, "height": 312},
158
+ "Twitter 헤더": {"width": 1500, "height": 500},
159
+ "YouTube 썸네일": {"width": 1280, "height": 720},
160
+ "LinkedIn 배너": {"width": 1584, "height": 396},
161
+ }
162
 
163
+ def update_dimensions(preset):
164
+ if preset in IMAGE_PRESETS:
165
+ return IMAGE_PRESETS[preset]["width"], IMAGE_PRESETS[preset]["height"]
166
+ return 1024, 1024
167
 
168
+ def generate_text_to_image(prompt, width, height, guidance, inference_steps, seed):
169
+ if not prompt:
170
+ return None, "프롬프트를 입력해주세요"
171
+
172
  try:
173
+ client = Client(TEXT2IMG_API_URL)
174
+ if seed == -1:
175
+ seed = random.randint(0, 9999999)
176
 
177
+ result = client.predict(
178
+ prompt=prompt,
179
+ width=int(width),
180
+ height=int(height),
181
+ guidance=float(guidance),
182
+ inference_steps=int(inference_steps),
183
+ seed=int(seed),
184
+ do_img2img=False,
185
+ init_image=None,
186
+ image2image_strength=0.8,
187
+ resize_img=True,
188
+ api_name="/generate_image"
189
+ )
190
+ return result[0], f"사용된 시드: {result[1]}"
191
  except Exception as e:
192
+ logging.error(f"Image generation error: {str(e)}")
193
+ return None, f"오류: {str(e)}"
194
 
195
+ @spaces.GPU(duration=60)
196
+ @torch.inference_mode()
197
+ def video_to_audio(video_path, prompt, negative_prompt="music", seed=0, num_steps=25, cfg_strength=4.5, duration=8.0):
198
+ """비디오에 사운드를 추가하는 함수"""
199
+ if not MMAUDIO_LOADED:
200
+ logging.error("MMAudio model not loaded")
201
+ return video_path
202
+
203
  try:
204
+ rng = torch.Generator(device=device)
205
+ rng.manual_seed(seed)
206
+ fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
207
+
208
+ # 비디오 로드
209
+ clip_frames, sync_frames, actual_duration = load_video(video_path, duration)
210
+ clip_frames = clip_frames.unsqueeze(0)
211
+ sync_frames = sync_frames.unsqueeze(0)
212
+ mmaudio_seq_cfg.duration = actual_duration
213
+ mmaudio_net.update_seq_lengths(mmaudio_seq_cfg.latent_seq_len, mmaudio_seq_cfg.clip_seq_len, mmaudio_seq_cfg.sync_seq_len)
214
+
215
+ # 오디오 생성
216
+ audios = generate(clip_frames,
217
+ sync_frames, [prompt],
218
+ negative_text=[negative_prompt],
219
+ feature_utils=mmaudio_feature_utils,
220
+ net=mmaudio_net,
221
+ fm=fm,
222
+ rng=rng,
223
+ cfg_strength=cfg_strength)
224
+ audio = audios.float().cpu()[0]
225
+
226
+ # 비디오와 오디오 결합
227
+ video_save_path = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name
228
+ make_video(video_path,
229
+ video_save_path,
230
+ audio,
231
+ sampling_rate=mmaudio_seq_cfg.sampling_rate,
232
+ duration_sec=mmaudio_seq_cfg.duration)
233
+
234
+ return video_save_path
235
  except Exception as e:
236
+ logging.error(f"Video to audio error: {str(e)}")
237
+ return video_path
238
 
239
+ def generate_video_from_image(image, prompt="", length=4.0, sound_generation="사운드 없음", sound_prompt="", sound_negative_prompt="music"):
240
+ if image is None:
241
+ return None
242
+
243
  try:
244
+ # 이미지 저장
245
+ with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as fp:
246
+ temp_path = fp.name
247
+ Image.fromarray(image).save(temp_path)
 
 
248
 
249
+ # 비디오 생성 API 호출
250
+ client = Client(VIDEO_API_URL)
251
+ result = client.predict(
252
+ input_image=handle_file(temp_path),
253
+ prompt=prompt if prompt else "Generate natural motion",
254
+ n_prompt="",
255
+ seed=random.randint(0, 9999999),
256
+ use_teacache=True,
257
+ video_length=float(length),
258
+ api_name="/process"
259
+ )
260
+
261
+ os.unlink(temp_path)
262
+
263
+ if result and len(result) > 0:
264
+ video_dict = result[0]
265
+ video_path = video_dict.get("video") if isinstance(video_dict, dict) else None
266
+
267
+ # 사운드 생성 옵션이 선택된 경우
268
+ if video_path and sound_generation == "사운드 생성" and MMAUDIO_LOADED:
269
+ # 사운드 프롬프트가 비어있으면 기본값 사용
270
+ if not sound_prompt:
271
+ sound_prompt = prompt if prompt else "ambient sound"
272
+
273
+ # 비디오에 사운드 추가
274
+ video_with_sound = video_to_audio(
275
+ video_path,
276
+ sound_prompt,
277
+ sound_negative_prompt,
278
+ duration=length
279
+ )
280
+ return video_with_sound
281
+
282
+ return video_path
283
+
284
  except Exception as e:
285
+ logging.error(f"Video generation error: {str(e)}")
286
+ return None
287
+
288
+ def prepare_image_and_mask(image, width, height, overlap_percentage, alignment):
289
+ """이미지와 마스크를 준비하는 함수"""
290
+ if image is None:
291
+ return None, None
292
+
293
+ # PIL 이미지로 변환
294
+ if isinstance(image, np.ndarray):
295
+ image = Image.fromarray(image).convert('RGB')
296
+
297
+ target_size = (width, height)
298
+
299
+ # 이미지를 타겟 크기에 맞게 조정
300
+ scale_factor = min(target_size[0] / image.width, target_size[1] / image.height)
301
+ new_width = int(image.width * scale_factor)
302
+ new_height = int(image.height * scale_factor)
303
+
304
+ # 이미지 리사이즈
305
+ source = image.resize((new_width, new_height), Image.LANCZOS)
306
+
307
+ # 오버랩 계산
308
+ overlap_x = int(new_width * (overlap_percentage / 100))
309
+ overlap_y = int(new_height * (overlap_percentage / 100))
310
+ overlap_x = max(overlap_x, 1)
311
+ overlap_y = max(overlap_y, 1)
312
+
313
+ # 정렬에 따른 마진 계산
314
+ if alignment == "가운데":
315
+ margin_x = (target_size[0] - new_width) // 2
316
+ margin_y = (target_size[1] - new_height) // 2
317
+ elif alignment == "왼쪽":
318
+ margin_x = 0
319
+ margin_y = (target_size[1] - new_height) // 2
320
+ elif alignment == "오른쪽":
321
+ margin_x = target_size[0] - new_width
322
+ margin_y = (target_size[1] - new_height) // 2
323
+ elif alignment == "위":
324
+ margin_x = (target_size[0] - new_width) // 2
325
+ margin_y = 0
326
+ elif alignment == "아래":
327
+ margin_x = (target_size[0] - new_width) // 2
328
+ margin_y = target_size[1] - new_height
329
+
330
+ # 배경 이미지 생성
331
+ background = Image.new('RGB', target_size, (255, 255, 255))
332
+ background.paste(source, (margin_x, margin_y))
333
+
334
+ # 마스크 생성
335
+ mask = Image.new('L', target_size, 255)
336
+ mask_draw = ImageDraw.Draw(mask)
337
+
338
+ # 마스크 영역 그리기
339
+ white_gaps_patch = 2
340
+
341
+ left_overlap = margin_x + overlap_x if alignment != "왼쪽" else margin_x
342
+ right_overlap = margin_x + new_width - overlap_x if alignment != "오른쪽" else margin_x + new_width
343
+ top_overlap = margin_y + overlap_y if alignment != "위" else margin_y
344
+ bottom_overlap = margin_y + new_height - overlap_y if alignment != "아래" else margin_y + new_height
345
+
346
+ mask_draw.rectangle([
347
+ (left_overlap, top_overlap),
348
+ (right_overlap, bottom_overlap)
349
+ ], fill=0)
350
+
351
+ return background, mask
352
+
353
+ @spaces.GPU(duration=24)
354
+ def outpaint_image(image, prompt, width, height, overlap_percentage, alignment, num_steps=8):
355
+ """이미지 아웃페인팅 실행"""
356
+ if image is None:
357
+ return None
358
+
359
+ if not OUTPAINT_MODEL_LOADED:
360
+ return Image.new('RGB', (width, height), (200, 200, 200))
361
+
362
+ try:
363
+ # 이미지와 마스크 준비
364
+ background, mask = prepare_image_and_mask(image, width, height, overlap_percentage, alignment)
365
+ if background is None:
366
+ return None
367
+
368
+ # cnet_image 생성 (마스크 영역을 검은색으로)
369
+ cnet_image = background.copy()
370
+ cnet_image.paste(0, (0, 0), mask)
371
+
372
+ # 프롬프트 준비
373
+ final_prompt = f"{prompt}, high quality, 4k" if prompt else "high quality, 4k"
374
+
375
+ # GPU에서 실행
376
+ with torch.autocast(device_type="cuda", dtype=torch.float16):
377
+ (
378
+ prompt_embeds,
379
+ negative_prompt_embeds,
380
+ pooled_prompt_embeds,
381
+ negative_pooled_prompt_embeds,
382
+ ) = pipe.encode_prompt(final_prompt, "cuda", True)
383
+
384
+ # 생성 프로세스
385
+ for generated_image in pipe(
386
+ prompt_embeds=prompt_embeds,
387
+ negative_prompt_embeds=negative_prompt_embeds,
388
+ pooled_prompt_embeds=pooled_prompt_embeds,
389
+ negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
390
+ image=cnet_image,
391
+ num_inference_steps=num_steps
392
+ ):
393
+ # 중간 결과 (필요시 사용)
394
+ pass
395
+
396
+ # 최종 이미지
397
+ final_image = generated_image
398
+
399
+ # RGBA로 변환하고 마스크 적용
400
+ final_image = final_image.convert("RGBA")
401
+ cnet_image.paste(final_image, (0, 0), mask)
402
+
403
+ return cnet_image
404
+
405
+ except Exception as e:
406
+ logging.error(f"Outpainting error: {str(e)}")
407
+ return background if 'background' in locals() else None
408
 
409
+ # CSS
410
+ css = """
411
+ :root {
412
+ --primary-color: #f8c3cd;
413
+ --secondary-color: #b3e5fc;
414
+ --background-color: #f5f5f7;
415
+ --card-background: #ffffff;
416
+ --text-color: #424242;
417
+ --accent-color: #ffb6c1;
418
+ --success-color: #c8e6c9;
419
+ --warning-color: #fff9c4;
420
+ --shadow-color: rgba(0, 0, 0, 0.1);
421
+ --border-radius: 12px;
422
  }
423
+ .gradio-container {
424
+ max-width: 1200px !important;
425
+ margin: 0 auto !important;
 
426
  }
427
+ .panel-box {
428
+ border-radius: var(--border-radius) !important;
429
+ box-shadow: 0 8px 16px var(--shadow-color) !important;
430
+ background-color: var(--card-background) !important;
431
+ padding: 20px !important;
432
+ margin-bottom: 20px !important;
433
  }
434
+ #generate-btn, #video-btn, #outpaint-btn {
435
+ background: linear-gradient(135deg, #ff9a9e, #fad0c4) !important;
436
+ font-size: 1.1rem !important;
437
+ padding: 12px 24px !important;
438
+ margin-top: 10px !important;
439
+ width: 100% !important;
440
  }
441
+ .tabitem {
442
+ min-height: 700px !important;
 
 
 
 
443
  }
444
  """
445
 
446
+ # Gradio Interface
447
+ demo = gr.Blocks(css=css, title="AI 이미지 & 비디오 생성기")
448
+
449
+ with demo:
450
+ gr.Markdown("# 🎨 Ginigen 스튜디오")
451
+
452
+ with gr.Tabs() as tabs:
453
+ # 번째 탭: 텍스트 to 이미지
454
+ with gr.Tab("텍스트→이미지→비디오", elem_classes="tabitem"):
455
+ with gr.Row(equal_height=True):
456
+ # 입력 컬럼
457
+ with gr.Column(scale=1):
458
+ with gr.Group(elem_classes="panel-box"):
459
+ gr.Markdown("### 📝 이미지 생성 설정")
460
+
461
+ prompt = gr.Textbox(
462
+ label="프롬프트(한글/영어 가능)",
463
+ placeholder="생성하고 싶은 이미지를 설명하세요...",
464
+ lines=3
465
+ )
466
+
467
+ size_preset = gr.Dropdown(
468
+ choices=list(IMAGE_PRESETS.keys()),
469
+ value="1:1 정사각형",
470
+ label="크기 프리셋"
471
+ )
472
+
473
+ with gr.Row():
474
+ width = gr.Slider(256, 2048, 1024, step=64, label="너비")
475
+ height = gr.Slider(256, 2048, 1024, step=64, label="높이")
476
+
477
+ with gr.Row():
478
+ guidance = gr.Slider(1.0, 20.0, 3.5, step=0.1, label="가이던스")
479
+ steps = gr.Slider(1, 50, 30, step=1, label="스텝")
480
+
481
+ seed = gr.Number(label="시드 (-1=랜덤)", value=-1)
482
+
483
+ generate_btn = gr.Button("🎨 이미지 생성", variant="primary", elem_id="generate-btn")
484
+
485
+ with gr.Group(elem_classes="panel-box"):
486
+ gr.Markdown("### 🎬 비디오 생성 설정")
487
+
488
+ video_prompt = gr.Textbox(
489
+ label="(선택) 비디오 프롬프트(영어로 입력)",
490
+ placeholder="비디오의 움직임을 설명하세요... (비워두면 기본 움직임 적용)",
491
+ lines=2
492
+ )
493
+
494
+ video_length = gr.Slider(
495
+ minimum=1,
496
+ maximum=60,
497
+ value=4,
498
+ step=0.5,
499
+ label="비디오 길이 (초)",
500
+ info="1초에서 60초까지 선택 가능합니다"
501
+ )
502
+
503
+ # 사운드 생성 옵션 추가
504
+ sound_generation = gr.Radio(
505
+ choices=["사운드 없음", "사운드 생성"],
506
+ value="사운드 없음",
507
+ label="사운드 옵션",
508
+ info="비디오에 사운드를 추가할지 선택하세요"
509
+ )
510
+
511
+ # 사운드 관련 입력 필드 (조건부 표시)
512
+ with gr.Column(visible=False) as sound_options:
513
+ sound_prompt = gr.Textbox(
514
+ label="사운드 프롬프트 (선택)",
515
+ placeholder="생성할 사운드를 설명하세요... (비워두면 비디오 프롬프트 사용)",
516
+ lines=2
517
+ )
518
+ sound_negative_prompt = gr.Textbox(
519
+ label="사운드 네거티브 프롬프트",
520
+ value="music",
521
+ lines=1
522
+ )
523
+
524
+ video_btn = gr.Button("🎬 비디오로 변환", variant="secondary", elem_id="video-btn")
525
+
526
+ # 출력 컬럼
527
+ with gr.Column(scale=1):
528
+ with gr.Group(elem_classes="panel-box"):
529
+ gr.Markdown("### 🖼️ 생성 결과")
530
+
531
+ output_image = gr.Image(label="생성된 이미지", type="numpy")
532
+ output_seed = gr.Textbox(label="시드 정보")
533
+ output_video = gr.Video(label="생성된 비디오")
534
 
535
+ # 두 번째 탭: 이미지 아웃페인팅
536
+ with gr.Tab("이미지 비율 변경/생성", elem_classes="tabitem"):
537
+ with gr.Row(equal_height=True):
538
+ # 입력 컬럼
539
+ with gr.Column(scale=1):
540
+ with gr.Group(elem_classes="panel-box"):
541
+ gr.Markdown("### 🖼️ 이미지 업로드")
542
+
543
+ input_image = gr.Image(
544
+ label="원본 이미지",
545
+ type="numpy"
546
+ )
547
+
548
+ outpaint_prompt = gr.Textbox(
549
+ label="프롬프트 (선택)",
550
+ placeholder="확장할 영역에 대한 설명...",
551
+ lines=2
552
+ )
553
+
554
+ with gr.Group(elem_classes="panel-box"):
555
+ gr.Markdown("### ⚙️ 아웃페인팅 설정")
556
+
557
+ outpaint_size_preset = gr.Dropdown(
558
+ choices=list(IMAGE_PRESETS.keys()),
559
+ value="16:9 와이드스크린",
560
+ label="목표 크기 프리셋"
561
+ )
562
+
563
+ with gr.Row():
564
+ outpaint_width = gr.Slider(256, 2048, 1280, step=64, label="목표 너비")
565
+ outpaint_height = gr.Slider(256, 2048, 720, step=64, label="목표 높이")
566
+
567
+ alignment = gr.Dropdown(
568
+ choices=["가운데", "왼쪽", "오른쪽", "위", "아래"],
569
+ value="가운데",
570
+ label="정렬"
571
+ )
572
+
573
+ overlap_percentage = gr.Slider(
574
+ minimum=1,
575
+ maximum=50,
576
+ value=10,
577
+ step=1,
578
+ label="마스크 오버랩 (%)"
579
+ )
580
+
581
+ outpaint_steps = gr.Slider(
582
+ minimum=4,
583
+ maximum=12,
584
+ value=8,
585
+ step=1,
586
+ label="추론 스텝"
587
+ )
588
+
589
+ outpaint_btn = gr.Button("🎨 아웃페인팅 실행", variant="primary", elem_id="outpaint-btn")
590
+
591
+ # 출력 컬럼
592
+ with gr.Column(scale=1):
593
+ with gr.Group(elem_classes="panel-box"):
594
+ gr.Markdown("### 🖼️ 결과")
595
+
596
+ outpaint_result = gr.Image(label="아웃페인팅 결과")
597
+
598
+ # 이벤트 연결 - 첫 번째 탭
599
+ size_preset.change(update_dimensions, [size_preset], [width, height])
600
+
601
+ generate_btn.click(
602
+ generate_text_to_image,
603
+ [prompt, width, height, guidance, steps, seed],
604
+ [output_image, output_seed]
605
+ )
606
+
607
+ # 사운드 옵션 표시/숨김
608
+ def toggle_sound_options(choice):
609
+ return gr.update(visible=(choice == "사운드 생성"))
610
+
611
+ sound_generation.change(
612
+ toggle_sound_options,
613
+ [sound_generation],
614
+ [sound_options]
615
+ )
616
+
617
+ video_btn.click(
618
+ generate_video_from_image,
619
+ [output_image, video_prompt, video_length, sound_generation, sound_prompt, sound_negative_prompt],
620
+ [output_video]
621
+ )
622
+
623
+ # 이벤트 연결 - 두 번째 탭
624
+ outpaint_size_preset.change(update_dimensions, [outpaint_size_preset], [outpaint_width, outpaint_height])
625
+
626
+ outpaint_btn.click(
627
+ outpaint_image,
628
+ [input_image, outpaint_prompt, outpaint_width, outpaint_height, overlap_percentage, alignment, outpaint_steps],
629
+ [outpaint_result]
630
+ )
631
+
632
+ demo.launch()