openfree commited on
Commit
5d025b7
·
verified ·
1 Parent(s): 63f5669

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +192 -186
app.py CHANGED
@@ -1,10 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import numpy as np
3
  from PIL import Image, ImageDraw
4
  from gradio_client import Client, handle_file
5
  import random
6
  import tempfile
7
- import os
8
  import logging
9
  import torch
10
  from diffusers import AutoencoderKL, TCDScheduler
@@ -26,21 +41,6 @@ from concurrent.futures import ThreadPoolExecutor
26
  # 환경 변수 설정으로 torch.load 체크 우회 (임시 해결책)
27
  os.environ["TRANSFORMERS_ALLOW_UNSAFE_DESERIALIZATION"] = "1"
28
 
29
- # Spaces GPU
30
- import os
31
- IS_SPACES = os.environ.get("SPACE_ID") is not None
32
-
33
- if IS_SPACES:
34
- import spaces
35
- else:
36
- # GPU 데코레이터가 없을 때를 위한 더미 데코레이터
37
- class spaces:
38
- @staticmethod
39
- def GPU(duration=None):
40
- def decorator(func):
41
- return func
42
- return decorator
43
-
44
  # GPU 초기화를 위한 간단한 함수 (Spaces 환경에서 필수)
45
  @spaces.GPU(duration=1)
46
  def gpu_warmup():
@@ -50,7 +50,7 @@ def gpu_warmup():
50
  del dummy
51
  return "GPU ready"
52
 
53
- # MMAudio imports
54
  try:
55
  import mmaudio
56
  except ImportError:
@@ -64,6 +64,9 @@ from mmaudio.model.networks import MMAudio, get_my_mmaudio
64
  from mmaudio.model.sequence_config import SequenceConfig
65
  from mmaudio.model.utils.features_utils import FeaturesUtils
66
 
 
 
 
67
  # 기존 코드의 모든 설정과 초기화 부분 유지
68
  torch.set_float32_matmul_precision("medium")
69
 
@@ -77,130 +80,21 @@ else:
77
 
78
  logging.info(f"Using device: {device}")
79
 
80
- # BiRefNet 모델 로드
81
- try:
82
- birefnet = AutoModelForImageSegmentation.from_pretrained("ZhengPeng7/BiRefNet", trust_remote_code=True)
83
- birefnet.to(device)
84
- birefnet_lite = AutoModelForImageSegmentation.from_pretrained("ZhengPeng7/BiRefNet_lite", trust_remote_code=True)
85
- birefnet_lite.to(device)
86
-
87
- transform_image = transforms.Compose([
88
- transforms.Resize((768, 768)),
89
- transforms.ToTensor(),
90
- transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
91
- ])
92
-
93
- BIREFNET_MODEL_LOADED = True
94
- except Exception as e:
95
- logging.error(f"Failed to load BiRefNet models: {str(e)}")
96
- BIREFNET_MODEL_LOADED = False
97
-
98
- # ControlNet 모델 로드 (기존 코드)
99
- try:
100
- from controlnet_union import ControlNetModel_Union
101
- from pipeline_fill_sd_xl import StableDiffusionXLFillPipeline
102
-
103
- # ControlNet 설정 및 로드
104
- config_file = hf_hub_download(
105
- "xinsir/controlnet-union-sdxl-1.0",
106
- filename="config_promax.json",
107
- )
108
-
109
- config = ControlNetModel_Union.load_config(config_file)
110
- controlnet_model = ControlNetModel_Union.from_config(config)
111
-
112
- model_file = hf_hub_download(
113
- "xinsir/controlnet-union-sdxl-1.0",
114
- filename="diffusion_pytorch_model_promax.safetensors",
115
- )
116
- state_dict = load_state_dict(model_file)
117
- loaded_keys = list(state_dict.keys())
118
-
119
- result = ControlNetModel_Union._load_pretrained_model(
120
- controlnet_model, state_dict, model_file, "xinsir/controlnet-union-sdxl-1.0", loaded_keys
121
- )
122
-
123
- model = result[0]
124
- model = model.to(device=device, dtype=torch.float16 if device.type == "cuda" else torch.float32)
125
-
126
- # VAE 로드
127
- vae = AutoencoderKL.from_pretrained(
128
- "madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16 if device.type == "cuda" else torch.float32
129
- ).to(device)
130
-
131
- # 파이프라인 로드
132
- pipe = StableDiffusionXLFillPipeline.from_pretrained(
133
- "SG161222/RealVisXL_V5.0_Lightning",
134
- torch_dtype=torch.float16 if device.type == "cuda" else torch.float32,
135
- vae=vae,
136
- controlnet=model,
137
- variant="fp16" if device.type == "cuda" else None,
138
- ).to(device)
139
-
140
- pipe.scheduler = TCDScheduler.from_config(pipe.scheduler.config)
141
-
142
- OUTPAINT_MODEL_LOADED = True
143
- except Exception as e:
144
- logging.error(f"Failed to load outpainting models: {str(e)}")
145
- OUTPAINT_MODEL_LOADED = False
146
-
147
- # MMAudio 모델 설정 (기존 코드)
148
- if torch.cuda.is_available():
149
- mmaudio_dtype = torch.bfloat16
150
- else:
151
- mmaudio_dtype = torch.float32
152
-
153
- # MMAudio 모델 초기화 (기존 코드)
154
- try:
155
- model_mmaudio: ModelConfig = all_model_cfg['large_44k_v2']
156
- model_mmaudio.download_if_needed()
157
- output_dir = Path('./output/gradio')
158
- setup_eval_logging()
159
-
160
- # 번역기 설정
161
- try:
162
- translator = pipeline("translation",
163
- model="Helsinki-NLP/opus-mt-ko-en",
164
- device="cpu",
165
- use_fast=True,
166
- trust_remote_code=False)
167
- except Exception as e:
168
- logging.warning(f"Failed to load translation model: {e}")
169
- translator = None
170
-
171
- def get_mmaudio_model() -> tuple[MMAudio, FeaturesUtils, SequenceConfig]:
172
- with torch.cuda.device(device):
173
- seq_cfg = model_mmaudio.seq_cfg
174
- net: MMAudio = get_my_mmaudio(model_mmaudio.model_name).to(device, mmaudio_dtype).eval()
175
- net.load_weights(torch.load(model_mmaudio.model_path, map_location=device, weights_only=True))
176
- logging.info(f'Loaded weights from {model_mmaudio.model_path}')
177
-
178
- feature_utils = FeaturesUtils(
179
- tod_vae_ckpt=model_mmaudio.vae_path,
180
- synchformer_ckpt=model_mmaudio.synchformer_ckpt,
181
- enable_conditions=True,
182
- mode=model_mmaudio.mode,
183
- bigvgan_vocoder_ckpt=model_mmaudio.bigvgan_16k_path,
184
- need_vae_encoder=False
185
- ).to(device, mmaudio_dtype).eval()
186
-
187
- return net, feature_utils, seq_cfg
188
-
189
- net_mmaudio, feature_utils, seq_cfg = get_mmaudio_model()
190
- MMAUDIO_MODEL_LOADED = True
191
- except Exception as e:
192
- logging.error(f"Failed to load MMAudio models: {str(e)}")
193
- MMAUDIO_MODEL_LOADED = False
194
- translator = None
195
 
196
  # API URLs
197
  TEXT2IMG_API_URL = "http://211.233.58.201:7896"
198
  VIDEO_API_URL = "http://211.233.58.201:7875"
199
 
200
- # 로깅 설정
201
- logging.basicConfig(level=logging.INFO)
202
-
203
- # Image size presets (기존 코드)
204
  IMAGE_PRESETS = {
205
  "커스텀": {"width": 1024, "height": 1024},
206
  "1:1 정사각형": {"width": 1024, "height": 1024},
@@ -217,6 +111,119 @@ IMAGE_PRESETS = {
217
  "LinkedIn 배너": {"width": 1584, "height": 396},
218
  }
219
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
  # 기존 함수들 모두 유지
221
  def update_dimensions(preset):
222
  if preset in IMAGE_PRESETS:
@@ -332,9 +339,7 @@ def prepare_image_and_mask(image, width, height, overlap_percentage, alignment):
332
  mask = Image.new('L', target_size, 255)
333
  mask_draw = ImageDraw.Draw(mask)
334
 
335
- # 마스크 영역 그리기 (영어 정렬과 매칭)
336
- white_gaps_patch = 2
337
-
338
  left_overlap = margin_x + overlap_x if alignment != "왼쪽" else margin_x
339
  right_overlap = margin_x + new_width - overlap_x if alignment != "오른쪽" else margin_x + new_width
340
  top_overlap = margin_y + overlap_y if alignment != "위" else margin_y
@@ -374,7 +379,11 @@ def outpaint_image(image, prompt, width, height, overlap_percentage, alignment,
374
  if image is None:
375
  return None
376
 
377
- if not OUTPAINT_MODEL_LOADED:
 
 
 
 
378
  return Image.new('RGB', (width, height), (200, 200, 200))
379
 
380
  try:
@@ -391,16 +400,16 @@ def outpaint_image(image, prompt, width, height, overlap_percentage, alignment,
391
  final_prompt = f"{prompt}, high quality, 4k" if prompt else "high quality, 4k"
392
 
393
  # GPU에서 실행
394
- with torch.autocast(device_type=device.type, dtype=torch.float16 if device.type == "cuda" else torch.float32):
395
  (
396
  prompt_embeds,
397
  negative_prompt_embeds,
398
  pooled_prompt_embeds,
399
  negative_pooled_prompt_embeds,
400
- ) = pipe.encode_prompt(final_prompt, str(device), True)
401
 
402
  # 생성 프로세스
403
- for generated_image in pipe(
404
  prompt_embeds=prompt_embeds,
405
  negative_prompt_embeds=negative_prompt_embeds,
406
  pooled_prompt_embeds=pooled_prompt_embeds,
@@ -427,12 +436,12 @@ def outpaint_image(image, prompt, width, height, overlap_percentage, alignment,
427
  # MMAudio 관련 함수들
428
  def translate_prompt(text):
429
  try:
430
- if translator is None:
431
  return text
432
 
433
  if text and any(ord(char) >= 0x3131 and ord(char) <= 0xD7A3 for char in text):
434
  with torch.no_grad():
435
- translation = translator(text)[0]['translation_text']
436
  return translation
437
  return text
438
  except Exception as e:
@@ -443,7 +452,11 @@ def translate_prompt(text):
443
  @torch.inference_mode()
444
  def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int, num_steps: int,
445
  cfg_strength: float, duration: float):
446
- if not MMAUDIO_MODEL_LOADED:
 
 
 
 
447
  return None
448
 
449
  prompt = translate_prompt(prompt)
@@ -456,14 +469,14 @@ def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int
456
  clip_frames, sync_frames, duration = load_video(video, duration)
457
  clip_frames = clip_frames.unsqueeze(0)
458
  sync_frames = sync_frames.unsqueeze(0)
459
- seq_cfg.duration = duration
460
- net_mmaudio.update_seq_lengths(seq_cfg.latent_seq_len, seq_cfg.clip_seq_len, seq_cfg.sync_seq_len)
461
 
462
  audios = generate(clip_frames,
463
  sync_frames, [prompt],
464
  negative_text=[negative_prompt],
465
- feature_utils=feature_utils,
466
- net=net_mmaudio,
467
  fm=fm,
468
  rng=rng,
469
  cfg_strength=cfg_strength)
@@ -473,19 +486,19 @@ def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int
473
  make_video(video,
474
  video_save_path,
475
  audio,
476
- sampling_rate=seq_cfg.sampling_rate,
477
- duration_sec=seq_cfg.duration)
478
  return video_save_path
479
 
480
  # 비디오 배경제거 관련 함수들
481
  def process_bg_image(image, bg, fast_mode=False):
482
  """단일 이미지 배경 처리"""
483
- if not BIREFNET_MODEL_LOADED:
484
  return image
485
 
486
  image_size = image.size
487
  input_images = transform_image(image).unsqueeze(0).to(device)
488
- model = birefnet_lite if fast_mode else birefnet
489
 
490
  with torch.no_grad():
491
  preds = model(input_images)[-1].sigmoid().cpu()
@@ -528,7 +541,11 @@ def process_video_frame(frame, bg_type, bg, fast_mode, bg_frame_index, backgroun
528
  def process_video_bg(vid, bg_type="색상", bg_image=None, bg_video=None, color="#00FF00",
529
  fps=0, video_handling="slow_down", fast_mode=True, max_workers=10):
530
  """비디오 배경 처리 메인 함수"""
531
- if not BIREFNET_MODEL_LOADED:
 
 
 
 
532
  yield gr.update(visible=False), gr.update(visible=True), "BiRefNet 모델을 로드하지 못했습니다."
533
  yield None, None, "BiRefNet 모델을 로드하지 못했습니다."
534
  return
@@ -697,18 +714,6 @@ def merge_videos_with_audio(video_files, audio_file, audio_volume, output_fps):
697
  logging.error(f"Video merge error: {str(e)}")
698
  return None, f"❌ 오류 발생: {str(e)}"
699
 
700
- # GPU 초기화 함수 추가
701
- def dummy_gpu_init():
702
- """GPU 초기화를 위한 더미 함수"""
703
- if torch.cuda.is_available():
704
- try:
705
- # 간단한 텐서 연산으로 GPU 초기화
706
- dummy_tensor = torch.zeros(1).to(device)
707
- del dummy_tensor
708
- logging.info("GPU initialized successfully")
709
- except Exception as e:
710
- logging.warning(f"GPU initialization warning: {e}")
711
-
712
  # CSS
713
  css = """
714
  :root {
@@ -751,6 +756,10 @@ demo = gr.Blocks(css=css, title="AI 이미지 & 비디오 & 오디오 생성기"
751
 
752
  with demo:
753
  gr.Markdown("# 🎨 Ginigen 스튜디오")
 
 
 
 
754
 
755
  with gr.Tabs() as tabs:
756
  # 첫 번째 탭: 텍스트 to 이미지
@@ -896,7 +905,7 @@ with demo:
896
  gr.Markdown("### 🎵 오디오 생성 설정")
897
 
898
  audio_prompt = gr.Textbox(
899
- label="프롬프트 (한글 지원)" if MMAUDIO_MODEL_LOADED and translator else "프롬프트",
900
  placeholder="생성하고 싶은 오디오를 설명하세요... (예: 평화로운 피아노 음악)",
901
  lines=3
902
  )
@@ -927,9 +936,6 @@ with demo:
927
  label="오디오가 추가된 비디오",
928
  interactive=False
929
  )
930
-
931
- if not MMAUDIO_MODEL_LOADED:
932
- gr.Markdown("⚠️ MMAudio 모델을 로드하지 못했습니다. 이 기능은 사용할 수 없습니다.")
933
 
934
  # 네 번째 탭: 비디오 편집
935
  with gr.Tab("비디오 편집", elem_classes="tabitem"):
@@ -1077,9 +1083,6 @@ with demo:
1077
  )
1078
 
1079
  bg_remove_btn = gr.Button("🎬 배경 변경", variant="primary", elem_id="bg-remove-btn")
1080
-
1081
- if not BIREFNET_MODEL_LOADED:
1082
- gr.Markdown("⚠️ BiRefNet 모델을 로드하지 못했습니다. 이 기능은 사용할 수 없습니다.")
1083
 
1084
  # 출력 컬럼
1085
  with gr.Column(scale=1):
@@ -1100,6 +1103,17 @@ with demo:
1100
  긴 비디오는 작은 조각으로 나누어 처리하세요.
1101
  """)
1102
 
 
 
 
 
 
 
 
 
 
 
 
1103
  # 이벤트 연결 - 첫 번째 탭
1104
  size_preset.change(update_dimensions, [size_preset], [width, height])
1105
 
@@ -1167,20 +1181,12 @@ with demo:
1167
  fps_slider, video_handling_radio, fast_mode_checkbox, max_workers_slider],
1168
  outputs=[stream_image, output_bg_video, time_textbox]
1169
  )
1170
-
1171
- # GPU 초기화 (Spaces 환경에서 필요)
1172
- try:
1173
- if IS_SPACES and torch.cuda.is_available():
1174
- # Spaces 환경에서 GPU 워밍업 실행
1175
- gpu_warmup()
1176
- logging.info("GPU warmed up successfully")
1177
- elif torch.cuda.is_available():
1178
- dummy_gpu_init()
1179
- except Exception as e:
1180
- logging.warning(f"GPU initialization warning: {e}")
1181
 
1182
  if __name__ == "__main__":
1183
- # Spaces 환경에서 추가 GPU 체크
1184
  if IS_SPACES:
1185
  try:
1186
  gpu_warmup()
 
1
+ # Spaces GPU - 반드시 첫 번째로 import해야 함!
2
+ import os
3
+ IS_SPACES = os.environ.get("SPACE_ID") is not None
4
+
5
+ if IS_SPACES:
6
+ import spaces
7
+ else:
8
+ # GPU 데코레이터가 없을 때를 위한 더미 데코레이터
9
+ class spaces:
10
+ @staticmethod
11
+ def GPU(duration=None):
12
+ def decorator(func):
13
+ return func
14
+ return decorator
15
+
16
+ # 이제 다른 라이브러리들을 import
17
  import gradio as gr
18
  import numpy as np
19
  from PIL import Image, ImageDraw
20
  from gradio_client import Client, handle_file
21
  import random
22
  import tempfile
 
23
  import logging
24
  import torch
25
  from diffusers import AutoencoderKL, TCDScheduler
 
41
  # 환경 변수 설정으로 torch.load 체크 우회 (임시 해결책)
42
  os.environ["TRANSFORMERS_ALLOW_UNSAFE_DESERIALIZATION"] = "1"
43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  # GPU 초기화를 위한 간단한 함수 (Spaces 환경에서 필수)
45
  @spaces.GPU(duration=1)
46
  def gpu_warmup():
 
50
  del dummy
51
  return "GPU ready"
52
 
53
+ # MMAudio imports - spaces import 이후에 와야 함
54
  try:
55
  import mmaudio
56
  except ImportError:
 
64
  from mmaudio.model.sequence_config import SequenceConfig
65
  from mmaudio.model.utils.features_utils import FeaturesUtils
66
 
67
+ # 로깅 설정
68
+ logging.basicConfig(level=logging.INFO)
69
+
70
  # 기존 코드의 모든 설정과 초기화 부분 유지
71
  torch.set_float32_matmul_precision("medium")
72
 
 
80
 
81
  logging.info(f"Using device: {device}")
82
 
83
+ # 전역 변수로 모델 상태 관리
84
+ MODELS_LOADED = False
85
+ BIREFNET_MODEL = None
86
+ BIREFNET_LITE_MODEL = None
87
+ OUTPAINT_PIPE = None
88
+ MMAUDIO_NET = None
89
+ MMAUDIO_FEATURE_UTILS = None
90
+ MMAUDIO_SEQ_CFG = None
91
+ TRANSLATOR = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
  # API URLs
94
  TEXT2IMG_API_URL = "http://211.233.58.201:7896"
95
  VIDEO_API_URL = "http://211.233.58.201:7875"
96
 
97
+ # Image size presets
 
 
 
98
  IMAGE_PRESETS = {
99
  "커스텀": {"width": 1024, "height": 1024},
100
  "1:1 정사각형": {"width": 1024, "height": 1024},
 
111
  "LinkedIn 배너": {"width": 1584, "height": 396},
112
  }
113
 
114
+ # Transform for BiRefNet
115
+ transform_image = transforms.Compose([
116
+ transforms.Resize((768, 768)),
117
+ transforms.ToTensor(),
118
+ transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
119
+ ])
120
+
121
+ @spaces.GPU(duration=60)
122
+ def load_models():
123
+ """모든 모델을 로드하는 함수"""
124
+ global MODELS_LOADED, BIREFNET_MODEL, BIREFNET_LITE_MODEL, OUTPAINT_PIPE
125
+ global MMAUDIO_NET, MMAUDIO_FEATURE_UTILS, MMAUDIO_SEQ_CFG, TRANSLATOR
126
+
127
+ if MODELS_LOADED:
128
+ return True
129
+
130
+ try:
131
+ # BiRefNet 모델 로드
132
+ logging.info("Loading BiRefNet models...")
133
+ BIREFNET_MODEL = AutoModelForImageSegmentation.from_pretrained("ZhengPeng7/BiRefNet", trust_remote_code=True)
134
+ BIREFNET_MODEL.to(device)
135
+ BIREFNET_LITE_MODEL = AutoModelForImageSegmentation.from_pretrained("ZhengPeng7/BiRefNet_lite", trust_remote_code=True)
136
+ BIREFNET_LITE_MODEL.to(device)
137
+
138
+ # ControlNet 및 Outpainting 모델 로드
139
+ logging.info("Loading ControlNet models...")
140
+ from controlnet_union import ControlNetModel_Union
141
+ from pipeline_fill_sd_xl import StableDiffusionXLFillPipeline
142
+
143
+ config_file = hf_hub_download(
144
+ "xinsir/controlnet-union-sdxl-1.0",
145
+ filename="config_promax.json",
146
+ )
147
+
148
+ config = ControlNetModel_Union.load_config(config_file)
149
+ controlnet_model = ControlNetModel_Union.from_config(config)
150
+
151
+ model_file = hf_hub_download(
152
+ "xinsir/controlnet-union-sdxl-1.0",
153
+ filename="diffusion_pytorch_model_promax.safetensors",
154
+ )
155
+ state_dict = load_state_dict(model_file)
156
+ loaded_keys = list(state_dict.keys())
157
+
158
+ result = ControlNetModel_Union._load_pretrained_model(
159
+ controlnet_model, state_dict, model_file, "xinsir/controlnet-union-sdxl-1.0", loaded_keys
160
+ )
161
+
162
+ model = result[0]
163
+ model = model.to(device=device, dtype=torch_dtype)
164
+
165
+ # VAE 로드
166
+ vae = AutoencoderKL.from_pretrained(
167
+ "madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch_dtype
168
+ ).to(device)
169
+
170
+ # 파이프라인 로드
171
+ OUTPAINT_PIPE = StableDiffusionXLFillPipeline.from_pretrained(
172
+ "SG161222/RealVisXL_V5.0_Lightning",
173
+ torch_dtype=torch_dtype,
174
+ vae=vae,
175
+ controlnet=model,
176
+ variant="fp16" if device.type == "cuda" else None,
177
+ ).to(device)
178
+
179
+ OUTPAINT_PIPE.scheduler = TCDScheduler.from_config(OUTPAINT_PIPE.scheduler.config)
180
+
181
+ # MMAudio 모델 로드
182
+ logging.info("Loading MMAudio models...")
183
+ model_mmaudio: ModelConfig = all_model_cfg['large_44k_v2']
184
+ model_mmaudio.download_if_needed()
185
+ setup_eval_logging()
186
+
187
+ # 번역기 설정
188
+ try:
189
+ TRANSLATOR = pipeline("translation",
190
+ model="Helsinki-NLP/opus-mt-ko-en",
191
+ device="cpu",
192
+ use_fast=True,
193
+ trust_remote_code=False)
194
+ except Exception as e:
195
+ logging.warning(f"Failed to load translation model: {e}")
196
+ TRANSLATOR = None
197
+
198
+ # MMAudio 모델 초기화
199
+ if torch.cuda.is_available():
200
+ mmaudio_dtype = torch.bfloat16
201
+ else:
202
+ mmaudio_dtype = torch.float32
203
+
204
+ with torch.cuda.device(device):
205
+ MMAUDIO_SEQ_CFG = model_mmaudio.seq_cfg
206
+ MMAUDIO_NET = get_my_mmaudio(model_mmaudio.model_name).to(device, mmaudio_dtype).eval()
207
+ MMAUDIO_NET.load_weights(torch.load(model_mmaudio.model_path, map_location=device, weights_only=True))
208
+ logging.info(f'Loaded weights from {model_mmaudio.model_path}')
209
+
210
+ MMAUDIO_FEATURE_UTILS = FeaturesUtils(
211
+ tod_vae_ckpt=model_mmaudio.vae_path,
212
+ synchformer_ckpt=model_mmaudio.synchformer_ckpt,
213
+ enable_conditions=True,
214
+ mode=model_mmaudio.mode,
215
+ bigvgan_vocoder_ckpt=model_mmaudio.bigvgan_16k_path,
216
+ need_vae_encoder=False
217
+ ).to(device, mmaudio_dtype).eval()
218
+
219
+ MODELS_LOADED = True
220
+ logging.info("All models loaded successfully!")
221
+ return True
222
+
223
+ except Exception as e:
224
+ logging.error(f"Failed to load models: {str(e)}")
225
+ return False
226
+
227
  # 기존 함수들 모두 유지
228
  def update_dimensions(preset):
229
  if preset in IMAGE_PRESETS:
 
339
  mask = Image.new('L', target_size, 255)
340
  mask_draw = ImageDraw.Draw(mask)
341
 
342
+ # 마스크 영역 그리기
 
 
343
  left_overlap = margin_x + overlap_x if alignment != "왼쪽" else margin_x
344
  right_overlap = margin_x + new_width - overlap_x if alignment != "오른쪽" else margin_x + new_width
345
  top_overlap = margin_y + overlap_y if alignment != "위" else margin_y
 
379
  if image is None:
380
  return None
381
 
382
+ # 모델 로드 확인
383
+ if not MODELS_LOADED:
384
+ load_models()
385
+
386
+ if OUTPAINT_PIPE is None:
387
  return Image.new('RGB', (width, height), (200, 200, 200))
388
 
389
  try:
 
400
  final_prompt = f"{prompt}, high quality, 4k" if prompt else "high quality, 4k"
401
 
402
  # GPU에서 실행
403
+ with torch.autocast(device_type=device.type, dtype=torch_dtype):
404
  (
405
  prompt_embeds,
406
  negative_prompt_embeds,
407
  pooled_prompt_embeds,
408
  negative_pooled_prompt_embeds,
409
+ ) = OUTPAINT_PIPE.encode_prompt(final_prompt, str(device), True)
410
 
411
  # 생성 프로세스
412
+ for generated_image in OUTPAINT_PIPE(
413
  prompt_embeds=prompt_embeds,
414
  negative_prompt_embeds=negative_prompt_embeds,
415
  pooled_prompt_embeds=pooled_prompt_embeds,
 
436
  # MMAudio 관련 함수들
437
  def translate_prompt(text):
438
  try:
439
+ if TRANSLATOR is None:
440
  return text
441
 
442
  if text and any(ord(char) >= 0x3131 and ord(char) <= 0xD7A3 for char in text):
443
  with torch.no_grad():
444
+ translation = TRANSLATOR(text)[0]['translation_text']
445
  return translation
446
  return text
447
  except Exception as e:
 
452
  @torch.inference_mode()
453
  def video_to_audio(video: gr.Video, prompt: str, negative_prompt: str, seed: int, num_steps: int,
454
  cfg_strength: float, duration: float):
455
+ # 모델 로드 확인
456
+ if not MODELS_LOADED:
457
+ load_models()
458
+
459
+ if MMAUDIO_NET is None:
460
  return None
461
 
462
  prompt = translate_prompt(prompt)
 
469
  clip_frames, sync_frames, duration = load_video(video, duration)
470
  clip_frames = clip_frames.unsqueeze(0)
471
  sync_frames = sync_frames.unsqueeze(0)
472
+ MMAUDIO_SEQ_CFG.duration = duration
473
+ MMAUDIO_NET.update_seq_lengths(MMAUDIO_SEQ_CFG.latent_seq_len, MMAUDIO_SEQ_CFG.clip_seq_len, MMAUDIO_SEQ_CFG.sync_seq_len)
474
 
475
  audios = generate(clip_frames,
476
  sync_frames, [prompt],
477
  negative_text=[negative_prompt],
478
+ feature_utils=MMAUDIO_FEATURE_UTILS,
479
+ net=MMAUDIO_NET,
480
  fm=fm,
481
  rng=rng,
482
  cfg_strength=cfg_strength)
 
486
  make_video(video,
487
  video_save_path,
488
  audio,
489
+ sampling_rate=MMAUDIO_SEQ_CFG.sampling_rate,
490
+ duration_sec=MMAUDIO_SEQ_CFG.duration)
491
  return video_save_path
492
 
493
  # 비디오 배경제거 관련 함수들
494
  def process_bg_image(image, bg, fast_mode=False):
495
  """단일 이미지 배경 처리"""
496
+ if BIREFNET_MODEL is None or BIREFNET_LITE_MODEL is None:
497
  return image
498
 
499
  image_size = image.size
500
  input_images = transform_image(image).unsqueeze(0).to(device)
501
+ model = BIREFNET_LITE_MODEL if fast_mode else BIREFNET_MODEL
502
 
503
  with torch.no_grad():
504
  preds = model(input_images)[-1].sigmoid().cpu()
 
541
  def process_video_bg(vid, bg_type="색상", bg_image=None, bg_video=None, color="#00FF00",
542
  fps=0, video_handling="slow_down", fast_mode=True, max_workers=10):
543
  """비디오 배경 처리 메인 함수"""
544
+ # 모델 로드 확인
545
+ if not MODELS_LOADED:
546
+ load_models()
547
+
548
+ if BIREFNET_MODEL is None:
549
  yield gr.update(visible=False), gr.update(visible=True), "BiRefNet 모델을 로드하지 못했습니다."
550
  yield None, None, "BiRefNet 모델을 로드하지 못했습니다."
551
  return
 
714
  logging.error(f"Video merge error: {str(e)}")
715
  return None, f"❌ 오류 발생: {str(e)}"
716
 
 
 
 
 
 
 
 
 
 
 
 
 
717
  # CSS
718
  css = """
719
  :root {
 
756
 
757
  with demo:
758
  gr.Markdown("# 🎨 Ginigen 스튜디오")
759
+ gr.Markdown("처음 사용 시 모델 로딩에 시간이 걸릴 수 있습니다. 잠시만 기다려주세요.")
760
+
761
+ # 모델 로드 상태 표시
762
+ model_status = gr.Textbox(label="모델 상태", value="모델 로딩 대기 중...", interactive=False)
763
 
764
  with gr.Tabs() as tabs:
765
  # 첫 번째 탭: 텍스트 to 이미지
 
905
  gr.Markdown("### 🎵 오디오 생성 설정")
906
 
907
  audio_prompt = gr.Textbox(
908
+ label="프롬프트 (한글 지원)",
909
  placeholder="생성하고 싶은 오디오를 설명하세요... (예: 평화로운 피아노 음악)",
910
  lines=3
911
  )
 
936
  label="오디오가 추가된 비디오",
937
  interactive=False
938
  )
 
 
 
939
 
940
  # 네 번째 탭: 비디오 편집
941
  with gr.Tab("비디오 편집", elem_classes="tabitem"):
 
1083
  )
1084
 
1085
  bg_remove_btn = gr.Button("🎬 배경 변경", variant="primary", elem_id="bg-remove-btn")
 
 
 
1086
 
1087
  # 출력 컬럼
1088
  with gr.Column(scale=1):
 
1103
  긴 비디오는 작은 조각으로 나누어 처리하세요.
1104
  """)
1105
 
1106
+ # 모델 로드 함수 실행
1107
+ def on_demo_load():
1108
+ try:
1109
+ if IS_SPACES:
1110
+ # Spaces 환경에서 GPU 워밍업
1111
+ gpu_warmup()
1112
+ # 모델 로드는 첫 번째 GPU 함수 호출 시 자동으로 수행됨
1113
+ return "모델 로딩 준비 완료"
1114
+ except Exception as e:
1115
+ return f"초기화 오류: {str(e)}"
1116
+
1117
  # 이벤트 연결 - 첫 번째 탭
1118
  size_preset.change(update_dimensions, [size_preset], [width, height])
1119
 
 
1181
  fps_slider, video_handling_radio, fast_mode_checkbox, max_workers_slider],
1182
  outputs=[stream_image, output_bg_video, time_textbox]
1183
  )
1184
+
1185
+ # 데모 로드 실행
1186
+ demo.load(on_demo_load, outputs=model_status)
 
 
 
 
 
 
 
 
1187
 
1188
  if __name__ == "__main__":
1189
+ # Spaces 환경에서 추가 체크
1190
  if IS_SPACES:
1191
  try:
1192
  gpu_warmup()