openfree commited on
Commit
7409da1
ยท
verified ยท
1 Parent(s): 3a4d626

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -8
app.py CHANGED
@@ -194,7 +194,7 @@ def generate_text_to_image(prompt, width, height, guidance, inference_steps, see
194
 
195
  @spaces.GPU(duration=60)
196
  @torch.inference_mode()
197
- def video_to_audio(video_path, prompt, negative_prompt="music", seed=0, num_steps=25, cfg_strength=4.5, duration=8.0):
198
  """๋น„๋””์˜ค์— ์‚ฌ์šด๋“œ๋ฅผ ์ถ”๊ฐ€ํ•˜๋Š” ํ•จ์ˆ˜"""
199
  if not MMAUDIO_LOADED:
200
  logging.error("MMAudio model not loaded")
@@ -205,12 +205,10 @@ def video_to_audio(video_path, prompt, negative_prompt="music", seed=0, num_step
205
  rng.manual_seed(seed)
206
  fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
207
 
208
- # ๋น„๋””์˜ค ๋กœ๋“œ - ๋ฐ˜ํ™˜๋˜๋Š” duration์„ ๋‹ค๋ฅธ ๋ณ€์ˆ˜๋ช…์œผ๋กœ ๋ฐ›๊ธฐ
209
- clip_frames, sync_frames, actual_duration = load_video(video_path, duration)
210
  clip_frames = clip_frames.unsqueeze(0)
211
  sync_frames = sync_frames.unsqueeze(0)
212
-
213
- # ์‹ค์ œ ๋น„๋””์˜ค ๊ธธ์ด๋กœ seq_cfg ์—…๋ฐ์ดํŠธ
214
  mmaudio_seq_cfg.duration = actual_duration
215
  mmaudio_net.update_seq_lengths(mmaudio_seq_cfg.latent_seq_len, mmaudio_seq_cfg.clip_seq_len, mmaudio_seq_cfg.sync_seq_len)
216
 
@@ -225,12 +223,13 @@ def video_to_audio(video_path, prompt, negative_prompt="music", seed=0, num_step
225
  cfg_strength=cfg_strength)
226
  audio = audios.float().cpu()[0]
227
 
228
- # ๋น„๋””์˜ค์™€ ์˜ค๋””์˜ค ๊ฒฐํ•ฉ - duration_sec ๋งค๊ฐœ๋ณ€์ˆ˜ ์ œ๊ฑฐ
229
  video_save_path = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name
230
  make_video(video_path,
231
  video_save_path,
232
  audio,
233
- sampling_rate=mmaudio_seq_cfg.sampling_rate)
 
234
 
235
  return video_save_path
236
  except Exception as e:
@@ -274,6 +273,7 @@ def generate_video_from_image(image, prompt="", length=4.0, sound_generation="
274
  sound_prompt = prompt if prompt else "ambient sound"
275
 
276
  # ๋น„๋””์˜ค์— ์‚ฌ์šด๋“œ ์ถ”๊ฐ€ - ๋ชจ๋“  ๋งค๊ฐœ๋ณ€์ˆ˜๋ฅผ ๋ช…์‹œ์ ์œผ๋กœ ์ „๋‹ฌ
 
277
  video_with_sound = video_to_audio(
278
  video_path=video_path,
279
  prompt=sound_prompt,
@@ -281,8 +281,9 @@ def generate_video_from_image(image, prompt="", length=4.0, sound_generation="
281
  seed=random.randint(0, 9999999),
282
  num_steps=25,
283
  cfg_strength=4.5,
284
- duration=length
285
  )
 
286
  return video_with_sound
287
 
288
  return video_path
 
194
 
195
  @spaces.GPU(duration=60)
196
  @torch.inference_mode()
197
+ def video_to_audio(video_path, prompt, negative_prompt="music", seed=0, num_steps=25, cfg_strength=4.5, target_duration=8.0):
198
  """๋น„๋””์˜ค์— ์‚ฌ์šด๋“œ๋ฅผ ์ถ”๊ฐ€ํ•˜๋Š” ํ•จ์ˆ˜"""
199
  if not MMAUDIO_LOADED:
200
  logging.error("MMAudio model not loaded")
 
205
  rng.manual_seed(seed)
206
  fm = FlowMatching(min_sigma=0, inference_mode='euler', num_steps=num_steps)
207
 
208
+ # ๋น„๋””์˜ค ๋กœ๋“œ - target_duration ์‚ฌ์šฉ
209
+ clip_frames, sync_frames, actual_duration = load_video(video_path, target_duration)
210
  clip_frames = clip_frames.unsqueeze(0)
211
  sync_frames = sync_frames.unsqueeze(0)
 
 
212
  mmaudio_seq_cfg.duration = actual_duration
213
  mmaudio_net.update_seq_lengths(mmaudio_seq_cfg.latent_seq_len, mmaudio_seq_cfg.clip_seq_len, mmaudio_seq_cfg.sync_seq_len)
214
 
 
223
  cfg_strength=cfg_strength)
224
  audio = audios.float().cpu()[0]
225
 
226
+ # ๋น„๋””์˜ค์™€ ์˜ค๋””์˜ค ๊ฒฐํ•ฉ
227
  video_save_path = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4').name
228
  make_video(video_path,
229
  video_save_path,
230
  audio,
231
+ sampling_rate=mmaudio_seq_cfg.sampling_rate,
232
+ duration_sec=mmaudio_seq_cfg.duration)
233
 
234
  return video_save_path
235
  except Exception as e:
 
273
  sound_prompt = prompt if prompt else "ambient sound"
274
 
275
  # ๋น„๋””์˜ค์— ์‚ฌ์šด๋“œ ์ถ”๊ฐ€ - ๋ชจ๋“  ๋งค๊ฐœ๋ณ€์ˆ˜๋ฅผ ๋ช…์‹œ์ ์œผ๋กœ ์ „๋‹ฌ
276
+ # ๋น„๋””์˜ค์— ์‚ฌ์šด๋“œ ์ถ”๊ฐ€
277
  video_with_sound = video_to_audio(
278
  video_path=video_path,
279
  prompt=sound_prompt,
 
281
  seed=random.randint(0, 9999999),
282
  num_steps=25,
283
  cfg_strength=4.5,
284
+ target_duration=length # duration โ†’ target_duration
285
  )
286
+
287
  return video_with_sound
288
 
289
  return video_path