DeepSound-V1 / app.py
lym0302
aa
3f6c072
import os
import sys
import time
import gradio as gr
import subprocess
from pathlib import Path
import requests
from moviepy.editor import AudioFileClip, VideoFileClip
project_root = os.path.dirname(os.path.abspath(__file__))
mmaudio_path = os.path.join(project_root, 'third_party', 'MMAudio')
sys.path.append(mmaudio_path)
from pipeline.pipeline import Pipeline
from third_party.MMAudio.mmaudio.eval_utils import setup_eval_logging
# download model
# os.makedirs("pretrained/mllm", exist_ok=True)
from huggingface_hub import snapshot_download
repo_local_path = snapshot_download(repo_id="lym0302/VideoLLaMA2.1-7B-AV-CoT")
remove_vo_model_dir = "pretrained/remove_vo/checkpoints"
os.makedirs(remove_vo_model_dir, exist_ok=True)
urls = ["https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/model_bs_roformer_ep_317_sdr_12.9755.ckpt",
"https://raw.githubusercontent.com/ZFTurbo/Music-Source-Separation-Training/main/configs/viperx/model_bs_roformer_ep_317_sdr_12.9755.yaml"]
for url in urls:
file_name = url.split("/")[-1] # Extract file name from URL
file_path = os.path.join(remove_vo_model_dir, file_name)
response = requests.get(url, stream=True)
if response.status_code == 200:
with open(file_path, "wb") as f:
for chunk in response.iter_content(chunk_size=8192): # Use a chunk size of 8 KB
f.write(chunk)
print(f"File downloaded successfully and saved to {file_path}")
else:
print(f"Failed to download the file. Status code: {response.status_code}")
os.makedirs("pretrained/v2a/mmaudio", exist_ok=True)
setup_eval_logging()
pipeline = Pipeline(
step0_model_dir=repo_local_path,
step1_mode='mmaudio_small_44k',
step2_model_dir=repo_local_path,
step2_mode='cot',
step3_mode='bs_roformer',
)
output_dir = "output_gradio"
os.makedirs(output_dir, exist_ok=True)
skip_final_video = False
def video_to_audio(
video_input: gr.Video,
prompt: str='',
negative_prompt: str='',
mode: str='s4',
postp_mode: str='neg',
duration: float=10,
seed: int=42,):
log_messages = [] # 用于存储日志
def log_info(msg):
log_messages.append(msg)
return "\n".join(log_messages) # 每次返回完整的日志历史
if not video_input:
yield None, log_info("Error: No video input provided.")
return
yield None, log_info("Generate high-quality audio from video step-by-step...") # 初始化日志
st_infer = time.time()
video_input = str(video_input)
for step_results in pipeline.run_for_gradio(
video_input=video_input,
output_dir=output_dir,
mode=mode,
postp_mode=postp_mode,
prompt=prompt,
negative_prompt=negative_prompt,
duration=duration,
seed=seed
):
if step_results['log'] == 'Finish step-by-step v2a.':
break
else:
yield None, log_info(step_results['log'])
temp_final_audio_path = step_results["temp_final_audio_path"]
temp_final_video_path = step_results["temp_final_video_path"]
video_name_stem = Path(video_input).stem
final_audio_path = str(Path(output_dir) / f'{video_name_stem}.wav')
final_video_path = str(Path(output_dir) / f'{video_name_stem}.mp4')
if temp_final_audio_path is not None:
subprocess.run(['cp', str(temp_final_audio_path), final_audio_path], check=True)
step_results["final_audio_path"] = final_audio_path
if skip_final_video:
step_results["final_video_path"] = None
else:
if temp_final_video_path is not None:
subprocess.run(['cp', str(temp_final_video_path), final_video_path], check=True)
else:
audio = AudioFileClip(final_audio_path)
video = VideoFileClip(video_input)
duration = min(audio.duration, video.duration)
audio = audio.subclip(0, duration)
video.audio = audio
video = video.subclip(0, duration)
video.write_videofile(final_video_path)
step_results["final_video_path"] = final_video_path
et_infer = time.time()
print(f"Inference time: {et_infer - st_infer:.2f} s.")
print("step_results: ", step_results)
yield (final_video_path if os.path.exists(final_video_path) else None), log_info(step_results['log'])
video_to_audio_tab = gr.Interface(
fn=video_to_audio,
# Project page: <a href="https://hkchengrex.com/MMAudio/">https://hkchengrex.com/MMAudio/</a><br>
description="""
Code: <a href="https://github.com/lym0302/DeepSound-V1">https://github.com/lym0302/DeepSound-V1</a><br>
NOTE: It takes longer to process high-resolution videos (>384 px on the shorter side).
Doing so does not improve results.
This is a step-by-step v2a process and may take a long time.
If Post Processing is set to 'rm', the generated video may be None.
""",
inputs=[
gr.Video(),
gr.Text(label='Prompt'),
gr.Text(label='Negative prompt', value=''),
gr.Radio(["s3", "s4"], label="Mode", value="s4"),
gr.Radio(["rm", "rep", "neg"], label="Post Processing", value="neg"),
gr.Number(label='Duration (sec)', value=10, minimum=1),
gr.Number(label='Seed (42: random)', value=42, precision=0, minimum=-1),
],
outputs=[gr.Video(label="Generated Video"), gr.Text(label="Logs"),],
cache_examples=False,
title='DeepSound-V1 — Video-to-Audio Synthesis',
)
if __name__ == "__main__":
gr.TabbedInterface([video_to_audio_tab],
['Video-to-Audio']).launch(allowed_paths=[output_dir])
# if __name__ == "__main__":
# port = 8000
# gr.TabbedInterface([video_to_audio_tab, ],
# ['Video-to-Audio', ]).launch(
# server_port=port, allowed_paths=[output_dir])