import os import sys import time import gradio as gr import subprocess from pathlib import Path import requests from moviepy.editor import AudioFileClip, VideoFileClip project_root = os.path.dirname(os.path.abspath(__file__)) mmaudio_path = os.path.join(project_root, 'third_party', 'MMAudio') sys.path.append(mmaudio_path) from pipeline.pipeline import Pipeline from third_party.MMAudio.mmaudio.eval_utils import setup_eval_logging # download model # os.makedirs("pretrained/mllm", exist_ok=True) from huggingface_hub import snapshot_download repo_local_path = snapshot_download(repo_id="lym0302/VideoLLaMA2.1-7B-AV-CoT") remove_vo_model_dir = "pretrained/remove_vo/checkpoints" os.makedirs(remove_vo_model_dir, exist_ok=True) urls = ["https://github.com/TRvlvr/model_repo/releases/download/all_public_uvr_models/model_bs_roformer_ep_317_sdr_12.9755.ckpt", "https://raw.githubusercontent.com/ZFTurbo/Music-Source-Separation-Training/main/configs/viperx/model_bs_roformer_ep_317_sdr_12.9755.yaml"] for url in urls: file_name = url.split("/")[-1] # Extract file name from URL file_path = os.path.join(remove_vo_model_dir, file_name) response = requests.get(url, stream=True) if response.status_code == 200: with open(file_path, "wb") as f: for chunk in response.iter_content(chunk_size=8192): # Use a chunk size of 8 KB f.write(chunk) print(f"File downloaded successfully and saved to {file_path}") else: print(f"Failed to download the file. Status code: {response.status_code}") os.makedirs("pretrained/v2a/mmaudio", exist_ok=True) setup_eval_logging() pipeline = Pipeline( step0_model_dir=repo_local_path, step1_mode='mmaudio_small_44k', step2_model_dir=repo_local_path, step2_mode='cot', step3_mode='bs_roformer', ) output_dir = "output_gradio" os.makedirs(output_dir, exist_ok=True) skip_final_video = False def video_to_audio( video_input: gr.Video, prompt: str='', negative_prompt: str='', mode: str='s4', postp_mode: str='neg', duration: float=10, seed: int=42,): log_messages = [] # 用于存储日志 def log_info(msg): log_messages.append(msg) return "\n".join(log_messages) # 每次返回完整的日志历史 if not video_input: yield None, log_info("Error: No video input provided.") return yield None, log_info("Generate high-quality audio from video step-by-step...") # 初始化日志 st_infer = time.time() video_input = str(video_input) for step_results in pipeline.run_for_gradio( video_input=video_input, output_dir=output_dir, mode=mode, postp_mode=postp_mode, prompt=prompt, negative_prompt=negative_prompt, duration=duration, seed=seed ): if step_results['log'] == 'Finish step-by-step v2a.': break else: yield None, log_info(step_results['log']) temp_final_audio_path = step_results["temp_final_audio_path"] temp_final_video_path = step_results["temp_final_video_path"] video_name_stem = Path(video_input).stem final_audio_path = str(Path(output_dir) / f'{video_name_stem}.wav') final_video_path = str(Path(output_dir) / f'{video_name_stem}.mp4') if temp_final_audio_path is not None: subprocess.run(['cp', str(temp_final_audio_path), final_audio_path], check=True) step_results["final_audio_path"] = final_audio_path if skip_final_video: step_results["final_video_path"] = None else: if temp_final_video_path is not None: subprocess.run(['cp', str(temp_final_video_path), final_video_path], check=True) else: audio = AudioFileClip(final_audio_path) video = VideoFileClip(video_input) duration = min(audio.duration, video.duration) audio = audio.subclip(0, duration) video.audio = audio video = video.subclip(0, duration) video.write_videofile(final_video_path) step_results["final_video_path"] = final_video_path et_infer = time.time() print(f"Inference time: {et_infer - st_infer:.2f} s.") print("step_results: ", step_results) yield (final_video_path if os.path.exists(final_video_path) else None), log_info(step_results['log']) video_to_audio_tab = gr.Interface( fn=video_to_audio, # Project page: https://hkchengrex.com/MMAudio/
description=""" Code: https://github.com/lym0302/DeepSound-V1
NOTE: It takes longer to process high-resolution videos (>384 px on the shorter side). Doing so does not improve results. This is a step-by-step v2a process and may take a long time. If Post Processing is set to 'rm', the generated video may be None. """, inputs=[ gr.Video(), gr.Text(label='Prompt'), gr.Text(label='Negative prompt', value=''), gr.Radio(["s3", "s4"], label="Mode", value="s4"), gr.Radio(["rm", "rep", "neg"], label="Post Processing", value="neg"), gr.Number(label='Duration (sec)', value=10, minimum=1), gr.Number(label='Seed (42: random)', value=42, precision=0, minimum=-1), ], outputs=[gr.Video(label="Generated Video"), gr.Text(label="Logs"),], cache_examples=False, title='DeepSound-V1 — Video-to-Audio Synthesis', ) if __name__ == "__main__": gr.TabbedInterface([video_to_audio_tab], ['Video-to-Audio']).launch(allowed_paths=[output_dir]) # if __name__ == "__main__": # port = 8000 # gr.TabbedInterface([video_to_audio_tab, ], # ['Video-to-Audio', ]).launch( # server_port=port, allowed_paths=[output_dir])