import os import time import sys import cuid import gradio as gr import spaces import numpy as np from huggingface_hub import snapshot_download # Add necessary paths ProjectDir = os.path.abspath(os.path.dirname(__file__)) sys.path.insert(0, ProjectDir) sys.path.insert(0, os.path.join(ProjectDir, "MMCM")) sys.path.insert(0, os.path.join(ProjectDir, "diffusers/src")) sys.path.insert(0, os.path.join(ProjectDir, "controlnet_aux/src")) CheckpointsDir = os.path.join(ProjectDir, "checkpoints") ignore_video2video = True max_image_edge = 960 def download_model(): if not os.path.exists(CheckpointsDir): print("Checkpoint Not Downloaded, start downloading...") tic = time.time() snapshot_download( repo_id="TMElyralab/MuseV", local_dir=CheckpointsDir, max_workers=8, local_dir_use_symlinks=True, ) toc = time.time() print(f"download cost {toc-tic} seconds") else: print("Already download the model.") # Download model first print("Starting model download...") download_model() # Import after model download to ensure all dependencies are ready from gradio_text2video import online_t2v_inference @spaces.GPU(duration=180) def hf_online_t2v_inference( prompt, image_np, seed, fps, w, h, video_len, img_edge_ratio, ): if not isinstance(image_np, np.ndarray): # None raise gr.Error("Need input reference image") return online_t2v_inference( prompt, image_np, seed, fps, w, h, video_len, img_edge_ratio ) # Create Gradio interface with gr.Blocks() as demo: gr.Markdown("# MuseV Demo") with gr.Tab("Text to Video"): with gr.Row(): with gr.Column(): prompt = gr.Textbox(label="Prompt") image = gr.Image(label="VisionCondImage") seed = gr.Number( label="Seed (seed=-1 means that the seeds run each time are different)", value=-1, ) video_length = gr.Number( label="Video Length(need smaller than 144)", value=12, ) fps = gr.Number(label="Generate Video FPS", value=6) with gr.Row(): w = gr.Number(label="Width", value=-1) h = gr.Number(label="Height", value=-1) img_edge_ratio = gr.Number(label="img_edge_ratio", value=1.0) btn = gr.Button("Generate") video_output = gr.Video() btn.click( fn=hf_online_t2v_inference, inputs=[ prompt, image, seed, fps, w, h, video_length, img_edge_ratio, ], outputs=video_output, ) with gr.Tab("Video to Video"): gr.Markdown( "Due to GPU limit, MuseVDemo now only support Text2Video. If you want to try Video2Video, please run it locally." ) # Launch the app demo.queue().launch(server_name="0.0.0.0", server_port=7860)