Mochi1

Build error

App Files Files Community

Haoxin Chen commited on Apr 13, 2023

Commit

32619a4

1 Parent(s): 8cdb359

add variable resolution and frame

Browse files

Files changed (3) hide show

app.py +7 -5
videocontrol_test.py +34 -14
videocrafter_test.py +4 -0

app.py CHANGED Viewed

@@ -15,7 +15,7 @@ t2v_examples = [
 ]
 control_examples = [
-    ['input/flamingo.mp4', 'An ostrich walking in the desert, photorealistic, 4k', 0, 50, 15, 1]
 ]
 def videocrafter_demo(result_dir='./tmp/'):
@@ -23,7 +23,7 @@ def videocrafter_demo(result_dir='./tmp/'):
     videocontrol = VideoControl(result_dir)
     with gr.Blocks(analytics_enabled=False) as videocrafter_iface:
         gr.Markdown("<div align='center'> <h2> VideoCrafter: A Toolkit for Text-to-Video Generation and Editing </span> </h2> \
-                     <a style='font-size:18px;color: #efefef' href='https://github.com/VideoCrafter/VideoCrafter'> Github </div>")
         #######t2v#######
         with gr.Tab(label="Text2Video"):
             with gr.Column():
@@ -70,7 +70,9 @@ def videocrafter_demo(result_dir='./tmp/'):
                         with gr.Row():
                             vc_steps = gr.Slider(minimum=1, maximum=60, step=1, elem_id="vc_steps", label="Sampling steps", value=50)
                             frame_stride = gr.Slider(minimum=0 , maximum=100, step=1, label='Frame Stride', value=0, elem_id="vc_frame_stride")
                         vc_end_btn = gr.Button("Send")
                     with gr.Tab(label='Result'):
                         vc_output_info = gr.Text(label='Info')
@@ -79,12 +81,12 @@ def videocrafter_demo(result_dir='./tmp/'):
                             vc_output_video = gr.Video(label="Generated Video").style(width=256)
                 gr.Examples(examples=control_examples,
-                            inputs=[vc_input_video, vc_input_text, frame_stride, vc_steps, vc_cfg_scale, vc_eta],
                             outputs=[vc_output_info, vc_origin_video, vc_depth_video, vc_output_video],
                             fn = videocontrol.get_video,
                             cache_examples=os.getenv('SYSTEM') == 'spaces',
                 )
-            vc_end_btn.click(inputs=[vc_input_video, vc_input_text, frame_stride, vc_steps, vc_cfg_scale, vc_eta],
                             outputs=[vc_output_info, vc_origin_video, vc_depth_video, vc_output_video],
                             fn = videocontrol.get_video
             )

 ]
 control_examples = [
+    ['input/flamingo.mp4', 'An ostrich walking in the desert, photorealistic, 4k', 0, 50, 15, 1, 16, 256]
 ]
 def videocrafter_demo(result_dir='./tmp/'):
     videocontrol = VideoControl(result_dir)
     with gr.Blocks(analytics_enabled=False) as videocrafter_iface:
         gr.Markdown("<div align='center'> <h2> VideoCrafter: A Toolkit for Text-to-Video Generation and Editing </span> </h2> \
+                     <a style='font-size:18px;color: #000000' href='https://github.com/VideoCrafter/VideoCrafter'> Github </div>")
         #######t2v#######
         with gr.Tab(label="Text2Video"):
             with gr.Column():
                         with gr.Row():
                             vc_steps = gr.Slider(minimum=1, maximum=60, step=1, elem_id="vc_steps", label="Sampling steps", value=50)
                             frame_stride = gr.Slider(minimum=0 , maximum=100, step=1, label='Frame Stride', value=0, elem_id="vc_frame_stride")
+                        with gr.Row():
+                            resolution = gr.Slider(minimum=128 , maximum=512, step=8, label='Long Side Resolution', value=256, elem_id="vc_resolution")
+                            video_frames = gr.Slider(minimum=8 , maximum=64, step=1, label='Video Frame Num', value=16, elem_id="vc_video_frames")
                         vc_end_btn = gr.Button("Send")
                     with gr.Tab(label='Result'):
                         vc_output_info = gr.Text(label='Info')
                             vc_output_video = gr.Video(label="Generated Video").style(width=256)
                 gr.Examples(examples=control_examples,
+                            inputs=[vc_input_video, vc_input_text, frame_stride, vc_steps, vc_cfg_scale, vc_eta, video_frames, resolution],
                             outputs=[vc_output_info, vc_origin_video, vc_depth_video, vc_output_video],
                             fn = videocontrol.get_video,
                             cache_examples=os.getenv('SYSTEM') == 'spaces',
                 )
+            vc_end_btn.click(inputs=[vc_input_video, vc_input_text, frame_stride, vc_steps, vc_cfg_scale, vc_eta, video_frames, resolution],
                             outputs=[vc_output_info, vc_origin_video, vc_depth_video, vc_output_video],
                             fn = videocontrol.get_video
             )

videocontrol_test.py CHANGED Viewed

@@ -50,7 +50,8 @@ class VideoControl:
         config_path = "models/adapter_t2v_depth/model_config.yaml"
         ckpt_path = "models/base_t2v/model.ckpt"
         adapter_ckpt = "models/adapter_t2v_depth/adapter.pth"
         config = OmegaConf.load(config_path)
         model_config = config.pop("model", OmegaConf.create())
         model = instantiate_from_config(model_config)
@@ -59,10 +60,18 @@ class VideoControl:
         model = load_model_checkpoint(model, ckpt_path, adapter_ckpt)
         model.eval()
         self.model = model
-        self.resolution=256
-        self.spatial_transform = transforms_video.CenterCropVideo(self.resolution)
-    def get_video(self, input_video, input_prompt, frame_stride=0, vc_steps=50, vc_cfg_scale=15.0, vc_eta=1.0):
         if vc_steps > 60:
             vc_steps = 60
         ## load video
@@ -74,32 +83,43 @@ class VideoControl:
             os.remove(input_video)
             return 'please input video', None, None, None
-        if h < w:
-            scale = h / self.resolution
         else:
-            scale = w / self.resolution
         h = math.ceil(h / scale)
         w = math.ceil(w / scale)
         try:
-            video, info_str = load_video(input_video, frame_stride, video_size=(h, w), video_frames=16)
         except:
             os.remove(input_video)
             return 'load video error', None, None, None
-        video = self.spatial_transform(video)
         print('video shape', video.shape)
-        h, w = 32, 32
         bs = 1
         channels = self.model.channels
-        frames = self.model.temporal_length
-        noise_shape = [bs, channels, frames, h, w]
         ## inference
         start = time.time()
         prompt = input_prompt
         video = video.unsqueeze(0).to("cuda")
-        with torch.no_grad():
-            batch_samples, batch_conds = adapter_guided_synthesis(self.model, prompt, video, noise_shape, n_samples=1, ddim_steps=vc_steps, ddim_eta=vc_eta, unconditional_guidance_scale=vc_cfg_scale)
         batch_samples = batch_samples[0]
         os.makedirs(self.savedir, exist_ok=True)
         filename = prompt

         config_path = "models/adapter_t2v_depth/model_config.yaml"
         ckpt_path = "models/base_t2v/model.ckpt"
         adapter_ckpt = "models/adapter_t2v_depth/adapter.pth"
+        if os.path.exists('/dev/shm/model.ckpt'):
+            ckpt_path='/dev/shm/model.ckpt'
         config = OmegaConf.load(config_path)
         model_config = config.pop("model", OmegaConf.create())
         model = instantiate_from_config(model_config)
         model = load_model_checkpoint(model, ckpt_path, adapter_ckpt)
         model.eval()
         self.model = model
+    def get_video(self, input_video, input_prompt, frame_stride=0, vc_steps=50, vc_cfg_scale=15.0, vc_eta=1.0, video_frames=16, resolution=256):
+        torch.cuda.empty_cache()
+        if resolution > 512:
+            resolution = 512
+        if resolution < 64:
+            resolution = 64
+        if video_frames > 64:
+            video_frames = 64
+        resolution = int(resolution//64)*64
         if vc_steps > 60:
             vc_steps = 60
         ## load video
             os.remove(input_video)
             return 'please input video', None, None, None
+        if h > w:
+            scale = h / resolution
         else:
+            scale = w / resolution
         h = math.ceil(h / scale)
         w = math.ceil(w / scale)
         try:
+            video, info_str = load_video(input_video, frame_stride, video_size=(h, w), video_frames=video_frames)
         except:
             os.remove(input_video)
             return 'load video error', None, None, None
+        if h > w:
+            w = int(w//64)*64
+        else:
+            h = int(h//64)*64
+        spatial_transform = transforms_video.CenterCropVideo((h,w))
+        video = spatial_transform(video)
         print('video shape', video.shape)
+        rh, rw = h//8, w//8
         bs = 1
         channels = self.model.channels
+        # frames = self.model.temporal_length
+        frames = video_frames
+        noise_shape = [bs, channels, frames, rh, rw]
         ## inference
         start = time.time()
         prompt = input_prompt
         video = video.unsqueeze(0).to("cuda")
+        try:
+            with torch.no_grad():
+                batch_samples, batch_conds = adapter_guided_synthesis(self.model, prompt, video, noise_shape, n_samples=1, ddim_steps=vc_steps, ddim_eta=vc_eta, unconditional_guidance_scale=vc_cfg_scale)
+        except:
+            torch.cuda.empty_cache()
+            info_str="OOM, please enter a smaller resolution or smaller frame num"
+            return info_str, None, None, None
         batch_samples = batch_samples[0]
         os.makedirs(self.savedir, exist_ok=True)
         filename = prompt

videocrafter_test.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import os
 from omegaconf import OmegaConf
 from lvdm.samplers.ddim import DDIMSampler
@@ -29,6 +30,8 @@ class Text2Video():
         self.download_model()
         config_file = 'models/base_t2v/model_config.yaml'
         ckpt_path = 'models/base_t2v/model.ckpt'
         config = OmegaConf.load(config_file)
         self.lora_path_list = ['','models/videolora/lora_001_Loving_Vincent_style.ckpt',
                                 'models/videolora/lora_002_frozenmovie_style.ckpt',
@@ -45,6 +48,7 @@ class Text2Video():
         self.origin_weight = None
     def get_prompt(self, input_text, steps=50, model_index=0, eta=1.0, cfg_scale=15.0, lora_scale=1.0):
         if steps > 60:
             steps = 60
         if model_index > 0:

 import os
+import torch
 from omegaconf import OmegaConf
 from lvdm.samplers.ddim import DDIMSampler
         self.download_model()
         config_file = 'models/base_t2v/model_config.yaml'
         ckpt_path = 'models/base_t2v/model.ckpt'
+        if os.path.exists('/dev/shm/model.ckpt'):
+            ckpt_path='/dev/shm/model.ckpt'
         config = OmegaConf.load(config_file)
         self.lora_path_list = ['','models/videolora/lora_001_Loving_Vincent_style.ckpt',
                                 'models/videolora/lora_002_frozenmovie_style.ckpt',
         self.origin_weight = None
     def get_prompt(self, input_text, steps=50, model_index=0, eta=1.0, cfg_scale=15.0, lora_scale=1.0):
+        torch.cuda.empty_cache()
         if steps > 60:
             steps = 60
         if model_index > 0: