Spaces:

roll-ai
/

EPiC

Paused

App Files Files Community

Muhammad Taqi Raza commited on Jul 11

Commit

0f464ea

1 Parent(s): 79ff636

adding camera offsets values

Browse files

Files changed (4) hide show

gradio_app.py +19 -4
inference/v2v_data/demo.py +27 -13
inference/v2v_data/inference.py +5 -0
inference/v2v_data/models/utils.py +1 -0

gradio_app.py CHANGED Viewed

@@ -43,7 +43,9 @@ def get_anchor_video(video_path, fps, num_frames, target_pose, mode,
                        sampler_name, diffusion_guidance_scale, diffusion_inference_steps,
                        prompt, negative_prompt, refine_prompt,
                        depth_inference_steps, depth_guidance_scale,
-                       window_size, overlap, max_res, sample_size, seed_input, height, width, aspect_ratio_inputs):
     temp_input_path = "/app/temp_input.mp4"
     output_dir = "/app/output_anchor"
@@ -58,6 +60,7 @@ def get_anchor_video(video_path, fps, num_frames, target_pose, mode,
         return f"Invalid target pose format. Use: θ φ r x y", None, None
     logs =  f"Running inference with target pose: θ={theta}, φ={phi}, r={r}, x={x}, y={y}\n"
     w, h = aspect_ratio_inputs.strip().split(",")
     command = [
         "python", "/app/inference/v2v_data/inference.py",
@@ -84,11 +87,15 @@ def get_anchor_video(video_path, fps, num_frames, target_pose, mode,
         "--window_size", str(window_size),
         "--overlap", str(overlap),
         "--max_res", str(max_res),
-        "--sample_size", sample_size if sample_size else "384,672",
         "--seed", str(seed_input),
         "--height", str(height),
         "--width", str(width),
-        "--target_aspect_ratio", w.strip(), h.strip()
     ]
     try:
@@ -169,6 +176,11 @@ with demo:
                         pose_input = gr.Textbox(label="Target Pose (θ φ r x y)", placeholder="e.g., 0 30 -0.6 0 0")
                         fps_input = gr.Number(value=24, label="FPS")
                         aspect_ratio_inputs=gr.Textbox(label="Target Aspect Ratio (e.g., 2,3)")
                         num_frames_input = gr.Number(value=49, label="Number of Frames")
                         radius_input = gr.Number(value = 1.0, label="Radius Scale")
                         mode_input = gr.Dropdown(choices=["gradual"], value="gradual", label="Camera Mode")
@@ -230,10 +242,13 @@ with demo:
             sampler_input, diff_guidance_input, diff_steps_input,
             prompt_input, neg_prompt_input, refine_prompt_input,
             depth_steps_input, depth_guidance_input,
-            window_input, overlap_input, maxres_input, sample_size, seed_input, height, width, aspect_ratio_inputs
         ],
         outputs=[step1_video, step1_logs]
     )
     step2_button.click(
         inference,
         inputs=[

                        sampler_name, diffusion_guidance_scale, diffusion_inference_steps,
                        prompt, negative_prompt, refine_prompt,
                        depth_inference_steps, depth_guidance_scale,
+                       window_size, overlap, max_res, sample_size,
+                       seed_input, height, width, aspect_ratio_inputs,
+                       init_dx, init_dy, init_dz):  # ← NEW
     temp_input_path = "/app/temp_input.mp4"
     output_dir = "/app/output_anchor"
         return f"Invalid target pose format. Use: θ φ r x y", None, None
     logs =  f"Running inference with target pose: θ={theta}, φ={phi}, r={r}, x={x}, y={y}\n"
     w, h = aspect_ratio_inputs.strip().split(",")
+    h_s, w_s = sample_size.strip().split(",")
     command = [
         "python", "/app/inference/v2v_data/inference.py",
         "--window_size", str(window_size),
         "--overlap", str(overlap),
         "--max_res", str(max_res),
+        "--sample_size", h_s.strip(), w_s.strip(),
         "--seed", str(seed_input),
         "--height", str(height),
         "--width", str(width),
+        "--target_aspect_ratio", w.strip(), h.strip(),
+        "--init_dx", str(init_dx),
+        "--init_dy", str(init_dy),
+        "--init_dz", str(init_dz),
     ]
     try:
                         pose_input = gr.Textbox(label="Target Pose (θ φ r x y)", placeholder="e.g., 0 30 -0.6 0 0")
                         fps_input = gr.Number(value=24, label="FPS")
                         aspect_ratio_inputs=gr.Textbox(label="Target Aspect Ratio (e.g., 2,3)")
+                        init_dx = gr.Number(value=0.0, label="Start Camera Offset X")
+                        init_dy = gr.Number(value=0.0, label="Start Camera Offset Y")
+                        init_dz = gr.Number(value=0.0, label="Start Camera Offset Z")
                         num_frames_input = gr.Number(value=49, label="Number of Frames")
                         radius_input = gr.Number(value = 1.0, label="Radius Scale")
                         mode_input = gr.Dropdown(choices=["gradual"], value="gradual", label="Camera Mode")
             sampler_input, diff_guidance_input, diff_steps_input,
             prompt_input, neg_prompt_input, refine_prompt_input,
             depth_steps_input, depth_guidance_input,
+            window_input, overlap_input, maxres_input, sample_size,
+            seed_input, height, width, aspect_ratio_inputs,
+            init_dx, init_dy, init_dz  # ← NEW INPUTS
         ],
         outputs=[step1_video, step1_logs]
     )
     step2_button.click(
         inference,
         inputs=[

inference/v2v_data/demo.py CHANGED Viewed

@@ -1,17 +1,17 @@
 import gc
 import os
 import torch
-from models.infer import DepthCrafterDemo
 import numpy as np
-import torch
 from PIL import Image
 from models.utils import *
-import torch
 import torch.nn.functional as F
-from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
 from qwen_vl_utils import process_vision_info
 def get_center_crop_resolution(original_resoultion, height = 576, width = 1024,target_aspect_ratio=(2, 3)):
     target_h, target_w = target_aspect_ratio
@@ -68,8 +68,6 @@ def center_crop_to_ratio(tensor: torch.Tensor, resolution=(480, 720)):
     return tensor[:, :, top:top + crop_h, left:left + crop_w]
-import imageio
-import numpy as np
 def save_video_as_mp4(video_tensor, save_path, fps=24):
     """
@@ -589,18 +587,34 @@ class GetAnchorVideos:
             .repeat(num_frames, 1, 1)
             .to(opts.device)
         )
         c2w_init = (
             torch.tensor(
                 [
-                    [-1.0, 0.0, 0.0, 0.0],
-                    [0.0, 1.0, 0.0, 0.0],
-                    [0.0, 0.0, -1.0, 0.0],
                     [0.0, 0.0, 0.0, 1.0],
                 ]
-            )
-            .to(opts.device)
-            .unsqueeze(0)
         )
         if opts.camera == 'target':
             dtheta, dphi, dr, dx, dy = opts.target_pose

 import gc
 import os
 import torch
+import imageio
 import numpy as np
 from PIL import Image
 from models.utils import *
 import torch.nn.functional as F
+from models.infer import DepthCrafterDemo
 from qwen_vl_utils import process_vision_info
+from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
 def get_center_crop_resolution(original_resoultion, height = 576, width = 1024,target_aspect_ratio=(2, 3)):
     target_h, target_w = target_aspect_ratio
     return tensor[:, :, top:top + crop_h, left:left + crop_w]
 def save_video_as_mp4(video_tensor, save_path, fps=24):
     """
             .repeat(num_frames, 1, 1)
             .to(opts.device)
         )
+        camera_x = getattr(opts, "init_dx", 0.0)
+        camera_y = getattr(opts, "init_dy", 0.0)
+        camera_z = getattr(opts, "init_dz", 0.0)
         c2w_init = (
             torch.tensor(
                 [
+                    [-1.0, 0.0, 0.0, camera_x],
+                    [0.0, 1.0, 0.0, camera_y],
+                    [0.0, 0.0, -1.0, camera_z],
                     [0.0, 0.0, 0.0, 1.0],
                 ]
+            ).to(opts.device).unsqueeze(0)
         )
+        # c2w_init = (
+        #     torch.tensor(
+        #         [
+        #             [-1.0, 0.0, 0.0, 0.0],
+        #             [0.0, 1.0, 0.0, 0.0],
+        #             [0.0, 0.0, -1.0, 0.0],
+        #             [0.0, 0.0, 0.0, 1.0],
+        #         ]
+        #     )
+        #     .to(opts.device)
+        #     .unsqueeze(0)
+        # )
         if opts.camera == 'target':
             dtheta, dphi, dr, dx, dy = opts.target_pose

inference/v2v_data/inference.py CHANGED Viewed

@@ -189,7 +189,12 @@ def get_parser():
     parser.add_argument(
         '--max_res', type=int, default=1024, help='Maximum resolution for processing'
     )
     parser.add_argument("--target_aspect_ratio", type=int, nargs=2, default=None)
     return parser

     parser.add_argument(
         '--max_res', type=int, default=1024, help='Maximum resolution for processing'
     )
     parser.add_argument("--target_aspect_ratio", type=int, nargs=2, default=None)
+    parser.add_argument('--init_dx', type=float, default=0.0)
+    parser.add_argument('--init_dy', type=float, default=0.0)
+    parser.add_argument('--init_dz', type=float, default=0.0)
     return parser

inference/v2v_data/models/utils.py CHANGED Viewed

@@ -187,6 +187,7 @@ def generate_traj_specified(c2ws_anchor, theta, phi, d_r, d_x, d_y, frame, devic
     rs = np.linspace(0, d_r, frame)
     xs = np.linspace(0, d_x, frame)
     ys = np.linspace(0, d_y, frame)
     c2ws_list = []
     for th, ph, r, x, y in zip(thetas, phis, rs, xs, ys):
         c2w_new = sphere2pose(

     rs = np.linspace(0, d_r, frame)
     xs = np.linspace(0, d_x, frame)
     ys = np.linspace(0, d_y, frame)
     c2ws_list = []
     for th, ph, r, x, y in zip(thetas, phis, rs, xs, ys):
         c2w_new = sphere2pose(