Muhammad Taqi Raza
commited on
Commit
·
0f464ea
1
Parent(s):
79ff636
adding camera offsets values
Browse files- gradio_app.py +19 -4
- inference/v2v_data/demo.py +27 -13
- inference/v2v_data/inference.py +5 -0
- inference/v2v_data/models/utils.py +1 -0
gradio_app.py
CHANGED
|
@@ -43,7 +43,9 @@ def get_anchor_video(video_path, fps, num_frames, target_pose, mode,
|
|
| 43 |
sampler_name, diffusion_guidance_scale, diffusion_inference_steps,
|
| 44 |
prompt, negative_prompt, refine_prompt,
|
| 45 |
depth_inference_steps, depth_guidance_scale,
|
| 46 |
-
window_size, overlap, max_res, sample_size,
|
|
|
|
|
|
|
| 47 |
|
| 48 |
temp_input_path = "/app/temp_input.mp4"
|
| 49 |
output_dir = "/app/output_anchor"
|
|
@@ -58,6 +60,7 @@ def get_anchor_video(video_path, fps, num_frames, target_pose, mode,
|
|
| 58 |
return f"Invalid target pose format. Use: θ φ r x y", None, None
|
| 59 |
logs = f"Running inference with target pose: θ={theta}, φ={phi}, r={r}, x={x}, y={y}\n"
|
| 60 |
w, h = aspect_ratio_inputs.strip().split(",")
|
|
|
|
| 61 |
|
| 62 |
command = [
|
| 63 |
"python", "/app/inference/v2v_data/inference.py",
|
|
@@ -84,11 +87,15 @@ def get_anchor_video(video_path, fps, num_frames, target_pose, mode,
|
|
| 84 |
"--window_size", str(window_size),
|
| 85 |
"--overlap", str(overlap),
|
| 86 |
"--max_res", str(max_res),
|
| 87 |
-
"--sample_size",
|
| 88 |
"--seed", str(seed_input),
|
| 89 |
"--height", str(height),
|
| 90 |
"--width", str(width),
|
| 91 |
-
"--target_aspect_ratio", w.strip(), h.strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
]
|
| 93 |
|
| 94 |
try:
|
|
@@ -169,6 +176,11 @@ with demo:
|
|
| 169 |
pose_input = gr.Textbox(label="Target Pose (θ φ r x y)", placeholder="e.g., 0 30 -0.6 0 0")
|
| 170 |
fps_input = gr.Number(value=24, label="FPS")
|
| 171 |
aspect_ratio_inputs=gr.Textbox(label="Target Aspect Ratio (e.g., 2,3)")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
num_frames_input = gr.Number(value=49, label="Number of Frames")
|
| 173 |
radius_input = gr.Number(value = 1.0, label="Radius Scale")
|
| 174 |
mode_input = gr.Dropdown(choices=["gradual"], value="gradual", label="Camera Mode")
|
|
@@ -230,10 +242,13 @@ with demo:
|
|
| 230 |
sampler_input, diff_guidance_input, diff_steps_input,
|
| 231 |
prompt_input, neg_prompt_input, refine_prompt_input,
|
| 232 |
depth_steps_input, depth_guidance_input,
|
| 233 |
-
window_input, overlap_input, maxres_input, sample_size,
|
|
|
|
|
|
|
| 234 |
],
|
| 235 |
outputs=[step1_video, step1_logs]
|
| 236 |
)
|
|
|
|
| 237 |
step2_button.click(
|
| 238 |
inference,
|
| 239 |
inputs=[
|
|
|
|
| 43 |
sampler_name, diffusion_guidance_scale, diffusion_inference_steps,
|
| 44 |
prompt, negative_prompt, refine_prompt,
|
| 45 |
depth_inference_steps, depth_guidance_scale,
|
| 46 |
+
window_size, overlap, max_res, sample_size,
|
| 47 |
+
seed_input, height, width, aspect_ratio_inputs,
|
| 48 |
+
init_dx, init_dy, init_dz): # ← NEW
|
| 49 |
|
| 50 |
temp_input_path = "/app/temp_input.mp4"
|
| 51 |
output_dir = "/app/output_anchor"
|
|
|
|
| 60 |
return f"Invalid target pose format. Use: θ φ r x y", None, None
|
| 61 |
logs = f"Running inference with target pose: θ={theta}, φ={phi}, r={r}, x={x}, y={y}\n"
|
| 62 |
w, h = aspect_ratio_inputs.strip().split(",")
|
| 63 |
+
h_s, w_s = sample_size.strip().split(",")
|
| 64 |
|
| 65 |
command = [
|
| 66 |
"python", "/app/inference/v2v_data/inference.py",
|
|
|
|
| 87 |
"--window_size", str(window_size),
|
| 88 |
"--overlap", str(overlap),
|
| 89 |
"--max_res", str(max_res),
|
| 90 |
+
"--sample_size", h_s.strip(), w_s.strip(),
|
| 91 |
"--seed", str(seed_input),
|
| 92 |
"--height", str(height),
|
| 93 |
"--width", str(width),
|
| 94 |
+
"--target_aspect_ratio", w.strip(), h.strip(),
|
| 95 |
+
"--init_dx", str(init_dx),
|
| 96 |
+
"--init_dy", str(init_dy),
|
| 97 |
+
"--init_dz", str(init_dz),
|
| 98 |
+
|
| 99 |
]
|
| 100 |
|
| 101 |
try:
|
|
|
|
| 176 |
pose_input = gr.Textbox(label="Target Pose (θ φ r x y)", placeholder="e.g., 0 30 -0.6 0 0")
|
| 177 |
fps_input = gr.Number(value=24, label="FPS")
|
| 178 |
aspect_ratio_inputs=gr.Textbox(label="Target Aspect Ratio (e.g., 2,3)")
|
| 179 |
+
|
| 180 |
+
init_dx = gr.Number(value=0.0, label="Start Camera Offset X")
|
| 181 |
+
init_dy = gr.Number(value=0.0, label="Start Camera Offset Y")
|
| 182 |
+
init_dz = gr.Number(value=0.0, label="Start Camera Offset Z")
|
| 183 |
+
|
| 184 |
num_frames_input = gr.Number(value=49, label="Number of Frames")
|
| 185 |
radius_input = gr.Number(value = 1.0, label="Radius Scale")
|
| 186 |
mode_input = gr.Dropdown(choices=["gradual"], value="gradual", label="Camera Mode")
|
|
|
|
| 242 |
sampler_input, diff_guidance_input, diff_steps_input,
|
| 243 |
prompt_input, neg_prompt_input, refine_prompt_input,
|
| 244 |
depth_steps_input, depth_guidance_input,
|
| 245 |
+
window_input, overlap_input, maxres_input, sample_size,
|
| 246 |
+
seed_input, height, width, aspect_ratio_inputs,
|
| 247 |
+
init_dx, init_dy, init_dz # ← NEW INPUTS
|
| 248 |
],
|
| 249 |
outputs=[step1_video, step1_logs]
|
| 250 |
)
|
| 251 |
+
|
| 252 |
step2_button.click(
|
| 253 |
inference,
|
| 254 |
inputs=[
|
inference/v2v_data/demo.py
CHANGED
|
@@ -1,17 +1,17 @@
|
|
| 1 |
import gc
|
| 2 |
import os
|
| 3 |
import torch
|
| 4 |
-
|
| 5 |
import numpy as np
|
| 6 |
-
|
| 7 |
from PIL import Image
|
| 8 |
from models.utils import *
|
| 9 |
-
|
| 10 |
-
import torch
|
| 11 |
import torch.nn.functional as F
|
|
|
|
|
|
|
| 12 |
|
| 13 |
-
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
|
| 14 |
from qwen_vl_utils import process_vision_info
|
|
|
|
| 15 |
|
| 16 |
def get_center_crop_resolution(original_resoultion, height = 576, width = 1024,target_aspect_ratio=(2, 3)):
|
| 17 |
target_h, target_w = target_aspect_ratio
|
|
@@ -68,8 +68,6 @@ def center_crop_to_ratio(tensor: torch.Tensor, resolution=(480, 720)):
|
|
| 68 |
|
| 69 |
return tensor[:, :, top:top + crop_h, left:left + crop_w]
|
| 70 |
|
| 71 |
-
import imageio
|
| 72 |
-
import numpy as np
|
| 73 |
|
| 74 |
def save_video_as_mp4(video_tensor, save_path, fps=24):
|
| 75 |
"""
|
|
@@ -589,18 +587,34 @@ class GetAnchorVideos:
|
|
| 589 |
.repeat(num_frames, 1, 1)
|
| 590 |
.to(opts.device)
|
| 591 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 592 |
c2w_init = (
|
| 593 |
torch.tensor(
|
| 594 |
[
|
| 595 |
-
[-1.0, 0.0, 0.0,
|
| 596 |
-
[0.0, 1.0, 0.0,
|
| 597 |
-
[0.0, 0.0, -1.0,
|
| 598 |
[0.0, 0.0, 0.0, 1.0],
|
| 599 |
]
|
| 600 |
-
)
|
| 601 |
-
.to(opts.device)
|
| 602 |
-
.unsqueeze(0)
|
| 603 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 604 |
|
| 605 |
if opts.camera == 'target':
|
| 606 |
dtheta, dphi, dr, dx, dy = opts.target_pose
|
|
|
|
| 1 |
import gc
|
| 2 |
import os
|
| 3 |
import torch
|
| 4 |
+
import imageio
|
| 5 |
import numpy as np
|
| 6 |
+
|
| 7 |
from PIL import Image
|
| 8 |
from models.utils import *
|
|
|
|
|
|
|
| 9 |
import torch.nn.functional as F
|
| 10 |
+
from models.infer import DepthCrafterDemo
|
| 11 |
+
|
| 12 |
|
|
|
|
| 13 |
from qwen_vl_utils import process_vision_info
|
| 14 |
+
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
|
| 15 |
|
| 16 |
def get_center_crop_resolution(original_resoultion, height = 576, width = 1024,target_aspect_ratio=(2, 3)):
|
| 17 |
target_h, target_w = target_aspect_ratio
|
|
|
|
| 68 |
|
| 69 |
return tensor[:, :, top:top + crop_h, left:left + crop_w]
|
| 70 |
|
|
|
|
|
|
|
| 71 |
|
| 72 |
def save_video_as_mp4(video_tensor, save_path, fps=24):
|
| 73 |
"""
|
|
|
|
| 587 |
.repeat(num_frames, 1, 1)
|
| 588 |
.to(opts.device)
|
| 589 |
)
|
| 590 |
+
|
| 591 |
+
camera_x = getattr(opts, "init_dx", 0.0)
|
| 592 |
+
camera_y = getattr(opts, "init_dy", 0.0)
|
| 593 |
+
camera_z = getattr(opts, "init_dz", 0.0)
|
| 594 |
+
|
| 595 |
c2w_init = (
|
| 596 |
torch.tensor(
|
| 597 |
[
|
| 598 |
+
[-1.0, 0.0, 0.0, camera_x],
|
| 599 |
+
[0.0, 1.0, 0.0, camera_y],
|
| 600 |
+
[0.0, 0.0, -1.0, camera_z],
|
| 601 |
[0.0, 0.0, 0.0, 1.0],
|
| 602 |
]
|
| 603 |
+
).to(opts.device).unsqueeze(0)
|
|
|
|
|
|
|
| 604 |
)
|
| 605 |
+
|
| 606 |
+
# c2w_init = (
|
| 607 |
+
# torch.tensor(
|
| 608 |
+
# [
|
| 609 |
+
# [-1.0, 0.0, 0.0, 0.0],
|
| 610 |
+
# [0.0, 1.0, 0.0, 0.0],
|
| 611 |
+
# [0.0, 0.0, -1.0, 0.0],
|
| 612 |
+
# [0.0, 0.0, 0.0, 1.0],
|
| 613 |
+
# ]
|
| 614 |
+
# )
|
| 615 |
+
# .to(opts.device)
|
| 616 |
+
# .unsqueeze(0)
|
| 617 |
+
# )
|
| 618 |
|
| 619 |
if opts.camera == 'target':
|
| 620 |
dtheta, dphi, dr, dx, dy = opts.target_pose
|
inference/v2v_data/inference.py
CHANGED
|
@@ -189,7 +189,12 @@ def get_parser():
|
|
| 189 |
parser.add_argument(
|
| 190 |
'--max_res', type=int, default=1024, help='Maximum resolution for processing'
|
| 191 |
)
|
|
|
|
| 192 |
parser.add_argument("--target_aspect_ratio", type=int, nargs=2, default=None)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
|
| 194 |
return parser
|
| 195 |
|
|
|
|
| 189 |
parser.add_argument(
|
| 190 |
'--max_res', type=int, default=1024, help='Maximum resolution for processing'
|
| 191 |
)
|
| 192 |
+
|
| 193 |
parser.add_argument("--target_aspect_ratio", type=int, nargs=2, default=None)
|
| 194 |
+
|
| 195 |
+
parser.add_argument('--init_dx', type=float, default=0.0)
|
| 196 |
+
parser.add_argument('--init_dy', type=float, default=0.0)
|
| 197 |
+
parser.add_argument('--init_dz', type=float, default=0.0)
|
| 198 |
|
| 199 |
return parser
|
| 200 |
|
inference/v2v_data/models/utils.py
CHANGED
|
@@ -187,6 +187,7 @@ def generate_traj_specified(c2ws_anchor, theta, phi, d_r, d_x, d_y, frame, devic
|
|
| 187 |
rs = np.linspace(0, d_r, frame)
|
| 188 |
xs = np.linspace(0, d_x, frame)
|
| 189 |
ys = np.linspace(0, d_y, frame)
|
|
|
|
| 190 |
c2ws_list = []
|
| 191 |
for th, ph, r, x, y in zip(thetas, phis, rs, xs, ys):
|
| 192 |
c2w_new = sphere2pose(
|
|
|
|
| 187 |
rs = np.linspace(0, d_r, frame)
|
| 188 |
xs = np.linspace(0, d_x, frame)
|
| 189 |
ys = np.linspace(0, d_y, frame)
|
| 190 |
+
|
| 191 |
c2ws_list = []
|
| 192 |
for th, ph, r, x, y in zip(thetas, phis, rs, xs, ys):
|
| 193 |
c2w_new = sphere2pose(
|