Spaces:
Running
on
Zero
Running
on
Zero
import gradio as gr | |
import os | |
import json | |
import numpy as np | |
import cv2 | |
import base64 | |
import time | |
import tempfile | |
import shutil | |
import glob | |
import threading | |
import subprocess | |
import struct | |
import zlib | |
from pathlib import Path | |
from einops import rearrange | |
from typing import List, Tuple, Union | |
try: | |
import spaces | |
except ImportError: | |
# Fallback for local development | |
def spaces(func): | |
return func | |
import torch | |
import logging | |
from concurrent.futures import ThreadPoolExecutor | |
import atexit | |
import uuid | |
# Configure logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
# Import custom modules with error handling | |
try: | |
from app_3rd.sam_utils.inference import SamPredictor, get_sam_predictor, run_inference | |
from app_3rd.spatrack_utils.infer_track import get_tracker_predictor, run_tracker, get_points_on_a_grid | |
except ImportError as e: | |
logger.error(f"Failed to import custom modules: {e}") | |
raise | |
# Constants | |
MAX_FRAMES = 80 | |
COLORS = [(0, 0, 255), (0, 255, 255)] # BGR: Red for negative, Yellow for positive | |
MARKERS = [1, 5] # Cross for negative, Star for positive | |
MARKER_SIZE = 8 | |
# Thread pool for delayed deletion | |
thread_pool_executor = ThreadPoolExecutor(max_workers=2) | |
def delete_later(path: Union[str, os.PathLike], delay: int = 600): | |
"""Delete file or directory after specified delay (default 10 minutes)""" | |
def _delete(): | |
try: | |
if os.path.isfile(path): | |
os.remove(path) | |
elif os.path.isdir(path): | |
shutil.rmtree(path) | |
except Exception as e: | |
logger.warning(f"Failed to delete {path}: {e}") | |
def _wait_and_delete(): | |
time.sleep(delay) | |
_delete() | |
thread_pool_executor.submit(_wait_and_delete) | |
atexit.register(_delete) | |
def create_user_temp_dir(): | |
"""Create a unique temporary directory for each user session""" | |
session_id = str(uuid.uuid4())[:8] # Short unique ID | |
temp_dir = os.path.join("temp_local", f"session_{session_id}") | |
os.makedirs(temp_dir, exist_ok=True) | |
# Schedule deletion after 10 minutes | |
delete_later(temp_dir, delay=600) | |
return temp_dir | |
from huggingface_hub import hf_hub_download | |
# init the model | |
os.environ["VGGT_DIR"] = hf_hub_download("Yuxihenry/SpatialTrackerCkpts", "spatrack_front.pth") #, force_download=True) | |
if os.environ.get("VGGT_DIR", None) is not None: | |
from models.vggt.vggt.models.vggt_moe import VGGT_MoE | |
from models.vggt.vggt.utils.load_fn import preprocess_image | |
vggt_model = VGGT_MoE() | |
vggt_model.load_state_dict(torch.load(os.environ.get("VGGT_DIR")), strict=False) | |
vggt_model.eval() | |
vggt_model = vggt_model.to("cuda") | |
# Global model initialization | |
print("🚀 Initializing local models...") | |
tracker_model, _ = get_tracker_predictor(".", vo_points=756) | |
predictor = get_sam_predictor() | |
print("✅ Models loaded successfully!") | |
gr.set_static_paths(paths=[Path.cwd().absolute()/"_viz"]) | |
def gpu_run_inference(predictor_arg, image, points, boxes): | |
"""GPU-accelerated SAM inference""" | |
if predictor_arg is None: | |
print("Initializing SAM predictor inside GPU function...") | |
predictor_arg = get_sam_predictor(predictor=predictor) | |
# Ensure predictor is on GPU | |
try: | |
if hasattr(predictor_arg, 'model'): | |
predictor_arg.model = predictor_arg.model.cuda() | |
elif hasattr(predictor_arg, 'sam'): | |
predictor_arg.sam = predictor_arg.sam.cuda() | |
elif hasattr(predictor_arg, 'to'): | |
predictor_arg = predictor_arg.to('cuda') | |
if hasattr(image, 'cuda'): | |
image = image.cuda() | |
except Exception as e: | |
print(f"Warning: Could not move predictor to GPU: {e}") | |
return run_inference(predictor_arg, image, points, boxes) | |
def gpu_run_tracker(tracker_model_arg, tracker_viser_arg, temp_dir, video_name, grid_size, vo_points, fps, mode="offline"): | |
"""GPU-accelerated tracking""" | |
import torchvision.transforms as T | |
import decord | |
if tracker_model_arg is None or tracker_viser_arg is None: | |
print("Initializing tracker models inside GPU function...") | |
out_dir = os.path.join(temp_dir, "results") | |
os.makedirs(out_dir, exist_ok=True) | |
tracker_model_arg, tracker_viser_arg = get_tracker_predictor(out_dir, vo_points=vo_points, tracker_model=tracker_model) | |
# Setup paths | |
video_path = os.path.join(temp_dir, f"{video_name}.mp4") | |
mask_path = os.path.join(temp_dir, f"{video_name}.png") | |
out_dir = os.path.join(temp_dir, "results") | |
os.makedirs(out_dir, exist_ok=True) | |
# Load video using decord | |
video_reader = decord.VideoReader(video_path) | |
video_tensor = torch.from_numpy(video_reader.get_batch(range(len(video_reader))).asnumpy()).permute(0, 3, 1, 2) | |
# Resize to ensure minimum side is 336 | |
h, w = video_tensor.shape[2:] | |
scale = max(224 / h, 224 / w) | |
if scale < 1: | |
new_h, new_w = int(h * scale), int(w * scale) | |
video_tensor = T.Resize((new_h, new_w))(video_tensor) | |
video_tensor = video_tensor[::fps].float()[:MAX_FRAMES] | |
# Move to GPU | |
video_tensor = video_tensor.cuda() | |
print(f"Video tensor shape: {video_tensor.shape}, device: {video_tensor.device}") | |
depth_tensor = None | |
intrs = None | |
extrs = None | |
data_npz_load = {} | |
# run vggt | |
if os.environ.get("VGGT_DIR", None) is not None: | |
# process the image tensor | |
video_tensor = preprocess_image(video_tensor)[None] | |
with torch.no_grad(): | |
with torch.cuda.amp.autocast(dtype=torch.bfloat16): | |
# Predict attributes including cameras, depth maps, and point maps. | |
predictions = vggt_model(video_tensor.cuda()/255) | |
extrinsic, intrinsic = predictions["poses_pred"], predictions["intrs"] | |
depth_map, depth_conf = predictions["points_map"][..., 2], predictions["unc_metric"] | |
depth_tensor = depth_map.squeeze().cpu().numpy() | |
extrs = np.eye(4)[None].repeat(len(depth_tensor), axis=0) | |
extrs = extrinsic.squeeze().cpu().numpy() | |
intrs = intrinsic.squeeze().cpu().numpy() | |
video_tensor = video_tensor.squeeze() | |
#NOTE: 20% of the depth is not reliable | |
# threshold = depth_conf.squeeze()[0].view(-1).quantile(0.6).item() | |
unc_metric = depth_conf.squeeze().cpu().numpy() > 0.5 | |
# Load and process mask | |
if os.path.exists(mask_path): | |
mask = cv2.imread(mask_path) | |
mask = cv2.resize(mask, (video_tensor.shape[3], video_tensor.shape[2])) | |
mask = mask.sum(axis=-1)>0 | |
else: | |
mask = np.ones_like(video_tensor[0,0].cpu().numpy())>0 | |
grid_size = 10 | |
# Get frame dimensions and create grid points | |
frame_H, frame_W = video_tensor.shape[2:] | |
grid_pts = get_points_on_a_grid(grid_size, (frame_H, frame_W), device="cuda") | |
# Sample mask values at grid points and filter | |
if os.path.exists(mask_path): | |
grid_pts_int = grid_pts[0].long() | |
mask_values = mask[grid_pts_int.cpu()[...,1], grid_pts_int.cpu()[...,0]] | |
grid_pts = grid_pts[:, mask_values] | |
query_xyt = torch.cat([torch.zeros_like(grid_pts[:, :, :1]), grid_pts], dim=2)[0].cpu().numpy() | |
print(f"Query points shape: {query_xyt.shape}") | |
# Run model inference | |
with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16): | |
( | |
c2w_traj, intrs, point_map, conf_depth, | |
track3d_pred, track2d_pred, vis_pred, conf_pred, video | |
) = tracker_model_arg.forward(video_tensor, depth=depth_tensor, | |
intrs=intrs, extrs=extrs, | |
queries=query_xyt, | |
fps=1, full_point=False, iters_track=4, | |
query_no_BA=True, fixed_cam=False, stage=1, unc_metric=unc_metric, | |
support_frame=len(video_tensor)-1, replace_ratio=0.2) | |
# Resize results to avoid large I/O | |
max_size = 224 | |
h, w = video.shape[2:] | |
scale = min(max_size / h, max_size / w) | |
if scale < 1: | |
new_h, new_w = int(h * scale), int(w * scale) | |
video = T.Resize((new_h, new_w))(video) | |
video_tensor = T.Resize((new_h, new_w))(video_tensor) | |
point_map = T.Resize((new_h, new_w))(point_map) | |
track2d_pred[...,:2] = track2d_pred[...,:2] * scale | |
intrs[:,:2,:] = intrs[:,:2,:] * scale | |
conf_depth = T.Resize((new_h, new_w))(conf_depth) | |
# Visualize tracks | |
tracker_viser_arg.visualize(video=video[None], | |
tracks=track2d_pred[None][...,:2], | |
visibility=vis_pred[None],filename="test") | |
# Save in tapip3d format | |
data_npz_load["coords"] = (torch.einsum("tij,tnj->tni", c2w_traj[:,:3,:3], track3d_pred[:,:,:3].cpu()) + c2w_traj[:,:3,3][:,None,:]).numpy() | |
data_npz_load["extrinsics"] = torch.inverse(c2w_traj).cpu().numpy() | |
data_npz_load["intrinsics"] = intrs.cpu().numpy() | |
data_npz_load["depths"] = point_map[:,2,...].cpu().numpy() | |
data_npz_load["video"] = (video_tensor).cpu().numpy()/255 | |
data_npz_load["visibs"] = vis_pred.cpu().numpy() | |
data_npz_load["confs"] = conf_pred.cpu().numpy() | |
data_npz_load["confs_depth"] = conf_depth.cpu().numpy() | |
np.savez(os.path.join(out_dir, f'result.npz'), **data_npz_load) | |
return None | |
def compress_and_write(filename, header, blob): | |
header_bytes = json.dumps(header).encode("utf-8") | |
header_len = struct.pack("<I", len(header_bytes)) | |
with open(filename, "wb") as f: | |
f.write(header_len) | |
f.write(header_bytes) | |
f.write(blob) | |
def process_point_cloud_data(npz_file, width=256, height=192, fps=4): | |
fixed_size = (width, height) | |
data = np.load(npz_file) | |
extrinsics = data["extrinsics"] | |
intrinsics = data["intrinsics"] | |
trajs = data["coords"] | |
T, C, H, W = data["video"].shape | |
fx = intrinsics[0, 0, 0] | |
fy = intrinsics[0, 1, 1] | |
fov_y = 2 * np.arctan(H / (2 * fy)) * (180 / np.pi) | |
fov_x = 2 * np.arctan(W / (2 * fx)) * (180 / np.pi) | |
original_aspect_ratio = (W / fx) / (H / fy) | |
rgb_video = (rearrange(data["video"], "T C H W -> T H W C") * 255).astype(np.uint8) | |
rgb_video = np.stack([cv2.resize(frame, fixed_size, interpolation=cv2.INTER_AREA) | |
for frame in rgb_video]) | |
depth_video = data["depths"].astype(np.float32) | |
if "confs_depth" in data.keys(): | |
confs = (data["confs_depth"].astype(np.float32) > 0.5).astype(np.float32) | |
depth_video = depth_video * confs | |
depth_video = np.stack([cv2.resize(frame, fixed_size, interpolation=cv2.INTER_NEAREST) | |
for frame in depth_video]) | |
scale_x = fixed_size[0] / W | |
scale_y = fixed_size[1] / H | |
intrinsics = intrinsics.copy() | |
intrinsics[:, 0, :] *= scale_x | |
intrinsics[:, 1, :] *= scale_y | |
min_depth = float(depth_video.min()) * 0.8 | |
max_depth = float(depth_video.max()) * 1.5 | |
depth_normalized = (depth_video - min_depth) / (max_depth - min_depth) | |
depth_int = (depth_normalized * ((1 << 16) - 1)).astype(np.uint16) | |
depths_rgb = np.zeros((T, fixed_size[1], fixed_size[0], 3), dtype=np.uint8) | |
depths_rgb[:, :, :, 0] = (depth_int & 0xFF).astype(np.uint8) | |
depths_rgb[:, :, :, 1] = ((depth_int >> 8) & 0xFF).astype(np.uint8) | |
first_frame_inv = np.linalg.inv(extrinsics[0]) | |
normalized_extrinsics = np.array([first_frame_inv @ ext for ext in extrinsics]) | |
normalized_trajs = np.zeros_like(trajs) | |
for t in range(T): | |
homogeneous_trajs = np.concatenate([trajs[t], np.ones((trajs.shape[1], 1))], axis=1) | |
transformed_trajs = (first_frame_inv @ homogeneous_trajs.T).T | |
normalized_trajs[t] = transformed_trajs[:, :3] | |
arrays = { | |
"rgb_video": rgb_video, | |
"depths_rgb": depths_rgb, | |
"intrinsics": intrinsics, | |
"extrinsics": normalized_extrinsics, | |
"inv_extrinsics": np.linalg.inv(normalized_extrinsics), | |
"trajectories": normalized_trajs.astype(np.float32), | |
"cameraZ": 0.0 | |
} | |
header = {} | |
blob_parts = [] | |
offset = 0 | |
for key, arr in arrays.items(): | |
arr = np.ascontiguousarray(arr) | |
arr_bytes = arr.tobytes() | |
header[key] = { | |
"dtype": str(arr.dtype), | |
"shape": arr.shape, | |
"offset": offset, | |
"length": len(arr_bytes) | |
} | |
blob_parts.append(arr_bytes) | |
offset += len(arr_bytes) | |
raw_blob = b"".join(blob_parts) | |
compressed_blob = zlib.compress(raw_blob, level=9) | |
header["meta"] = { | |
"depthRange": [min_depth, max_depth], | |
"totalFrames": int(T), | |
"resolution": fixed_size, | |
"baseFrameRate": fps, | |
"numTrajectoryPoints": normalized_trajs.shape[1], | |
"fov": float(fov_y), | |
"fov_x": float(fov_x), | |
"original_aspect_ratio": float(original_aspect_ratio), | |
"fixed_aspect_ratio": float(fixed_size[0]/fixed_size[1]) | |
} | |
compress_and_write('./_viz/data.bin', header, compressed_blob) | |
with open('./_viz/data.bin', "rb") as f: | |
encoded_blob = base64.b64encode(f.read()).decode("ascii") | |
os.unlink('./_viz/data.bin') | |
random_path = f'./_viz/_{time.time()}.html' | |
with open('./_viz/viz_template.html') as f: | |
html_template = f.read() | |
html_out = html_template.replace( | |
"<head>", | |
f"<head>\n<script>window.embeddedBase64 = `{encoded_blob}`;</script>" | |
) | |
with open(random_path,'w') as f: | |
f.write(html_out) | |
return random_path | |
def numpy_to_base64(arr): | |
"""Convert numpy array to base64 string""" | |
return base64.b64encode(arr.tobytes()).decode('utf-8') | |
def base64_to_numpy(b64_str, shape, dtype): | |
"""Convert base64 string back to numpy array""" | |
return np.frombuffer(base64.b64decode(b64_str), dtype=dtype).reshape(shape) | |
def get_video_name(video_path): | |
"""Extract video name without extension""" | |
return os.path.splitext(os.path.basename(video_path))[0] | |
def extract_first_frame(video_path): | |
"""Extract first frame from video file""" | |
try: | |
cap = cv2.VideoCapture(video_path) | |
ret, frame = cap.read() | |
cap.release() | |
if ret: | |
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) | |
return frame_rgb | |
else: | |
return None | |
except Exception as e: | |
print(f"Error extracting first frame: {e}") | |
return None | |
def handle_video_upload(video): | |
"""Handle video upload and extract first frame""" | |
if video is None: | |
return (None, None, [], | |
gr.update(value=50), | |
gr.update(value=756), | |
gr.update(value=3)) | |
# Create user-specific temporary directory | |
user_temp_dir = create_user_temp_dir() | |
# Get original video name and copy to temp directory | |
if isinstance(video, str): | |
video_name = get_video_name(video) | |
video_path = os.path.join(user_temp_dir, f"{video_name}.mp4") | |
shutil.copy(video, video_path) | |
else: | |
video_name = get_video_name(video.name) | |
video_path = os.path.join(user_temp_dir, f"{video_name}.mp4") | |
with open(video_path, 'wb') as f: | |
f.write(video.read()) | |
print(f"📁 Video saved to: {video_path}") | |
# Extract first frame | |
frame = extract_first_frame(video_path) | |
if frame is None: | |
return (None, None, [], | |
gr.update(value=50), | |
gr.update(value=756), | |
gr.update(value=3)) | |
# Resize frame to have minimum side length of 336 | |
h, w = frame.shape[:2] | |
scale = 336 / min(h, w) | |
new_h, new_w = int(h * scale)//2*2, int(w * scale)//2*2 | |
frame = cv2.resize(frame, (new_w, new_h), interpolation=cv2.INTER_LINEAR) | |
# Store frame data with temp directory info | |
frame_data = { | |
'data': numpy_to_base64(frame), | |
'shape': frame.shape, | |
'dtype': str(frame.dtype), | |
'temp_dir': user_temp_dir, | |
'video_name': video_name, | |
'video_path': video_path | |
} | |
# Get video-specific settings | |
print(f"🎬 Video path: '{video}' -> Video name: '{video_name}'") | |
grid_size_val, vo_points_val, fps_val = get_video_settings(video_name) | |
print(f"🎬 Video settings for '{video_name}': grid_size={grid_size_val}, vo_points={vo_points_val}, fps={fps_val}") | |
return (json.dumps(frame_data), frame, [], | |
gr.update(value=grid_size_val), | |
gr.update(value=vo_points_val), | |
gr.update(value=fps_val)) | |
def save_masks(o_masks, video_name, temp_dir): | |
"""Save binary masks to files in user-specific temp directory""" | |
o_files = [] | |
for mask, _ in o_masks: | |
o_mask = np.uint8(mask.squeeze() * 255) | |
o_file = os.path.join(temp_dir, f"{video_name}.png") | |
cv2.imwrite(o_file, o_mask) | |
o_files.append(o_file) | |
return o_files | |
def select_point(original_img: str, sel_pix: list, point_type: str, evt: gr.SelectData): | |
"""Handle point selection for SAM""" | |
if original_img is None: | |
return None, [] | |
try: | |
# Convert stored image data back to numpy array | |
frame_data = json.loads(original_img) | |
original_img_array = base64_to_numpy(frame_data['data'], frame_data['shape'], frame_data['dtype']) | |
temp_dir = frame_data.get('temp_dir', 'temp_local') | |
video_name = frame_data.get('video_name', 'video') | |
# Create a display image for visualization | |
display_img = original_img_array.copy() | |
new_sel_pix = sel_pix.copy() if sel_pix else [] | |
new_sel_pix.append((evt.index, 1 if point_type == 'positive_point' else 0)) | |
print(f"🎯 Running SAM inference for point: {evt.index}, type: {point_type}") | |
# Run SAM inference | |
o_masks = gpu_run_inference(None, original_img_array, new_sel_pix, []) | |
# Draw points on display image | |
for point, label in new_sel_pix: | |
cv2.drawMarker(display_img, point, COLORS[label], markerType=MARKERS[label], markerSize=MARKER_SIZE, thickness=2) | |
# Draw mask overlay on display image | |
if o_masks: | |
mask = o_masks[0][0] | |
overlay = display_img.copy() | |
overlay[mask.squeeze()!=0] = [20, 60, 200] # Light blue | |
display_img = cv2.addWeighted(overlay, 0.6, display_img, 0.4, 0) | |
# Save mask for tracking | |
save_masks(o_masks, video_name, temp_dir) | |
print(f"✅ Mask saved for video: {video_name}") | |
return display_img, new_sel_pix | |
except Exception as e: | |
print(f"❌ Error in select_point: {e}") | |
return None, [] | |
def reset_points(original_img: str, sel_pix): | |
"""Reset all points and clear the mask""" | |
if original_img is None: | |
return None, [] | |
try: | |
# Convert stored image data back to numpy array | |
frame_data = json.loads(original_img) | |
original_img_array = base64_to_numpy(frame_data['data'], frame_data['shape'], frame_data['dtype']) | |
temp_dir = frame_data.get('temp_dir', 'temp_local') | |
# Create a display image (just the original image) | |
display_img = original_img_array.copy() | |
# Clear all points | |
new_sel_pix = [] | |
# Clear any existing masks | |
for mask_file in glob.glob(os.path.join(temp_dir, "*.png")): | |
try: | |
os.remove(mask_file) | |
except Exception as e: | |
logger.warning(f"Failed to remove mask file {mask_file}: {e}") | |
print("🔄 Points and masks reset") | |
return display_img, new_sel_pix | |
except Exception as e: | |
print(f"❌ Error in reset_points: {e}") | |
return None, [] | |
def launch_viz(grid_size, vo_points, fps, original_image_state, mode="offline"): | |
"""Launch visualization with user-specific temp directory""" | |
if original_image_state is None: | |
return None, None, None | |
try: | |
# Get user's temp directory from stored frame data | |
frame_data = json.loads(original_image_state) | |
temp_dir = frame_data.get('temp_dir', 'temp_local') | |
video_name = frame_data.get('video_name', 'video') | |
print(f"🚀 Starting tracking for video: {video_name}") | |
print(f"📊 Parameters: grid_size={grid_size}, vo_points={vo_points}, fps={fps}") | |
# Check for mask files | |
mask_files = glob.glob(os.path.join(temp_dir, "*.png")) | |
video_files = glob.glob(os.path.join(temp_dir, "*.mp4")) | |
if not video_files: | |
print("❌ No video file found") | |
return "❌ Error: No video file found", None, None | |
video_path = video_files[0] | |
mask_path = mask_files[0] if mask_files else None | |
# Run tracker | |
print("🎯 Running tracker...") | |
out_dir = os.path.join(temp_dir, "results") | |
os.makedirs(out_dir, exist_ok=True) | |
gpu_run_tracker(None, None, temp_dir, video_name, grid_size, vo_points, fps, mode=mode) | |
# Process results | |
npz_path = os.path.join(out_dir, "result.npz") | |
track2d_video = os.path.join(out_dir, "test_pred_track.mp4") | |
if os.path.exists(npz_path): | |
print("📊 Processing 3D visualization...") | |
html_path = process_point_cloud_data(npz_path) | |
# Schedule deletion of generated files | |
delete_later(html_path, delay=600) | |
if os.path.exists(track2d_video): | |
delete_later(track2d_video, delay=600) | |
delete_later(npz_path, delay=600) | |
# Create iframe HTML | |
iframe_html = f""" | |
<div style='border: 3px solid #667eea; border-radius: 10px; | |
background: #f8f9ff; height: 650px; width: 100%; | |
box-shadow: 0 8px 32px rgba(102, 126, 234, 0.3); | |
margin: 0; padding: 0; box-sizing: border-box; overflow: hidden;'> | |
<iframe id="viz_iframe" src="/gradio_api/file={html_path}" | |
width="100%" height="650" frameborder="0" | |
style="border: none; display: block; width: 100%; height: 650px; | |
margin: 0; padding: 0; border-radius: 7px;"> | |
</iframe> | |
</div> | |
""" | |
print("✅ Tracking completed successfully!") | |
return iframe_html, track2d_video if os.path.exists(track2d_video) else None, html_path | |
else: | |
print("❌ Tracking failed - no results generated") | |
return "❌ Error: Tracking failed to generate results", None, None | |
except Exception as e: | |
print(f"❌ Error in launch_viz: {e}") | |
return f"❌ Error: {str(e)}", None, None | |
def clear_all(): | |
"""Clear all buffers and temporary files""" | |
return (None, None, [], | |
gr.update(value=50), | |
gr.update(value=756), | |
gr.update(value=3)) | |
def clear_all_with_download(): | |
"""Clear all buffers including both download components""" | |
return (None, None, [], | |
gr.update(value=50), | |
gr.update(value=756), | |
gr.update(value=3), | |
None, # tracking_video_download | |
None) # HTML download component | |
def get_video_settings(video_name): | |
"""Get video-specific settings based on video name""" | |
video_settings = { | |
"running": (50, 512, 2), | |
"backpack": (40, 600, 2), | |
"kitchen": (60, 800, 3), | |
"pillow": (35, 500, 2), | |
"handwave": (35, 500, 8), | |
"hockey": (45, 700, 2), | |
"drifting": (35, 1000, 6), | |
"basketball": (45, 1500, 5), | |
"ken_block_0": (45, 700, 2), | |
"ego_kc1": (45, 500, 4), | |
"vertical_place": (45, 500, 3), | |
"ego_teaser": (45, 1200, 10), | |
"robot_unitree": (45, 500, 4), | |
"robot_3": (35, 400, 5), | |
"teleop2": (45, 256, 7), | |
"pusht": (45, 256, 10), | |
"cinema_0": (45, 356, 5), | |
"cinema_1": (45, 756, 3), | |
"robot1": (45, 600, 2), | |
"robot2": (45, 600, 2), | |
"protein": (45, 600, 2), | |
"kitchen_egocentric": (45, 600, 2), | |
} | |
return video_settings.get(video_name, (50, 756, 3)) | |
# Create the Gradio interface | |
print("🎨 Creating Gradio interface...") | |
with gr.Blocks( | |
theme=gr.themes.Soft(), | |
title="🎯 [SpatialTracker V2](https://github.com/henry123-boy/SpaTrackerV2)", | |
css=""" | |
.gradio-container { | |
max-width: 1200px !important; | |
margin: auto !important; | |
} | |
.gr-button { | |
margin: 5px; | |
} | |
.gr-form { | |
background: white; | |
border-radius: 10px; | |
padding: 20px; | |
box-shadow: 0 2px 10px rgba(0,0,0,0.1); | |
} | |
/* 移除 gr.Group 的默认灰色背景 */ | |
.gr-form { | |
background: transparent !important; | |
border: none !important; | |
box-shadow: none !important; | |
padding: 0 !important; | |
} | |
/* 固定3D可视化器尺寸 */ | |
#viz_container { | |
height: 650px !important; | |
min-height: 650px !important; | |
max-height: 650px !important; | |
width: 100% !important; | |
margin: 0 !important; | |
padding: 0 !important; | |
overflow: hidden !important; | |
} | |
#viz_container > div { | |
height: 650px !important; | |
min-height: 650px !important; | |
max-height: 650px !important; | |
width: 100% !important; | |
margin: 0 !important; | |
padding: 0 !important; | |
box-sizing: border-box !important; | |
} | |
#viz_container iframe { | |
height: 650px !important; | |
min-height: 650px !important; | |
max-height: 650px !important; | |
width: 100% !important; | |
border: none !important; | |
display: block !important; | |
margin: 0 !important; | |
padding: 0 !important; | |
box-sizing: border-box !important; | |
} | |
/* 固定视频上传组件高度 */ | |
.gr-video { | |
height: 300px !important; | |
min-height: 300px !important; | |
max-height: 300px !important; | |
} | |
.gr-video video { | |
height: 260px !important; | |
max-height: 260px !important; | |
object-fit: contain !important; | |
background: #f8f9fa; | |
} | |
.gr-video .gr-video-player { | |
height: 260px !important; | |
max-height: 260px !important; | |
} | |
/* 强力移除examples的灰色背景 - 使用更通用的选择器 */ | |
.horizontal-examples, | |
.horizontal-examples > *, | |
.horizontal-examples * { | |
background: transparent !important; | |
background-color: transparent !important; | |
border: none !important; | |
} | |
/* Examples组件水平滚动样式 */ | |
.horizontal-examples [data-testid="examples"] { | |
background: transparent !important; | |
background-color: transparent !important; | |
} | |
.horizontal-examples [data-testid="examples"] > div { | |
background: transparent !important; | |
background-color: transparent !important; | |
overflow-x: auto !important; | |
overflow-y: hidden !important; | |
scrollbar-width: thin; | |
scrollbar-color: #667eea transparent; | |
padding: 0 !important; | |
margin-top: 10px; | |
border: none !important; | |
} | |
.horizontal-examples [data-testid="examples"] table { | |
display: flex !important; | |
flex-wrap: nowrap !important; | |
min-width: max-content !important; | |
gap: 15px !important; | |
padding: 10px 0; | |
background: transparent !important; | |
border: none !important; | |
} | |
.horizontal-examples [data-testid="examples"] tbody { | |
display: flex !important; | |
flex-direction: row !important; | |
flex-wrap: nowrap !important; | |
gap: 15px !important; | |
background: transparent !important; | |
} | |
.horizontal-examples [data-testid="examples"] tr { | |
display: flex !important; | |
flex-direction: column !important; | |
min-width: 160px !important; | |
max-width: 160px !important; | |
margin: 0 !important; | |
background: white !important; | |
border-radius: 12px; | |
box-shadow: 0 3px 12px rgba(0,0,0,0.12); | |
transition: all 0.3s ease; | |
cursor: pointer; | |
overflow: hidden; | |
border: none !important; | |
} | |
.horizontal-examples [data-testid="examples"] tr:hover { | |
transform: translateY(-4px); | |
box-shadow: 0 8px 20px rgba(102, 126, 234, 0.25); | |
} | |
.horizontal-examples [data-testid="examples"] td { | |
text-align: center !important; | |
padding: 0 !important; | |
border: none !important; | |
background: transparent !important; | |
} | |
.horizontal-examples [data-testid="examples"] td:first-child { | |
padding: 0 !important; | |
background: transparent !important; | |
} | |
.horizontal-examples [data-testid="examples"] video { | |
border-radius: 8px 8px 0 0 !important; | |
width: 100% !important; | |
height: 90px !important; | |
object-fit: cover !important; | |
background: #f8f9fa !important; | |
} | |
.horizontal-examples [data-testid="examples"] td:last-child { | |
font-size: 11px !important; | |
font-weight: 600 !important; | |
color: #333 !important; | |
padding: 8px 12px !important; | |
background: linear-gradient(135deg, #f8f9ff 0%, #e6f3ff 100%) !important; | |
border-radius: 0 0 8px 8px; | |
} | |
/* 滚动条样式 */ | |
.horizontal-examples [data-testid="examples"] > div::-webkit-scrollbar { | |
height: 8px; | |
} | |
.horizontal-examples [data-testid="examples"] > div::-webkit-scrollbar-track { | |
background: transparent; | |
border-radius: 4px; | |
} | |
.horizontal-examples [data-testid="examples"] > div::-webkit-scrollbar-thumb { | |
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
border-radius: 4px; | |
} | |
.horizontal-examples [data-testid="examples"] > div::-webkit-scrollbar-thumb:hover { | |
background: linear-gradient(135deg, #5a6fd8 0%, #6a4190 100%); | |
} | |
""" | |
) as demo: | |
# Add prominent main title | |
gr.Markdown(""" | |
# ✨ SpatialTrackerV2 | |
Welcome to [SpatialTracker V2](https://github.com/henry123-boy/SpaTrackerV2)! This interface allows you to track any pixels in 3D using our model. | |
For full information, please refer to the [official website](https://spatialtracker.github.io/), and [ICCV2025 paper](https://github.com/henry123-boy/SpaTrackerV2). | |
Please cite our paper and give us a star 🌟 if you find this project useful! | |
**⚡ Quick Start:** Upload video → Click "Start Tracking Now!" | |
**🔬 Advanced Usage with SAM:** | |
1. Upload a video file or select from examples below | |
2. Expand "Manual Point Selection" to click on specific objects for SAM-guided tracking | |
3. Adjust tracking parameters for optimal performance | |
4. Click "Start Tracking Now!" to begin 3D tracking with SAM guidance | |
""") | |
# Status indicator | |
gr.Markdown("**Status:** 🟢 Local Processing Mode") | |
# Main content area - video upload left, 3D visualization right | |
with gr.Row(): | |
with gr.Column(scale=1): | |
# Video upload section | |
gr.Markdown("### 📂 Select Video") | |
# Define video_input here so it can be referenced in examples | |
video_input = gr.Video( | |
label="Upload Video or Select Example", | |
format="mp4", | |
height=250 # Matched height with 3D viz | |
) | |
# Traditional examples but with horizontal scroll styling | |
gr.Markdown("🎨**Examples:** (scroll horizontally to see all videos)") | |
with gr.Row(elem_classes=["horizontal-examples"]): | |
# Horizontal video examples with slider | |
# gr.HTML("<div style='margin-top: 5px;'></div>") | |
gr.Examples( | |
examples=[ | |
["./examples/robot1.mp4"], | |
["./examples/robot2.mp4"], | |
["./examples/protein.mp4"], | |
["./examples/kitchen_egocentric.mp4"], | |
["./examples/hockey.mp4"], | |
["./examples/running.mp4"], | |
["./examples/robot_3.mp4"], | |
["./examples/backpack.mp4"], | |
["./examples/kitchen.mp4"], | |
["./examples/pillow.mp4"], | |
["./examples/handwave.mp4"], | |
["./examples/drifting.mp4"], | |
["./examples/basketball.mp4"], | |
["./examples/ken_block_0.mp4"], | |
["./examples/ego_kc1.mp4"], | |
["./examples/vertical_place.mp4"], | |
["./examples/ego_teaser.mp4"], | |
["./examples/robot_unitree.mp4"], | |
["./examples/teleop2.mp4"], | |
["./examples/pusht.mp4"], | |
["./examples/cinema_0.mp4"], | |
["./examples/cinema_1.mp4"], | |
], | |
inputs=[video_input], | |
outputs=[video_input], | |
fn=None, | |
cache_examples=False, | |
label="", | |
examples_per_page=6 # Show 6 examples per page so they can wrap to multiple rows | |
) | |
with gr.Column(scale=2): | |
# 3D Visualization - wider and taller to match left side | |
with gr.Group(): | |
gr.Markdown("### 🌐 3D Trajectory Visualization") | |
viz_html = gr.HTML( | |
label="3D Trajectory Visualization", | |
value=""" | |
<div style='border: 3px solid #667eea; border-radius: 10px; | |
background: linear-gradient(135deg, #f8f9ff 0%, #e6f3ff 100%); | |
text-align: center; height: 650px; display: flex; | |
flex-direction: column; justify-content: center; align-items: center; | |
box-shadow: 0 4px 16px rgba(102, 126, 234, 0.15); | |
margin: 0; padding: 20px; box-sizing: border-box;'> | |
<div style='font-size: 56px; margin-bottom: 25px;'>🌐</div> | |
<h3 style='color: #667eea; margin-bottom: 18px; font-size: 28px; font-weight: 600;'> | |
3D Trajectory Visualization | |
</h3> | |
<p style='color: #666; font-size: 18px; line-height: 1.6; max-width: 550px; margin-bottom: 30px;'> | |
Track any pixels in 3D space with camera motion | |
</p> | |
<div style='background: rgba(102, 126, 234, 0.1); border-radius: 30px; | |
padding: 15px 30px; border: 1px solid rgba(102, 126, 234, 0.2);'> | |
<span style='color: #667eea; font-weight: 600; font-size: 16px;'> | |
⚡ Powered by SpatialTracker V2 | |
</span> | |
</div> | |
</div> | |
""", | |
elem_id="viz_container" | |
) | |
# Start button section - below video area | |
with gr.Row(): | |
with gr.Column(scale=3): | |
launch_btn = gr.Button("🚀 Start Tracking Now!", variant="primary", size="lg") | |
with gr.Column(scale=1): | |
clear_all_btn = gr.Button("🗑️ Clear All", variant="secondary", size="sm") | |
# Tracking parameters section | |
with gr.Row(): | |
gr.Markdown("### ⚙️ Tracking Parameters") | |
with gr.Row(): | |
grid_size = gr.Slider( | |
minimum=10, maximum=100, step=10, value=50, | |
label="Grid Size", info="Tracking detail level" | |
) | |
vo_points = gr.Slider( | |
minimum=100, maximum=2000, step=50, value=756, | |
label="VO Points", info="Motion accuracy" | |
) | |
fps = gr.Slider( | |
minimum=1, maximum=20, step=1, value=3, | |
label="FPS", info="Processing speed" | |
) | |
# Advanced Point Selection with SAM - Collapsed by default | |
with gr.Row(): | |
gr.Markdown("### 🎯 Advanced: Manual Point Selection with SAM") | |
with gr.Accordion("🔬 SAM Point Selection Controls", open=False): | |
gr.HTML(""" | |
<div style='margin-bottom: 15px;'> | |
<ul style='color: #4a5568; font-size: 14px; line-height: 1.6; margin: 0; padding-left: 20px;'> | |
<li>Click on target objects in the image for SAM-guided segmentation</li> | |
<li>Positive points: include these areas | Negative points: exclude these areas</li> | |
<li>Get more accurate 3D tracking results with SAM's powerful segmentation</li> | |
</ul> | |
</div> | |
""") | |
with gr.Row(): | |
with gr.Column(): | |
interactive_frame = gr.Image( | |
label="Click to select tracking points with SAM guidance", | |
type="numpy", | |
interactive=True, | |
height=300 | |
) | |
with gr.Row(): | |
point_type = gr.Radio( | |
choices=["positive_point", "negative_point"], | |
value="positive_point", | |
label="Point Type", | |
info="Positive: track these areas | Negative: avoid these areas" | |
) | |
with gr.Row(): | |
reset_points_btn = gr.Button("🔄 Reset Points", variant="secondary", size="sm") | |
# Downloads section - hidden but still functional for local processing | |
with gr.Row(visible=False): | |
with gr.Column(scale=1): | |
tracking_video_download = gr.File( | |
label="📹 Download 2D Tracking Video", | |
interactive=False, | |
visible=False | |
) | |
with gr.Column(scale=1): | |
html_download = gr.File( | |
label="📄 Download 3D Visualization HTML", | |
interactive=False, | |
visible=False | |
) | |
# GitHub Star Section | |
gr.HTML(""" | |
<div style='background: linear-gradient(135deg, #e8eaff 0%, #f0f2ff 100%); | |
border-radius: 8px; padding: 20px; margin: 15px 0; | |
box-shadow: 0 2px 8px rgba(102, 126, 234, 0.1); | |
border: 1px solid rgba(102, 126, 234, 0.15);'> | |
<div style='text-align: center;'> | |
<h3 style='color: #4a5568; margin: 0 0 10px 0; font-size: 18px; font-weight: 600;'> | |
⭐ Love SpatialTracker? Give us a Star! ⭐ | |
</h3> | |
<p style='color: #666; margin: 0 0 15px 0; font-size: 14px; line-height: 1.5;'> | |
Help us grow by starring our repository on GitHub! Your support means a lot to the community. 🚀 | |
</p> | |
<a href="https://github.com/henry123-boy/SpaTrackerV2" target="_blank" | |
style='display: inline-flex; align-items: center; gap: 8px; | |
background: rgba(102, 126, 234, 0.1); color: #4a5568; | |
padding: 10px 20px; border-radius: 25px; text-decoration: none; | |
font-weight: bold; font-size: 14px; border: 1px solid rgba(102, 126, 234, 0.2); | |
transition: all 0.3s ease;' | |
onmouseover="this.style.background='rgba(102, 126, 234, 0.15)'; this.style.transform='translateY(-2px)'" | |
onmouseout="this.style.background='rgba(102, 126, 234, 0.1)'; this.style.transform='translateY(0)'"> | |
<span style='font-size: 16px;'>⭐</span> | |
Star SpatialTracker V2 on GitHub | |
</a> | |
</div> | |
</div> | |
""") | |
# Acknowledgments Section | |
gr.HTML(""" | |
<div style='background: linear-gradient(135deg, #fff8e1 0%, #fffbf0 100%); | |
border-radius: 8px; padding: 20px; margin: 15px 0; | |
box-shadow: 0 2px 8px rgba(255, 193, 7, 0.1); | |
border: 1px solid rgba(255, 193, 7, 0.2);'> | |
<div style='text-align: center;'> | |
<h3 style='color: #5d4037; margin: 0 0 10px 0; font-size: 18px; font-weight: 600;'> | |
📚 Acknowledgments | |
</h3> | |
<p style='color: #5d4037; margin: 0 0 15px 0; font-size: 14px; line-height: 1.5;'> | |
Our 3D visualizer is adapted from <strong>TAPIP3D</strong>. We thank the authors for their excellent work and contribution to the computer vision community! | |
</p> | |
<a href="https://github.com/zbw001/TAPIP3D" target="_blank" | |
style='display: inline-flex; align-items: center; gap: 8px; | |
background: rgba(255, 193, 7, 0.15); color: #5d4037; | |
padding: 10px 20px; border-radius: 25px; text-decoration: none; | |
font-weight: bold; font-size: 14px; border: 1px solid rgba(255, 193, 7, 0.3); | |
transition: all 0.3s ease;' | |
onmouseover="this.style.background='rgba(255, 193, 7, 0.25)'; this.style.transform='translateY(-2px)'" | |
onmouseout="this.style.background='rgba(255, 193, 7, 0.15)'; this.style.transform='translateY(0)'"> | |
📚 Visit TAPIP3D Repository | |
</a> | |
</div> | |
</div> | |
""") | |
# Footer | |
gr.HTML(""" | |
<div style='text-align: center; margin: 20px 0 10px 0;'> | |
<span style='font-size: 12px; color: #888; font-style: italic;'> | |
Powered by SpatialTracker V2 | Built with ❤️ for the Computer Vision Community | |
</span> | |
</div> | |
""") | |
# Hidden state variables | |
original_image_state = gr.State(None) | |
selected_points = gr.State([]) | |
# Event handlers | |
video_input.change( | |
fn=handle_video_upload, | |
inputs=[video_input], | |
outputs=[original_image_state, interactive_frame, selected_points, grid_size, vo_points, fps] | |
) | |
interactive_frame.select( | |
fn=select_point, | |
inputs=[original_image_state, selected_points, point_type], | |
outputs=[interactive_frame, selected_points] | |
) | |
reset_points_btn.click( | |
fn=reset_points, | |
inputs=[original_image_state, selected_points], | |
outputs=[interactive_frame, selected_points] | |
) | |
clear_all_btn.click( | |
fn=clear_all_with_download, | |
outputs=[video_input, interactive_frame, selected_points, grid_size, vo_points, fps, tracking_video_download, html_download] | |
) | |
launch_btn.click( | |
fn=launch_viz, | |
inputs=[grid_size, vo_points, fps, original_image_state], | |
outputs=[viz_html, tracking_video_download, html_download] | |
) | |
# Launch the interface | |
if __name__ == "__main__": | |
print("🌟 Launching SpatialTracker V2 Local Version...") | |
print("🔗 Running in Local Processing Mode") | |
demo.launch( | |
server_name="0.0.0.0", | |
server_port=7860, | |
share=True, | |
debug=True, | |
show_error=True | |
) |