Spaces:

Yuxihenry
/

SpatialTrackerV2

Running on Zero

App Files Files Community

xiaoyuxi commited on Jul 6

Commit

c8d9d42

0 Parent(s):

Cleaned history, reset to current state

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +36 -0
.gitignore +69 -0
README.md +14 -0
_viz/viz_template.html +1778 -0
app.py +1118 -0
app_3rd/README.md +12 -0
app_3rd/sam_utils/hf_sam_predictor.py +129 -0
app_3rd/sam_utils/inference.py +123 -0
app_3rd/spatrack_utils/infer_track.py +194 -0
config/__init__.py +0 -0
config/magic_infer_moge.yaml +48 -0
examples/backpack.mp4 +3 -0
examples/ball.mp4 +3 -0
examples/basketball.mp4 +3 -0
examples/biker.mp4 +3 -0
examples/cinema_0.mp4 +3 -0
examples/cinema_1.mp4 +3 -0
examples/drifting.mp4 +3 -0
examples/ego_kc1.mp4 +3 -0
examples/ego_teaser.mp4 +3 -0
examples/handwave.mp4 +3 -0
examples/hockey.mp4 +3 -0
examples/ken_block_0.mp4 +3 -0
examples/kiss.mp4 +3 -0
examples/kitchen.mp4 +3 -0
examples/kitchen_egocentric.mp4 +3 -0
examples/pillow.mp4 +3 -0
examples/protein.mp4 +3 -0
examples/pusht.mp4 +3 -0
examples/robot1.mp4 +3 -0
examples/robot2.mp4 +3 -0
examples/robot_3.mp4 +3 -0
examples/robot_unitree.mp4 +3 -0
examples/running.mp4 +3 -0
examples/teleop2.mp4 +3 -0
examples/vertical_place.mp4 +3 -0
models/SpaTrackV2/models/SpaTrack.py +759 -0
models/SpaTrackV2/models/__init__.py +0 -0
models/SpaTrackV2/models/blocks.py +519 -0
models/SpaTrackV2/models/camera_transform.py +248 -0
models/SpaTrackV2/models/depth_refiner/backbone.py +472 -0
models/SpaTrackV2/models/depth_refiner/decode_head.py +619 -0
models/SpaTrackV2/models/depth_refiner/depth_refiner.py +115 -0
models/SpaTrackV2/models/depth_refiner/network.py +429 -0
models/SpaTrackV2/models/depth_refiner/stablilization_attention.py +1187 -0
models/SpaTrackV2/models/depth_refiner/stablizer.py +342 -0
models/SpaTrackV2/models/predictor.py +153 -0
models/SpaTrackV2/models/tracker3D/TrackRefiner.py +1478 -0
models/SpaTrackV2/models/tracker3D/co_tracker/cotracker_base.py +418 -0
models/SpaTrackV2/models/tracker3D/co_tracker/utils.py +929 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,36 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.mp4 filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,69 @@

+# ignore the multi media
+checkpoints
+**/checkpoints/
+**/temp/
+temp
+assets_dev
+assets/example0/results
+assets/example0/snowboard.npz
+assets/example1/results
+assets/davis_eval
+assets/*/results
+*gradio*
+#
+models/monoD/zoeDepth/ckpts/*
+models/monoD/depth_anything/ckpts/*
+vis_results
+dist_encrypted
+# remove the dependencies
+deps
+# filter the __pycache__ files
+__pycache__/
+/**/**/__pycache__
+/**/__pycache__
+outputs
+scripts/lauch_exp/config
+scripts/lauch_exp/submit_job.log
+scripts/lauch_exp/hydra_output
+scripts/lauch_wulan
+scripts/custom_video
+# ignore the visualizer
+viser
+viser_result
+benchmark/results
+benchmark
+ossutil_output
+prev_version
+spat_ceres
+wandb
+*.log
+seg_target.py
+eval_davis.py
+eval_multiple_gpu.py
+eval_pose_scan.py
+eval_single_gpu.py
+infer_cam.py
+infer_stream.py
+*.egg-info/
+**/*.egg-info
+eval_kinectics.py
+models/SpaTrackV2/datasets
+scripts
+config/fix_2d.yaml
+models/SpaTrackV2/datasets
+scripts/
+models/**/build
+models/**/dist
+temp_local

README.md ADDED Viewed

	@@ -0,0 +1,14 @@

+---
+title: SpatialTrackerV2
+emoji: ⚡️
+colorFrom: yellow
+colorTo: red
+sdk: gradio
+sdk_version: 5.31.0
+app_file: app.py
+pinned: false
+license: mit
+short_description: Official Space for SpatialTrackerV2
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

_viz/viz_template.html ADDED Viewed

	@@ -0,0 +1,1778 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  <title>3D Point Cloud Visualizer</title>
+  <style>
+    :root {
+      --primary: #9b59b6; /* Brighter purple for dark mode */
+      --primary-light: #3a2e4a;
+      --secondary: #a86add;
+      --accent: #ff6e6e;
+      --bg: #1a1a1a;
+      --surface: #2c2c2c;
+      --text: #e0e0e0;
+      --text-secondary: #a0a0a0;
+      --border: #444444;
+      --shadow: rgba(0, 0, 0, 0.2);
+      --shadow-hover: rgba(0, 0, 0, 0.3);
+      --space-sm: 16px;
+      --space-md: 24px;
+      --space-lg: 32px;
+    }
+    body {
+      margin: 0;
+      overflow: hidden;
+      background: var(--bg);
+      color: var(--text);
+      font-family: 'Inter', sans-serif;
+      -webkit-font-smoothing: antialiased;
+    }
+    #canvas-container {
+      position: absolute;
+      width: 100%;
+      height: 100%;
+    }
+    #ui-container {
+      position: absolute;
+      top: 0;
+      left: 0;
+      width: 100%;
+      height: 100%;
+      pointer-events: none;
+      z-index: 10;
+    }
+    #status-bar {
+      position: absolute;
+      top: 16px;
+      left: 16px;
+      background: rgba(30, 30, 30, 0.9);
+      padding: 8px 16px;
+      border-radius: 8px;
+      pointer-events: auto;
+      box-shadow: 0 4px 6px var(--shadow);
+      backdrop-filter: blur(4px);
+      border: 1px solid var(--border);
+      color: var(--text);
+      transition: opacity 0.5s ease, transform 0.5s ease;
+      font-weight: 500;
+    }
+    #status-bar.hidden {
+      opacity: 0;
+      transform: translateY(-20px);
+      pointer-events: none;
+    }
+    #control-panel {
+      position: absolute;
+      bottom: 16px;
+      left: 50%;
+      transform: translateX(-50%);
+      background: rgba(44, 44, 44, 0.95);
+      padding: 6px 8px;
+      border-radius: 6px;
+      display: flex;
+      gap: 8px;
+      align-items: center;
+      justify-content: space-between;
+      pointer-events: auto;
+      box-shadow: 0 4px 10px var(--shadow);
+      backdrop-filter: blur(4px);
+      border: 1px solid var(--border);
+    }
+    #timeline {
+      width: 150px;
+      height: 4px;
+      background: rgba(255, 255, 255, 0.1);
+      border-radius: 2px;
+      position: relative;
+      cursor: pointer;
+    }
+    #progress {
+      position: absolute;
+      height: 100%;
+      background: var(--primary);
+      border-radius: 2px;
+      width: 0%;
+    }
+    #playback-controls {
+      display: flex;
+      gap: 4px;
+      align-items: center;
+    }
+    button {
+      background: rgba(255, 255, 255, 0.08);
+      border: 1px solid var(--border);
+      color: var(--text);
+      padding: 4px 6px;
+      border-radius: 3px;
+      cursor: pointer;
+      display: flex;
+      align-items: center;
+      justify-content: center;
+      transition: background 0.2s, transform 0.2s;
+      font-family: 'Inter', sans-serif;
+      font-weight: 500;
+      font-size: 6px;
+    }
+    button:hover {
+      background: rgba(255, 255, 255, 0.15);
+      transform: translateY(-1px);
+    }
+    button.active {
+      background: var(--primary);
+      color: white;
+      box-shadow: 0 2px 8px rgba(155, 89, 182, 0.4);
+    }
+    select, input {
+      background: rgba(255, 255, 255, 0.08);
+      border: 1px solid var(--border);
+      color: var(--text);
+      padding: 4px 6px;
+      border-radius: 3px;
+      cursor: pointer;
+      font-family: 'Inter', sans-serif;
+      font-size: 6px;
+    }
+    .icon {
+      width: 10px;
+      height: 10px;
+      fill: currentColor;
+    }
+    .tooltip {
+      position: absolute;
+      bottom: 100%;
+      left: 50%;
+      transform: translateX(-50%);
+      background: var(--surface);
+      color: var(--text);
+      padding: 3px 6px;
+      border-radius: 3px;
+      font-size: 7px;
+      white-space: nowrap;
+      margin-bottom: 4px;
+      opacity: 0;
+      transition: opacity 0.2s;
+      pointer-events: none;
+      box-shadow: 0 2px 4px var(--shadow);
+      border: 1px solid var(--border);
+    }
+    button:hover .tooltip {
+      opacity: 1;
+    }
+    #settings-panel {
+      position: absolute;
+      top: 16px;
+      right: 16px;
+      background: rgba(44, 44, 44, 0.98);
+      padding: 10px;
+      border-radius: 6px;
+      width: 195px;
+      max-height: calc(100vh - 40px);
+      overflow-y: auto;
+      pointer-events: auto;
+      box-shadow: 0 4px 15px var(--shadow);
+      backdrop-filter: blur(4px);
+      border: 1px solid var(--border);
+      display: block;
+      opacity: 1;
+      scrollbar-width: thin;
+      scrollbar-color: var(--primary-light) transparent;
+      transition: transform 0.35s ease-in-out, opacity 0.3s ease-in-out;
+    }
+    #settings-panel.is-hidden {
+      transform: translateX(calc(100% + 20px));
+      opacity: 0;
+      pointer-events: none;
+    }
+    #settings-panel::-webkit-scrollbar {
+      width: 3px;
+    }
+    #settings-panel::-webkit-scrollbar-track {
+      background: transparent;
+    }
+    #settings-panel::-webkit-scrollbar-thumb {
+      background-color: var(--primary-light);
+      border-radius: 3px;
+    }
+    @media (max-height: 700px) {
+      #settings-panel {
+        max-height: calc(100vh - 40px);
+      }
+    }
+    @media (max-width: 768px) {
+      #control-panel {
+        width: 90%;
+        flex-wrap: wrap;
+        justify-content: center;
+      }
+      #timeline {
+        width: 100%;
+        order: 3;
+        margin-top: 10px;
+      }
+      #settings-panel {
+        width: 140px;
+        right: 10px;
+        top: 10px;
+        max-height: calc(100vh - 20px);
+      }
+    }
+    .settings-group {
+      margin-bottom: 8px;
+    }
+    .settings-group h3 {
+      margin: 0 0 6px 0;
+      font-size: 10px;
+      font-weight: 500;
+      color: var(--text-secondary);
+    }
+    .slider-container {
+      display: flex;
+      align-items: center;
+      gap: 6px;
+      width: 100%;
+    }
+    .slider-container label {
+      min-width: 60px;
+      font-size: 10px;
+      flex-shrink: 0;
+    }
+    input[type="range"] {
+      flex: 1;
+      height: 2px;
+      -webkit-appearance: none;
+      background: rgba(255, 255, 255, 0.1);
+      border-radius: 1px;
+      min-width: 0;
+    }
+    input[type="range"]::-webkit-slider-thumb {
+      -webkit-appearance: none;
+      width: 8px;
+      height: 8px;
+      border-radius: 50%;
+      background: var(--primary);
+      cursor: pointer;
+    }
+    .toggle-switch {
+      position: relative;
+      display: inline-block;
+      width: 20px;
+      height: 10px;
+    }
+    .toggle-switch input {
+      opacity: 0;
+      width: 0;
+      height: 0;
+    }
+    .toggle-slider {
+      position: absolute;
+      cursor: pointer;
+      top: 0;
+      left: 0;
+      right: 0;
+      bottom: 0;
+      background: rgba(255, 255, 255, 0.1);
+      transition: .4s;
+      border-radius: 10px;
+    }
+    .toggle-slider:before {
+      position: absolute;
+      content: "";
+      height: 8px;
+      width: 8px;
+      left: 1px;
+      bottom: 1px;
+      background: var(--surface);
+      border: 1px solid var(--border);
+      transition: .4s;
+      border-radius: 50%;
+    }
+    input:checked + .toggle-slider {
+      background: var(--primary);
+    }
+    input:checked + .toggle-slider:before {
+      transform: translateX(10px);
+    }
+    .checkbox-container {
+      display: flex;
+      align-items: center;
+      gap: 4px;
+      margin-bottom: 4px;
+    }
+    .checkbox-container label {
+      font-size: 10px;
+      cursor: pointer;
+    }
+    #loading-overlay {
+      position: absolute;
+      top: 0;
+      left: 0;
+      width: 100%;
+      height: 100%;
+      background: var(--bg);
+      display: flex;
+      flex-direction: column;
+      align-items: center;
+      justify-content: center;
+      z-index: 100;
+      transition: opacity 0.5s;
+    }
+    #loading-overlay.fade-out {
+      opacity: 0;
+      pointer-events: none;
+    }
+    .spinner {
+      width: 50px;
+      height: 50px;
+      border: 5px solid rgba(155, 89, 182, 0.2);
+      border-radius: 50%;
+      border-top-color: var(--primary);
+      animation: spin 1s ease-in-out infinite;
+      margin-bottom: 16px;
+    }
+    @keyframes spin {
+      to { transform: rotate(360deg); }
+    }
+    #loading-text {
+      margin-top: 16px;
+      font-size: 18px;
+      color: var(--text);
+      font-weight: 500;
+    }
+    #frame-counter {
+      color: var(--text-secondary);
+      font-size: 7px;
+      font-weight: 500;
+      min-width: 60px;
+      text-align: center;
+      padding: 0 4px;
+    }
+    .control-btn {
+      background: rgba(255, 255, 255, 0.08);
+      border: 1px solid var(--border);
+      padding: 4px 6px;
+      border-radius: 3px;
+      cursor: pointer;
+      display: flex;
+      align-items: center;
+      justify-content: center;
+      transition: all 0.2s ease;
+      font-size: 6px;
+    }
+    .control-btn:hover {
+      background: rgba(255, 255, 255, 0.15);
+      transform: translateY(-1px);
+    }
+    .control-btn.active {
+      background: var(--primary);
+      color: white;
+    }
+    .control-btn.active:hover {
+      background: var(--primary);
+      box-shadow: 0 2px 8px rgba(155, 89, 182, 0.4);
+    }
+    #settings-toggle-btn {
+      position: relative;
+      border-radius: 6px;
+      z-index: 20;
+    }
+    #settings-toggle-btn.active {
+      background: var(--primary);
+      color: white;
+    }
+    #status-bar,
+    #control-panel,
+    #settings-panel,
+    button,
+    input,
+    select,
+    .toggle-switch {
+      pointer-events: auto;
+    }
+    h2 {
+      font-size: 0.9rem;
+      font-weight: 600;
+      margin-top: 0;
+      margin-bottom: 12px;
+      color: var(--primary);
+      cursor: move;
+      user-select: none;
+      display: flex;
+      align-items: center;
+    }
+    .drag-handle {
+      font-size: 10px;
+      margin-right: 4px;
+      opacity: 0.6;
+    }
+    h2:hover .drag-handle {
+      opacity: 1;
+    }
+    .loading-subtitle {
+      font-size: 7px;
+      color: var(--text-secondary);
+      margin-top: 4px;
+    }
+    #reset-view-btn {
+      background: var(--primary-light);
+      color: var(--primary);
+      border: 1px solid rgba(155, 89, 182, 0.2);
+      font-weight: 600;
+      font-size: 9px;
+      padding: 4px 6px;
+      transition: all 0.2s;
+    }
+    #reset-view-btn:hover {
+      background: var(--primary);
+      color: white;
+      transform: translateY(-2px);
+      box-shadow: 0 4px 8px rgba(155, 89, 182, 0.3);
+    }
+    #show-settings-btn {
+      position: absolute;
+      top: 16px;
+      right: 16px;
+      z-index: 15;
+      display: none;
+    }
+    #settings-panel.visible {
+      display: block;
+      opacity: 1;
+      animation: slideIn 0.3s ease forwards;
+    }
+    @keyframes slideIn {
+      from {
+        transform: translateY(20px);
+        opacity: 0;
+      }
+      to {
+        transform: translateY(0);
+        opacity: 1;
+      }
+    }
+    .dragging {
+      opacity: 0.9;
+      box-shadow: 0 8px 20px rgba(0, 0, 0, 0.15) !important;
+      transition: none !important;
+    }
+    /* Tooltip for draggable element */
+    .tooltip-drag {
+      position: absolute;
+      left: 50%;
+      transform: translateX(-50%);
+      background: var(--primary);
+      color: white;
+      font-size: 9px;
+      padding: 2px 4px;
+      border-radius: 2px;
+      opacity: 0;
+      pointer-events: none;
+      transition: opacity 0.3s;
+      white-space: nowrap;
+      bottom: 100%;
+      margin-bottom: 4px;
+    }
+    h2:hover .tooltip-drag {
+      opacity: 1;
+    }
+    .btn-group {
+      display: flex;
+      margin-top: 8px;
+    }
+    #reset-settings-btn {
+      background: var(--primary-light);
+      color: var(--primary);
+      border: 1px solid rgba(155, 89, 182, 0.2);
+      font-weight: 600;
+      font-size: 9px;
+      padding: 4px 6px;
+      transition: all 0.2s;
+    }
+    #reset-settings-btn:hover {
+      background: var(--primary);
+      color: white;
+      transform: translateY(-2px);
+      box-shadow: 0 4px 8px rgba(155, 89, 182, 0.3);
+    }
+  </style>
+</head>
+<body>
+  <link rel="preconnect" href="https://fonts.googleapis.com">
+  <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
+  <link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap" rel="stylesheet">
+  <div id="canvas-container"></div>
+  <div id="ui-container">
+    <div id="status-bar">Initializing...</div>
+    <div id="control-panel">
+      <button id="play-pause-btn" class="control-btn">
+        <svg class="icon" viewBox="0 0 24 24">
+          <path id="play-icon" d="M8 5v14l11-7z"/>
+          <path id="pause-icon" d="M6 19h4V5H6v14zm8-14v14h4V5h-4z" style="display: none;"/>
+        </svg>
+        <span class="tooltip">Play/Pause</span>
+      </button>
+      <div id="timeline">
+        <div id="progress"></div>
+      </div>
+      <div id="frame-counter">Frame 0 / 0</div>
+      <div id="playback-controls">
+        <button id="speed-btn" class="control-btn">1x</button>
+      </div>
+    </div>
+    <div id="settings-panel">
+      <h2>
+        <span class="drag-handle">☰</span>
+        Visualization Settings
+        <button id="hide-settings-btn" class="control-btn" style="margin-left: auto; padding: 2px;" title="Hide Panel">
+          <svg class="icon" viewBox="0 0 24 24" style="width: 9px; height: 9px;">
+            <path d="M14.59 7.41L18.17 11H4v2h14.17l-3.58 3.59L16 18l6-6-6-6-1.41 1.41z"/>
+          </svg>
+        </button>
+      </h2>
+      <div class="settings-group">
+        <h3>Point Cloud</h3>
+        <div class="slider-container">
+          <label for="point-size">Size</label>
+          <input type="range" id="point-size" min="0.005" max="0.1" step="0.005" value="0.03">
+        </div>
+        <div class="slider-container">
+          <label for="point-opacity">Opacity</label>
+          <input type="range" id="point-opacity" min="0.1" max="1" step="0.05" value="1">
+        </div>
+        <div class="slider-container">
+          <label for="max-depth">Max Depth</label>
+          <input type="range" id="max-depth" min="0.1" max="10" step="0.2" value="100">
+        </div>
+      </div>
+      <div class="settings-group">
+        <h3>Trajectory</h3>
+        <div class="checkbox-container">
+          <label class="toggle-switch">
+            <input type="checkbox" id="show-trajectory" checked>
+            <span class="toggle-slider"></span>
+          </label>
+          <label for="show-trajectory">Show Trajectory</label>
+        </div>
+        <div class="checkbox-container">
+          <label class="toggle-switch">
+            <input type="checkbox" id="enable-rich-trail">
+            <span class="toggle-slider"></span>
+          </label>
+          <label for="enable-rich-trail">Visual-Rich Trail</label>
+        </div>
+        <div class="slider-container">
+          <label for="trajectory-line-width">Line Width</label>
+          <input type="range" id="trajectory-line-width" min="0.5" max="5" step="0.5" value="1.5">
+        </div>
+        <div class="slider-container">
+          <label for="trajectory-ball-size">Ball Size</label>
+          <input type="range" id="trajectory-ball-size" min="0.005" max="0.05" step="0.001" value="0.02">
+        </div>
+        <div class="slider-container">
+          <label for="trajectory-history">History Frames</label>
+          <input type="range" id="trajectory-history" min="1" max="500" step="1" value="30">
+        </div>
+        <div class="slider-container" id="tail-opacity-container" style="display: none;">
+          <label for="trajectory-fade">Tail Opacity</label>
+          <input type="range" id="trajectory-fade" min="0" max="1" step="0.05" value="0.0">
+        </div>
+      </div>
+      <div class="settings-group">
+        <h3>Camera</h3>
+        <div class="checkbox-container">
+          <label class="toggle-switch">
+            <input type="checkbox" id="show-camera-frustum" checked>
+            <span class="toggle-slider"></span>
+          </label>
+          <label for="show-camera-frustum">Show Camera Frustum</label>
+        </div>
+        <div class="slider-container">
+          <label for="frustum-size">Size</label>
+          <input type="range" id="frustum-size" min="0.02" max="0.5" step="0.01" value="0.2">
+        </div>
+      </div>
+      <div class="settings-group">
+        <div class="btn-group">
+          <button id="reset-view-btn" style="flex: 1; margin-right: 5px;">Reset View</button>
+          <button id="reset-settings-btn" style="flex: 1; margin-left: 5px;">Reset Settings</button>
+        </div>
+      </div>
+    </div>
+    <button id="show-settings-btn" class="control-btn" title="Show Settings">
+      <svg class="icon" viewBox="0 0 24 24">
+        <path d="M19.14,12.94c0.04-0.3,0.06-0.61,0.06-0.94c0-0.32-0.02-0.64-0.07-0.94l2.03-1.58c0.18-0.14,0.23-0.41,0.12-0.61 l-1.92-3.32c-0.12-0.22-0.37-0.29-0.59-0.22l-2.39,0.96c-0.5-0.38-1.03-0.7-1.62-0.94L14.4,2.81c-0.04-0.24-0.24-0.41-0.48-0.41 h-3.84c-0.24,0-0.43,0.17-0.47,0.41L9.25,5.35C8.66,5.59,8.12,5.92,7.63,6.29L5.24,5.33c-0.22-0.08-0.47,0-0.59,0.22L2.74,8.87 C2.62,9.08,2.66,9.34,2.86,9.48l2.03,1.58C4.84,11.36,4.8,11.69,4.8,12s0.02,0.64,0.07,0.94l-2.03,1.58 c-0.18,0.14-0.23,0.41-0.12,0.61l1.92,3.32c0.12,0.22,0.37,0.29,0.59,0.22l2.39-0.96c0.5,0.38,1.03,0.7,1.62,0.94l0.36,2.54 c0.04,0.24,0.24,0.41,0.48,0.41h3.84c0.24,0,0.44-0.17,0.47-0.41l0.36-2.54c0.59-0.24,1.13-0.56,1.62-0.94l2.39,0.96 c0.22,0.08,0.47,0,0.59-0.22l1.92-3.32c0.12-0.22,0.07-0.47-0.12-0.61L19.14,12.94z M12,15.6c-1.98,0-3.6-1.62-3.6-3.6 s1.62-3.6,3.6-3.6s3.6,1.62,3.6,3.6S13.98,15.6,12,15.6z"/>
+      </svg>
+    </button>
+  </div>
+  <div id="loading-overlay">
+    <!-- <div class="spinner"></div> -->
+    <div id="loading-text"></div>
+    <div class="loading-subtitle" style="font-size: medium;">Interactive Viewer of 3D Tracking</div>
+  </div>
+  <!-- Libraries -->
+  <script src="https://cdnjs.cloudflare.com/ajax/libs/pako/2.1.0/pako.min.js"></script>
+  <script src="https://cdn.jsdelivr.net/npm/[email protected]/build/three.min.js"></script>
+  <script src="https://cdn.jsdelivr.net/npm/[email protected]/examples/js/controls/OrbitControls.js"></script>
+  <script src="https://cdn.jsdelivr.net/npm/[email protected]/build/dat.gui.min.js"></script>
+  <script src="https://cdn.jsdelivr.net/npm/[email protected]/examples/js/lines/LineSegmentsGeometry.js"></script>
+  <script src="https://cdn.jsdelivr.net/npm/[email protected]/examples/js/lines/LineGeometry.js"></script>
+  <script src="https://cdn.jsdelivr.net/npm/[email protected]/examples/js/lines/LineMaterial.js"></script>
+  <script src="https://cdn.jsdelivr.net/npm/[email protected]/examples/js/lines/LineSegments2.js"></script>
+  <script src="https://cdn.jsdelivr.net/npm/[email protected]/examples/js/lines/Line2.js"></script>
+  <script>
+    class PointCloudVisualizer {
+      constructor() {
+        this.data = null;
+        this.config = {};
+        this.currentFrame = 0;
+        this.isPlaying = false;
+        this.playbackSpeed = 1;
+        this.lastFrameTime = 0;
+        this.defaultSettings = null;
+        this.ui = {
+          statusBar: document.getElementById('status-bar'),
+          playPauseBtn: document.getElementById('play-pause-btn'),
+          speedBtn: document.getElementById('speed-btn'),
+          timeline: document.getElementById('timeline'),
+          progress: document.getElementById('progress'),
+          settingsPanel: document.getElementById('settings-panel'),
+          loadingOverlay: document.getElementById('loading-overlay'),
+          loadingText: document.getElementById('loading-text'),
+          settingsToggleBtn: document.getElementById('settings-toggle-btn'),
+          frameCounter: document.getElementById('frame-counter'),
+          pointSize: document.getElementById('point-size'),
+          pointOpacity: document.getElementById('point-opacity'),
+          maxDepth: document.getElementById('max-depth'),
+          showTrajectory: document.getElementById('show-trajectory'),
+          enableRichTrail: document.getElementById('enable-rich-trail'),
+          trajectoryLineWidth: document.getElementById('trajectory-line-width'),
+          trajectoryBallSize: document.getElementById('trajectory-ball-size'),
+          trajectoryHistory: document.getElementById('trajectory-history'),
+          trajectoryFade: document.getElementById('trajectory-fade'),
+          tailOpacityContainer: document.getElementById('tail-opacity-container'),
+          resetViewBtn: document.getElementById('reset-view-btn'),
+          showCameraFrustum: document.getElementById('show-camera-frustum'),
+          frustumSize: document.getElementById('frustum-size'),
+          hideSettingsBtn: document.getElementById('hide-settings-btn'),
+          showSettingsBtn: document.getElementById('show-settings-btn')
+        };
+        this.scene = null;
+        this.camera = null;
+        this.renderer = null;
+        this.controls = null;
+        this.pointCloud = null;
+        this.trajectories = [];
+        this.cameraFrustum = null;
+        this.initThreeJS();
+        this.loadDefaultSettings().then(() => {
+          this.initEventListeners();
+          this.loadData();
+        });
+      }
+      async loadDefaultSettings() {
+        try {
+          const urlParams = new URLSearchParams(window.location.search);
+          const dataPath = urlParams.get('data') || '';
+          const defaultSettings = {
+            pointSize: 0.03,
+            pointOpacity: 1.0,
+            showTrajectory: true,
+            trajectoryLineWidth: 2.5,
+            trajectoryBallSize: 0.015,
+            trajectoryHistory: 0,
+            showCameraFrustum: true,
+            frustumSize: 0.2
+          };
+          if (!dataPath) {
+            this.defaultSettings = defaultSettings;
+            this.applyDefaultSettings();
+            return;
+          }
+          // Try to extract dataset and videoId from the data path
+          // Expected format: demos/datasetname/videoid.bin
+          const pathParts = dataPath.split('/');
+          if (pathParts.length < 3) {
+            this.defaultSettings = defaultSettings;
+            this.applyDefaultSettings();
+            return;
+          }
+          const datasetName = pathParts[pathParts.length - 2];
+          let videoId = pathParts[pathParts.length - 1].replace('.bin', '');
+          // Load settings from data.json
+          const response = await fetch('./data.json');
+          if (!response.ok) {
+            this.defaultSettings = defaultSettings;
+            this.applyDefaultSettings();
+            return;
+          }
+          const settingsData = await response.json();
+          // Check if this dataset and video exist
+          if (settingsData[datasetName] && settingsData[datasetName][videoId]) {
+            this.defaultSettings = settingsData[datasetName][videoId];
+          } else {
+            this.defaultSettings = defaultSettings;
+          }
+          this.applyDefaultSettings();
+        } catch (error) {
+          console.error("Error loading default settings:", error);
+          this.defaultSettings = {
+            pointSize: 0.03,
+            pointOpacity: 1.0,
+            showTrajectory: true,
+            trajectoryLineWidth: 2.5,
+            trajectoryBallSize: 0.015,
+            trajectoryHistory: 0,
+            showCameraFrustum: true,
+            frustumSize: 0.2
+          };
+          this.applyDefaultSettings();
+        }
+      }
+      applyDefaultSettings() {
+        if (!this.defaultSettings) return;
+        if (this.ui.pointSize) {
+          this.ui.pointSize.value = this.defaultSettings.pointSize;
+        }
+        if (this.ui.pointOpacity) {
+          this.ui.pointOpacity.value = this.defaultSettings.pointOpacity;
+        }
+        if (this.ui.maxDepth) {
+          this.ui.maxDepth.value = this.defaultSettings.maxDepth || 100.0;
+        }
+        if (this.ui.showTrajectory) {
+          this.ui.showTrajectory.checked = this.defaultSettings.showTrajectory;
+        }
+        if (this.ui.trajectoryLineWidth) {
+          this.ui.trajectoryLineWidth.value = this.defaultSettings.trajectoryLineWidth;
+        }
+        if (this.ui.trajectoryBallSize) {
+          this.ui.trajectoryBallSize.value = this.defaultSettings.trajectoryBallSize;
+        }
+        if (this.ui.trajectoryHistory) {
+          this.ui.trajectoryHistory.value = this.defaultSettings.trajectoryHistory;
+        }
+        if (this.ui.showCameraFrustum) {
+          this.ui.showCameraFrustum.checked = this.defaultSettings.showCameraFrustum;
+        }
+        if (this.ui.frustumSize) {
+          this.ui.frustumSize.value = this.defaultSettings.frustumSize;
+        }
+      }
+      initThreeJS() {
+        this.scene = new THREE.Scene();
+        this.scene.background = new THREE.Color(0x1a1a1a);
+        this.camera = new THREE.PerspectiveCamera(60, window.innerWidth / window.innerHeight, 0.1, 10000);
+        this.camera.position.set(0, 0, 0);
+        this.renderer = new THREE.WebGLRenderer({ antialias: true });
+        this.renderer.setPixelRatio(window.devicePixelRatio);
+        this.renderer.setSize(window.innerWidth, window.innerHeight);
+        document.getElementById('canvas-container').appendChild(this.renderer.domElement);
+        this.controls = new THREE.OrbitControls(this.camera, this.renderer.domElement);
+        this.controls.enableDamping = true;
+        this.controls.dampingFactor = 0.05;
+        this.controls.target.set(0, 0, 0);
+        this.controls.minDistance = 0.1;
+        this.controls.maxDistance = 1000;
+        this.controls.update();
+        const ambientLight = new THREE.AmbientLight(0xffffff, 0.5);
+        this.scene.add(ambientLight);
+        const directionalLight = new THREE.DirectionalLight(0xffffff, 0.8);
+        directionalLight.position.set(1, 1, 1);
+        this.scene.add(directionalLight);
+      }
+      initEventListeners() {
+        window.addEventListener('resize', () => this.onWindowResize());
+        this.ui.playPauseBtn.addEventListener('click', () => this.togglePlayback());
+        this.ui.timeline.addEventListener('click', (e) => {
+          const rect = this.ui.timeline.getBoundingClientRect();
+          const pos = (e.clientX - rect.left) / rect.width;
+          this.seekTo(pos);
+        });
+        this.ui.speedBtn.addEventListener('click', () => this.cyclePlaybackSpeed());
+        this.ui.pointSize.addEventListener('input', () => this.updatePointCloudSettings());
+        this.ui.pointOpacity.addEventListener('input', () => this.updatePointCloudSettings());
+        this.ui.maxDepth.addEventListener('input', () => this.updatePointCloudSettings());
+        this.ui.showTrajectory.addEventListener('change', () => {
+          this.trajectories.forEach(trajectory => {
+            trajectory.visible = this.ui.showTrajectory.checked;
+          });
+        });
+        this.ui.enableRichTrail.addEventListener('change', () => {
+            this.ui.tailOpacityContainer.style.display = this.ui.enableRichTrail.checked ? 'flex' : 'none';
+            this.updateTrajectories(this.currentFrame);
+        });
+        this.ui.trajectoryLineWidth.addEventListener('input', () => this.updateTrajectorySettings());
+        this.ui.trajectoryBallSize.addEventListener('input', () => this.updateTrajectorySettings());
+        this.ui.trajectoryHistory.addEventListener('input', () => {
+          this.updateTrajectories(this.currentFrame);
+        });
+        this.ui.trajectoryFade.addEventListener('input', () => {
+          this.updateTrajectories(this.currentFrame);
+        });
+        this.ui.resetViewBtn.addEventListener('click', () => this.resetView());
+        const resetSettingsBtn = document.getElementById('reset-settings-btn');
+        if (resetSettingsBtn) {
+          resetSettingsBtn.addEventListener('click', () => this.resetSettings());
+        }
+        document.addEventListener('keydown', (e) => {
+          if (e.key === 'Escape' && this.ui.settingsPanel.classList.contains('visible')) {
+            this.ui.settingsPanel.classList.remove('visible');
+            this.ui.settingsToggleBtn.classList.remove('active');
+          }
+        });
+        if (this.ui.settingsToggleBtn) {
+          this.ui.settingsToggleBtn.addEventListener('click', () => {
+            const isVisible = this.ui.settingsPanel.classList.toggle('visible');
+            this.ui.settingsToggleBtn.classList.toggle('active', isVisible);
+            if (isVisible) {
+              const panelRect = this.ui.settingsPanel.getBoundingClientRect();
+              const viewportHeight = window.innerHeight;
+              if (panelRect.bottom > viewportHeight) {
+                this.ui.settingsPanel.style.bottom = 'auto';
+                this.ui.settingsPanel.style.top = '80px';
+              }
+            }
+          });
+        }
+        if (this.ui.frustumSize) {
+          this.ui.frustumSize.addEventListener('input', () => this.updateFrustumDimensions());
+        }
+        if (this.ui.hideSettingsBtn && this.ui.showSettingsBtn && this.ui.settingsPanel) {
+          this.ui.hideSettingsBtn.addEventListener('click', () => {
+            this.ui.settingsPanel.classList.add('is-hidden');
+            this.ui.showSettingsBtn.style.display = 'flex';
+          });
+          this.ui.showSettingsBtn.addEventListener('click', () => {
+            this.ui.settingsPanel.classList.remove('is-hidden');
+            this.ui.showSettingsBtn.style.display = 'none';
+          });
+        }
+      }
+      makeElementDraggable(element) {
+        let pos1 = 0, pos2 = 0, pos3 = 0, pos4 = 0;
+        const dragHandle = element.querySelector('h2');
+        if (dragHandle) {
+          dragHandle.onmousedown = dragMouseDown;
+          dragHandle.title = "Drag to move panel";
+        } else {
+          element.onmousedown = dragMouseDown;
+        }
+        function dragMouseDown(e) {
+          e = e || window.event;
+          e.preventDefault();
+          pos3 = e.clientX;
+          pos4 = e.clientY;
+          document.onmouseup = closeDragElement;
+          document.onmousemove = elementDrag;
+          element.classList.add('dragging');
+        }
+        function elementDrag(e) {
+          e = e || window.event;
+          e.preventDefault();
+          pos1 = pos3 - e.clientX;
+          pos2 = pos4 - e.clientY;
+          pos3 = e.clientX;
+          pos4 = e.clientY;
+          const newTop = element.offsetTop - pos2;
+          const newLeft = element.offsetLeft - pos1;
+          const viewportWidth = window.innerWidth;
+          const viewportHeight = window.innerHeight;
+          const panelRect = element.getBoundingClientRect();
+          const maxTop = viewportHeight - 50;
+          const maxLeft = viewportWidth - 50;
+          element.style.top = Math.min(Math.max(newTop, 0), maxTop) + "px";
+          element.style.left = Math.min(Math.max(newLeft, 0), maxLeft) + "px";
+          // Remove bottom/right settings when dragging
+          element.style.bottom = 'auto';
+          element.style.right = 'auto';
+        }
+        function closeDragElement() {
+          document.onmouseup = null;
+          document.onmousemove = null;
+          element.classList.remove('dragging');
+        }
+      }
+      async loadData() {
+        try {
+          // this.ui.loadingText.textContent = "Loading binary data...";
+          let arrayBuffer;
+          if (window.embeddedBase64) {
+            // Base64 embedded path
+            const binaryString = atob(window.embeddedBase64);
+            const len = binaryString.length;
+            const bytes = new Uint8Array(len);
+            for (let i = 0; i < len; i++) {
+              bytes[i] = binaryString.charCodeAt(i);
+            }
+            arrayBuffer = bytes.buffer;
+          } else {
+            // Default fetch path (fallback)
+            const urlParams = new URLSearchParams(window.location.search);
+            const dataPath = urlParams.get('data') || 'data.bin';
+            const response = await fetch(dataPath);
+            if (!response.ok) throw new Error(`Failed to load ${dataPath}`);
+            arrayBuffer = await response.arrayBuffer();
+          }
+          const dataView = new DataView(arrayBuffer);
+          const headerLen = dataView.getUint32(0, true);
+          const headerText = new TextDecoder("utf-8").decode(arrayBuffer.slice(4, 4 + headerLen));
+          const header = JSON.parse(headerText);
+          const compressedBlob = new Uint8Array(arrayBuffer, 4 + headerLen);
+          const decompressed = pako.inflate(compressedBlob).buffer;
+          const arrays = {};
+          for (const key in header) {
+            if (key === "meta") continue;
+            const meta = header[key];
+            const { dtype, shape, offset, length } = meta;
+            const slice = decompressed.slice(offset, offset + length);
+            let typedArray;
+            switch (dtype) {
+              case "uint8": typedArray = new Uint8Array(slice); break;
+              case "uint16": typedArray = new Uint16Array(slice); break;
+              case "float32": typedArray = new Float32Array(slice); break;
+              case "float64": typedArray = new Float64Array(slice); break;
+              default: throw new Error(`Unknown dtype: ${dtype}`);
+            }
+            arrays[key] = { data: typedArray, shape: shape };
+          }
+          this.data = arrays;
+          this.config = header.meta;
+          this.initCameraWithCorrectFOV();
+          this.ui.loadingText.textContent = "Creating point cloud...";
+          this.initPointCloud();
+          this.initTrajectories();
+          setTimeout(() => {
+            this.ui.loadingOverlay.classList.add('fade-out');
+            this.ui.statusBar.classList.add('hidden');
+            this.startAnimation();
+          }, 500);
+        } catch (error) {
+          console.error("Error loading data:", error);
+          this.ui.statusBar.textContent = `Error: ${error.message}`;
+          // this.ui.loadingText.textContent = `Error loading data: ${error.message}`;
+        }
+      }
+      initPointCloud() {
+        const numPoints = this.config.resolution[0] * this.config.resolution[1];
+        const positions = new Float32Array(numPoints * 3);
+        const colors = new Float32Array(numPoints * 3);
+        const geometry = new THREE.BufferGeometry();
+        geometry.setAttribute('position', new THREE.BufferAttribute(positions, 3).setUsage(THREE.DynamicDrawUsage));
+        geometry.setAttribute('color', new THREE.BufferAttribute(colors, 3).setUsage(THREE.DynamicDrawUsage));
+        const pointSize = parseFloat(this.ui.pointSize.value) || this.defaultSettings.pointSize;
+        const pointOpacity = parseFloat(this.ui.pointOpacity.value) || this.defaultSettings.pointOpacity;
+        const material = new THREE.PointsMaterial({
+          size: pointSize,
+          vertexColors: true,
+          transparent: true,
+          opacity: pointOpacity,
+          sizeAttenuation: true
+        });
+        this.pointCloud = new THREE.Points(geometry, material);
+        this.scene.add(this.pointCloud);
+      }
+      initTrajectories() {
+        if (!this.data.trajectories) return;
+        this.trajectories.forEach(trajectory => {
+            if (trajectory.userData.lineSegments) {
+                trajectory.userData.lineSegments.forEach(segment => {
+                    segment.geometry.dispose();
+                    segment.material.dispose();
+                });
+            }
+            this.scene.remove(trajectory);
+        });
+        this.trajectories = [];
+        const shape = this.data.trajectories.shape;
+        if (!shape || shape.length < 2) return;
+        const [totalFrames, numTrajectories] = shape;
+        const palette = this.createColorPalette(numTrajectories);
+        const resolution = new THREE.Vector2(window.innerWidth, window.innerHeight);
+        const maxHistory = 500; // Max value of the history slider, for the object pool
+        for (let i = 0; i < numTrajectories; i++) {
+            const trajectoryGroup = new THREE.Group();
+            const ballSize = parseFloat(this.ui.trajectoryBallSize.value);
+            const sphereGeometry = new THREE.SphereGeometry(ballSize, 16, 16);
+            const sphereMaterial = new THREE.MeshBasicMaterial({ color: palette[i], transparent: true });
+            const positionMarker = new THREE.Mesh(sphereGeometry, sphereMaterial);
+            trajectoryGroup.add(positionMarker);
+            // High-Performance Line (default)
+            const simpleLineGeometry = new THREE.BufferGeometry();
+            const simpleLinePositions = new Float32Array(maxHistory * 3);
+            simpleLineGeometry.setAttribute('position', new THREE.BufferAttribute(simpleLinePositions, 3).setUsage(THREE.DynamicDrawUsage));
+            const simpleLine = new THREE.Line(simpleLineGeometry, new THREE.LineBasicMaterial({ color: palette[i] }));
+            simpleLine.frustumCulled = false;
+            trajectoryGroup.add(simpleLine);
+            // High-Quality Line Segments (for rich trail)
+            const lineSegments = [];
+            const lineWidth = parseFloat(this.ui.trajectoryLineWidth.value);
+            // Create a pool of line segment objects
+            for (let j = 0; j < maxHistory - 1; j++) {
+                const lineGeometry = new THREE.LineGeometry();
+                lineGeometry.setPositions([0, 0, 0, 0, 0, 0]);
+                const lineMaterial = new THREE.LineMaterial({
+                    color: palette[i],
+                    linewidth: lineWidth,
+                    resolution: resolution,
+                    transparent: true,
+                    depthWrite: false, // Correctly handle transparency
+                    opacity: 0
+                });
+                const segment = new THREE.Line2(lineGeometry, lineMaterial);
+                segment.frustumCulled = false;
+                segment.visible = false; // Start with all segments hidden
+                trajectoryGroup.add(segment);
+                lineSegments.push(segment);
+            }
+            trajectoryGroup.userData = {
+                marker: positionMarker,
+                simpleLine: simpleLine,
+                lineSegments: lineSegments,
+                color: palette[i]
+            };
+            this.scene.add(trajectoryGroup);
+            this.trajectories.push(trajectoryGroup);
+        }
+        const showTrajectory = this.ui.showTrajectory.checked;
+        this.trajectories.forEach(trajectory => trajectory.visible = showTrajectory);
+      }
+      createColorPalette(count) {
+        const colors = [];
+        const hueStep = 360 / count;
+        for (let i = 0; i < count; i++) {
+          const hue = (i * hueStep) % 360;
+          const color = new THREE.Color().setHSL(hue / 360, 0.8, 0.6);
+          colors.push(color);
+        }
+        return colors;
+      }
+      updatePointCloud(frameIndex) {
+        if (!this.data || !this.pointCloud) return;
+        const positions = this.pointCloud.geometry.attributes.position.array;
+        const colors = this.pointCloud.geometry.attributes.color.array;
+        const rgbVideo = this.data.rgb_video;
+        const depthsRgb = this.data.depths_rgb;
+        const intrinsics = this.data.intrinsics;
+        const invExtrinsics = this.data.inv_extrinsics;
+        const width = this.config.resolution[0];
+        const height = this.config.resolution[1];
+        const numPoints = width * height;
+        const K = this.get3x3Matrix(intrinsics.data, intrinsics.shape, frameIndex);
+        const fx = K[0][0], fy = K[1][1], cx = K[0][2], cy = K[1][2];
+        const invExtrMat = this.get4x4Matrix(invExtrinsics.data, invExtrinsics.shape, frameIndex);
+        const transform = this.getTransformElements(invExtrMat);
+        const rgbFrame = this.getFrame(rgbVideo.data, rgbVideo.shape, frameIndex);
+        const depthFrame = this.getFrame(depthsRgb.data, depthsRgb.shape, frameIndex);
+        const maxDepth = parseFloat(this.ui.maxDepth.value) || 10.0;
+        let validPointCount = 0;
+        for (let i = 0; i < numPoints; i++) {
+          const xPix = i % width;
+          const yPix = Math.floor(i / width);
+          const d0 = depthFrame[i * 3];
+          const d1 = depthFrame[i * 3 + 1];
+          const depthEncoded = d0 | (d1 << 8);
+          const depthValue = (depthEncoded / ((1 << 16) - 1)) *
+                           (this.config.depthRange[1] - this.config.depthRange[0]) +
+                           this.config.depthRange[0];
+          if (depthValue === 0 || depthValue > maxDepth) {
+            continue;
+          }
+          const X = ((xPix - cx) * depthValue) / fx;
+          const Y = ((yPix - cy) * depthValue) / fy;
+          const Z = depthValue;
+          const tx = transform.m11 * X + transform.m12 * Y + transform.m13 * Z + transform.m14;
+          const ty = transform.m21 * X + transform.m22 * Y + transform.m23 * Z + transform.m24;
+          const tz = transform.m31 * X + transform.m32 * Y + transform.m33 * Z + transform.m34;
+          const index = validPointCount * 3;
+          positions[index] = tx;
+          positions[index + 1] = -ty;
+          positions[index + 2] = -tz;
+          colors[index] = rgbFrame[i * 3] / 255;
+          colors[index + 1] = rgbFrame[i * 3 + 1] / 255;
+          colors[index + 2] = rgbFrame[i * 3 + 2] / 255;
+          validPointCount++;
+        }
+        this.pointCloud.geometry.setDrawRange(0, validPointCount);
+        this.pointCloud.geometry.attributes.position.needsUpdate = true;
+        this.pointCloud.geometry.attributes.color.needsUpdate = true;
+        this.pointCloud.geometry.computeBoundingSphere(); // Important for camera culling
+        this.updateTrajectories(frameIndex);
+        const progress = (frameIndex + 1) / this.config.totalFrames;
+        this.ui.progress.style.width = `${progress * 100}%`;
+        if (this.ui.frameCounter && this.config.totalFrames) {
+          this.ui.frameCounter.textContent = `Frame ${frameIndex} / ${this.config.totalFrames - 1}`;
+        }
+        this.updateCameraFrustum(frameIndex);
+      }
+      updateTrajectories(frameIndex) {
+        if (!this.data.trajectories || this.trajectories.length === 0) return;
+        const trajectoryData = this.data.trajectories.data;
+        const [totalFrames, numTrajectories] = this.data.trajectories.shape;
+        const historyFrames = parseInt(this.ui.trajectoryHistory.value);
+        const tailOpacity = parseFloat(this.ui.trajectoryFade.value);
+        const isRichMode = this.ui.enableRichTrail.checked;
+        for (let i = 0; i < numTrajectories; i++) {
+          const trajectoryGroup = this.trajectories[i];
+          const { marker, simpleLine, lineSegments } = trajectoryGroup.userData;
+          const currentPos = new THREE.Vector3();
+          const currentOffset = (frameIndex * numTrajectories + i) * 3;
+          currentPos.x = trajectoryData[currentOffset];
+          currentPos.y = -trajectoryData[currentOffset + 1];
+          currentPos.z = -trajectoryData[currentOffset + 2];
+          marker.position.copy(currentPos);
+          marker.material.opacity = 1.0;
+          const historyToShow = Math.min(historyFrames, frameIndex + 1);
+          if (isRichMode) {
+              // --- High-Quality Mode ---
+              simpleLine.visible = false;
+              for (let j = 0; j < lineSegments.length; j++) {
+                  const segment = lineSegments[j];
+                  if (j < historyToShow - 1) {
+                      const headFrame = frameIndex - j;
+                      const tailFrame = frameIndex - j - 1;
+                      const headOffset = (headFrame * numTrajectories + i) * 3;
+                      const tailOffset = (tailFrame * numTrajectories + i) * 3;
+                      const positions = [
+                          trajectoryData[headOffset], -trajectoryData[headOffset + 1], -trajectoryData[headOffset + 2],
+                          trajectoryData[tailOffset], -trajectoryData[tailOffset + 1], -trajectoryData[tailOffset + 2]
+                      ];
+                      segment.geometry.setPositions(positions);
+                      const headOpacity = 1.0;
+                      const normalizedAge = j / Math.max(1, historyToShow - 2);
+                      const alpha = headOpacity - (headOpacity - tailOpacity) * normalizedAge;
+                      segment.material.opacity = Math.max(0, alpha);
+                      segment.visible = true;
+                  } else {
+                      segment.visible = false;
+                  }
+              }
+          } else {
+              // --- Performance Mode ---
+              lineSegments.forEach(s => s.visible = false);
+              simpleLine.visible = true;
+              const positions = simpleLine.geometry.attributes.position.array;
+              for (let j = 0; j < historyToShow; j++) {
+                  const historyFrame = Math.max(0, frameIndex - j);
+                  const offset = (historyFrame * numTrajectories + i) * 3;
+                  positions[j * 3] = trajectoryData[offset];
+                  positions[j * 3 + 1] = -trajectoryData[offset + 1];
+                  positions[j * 3 + 2] = -trajectoryData[offset + 2];
+              }
+              simpleLine.geometry.setDrawRange(0, historyToShow);
+              simpleLine.geometry.attributes.position.needsUpdate = true;
+          }
+        }
+      }
+      updateTrajectorySettings() {
+        if (!this.trajectories || this.trajectories.length === 0) return;
+        const ballSize = parseFloat(this.ui.trajectoryBallSize.value);
+        const lineWidth = parseFloat(this.ui.trajectoryLineWidth.value);
+        this.trajectories.forEach(trajectoryGroup => {
+          const { marker, lineSegments } = trajectoryGroup.userData;
+          marker.geometry.dispose();
+          marker.geometry = new THREE.SphereGeometry(ballSize, 16, 16);
+          // Line width only affects rich mode
+          lineSegments.forEach(segment => {
+            if (segment.material) {
+              segment.material.linewidth = lineWidth;
+            }
+          });
+        });
+        this.updateTrajectories(this.currentFrame);
+      }
+      getDepthColor(normalizedDepth) {
+        const hue = (1 - normalizedDepth) * 240 / 360;
+        const color = new THREE.Color().setHSL(hue, 1.0, 0.5);
+        return color;
+      }
+      getFrame(typedArray, shape, frameIndex) {
+        const [T, H, W, C] = shape;
+        const frameSize = H * W * C;
+        const offset = frameIndex * frameSize;
+        return typedArray.subarray(offset, offset + frameSize);
+      }
+      get3x3Matrix(typedArray, shape, frameIndex) {
+        const frameSize = 9;
+        const offset = frameIndex * frameSize;
+        const K = [];
+        for (let i = 0; i < 3; i++) {
+          const row = [];
+          for (let j = 0; j < 3; j++) {
+            row.push(typedArray[offset + i * 3 + j]);
+          }
+          K.push(row);
+        }
+        return K;
+      }
+      get4x4Matrix(typedArray, shape, frameIndex) {
+        const frameSize = 16;
+        const offset = frameIndex * frameSize;
+        const M = [];
+        for (let i = 0; i < 4; i++) {
+          const row = [];
+          for (let j = 0; j < 4; j++) {
+            row.push(typedArray[offset + i * 4 + j]);
+          }
+          M.push(row);
+        }
+        return M;
+      }
+      getTransformElements(matrix) {
+        return {
+          m11: matrix[0][0], m12: matrix[0][1], m13: matrix[0][2], m14: matrix[0][3],
+          m21: matrix[1][0], m22: matrix[1][1], m23: matrix[1][2], m24: matrix[1][3],
+          m31: matrix[2][0], m32: matrix[2][1], m33: matrix[2][2], m34: matrix[2][3]
+        };
+      }
+      togglePlayback() {
+        this.isPlaying = !this.isPlaying;
+        const playIcon = document.getElementById('play-icon');
+        const pauseIcon = document.getElementById('pause-icon');
+        if (this.isPlaying) {
+          playIcon.style.display = 'none';
+          pauseIcon.style.display = 'block';
+          this.lastFrameTime = performance.now();
+        } else {
+          playIcon.style.display = 'block';
+          pauseIcon.style.display = 'none';
+        }
+      }
+      cyclePlaybackSpeed() {
+        const speeds = [0.5, 1, 2, 4, 8];
+        const speedRates = speeds.map(s => s * this.config.baseFrameRate);
+        let currentIndex = 0;
+        const normalizedSpeed = this.playbackSpeed / this.config.baseFrameRate;
+        for (let i = 0; i < speeds.length; i++) {
+          if (Math.abs(normalizedSpeed - speeds[i]) < Math.abs(normalizedSpeed - speeds[currentIndex])) {
+            currentIndex = i;
+          }
+        }
+        const nextIndex = (currentIndex + 1) % speeds.length;
+        this.playbackSpeed = speedRates[nextIndex];
+        this.ui.speedBtn.textContent = `${speeds[nextIndex]}x`;
+        if (speeds[nextIndex] === 1) {
+          this.ui.speedBtn.classList.remove('active');
+        } else {
+          this.ui.speedBtn.classList.add('active');
+        }
+      }
+      seekTo(position) {
+        const frameIndex = Math.floor(position * this.config.totalFrames);
+        this.currentFrame = Math.max(0, Math.min(frameIndex, this.config.totalFrames - 1));
+        this.updatePointCloud(this.currentFrame);
+      }
+      updatePointCloudSettings() {
+        if (!this.pointCloud) return;
+        const size = parseFloat(this.ui.pointSize.value);
+        const opacity = parseFloat(this.ui.pointOpacity.value);
+        this.pointCloud.material.size = size;
+        this.pointCloud.material.opacity = opacity;
+        this.pointCloud.material.needsUpdate = true;
+        this.updatePointCloud(this.currentFrame);
+      }
+      updateControls() {
+        if (!this.controls) return;
+        this.controls.update();
+      }
+      resetView() {
+        if (!this.camera || !this.controls) return;
+        // Reset camera position
+        this.camera.position.set(0, 0, this.config.cameraZ || 0);
+        // Reset controls
+        this.controls.reset();
+        // Set target slightly in front of camera
+        this.controls.target.set(0, 0, -1);
+        this.controls.update();
+        // Show status message
+        this.ui.statusBar.textContent = "View reset";
+        this.ui.statusBar.classList.remove('hidden');
+        // Hide status message after a few seconds
+        setTimeout(() => {
+          this.ui.statusBar.classList.add('hidden');
+        }, 3000);
+      }
+      onWindowResize() {
+        if (!this.camera || !this.renderer) return;
+        const windowAspect = window.innerWidth / window.innerHeight;
+        this.camera.aspect = windowAspect;
+        this.camera.updateProjectionMatrix();
+        this.renderer.setSize(window.innerWidth, window.innerHeight);
+        if (this.trajectories && this.trajectories.length > 0) {
+          const resolution = new THREE.Vector2(window.innerWidth, window.innerHeight);
+          this.trajectories.forEach(trajectory => {
+            const { lineSegments } = trajectory.userData;
+            if (lineSegments && lineSegments.length > 0) {
+              lineSegments.forEach(segment => {
+                if (segment.material && segment.material.resolution) {
+                  segment.material.resolution.copy(resolution);
+                }
+              });
+            }
+          });
+        }
+        if (this.cameraFrustum) {
+          const resolution = new THREE.Vector2(window.innerWidth, window.innerHeight);
+          this.cameraFrustum.children.forEach(line => {
+            if (line.material && line.material.resolution) {
+              line.material.resolution.copy(resolution);
+            }
+          });
+        }
+      }
+      startAnimation() {
+        this.isPlaying = true;
+        this.lastFrameTime = performance.now();
+        this.camera.position.set(0, 0, this.config.cameraZ || 0);
+        this.controls.target.set(0, 0, -1);
+        this.controls.update();
+        this.playbackSpeed = this.config.baseFrameRate;
+        document.getElementById('play-icon').style.display = 'none';
+        document.getElementById('pause-icon').style.display = 'block';
+        this.animate();
+      }
+      animate() {
+        requestAnimationFrame(() => this.animate());
+        if (this.controls) {
+          this.controls.update();
+        }
+        if (this.isPlaying && this.data) {
+          const now = performance.now();
+          const delta = (now - this.lastFrameTime) / 1000;
+          const framesToAdvance = Math.floor(delta * this.config.baseFrameRate * this.playbackSpeed);
+          if (framesToAdvance > 0) {
+            this.currentFrame = (this.currentFrame + framesToAdvance) % this.config.totalFrames;
+            this.lastFrameTime = now;
+            this.updatePointCloud(this.currentFrame);
+          }
+        }
+        if (this.renderer && this.scene && this.camera) {
+          this.renderer.render(this.scene, this.camera);
+        }
+      }
+      initCameraWithCorrectFOV() {
+        const fov = this.config.fov || 60;
+        const windowAspect = window.innerWidth / window.innerHeight;
+        this.camera = new THREE.PerspectiveCamera(
+          fov,
+          windowAspect,
+          0.1,
+          10000
+        );
+        this.controls.object = this.camera;
+        this.controls.update();
+        this.initCameraFrustum();
+      }
+      initCameraFrustum() {
+        this.cameraFrustum = new THREE.Group();
+        this.scene.add(this.cameraFrustum);
+        this.initCameraFrustumGeometry();
+        const showCameraFrustum = this.ui.showCameraFrustum ? this.ui.showCameraFrustum.checked : (this.defaultSettings ? this.defaultSettings.showCameraFrustum : false);
+        this.cameraFrustum.visible = showCameraFrustum;
+      }
+      initCameraFrustumGeometry() {
+        const fov = this.config.fov || 60;
+        const originalAspect = this.config.original_aspect_ratio || 1.33;
+        const size = parseFloat(this.ui.frustumSize.value) || this.defaultSettings.frustumSize;
+        const halfHeight = Math.tan(THREE.MathUtils.degToRad(fov / 2)) * size;
+        const halfWidth = halfHeight * originalAspect;
+        const vertices = [
+          new THREE.Vector3(0, 0, 0),
+          new THREE.Vector3(-halfWidth, -halfHeight, size),
+          new THREE.Vector3(halfWidth, -halfHeight, size),
+          new THREE.Vector3(halfWidth, halfHeight, size),
+          new THREE.Vector3(-halfWidth, halfHeight, size)
+        ];
+        const resolution = new THREE.Vector2(window.innerWidth, window.innerHeight);
+        const linePairs = [
+          [1, 2], [2, 3], [3, 4], [4, 1],
+          [0, 1], [0, 2], [0, 3], [0, 4]
+        ];
+        const colors = {
+          edge: new THREE.Color(0x3366ff),
+          ray: new THREE.Color(0x33cc66)
+        };
+        linePairs.forEach((pair, index) => {
+          const positions = [
+            vertices[pair[0]].x, vertices[pair[0]].y, vertices[pair[0]].z,
+            vertices[pair[1]].x, vertices[pair[1]].y, vertices[pair[1]].z
+          ];
+          const lineGeometry = new THREE.LineGeometry();
+          lineGeometry.setPositions(positions);
+          let color = index < 4 ? colors.edge : colors.ray;
+          const lineMaterial = new THREE.LineMaterial({
+            color: color,
+            linewidth: 2,
+            resolution: resolution,
+            dashed: false
+          });
+          const line = new THREE.Line2(lineGeometry, lineMaterial);
+          this.cameraFrustum.add(line);
+        });
+      }
+      updateCameraFrustum(frameIndex) {
+        if (!this.cameraFrustum || !this.data) return;
+        const invExtrinsics = this.data.inv_extrinsics;
+        if (!invExtrinsics) return;
+        const invExtrMat = this.get4x4Matrix(invExtrinsics.data, invExtrinsics.shape, frameIndex);
+        const matrix = new THREE.Matrix4();
+        matrix.set(
+          invExtrMat[0][0], invExtrMat[0][1], invExtrMat[0][2], invExtrMat[0][3],
+          invExtrMat[1][0], invExtrMat[1][1], invExtrMat[1][2], invExtrMat[1][3],
+          invExtrMat[2][0], invExtrMat[2][1], invExtrMat[2][2], invExtrMat[2][3],
+          invExtrMat[3][0], invExtrMat[3][1], invExtrMat[3][2], invExtrMat[3][3]
+        );
+        const position = new THREE.Vector3();
+        position.setFromMatrixPosition(matrix);
+        const rotMatrix = new THREE.Matrix4().extractRotation(matrix);
+        const coordinateCorrection = new THREE.Matrix4().makeRotationX(Math.PI);
+        const finalRotation = new THREE.Matrix4().multiplyMatrices(coordinateCorrection, rotMatrix);
+        const quaternion = new THREE.Quaternion();
+        quaternion.setFromRotationMatrix(finalRotation);
+        position.y = -position.y;
+        position.z = -position.z;
+        this.cameraFrustum.position.copy(position);
+        this.cameraFrustum.quaternion.copy(quaternion);
+        const showCameraFrustum = this.ui.showCameraFrustum ? this.ui.showCameraFrustum.checked : this.defaultSettings.showCameraFrustum;
+        if (this.cameraFrustum.visible !== showCameraFrustum) {
+          this.cameraFrustum.visible = showCameraFrustum;
+        }
+        const resolution = new THREE.Vector2(window.innerWidth, window.innerHeight);
+        this.cameraFrustum.children.forEach(line => {
+          if (line.material && line.material.resolution) {
+            line.material.resolution.copy(resolution);
+          }
+        });
+      }
+      updateFrustumDimensions() {
+        if (!this.cameraFrustum) return;
+        while(this.cameraFrustum.children.length > 0) {
+          const child = this.cameraFrustum.children[0];
+          if (child.geometry) child.geometry.dispose();
+          if (child.material) child.material.dispose();
+          this.cameraFrustum.remove(child);
+        }
+        this.initCameraFrustumGeometry();
+        this.updateCameraFrustum(this.currentFrame);
+      }
+      resetSettings() {
+        if (!this.defaultSettings) return;
+        this.applyDefaultSettings();
+        this.updatePointCloudSettings();
+        this.updateTrajectorySettings();
+        this.updateFrustumDimensions();
+        this.ui.statusBar.textContent = "Settings reset to defaults";
+        this.ui.statusBar.classList.remove('hidden');
+        setTimeout(() => {
+          this.ui.statusBar.classList.add('hidden');
+        }, 3000);
+      }
+    }
+    window.addEventListener('DOMContentLoaded', () => {
+      new PointCloudVisualizer();
+    });
+  </script>
+</body>
+</html>

app.py ADDED Viewed

	@@ -0,0 +1,1118 @@

+import gradio as gr
+import os
+import json
+import numpy as np
+import cv2
+import base64
+import time
+import tempfile
+import shutil
+import glob
+import threading
+import subprocess
+import struct
+import zlib
+from pathlib import Path
+from einops import rearrange
+from typing import List, Tuple, Union
+try:
+    import spaces
+except ImportError:
+    # Fallback for local development
+    def spaces(func):
+        return func
+import torch
+import logging
+from concurrent.futures import ThreadPoolExecutor
+import atexit
+import uuid
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Import custom modules with error handling
+try:
+    from app_3rd.sam_utils.inference import SamPredictor, get_sam_predictor, run_inference
+    from app_3rd.spatrack_utils.infer_track import get_tracker_predictor, run_tracker, get_points_on_a_grid
+except ImportError as e:
+    logger.error(f"Failed to import custom modules: {e}")
+    raise
+# Constants
+MAX_FRAMES = 80
+COLORS = [(0, 0, 255), (0, 255, 255)]  # BGR: Red for negative, Yellow for positive
+MARKERS = [1, 5]  # Cross for negative, Star for positive
+MARKER_SIZE = 8
+# Thread pool for delayed deletion
+thread_pool_executor = ThreadPoolExecutor(max_workers=2)
+def delete_later(path: Union[str, os.PathLike], delay: int = 600):
+    """Delete file or directory after specified delay (default 10 minutes)"""
+    def _delete():
+        try:
+            if os.path.isfile(path):
+                os.remove(path)
+            elif os.path.isdir(path):
+                shutil.rmtree(path)
+        except Exception as e:
+            logger.warning(f"Failed to delete {path}: {e}")
+    def _wait_and_delete():
+        time.sleep(delay)
+        _delete()
+    thread_pool_executor.submit(_wait_and_delete)
+    atexit.register(_delete)
+def create_user_temp_dir():
+    """Create a unique temporary directory for each user session"""
+    session_id = str(uuid.uuid4())[:8]  # Short unique ID
+    temp_dir = os.path.join("temp_local", f"session_{session_id}")
+    os.makedirs(temp_dir, exist_ok=True)
+    # Schedule deletion after 10 minutes
+    delete_later(temp_dir, delay=600)
+    return temp_dir
+from huggingface_hub import hf_hub_download
+# init the model
+os.environ["VGGT_DIR"] = hf_hub_download("Yuxihenry/SpatialTrackerCkpts", "spatrack_front.pth") #, force_download=True)
+if os.environ.get("VGGT_DIR", None) is not None:
+    from models.vggt.vggt.models.vggt_moe import VGGT_MoE
+    from models.vggt.vggt.utils.load_fn import preprocess_image
+    vggt_model = VGGT_MoE()
+    vggt_model.load_state_dict(torch.load(os.environ.get("VGGT_DIR")), strict=False)
+    vggt_model.eval()
+    vggt_model = vggt_model.to("cuda")
+# Global model initialization
+print("🚀 Initializing local models...")
+tracker_model, _ = get_tracker_predictor(".", vo_points=756)
+predictor = get_sam_predictor()
+print("✅ Models loaded successfully!")
+gr.set_static_paths(paths=[Path.cwd().absolute()/"_viz"])
+@spaces.GPU
+def gpu_run_inference(predictor_arg, image, points, boxes):
+    """GPU-accelerated SAM inference"""
+    if predictor_arg is None:
+        print("Initializing SAM predictor inside GPU function...")
+        predictor_arg = get_sam_predictor(predictor=predictor)
+    # Ensure predictor is on GPU
+    try:
+        if hasattr(predictor_arg, 'model'):
+            predictor_arg.model = predictor_arg.model.cuda()
+        elif hasattr(predictor_arg, 'sam'):
+            predictor_arg.sam = predictor_arg.sam.cuda()
+        elif hasattr(predictor_arg, 'to'):
+            predictor_arg = predictor_arg.to('cuda')
+        if hasattr(image, 'cuda'):
+            image = image.cuda()
+    except Exception as e:
+        print(f"Warning: Could not move predictor to GPU: {e}")
+    return run_inference(predictor_arg, image, points, boxes)
+@spaces.GPU
+def gpu_run_tracker(tracker_model_arg, tracker_viser_arg, temp_dir, video_name, grid_size, vo_points, fps, mode="offline"):
+    """GPU-accelerated tracking"""
+    import torchvision.transforms as T
+    import decord
+    if tracker_model_arg is None or tracker_viser_arg is None:
+        print("Initializing tracker models inside GPU function...")
+        out_dir = os.path.join(temp_dir, "results")
+        os.makedirs(out_dir, exist_ok=True)
+        tracker_model_arg, tracker_viser_arg = get_tracker_predictor(out_dir, vo_points=vo_points, tracker_model=tracker_model)
+    # Setup paths
+    video_path = os.path.join(temp_dir, f"{video_name}.mp4")
+    mask_path = os.path.join(temp_dir, f"{video_name}.png")
+    out_dir = os.path.join(temp_dir, "results")
+    os.makedirs(out_dir, exist_ok=True)
+    # Load video using decord
+    video_reader = decord.VideoReader(video_path)
+    video_tensor = torch.from_numpy(video_reader.get_batch(range(len(video_reader))).asnumpy()).permute(0, 3, 1, 2)
+    # Resize to ensure minimum side is 336
+    h, w = video_tensor.shape[2:]
+    scale = max(224 / h, 224 / w)
+    if scale < 1:
+        new_h, new_w = int(h * scale), int(w * scale)
+        video_tensor = T.Resize((new_h, new_w))(video_tensor)
+    video_tensor = video_tensor[::fps].float()[:MAX_FRAMES]
+    # Move to GPU
+    video_tensor = video_tensor.cuda()
+    print(f"Video tensor shape: {video_tensor.shape}, device: {video_tensor.device}")
+    depth_tensor = None
+    intrs = None
+    extrs = None
+    data_npz_load = {}
+    # run vggt
+    if os.environ.get("VGGT_DIR", None) is not None:
+        # process the image tensor
+        video_tensor = preprocess_image(video_tensor)[None]
+        with torch.no_grad():
+            with torch.cuda.amp.autocast(dtype=torch.bfloat16):
+                # Predict attributes including cameras, depth maps, and point maps.
+                predictions = vggt_model(video_tensor.cuda()/255)
+                extrinsic, intrinsic = predictions["poses_pred"], predictions["intrs"]
+                depth_map, depth_conf = predictions["points_map"][..., 2], predictions["unc_metric"]
+        depth_tensor = depth_map.squeeze().cpu().numpy()
+        extrs = np.eye(4)[None].repeat(len(depth_tensor), axis=0)
+        extrs = extrinsic.squeeze().cpu().numpy()
+        intrs = intrinsic.squeeze().cpu().numpy()
+        video_tensor = video_tensor.squeeze()
+        #NOTE: 20% of the depth is not reliable
+        # threshold = depth_conf.squeeze()[0].view(-1).quantile(0.6).item()
+        unc_metric = depth_conf.squeeze().cpu().numpy() > 0.5
+    # Load and process mask
+    if os.path.exists(mask_path):
+        mask = cv2.imread(mask_path)
+        mask = cv2.resize(mask, (video_tensor.shape[3], video_tensor.shape[2]))
+        mask = mask.sum(axis=-1)>0
+    else:
+        mask = np.ones_like(video_tensor[0,0].cpu().numpy())>0
+        grid_size = 10
+    # Get frame dimensions and create grid points
+    frame_H, frame_W = video_tensor.shape[2:]
+    grid_pts = get_points_on_a_grid(grid_size, (frame_H, frame_W), device="cuda")
+    # Sample mask values at grid points and filter
+    if os.path.exists(mask_path):
+        grid_pts_int = grid_pts[0].long()
+        mask_values = mask[grid_pts_int.cpu()[...,1], grid_pts_int.cpu()[...,0]]
+        grid_pts = grid_pts[:, mask_values]
+    query_xyt = torch.cat([torch.zeros_like(grid_pts[:, :, :1]), grid_pts], dim=2)[0].cpu().numpy()
+    print(f"Query points shape: {query_xyt.shape}")
+    # Run model inference
+    with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
+        (
+            c2w_traj, intrs, point_map, conf_depth,
+            track3d_pred, track2d_pred, vis_pred, conf_pred, video
+        ) = tracker_model_arg.forward(video_tensor, depth=depth_tensor,
+                            intrs=intrs, extrs=extrs,
+                            queries=query_xyt,
+                            fps=1, full_point=False, iters_track=4,
+                            query_no_BA=True, fixed_cam=False, stage=1, unc_metric=unc_metric,
+                            support_frame=len(video_tensor)-1, replace_ratio=0.2)
+        # Resize results to avoid large I/O
+        max_size = 224
+        h, w = video.shape[2:]
+        scale = min(max_size / h, max_size / w)
+        if scale < 1:
+            new_h, new_w = int(h * scale), int(w * scale)
+            video = T.Resize((new_h, new_w))(video)
+            video_tensor = T.Resize((new_h, new_w))(video_tensor)
+            point_map = T.Resize((new_h, new_w))(point_map)
+            track2d_pred[...,:2] = track2d_pred[...,:2] * scale
+            intrs[:,:2,:] = intrs[:,:2,:] * scale
+            conf_depth = T.Resize((new_h, new_w))(conf_depth)
+        # Visualize tracks
+        tracker_viser_arg.visualize(video=video[None],
+                        tracks=track2d_pred[None][...,:2],
+                        visibility=vis_pred[None],filename="test")
+        # Save in tapip3d format
+        data_npz_load["coords"] = (torch.einsum("tij,tnj->tni", c2w_traj[:,:3,:3], track3d_pred[:,:,:3].cpu()) + c2w_traj[:,:3,3][:,None,:]).numpy()
+        data_npz_load["extrinsics"] = torch.inverse(c2w_traj).cpu().numpy()
+        data_npz_load["intrinsics"] = intrs.cpu().numpy()
+        data_npz_load["depths"] = point_map[:,2,...].cpu().numpy()
+        data_npz_load["video"] = (video_tensor).cpu().numpy()/255
+        data_npz_load["visibs"] = vis_pred.cpu().numpy()
+        data_npz_load["confs"] = conf_pred.cpu().numpy()
+        data_npz_load["confs_depth"] = conf_depth.cpu().numpy()
+        np.savez(os.path.join(out_dir, f'result.npz'), **data_npz_load)
+    return None
+def compress_and_write(filename, header, blob):
+    header_bytes = json.dumps(header).encode("utf-8")
+    header_len = struct.pack("<I", len(header_bytes))
+    with open(filename, "wb") as f:
+        f.write(header_len)
+        f.write(header_bytes)
+        f.write(blob)
+def process_point_cloud_data(npz_file, width=256, height=192, fps=4):
+    fixed_size = (width, height)
+    data = np.load(npz_file)
+    extrinsics = data["extrinsics"]
+    intrinsics = data["intrinsics"]
+    trajs = data["coords"]
+    T, C, H, W = data["video"].shape
+    fx = intrinsics[0, 0, 0]
+    fy = intrinsics[0, 1, 1]
+    fov_y = 2 * np.arctan(H / (2 * fy)) * (180 / np.pi)
+    fov_x = 2 * np.arctan(W / (2 * fx)) * (180 / np.pi)
+    original_aspect_ratio = (W / fx) / (H / fy)
+    rgb_video = (rearrange(data["video"], "T C H W -> T H W C") * 255).astype(np.uint8)
+    rgb_video = np.stack([cv2.resize(frame, fixed_size, interpolation=cv2.INTER_AREA)
+                          for frame in rgb_video])
+    depth_video = data["depths"].astype(np.float32)
+    if "confs_depth" in data.keys():
+        confs = (data["confs_depth"].astype(np.float32) > 0.5).astype(np.float32)
+        depth_video = depth_video * confs
+    depth_video = np.stack([cv2.resize(frame, fixed_size, interpolation=cv2.INTER_NEAREST)
+                            for frame in depth_video])
+    scale_x = fixed_size[0] / W
+    scale_y = fixed_size[1] / H
+    intrinsics = intrinsics.copy()
+    intrinsics[:, 0, :] *= scale_x
+    intrinsics[:, 1, :] *= scale_y
+    min_depth = float(depth_video.min()) * 0.8
+    max_depth = float(depth_video.max()) * 1.5
+    depth_normalized = (depth_video - min_depth) / (max_depth - min_depth)
+    depth_int = (depth_normalized * ((1 << 16) - 1)).astype(np.uint16)
+    depths_rgb = np.zeros((T, fixed_size[1], fixed_size[0], 3), dtype=np.uint8)
+    depths_rgb[:, :, :, 0] = (depth_int & 0xFF).astype(np.uint8)
+    depths_rgb[:, :, :, 1] = ((depth_int >> 8) & 0xFF).astype(np.uint8)
+    first_frame_inv = np.linalg.inv(extrinsics[0])
+    normalized_extrinsics = np.array([first_frame_inv @ ext for ext in extrinsics])
+    normalized_trajs = np.zeros_like(trajs)
+    for t in range(T):
+        homogeneous_trajs = np.concatenate([trajs[t], np.ones((trajs.shape[1], 1))], axis=1)
+        transformed_trajs = (first_frame_inv @ homogeneous_trajs.T).T
+        normalized_trajs[t] = transformed_trajs[:, :3]
+    arrays = {
+        "rgb_video": rgb_video,
+        "depths_rgb": depths_rgb,
+        "intrinsics": intrinsics,
+        "extrinsics": normalized_extrinsics,
+        "inv_extrinsics": np.linalg.inv(normalized_extrinsics),
+        "trajectories": normalized_trajs.astype(np.float32),
+        "cameraZ": 0.0
+    }
+    header = {}
+    blob_parts = []
+    offset = 0
+    for key, arr in arrays.items():
+        arr = np.ascontiguousarray(arr)
+        arr_bytes = arr.tobytes()
+        header[key] = {
+            "dtype": str(arr.dtype),
+            "shape": arr.shape,
+            "offset": offset,
+            "length": len(arr_bytes)
+        }
+        blob_parts.append(arr_bytes)
+        offset += len(arr_bytes)
+    raw_blob = b"".join(blob_parts)
+    compressed_blob = zlib.compress(raw_blob, level=9)
+    header["meta"] = {
+        "depthRange": [min_depth, max_depth],
+        "totalFrames": int(T),
+        "resolution": fixed_size,
+        "baseFrameRate": fps,
+        "numTrajectoryPoints": normalized_trajs.shape[1],
+        "fov": float(fov_y),
+        "fov_x": float(fov_x),
+        "original_aspect_ratio": float(original_aspect_ratio),
+        "fixed_aspect_ratio": float(fixed_size[0]/fixed_size[1])
+    }
+    compress_and_write('./_viz/data.bin', header, compressed_blob)
+    with open('./_viz/data.bin', "rb") as f:
+        encoded_blob = base64.b64encode(f.read()).decode("ascii")
+    os.unlink('./_viz/data.bin')
+    random_path = f'./_viz/_{time.time()}.html'
+    with open('./_viz/viz_template.html') as f:
+        html_template = f.read()
+    html_out = html_template.replace(
+        "<head>",
+        f"<head>\n<script>window.embeddedBase64 = `{encoded_blob}`;</script>"
+    )
+    with open(random_path,'w') as f:
+        f.write(html_out)
+    return random_path
+def numpy_to_base64(arr):
+    """Convert numpy array to base64 string"""
+    return base64.b64encode(arr.tobytes()).decode('utf-8')
+def base64_to_numpy(b64_str, shape, dtype):
+    """Convert base64 string back to numpy array"""
+    return np.frombuffer(base64.b64decode(b64_str), dtype=dtype).reshape(shape)
+def get_video_name(video_path):
+    """Extract video name without extension"""
+    return os.path.splitext(os.path.basename(video_path))[0]
+def extract_first_frame(video_path):
+    """Extract first frame from video file"""
+    try:
+        cap = cv2.VideoCapture(video_path)
+        ret, frame = cap.read()
+        cap.release()
+        if ret:
+            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            return frame_rgb
+        else:
+            return None
+    except Exception as e:
+        print(f"Error extracting first frame: {e}")
+        return None
+def handle_video_upload(video):
+    """Handle video upload and extract first frame"""
+    if video is None:
+        return (None, None, [],
+                gr.update(value=50),
+                gr.update(value=756),
+                gr.update(value=3))
+    # Create user-specific temporary directory
+    user_temp_dir = create_user_temp_dir()
+    # Get original video name and copy to temp directory
+    if isinstance(video, str):
+        video_name = get_video_name(video)
+        video_path = os.path.join(user_temp_dir, f"{video_name}.mp4")
+        shutil.copy(video, video_path)
+    else:
+        video_name = get_video_name(video.name)
+        video_path = os.path.join(user_temp_dir, f"{video_name}.mp4")
+        with open(video_path, 'wb') as f:
+            f.write(video.read())
+    print(f"📁 Video saved to: {video_path}")
+    # Extract first frame
+    frame = extract_first_frame(video_path)
+    if frame is None:
+        return (None, None, [],
+                gr.update(value=50),
+                gr.update(value=756),
+                gr.update(value=3))
+    # Resize frame to have minimum side length of 336
+    h, w = frame.shape[:2]
+    scale = 336 / min(h, w)
+    new_h, new_w = int(h * scale)//2*2, int(w * scale)//2*2
+    frame = cv2.resize(frame, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
+    # Store frame data with temp directory info
+    frame_data = {
+        'data': numpy_to_base64(frame),
+        'shape': frame.shape,
+        'dtype': str(frame.dtype),
+        'temp_dir': user_temp_dir,
+        'video_name': video_name,
+        'video_path': video_path
+    }
+    # Get video-specific settings
+    print(f"🎬 Video path: '{video}' -> Video name: '{video_name}'")
+    grid_size_val, vo_points_val, fps_val = get_video_settings(video_name)
+    print(f"🎬 Video settings for '{video_name}': grid_size={grid_size_val}, vo_points={vo_points_val}, fps={fps_val}")
+    return (json.dumps(frame_data), frame, [],
+            gr.update(value=grid_size_val),
+            gr.update(value=vo_points_val),
+            gr.update(value=fps_val))
+def save_masks(o_masks, video_name, temp_dir):
+    """Save binary masks to files in user-specific temp directory"""
+    o_files = []
+    for mask, _ in o_masks:
+        o_mask = np.uint8(mask.squeeze() * 255)
+        o_file = os.path.join(temp_dir, f"{video_name}.png")
+        cv2.imwrite(o_file, o_mask)
+        o_files.append(o_file)
+    return o_files
+def select_point(original_img: str, sel_pix: list, point_type: str, evt: gr.SelectData):
+    """Handle point selection for SAM"""
+    if original_img is None:
+        return None, []
+    try:
+        # Convert stored image data back to numpy array
+        frame_data = json.loads(original_img)
+        original_img_array = base64_to_numpy(frame_data['data'], frame_data['shape'], frame_data['dtype'])
+        temp_dir = frame_data.get('temp_dir', 'temp_local')
+        video_name = frame_data.get('video_name', 'video')
+        # Create a display image for visualization
+        display_img = original_img_array.copy()
+        new_sel_pix = sel_pix.copy() if sel_pix else []
+        new_sel_pix.append((evt.index, 1 if point_type == 'positive_point' else 0))
+        print(f"🎯 Running SAM inference for point: {evt.index}, type: {point_type}")
+        # Run SAM inference
+        o_masks = gpu_run_inference(None, original_img_array, new_sel_pix, [])
+        # Draw points on display image
+        for point, label in new_sel_pix:
+            cv2.drawMarker(display_img, point, COLORS[label], markerType=MARKERS[label], markerSize=MARKER_SIZE, thickness=2)
+        # Draw mask overlay on display image
+        if o_masks:
+            mask = o_masks[0][0]
+            overlay = display_img.copy()
+            overlay[mask.squeeze()!=0] = [20, 60, 200]  # Light blue
+            display_img = cv2.addWeighted(overlay, 0.6, display_img, 0.4, 0)
+            # Save mask for tracking
+            save_masks(o_masks, video_name, temp_dir)
+            print(f"✅ Mask saved for video: {video_name}")
+        return display_img, new_sel_pix
+    except Exception as e:
+        print(f"❌ Error in select_point: {e}")
+        return None, []
+def reset_points(original_img: str, sel_pix):
+    """Reset all points and clear the mask"""
+    if original_img is None:
+        return None, []
+    try:
+        # Convert stored image data back to numpy array
+        frame_data = json.loads(original_img)
+        original_img_array = base64_to_numpy(frame_data['data'], frame_data['shape'], frame_data['dtype'])
+        temp_dir = frame_data.get('temp_dir', 'temp_local')
+        # Create a display image (just the original image)
+        display_img = original_img_array.copy()
+        # Clear all points
+        new_sel_pix = []
+        # Clear any existing masks
+        for mask_file in glob.glob(os.path.join(temp_dir, "*.png")):
+            try:
+                os.remove(mask_file)
+            except Exception as e:
+                logger.warning(f"Failed to remove mask file {mask_file}: {e}")
+        print("🔄 Points and masks reset")
+        return display_img, new_sel_pix
+    except Exception as e:
+        print(f"❌ Error in reset_points: {e}")
+        return None, []
+def launch_viz(grid_size, vo_points, fps, original_image_state, mode="offline"):
+    """Launch visualization with user-specific temp directory"""
+    if original_image_state is None:
+        return None, None, None
+    try:
+        # Get user's temp directory from stored frame data
+        frame_data = json.loads(original_image_state)
+        temp_dir = frame_data.get('temp_dir', 'temp_local')
+        video_name = frame_data.get('video_name', 'video')
+        print(f"🚀 Starting tracking for video: {video_name}")
+        print(f"📊 Parameters: grid_size={grid_size}, vo_points={vo_points}, fps={fps}")
+        # Check for mask files
+        mask_files = glob.glob(os.path.join(temp_dir, "*.png"))
+        video_files = glob.glob(os.path.join(temp_dir, "*.mp4"))
+        if not video_files:
+            print("❌ No video file found")
+            return "❌ Error: No video file found", None, None
+        video_path = video_files[0]
+        mask_path = mask_files[0] if mask_files else None
+        # Run tracker
+        print("🎯 Running tracker...")
+        out_dir = os.path.join(temp_dir, "results")
+        os.makedirs(out_dir, exist_ok=True)
+        gpu_run_tracker(None, None, temp_dir, video_name, grid_size, vo_points, fps, mode=mode)
+        # Process results
+        npz_path = os.path.join(out_dir, "result.npz")
+        track2d_video = os.path.join(out_dir, "test_pred_track.mp4")
+        if os.path.exists(npz_path):
+            print("📊 Processing 3D visualization...")
+            html_path = process_point_cloud_data(npz_path)
+            # Schedule deletion of generated files
+            delete_later(html_path, delay=600)
+            if os.path.exists(track2d_video):
+                delete_later(track2d_video, delay=600)
+            delete_later(npz_path, delay=600)
+            # Create iframe HTML
+            iframe_html = f"""
+            <div style='border: 3px solid #667eea; border-radius: 10px;
+                        background: #f8f9ff; height: 650px; width: 100%;
+                        box-shadow: 0 8px 32px rgba(102, 126, 234, 0.3);
+                        margin: 0; padding: 0; box-sizing: border-box; overflow: hidden;'>
+                <iframe id="viz_iframe" src="/gradio_api/file={html_path}"
+                        width="100%" height="650" frameborder="0"
+                        style="border: none; display: block; width: 100%; height: 650px;
+                               margin: 0; padding: 0; border-radius: 7px;">
+                </iframe>
+            </div>
+            """
+            print("✅ Tracking completed successfully!")
+            return iframe_html, track2d_video if os.path.exists(track2d_video) else None, html_path
+        else:
+            print("❌ Tracking failed - no results generated")
+            return "❌ Error: Tracking failed to generate results", None, None
+    except Exception as e:
+        print(f"❌ Error in launch_viz: {e}")
+        return f"❌ Error: {str(e)}", None, None
+def clear_all():
+    """Clear all buffers and temporary files"""
+    return (None, None, [],
+            gr.update(value=50),
+            gr.update(value=756),
+            gr.update(value=3))
+def clear_all_with_download():
+    """Clear all buffers including both download components"""
+    return (None, None, [],
+            gr.update(value=50),
+            gr.update(value=756),
+            gr.update(value=3),
+            None,  # tracking_video_download
+            None)  # HTML download component
+def get_video_settings(video_name):
+    """Get video-specific settings based on video name"""
+    video_settings = {
+        "running": (50, 512, 2),
+        "backpack": (40, 600, 2),
+        "kitchen": (60, 800, 3),
+        "pillow": (35, 500, 2),
+        "handwave": (35, 500, 8),
+        "hockey": (45, 700, 2),
+        "drifting": (35, 1000, 6),
+        "basketball": (45, 1500, 5),
+        "ken_block_0": (45, 700, 2),
+        "ego_kc1": (45, 500, 4),
+        "vertical_place": (45, 500, 3),
+        "ego_teaser": (45, 1200, 10),
+        "robot_unitree": (45, 500, 4),
+        "robot_3": (35, 400, 5),
+        "teleop2": (45, 256, 7),
+        "pusht": (45, 256, 10),
+        "cinema_0": (45, 356, 5),
+        "cinema_1": (45, 756, 3),
+        "robot1": (45, 600, 2),
+        "robot2": (45, 600, 2),
+        "protein": (45, 600, 2),
+        "kitchen_egocentric": (45, 600, 2),
+    }
+    return video_settings.get(video_name, (50, 756, 3))
+# Create the Gradio interface
+print("🎨 Creating Gradio interface...")
+with gr.Blocks(
+    theme=gr.themes.Soft(),
+    title="🎯 [SpatialTracker V2](https://github.com/henry123-boy/SpaTrackerV2)",
+    css="""
+    .gradio-container {
+        max-width: 1200px !important;
+        margin: auto !important;
+    }
+    .gr-button {
+        margin: 5px;
+    }
+    .gr-form {
+        background: white;
+        border-radius: 10px;
+        padding: 20px;
+        box-shadow: 0 2px 10px rgba(0,0,0,0.1);
+    }
+    /* 移除 gr.Group 的默认灰色背景 */
+    .gr-form {
+        background: transparent !important;
+        border: none !important;
+        box-shadow: none !important;
+        padding: 0 !important;
+    }
+    /* 固定3D可视化器尺寸 */
+    #viz_container {
+        height: 650px !important;
+        min-height: 650px !important;
+        max-height: 650px !important;
+        width: 100% !important;
+        margin: 0 !important;
+        padding: 0 !important;
+        overflow: hidden !important;
+    }
+    #viz_container > div {
+        height: 650px !important;
+        min-height: 650px !important;
+        max-height: 650px !important;
+        width: 100% !important;
+        margin: 0 !important;
+        padding: 0 !important;
+        box-sizing: border-box !important;
+    }
+    #viz_container iframe {
+        height: 650px !important;
+        min-height: 650px !important;
+        max-height: 650px !important;
+        width: 100% !important;
+        border: none !important;
+        display: block !important;
+        margin: 0 !important;
+        padding: 0 !important;
+        box-sizing: border-box !important;
+    }
+    /* 固定视频上传组件高度 */
+    .gr-video {
+        height: 300px !important;
+        min-height: 300px !important;
+        max-height: 300px !important;
+    }
+    .gr-video video {
+        height: 260px !important;
+        max-height: 260px !important;
+        object-fit: contain !important;
+        background: #f8f9fa;
+    }
+    .gr-video .gr-video-player {
+        height: 260px !important;
+        max-height: 260px !important;
+    }
+    /* 强力移除examples的灰色背景 - 使用更通用的选择器 */
+    .horizontal-examples,
+    .horizontal-examples > *,
+    .horizontal-examples * {
+        background: transparent !important;
+        background-color: transparent !important;
+        border: none !important;
+    }
+    /* Examples组件水平滚动样式 */
+    .horizontal-examples [data-testid="examples"] {
+        background: transparent !important;
+        background-color: transparent !important;
+    }
+    .horizontal-examples [data-testid="examples"] > div {
+        background: transparent !important;
+        background-color: transparent !important;
+        overflow-x: auto !important;
+        overflow-y: hidden !important;
+        scrollbar-width: thin;
+        scrollbar-color: #667eea transparent;
+        padding: 0 !important;
+        margin-top: 10px;
+        border: none !important;
+    }
+    .horizontal-examples [data-testid="examples"] table {
+        display: flex !important;
+        flex-wrap: nowrap !important;
+        min-width: max-content !important;
+        gap: 15px !important;
+        padding: 10px 0;
+        background: transparent !important;
+        border: none !important;
+    }
+    .horizontal-examples [data-testid="examples"] tbody {
+        display: flex !important;
+        flex-direction: row !important;
+        flex-wrap: nowrap !important;
+        gap: 15px !important;
+        background: transparent !important;
+    }
+    .horizontal-examples [data-testid="examples"] tr {
+        display: flex !important;
+        flex-direction: column !important;
+        min-width: 160px !important;
+        max-width: 160px !important;
+        margin: 0 !important;
+        background: white !important;
+        border-radius: 12px;
+        box-shadow: 0 3px 12px rgba(0,0,0,0.12);
+        transition: all 0.3s ease;
+        cursor: pointer;
+        overflow: hidden;
+        border: none !important;
+    }
+    .horizontal-examples [data-testid="examples"] tr:hover {
+        transform: translateY(-4px);
+        box-shadow: 0 8px 20px rgba(102, 126, 234, 0.25);
+    }
+    .horizontal-examples [data-testid="examples"] td {
+        text-align: center !important;
+        padding: 0 !important;
+        border: none !important;
+        background: transparent !important;
+    }
+    .horizontal-examples [data-testid="examples"] td:first-child {
+        padding: 0 !important;
+        background: transparent !important;
+    }
+    .horizontal-examples [data-testid="examples"] video {
+        border-radius: 8px 8px 0 0 !important;
+        width: 100% !important;
+        height: 90px !important;
+        object-fit: cover !important;
+        background: #f8f9fa !important;
+    }
+    .horizontal-examples [data-testid="examples"] td:last-child {
+        font-size: 11px !important;
+        font-weight: 600 !important;
+        color: #333 !important;
+        padding: 8px 12px !important;
+        background: linear-gradient(135deg, #f8f9ff 0%, #e6f3ff 100%) !important;
+        border-radius: 0 0 8px 8px;
+    }
+    /* 滚动条样式 */
+    .horizontal-examples [data-testid="examples"] > div::-webkit-scrollbar {
+        height: 8px;
+    }
+    .horizontal-examples [data-testid="examples"] > div::-webkit-scrollbar-track {
+        background: transparent;
+        border-radius: 4px;
+    }
+    .horizontal-examples [data-testid="examples"] > div::-webkit-scrollbar-thumb {
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+        border-radius: 4px;
+    }
+    .horizontal-examples [data-testid="examples"] > div::-webkit-scrollbar-thumb:hover {
+        background: linear-gradient(135deg, #5a6fd8 0%, #6a4190 100%);
+    }
+    """
+) as demo:
+    # Add prominent main title
+    gr.Markdown("""
+    # ✨ SpatialTrackerV2
+    Welcome to [SpatialTracker V2](https://github.com/henry123-boy/SpaTrackerV2)! This interface allows you to track any pixels in 3D using our model.
+    **⚡ Quick Start:** Upload video → Click "Start Tracking Now!"
+    **🔬 Advanced Usage with SAM:**
+    1. Upload a video file or select from examples below
+    2. Expand "Manual Point Selection" to click on specific objects for SAM-guided tracking
+    3. Adjust tracking parameters for optimal performance
+    4. Click "Start Tracking Now!" to begin 3D tracking with SAM guidance
+    """)
+    # Status indicator
+    gr.Markdown("**Status:** 🟢 Local Processing Mode")
+    # Main content area - video upload left, 3D visualization right
+    with gr.Row():
+        with gr.Column(scale=1):
+            # Video upload section
+            gr.Markdown("### 📂 Select Video")
+            # Define video_input here so it can be referenced in examples
+            video_input = gr.Video(
+                label="Upload Video or Select Example",
+                format="mp4",
+                height=250  # Matched height with 3D viz
+            )
+            # Traditional examples but with horizontal scroll styling
+            gr.Markdown("🎨**Examples:** (scroll horizontally to see all videos)")
+            with gr.Row(elem_classes=["horizontal-examples"]):
+                # Horizontal video examples with slider
+                # gr.HTML("<div style='margin-top: 5px;'></div>")
+                gr.Examples(
+                    examples=[
+                        ["./examples/robot1.mp4"],
+                        ["./examples/robot2.mp4"],
+                        ["./examples/protein.mp4"],
+                        ["./examples/kitchen_egocentric.mp4"],
+                        ["./examples/hockey.mp4"],
+                        ["./examples/running.mp4"],
+                        ["./examples/robot_3.mp4"],
+                        ["./examples/backpack.mp4"],
+                        ["./examples/kitchen.mp4"],
+                        ["./examples/pillow.mp4"],
+                        ["./examples/handwave.mp4"],
+                        ["./examples/drifting.mp4"],
+                        ["./examples/basketball.mp4"],
+                        ["./examples/ken_block_0.mp4"],
+                        ["./examples/ego_kc1.mp4"],
+                        ["./examples/vertical_place.mp4"],
+                        ["./examples/ego_teaser.mp4"],
+                        ["./examples/robot_unitree.mp4"],
+                        ["./examples/teleop2.mp4"],
+                        ["./examples/pusht.mp4"],
+                        ["./examples/cinema_0.mp4"],
+                        ["./examples/cinema_1.mp4"],
+                    ],
+                    inputs=[video_input],
+                    outputs=[video_input],
+                    fn=None,
+                    cache_examples=False,
+                    label="",
+                    examples_per_page=6  # Show 6 examples per page so they can wrap to multiple rows
+                )
+        with gr.Column(scale=2):
+            # 3D Visualization - wider and taller to match left side
+            with gr.Group():
+                gr.Markdown("### 🌐 3D Trajectory Visualization")
+                viz_html = gr.HTML(
+                    label="3D Trajectory Visualization",
+                    value="""
+                    <div style='border: 3px solid #667eea; border-radius: 10px;
+                                background: linear-gradient(135deg, #f8f9ff 0%, #e6f3ff 100%);
+                                text-align: center; height: 650px; display: flex;
+                                flex-direction: column; justify-content: center; align-items: center;
+                                box-shadow: 0 4px 16px rgba(102, 126, 234, 0.15);
+                                margin: 0; padding: 20px; box-sizing: border-box;'>
+                        <div style='font-size: 56px; margin-bottom: 25px;'>🌐</div>
+                        <h3 style='color: #667eea; margin-bottom: 18px; font-size: 28px; font-weight: 600;'>
+                            3D Trajectory Visualization
+                        </h3>
+                        <p style='color: #666; font-size: 18px; line-height: 1.6; max-width: 550px; margin-bottom: 30px;'>
+                            Track any pixels in 3D space with camera motion
+                        </p>
+                        <div style='background: rgba(102, 126, 234, 0.1); border-radius: 30px;
+                                    padding: 15px 30px; border: 1px solid rgba(102, 126, 234, 0.2);'>
+                            <span style='color: #667eea; font-weight: 600; font-size: 16px;'>
+                                ⚡ Powered by SpatialTracker V2
+                            </span>
+                        </div>
+                    </div>
+                    """,
+                    elem_id="viz_container"
+                )
+    # Start button section - below video area
+    with gr.Row():
+        with gr.Column(scale=3):
+            launch_btn = gr.Button("🚀 Start Tracking Now!", variant="primary", size="lg")
+        with gr.Column(scale=1):
+            clear_all_btn = gr.Button("🗑️ Clear All", variant="secondary", size="sm")
+    # Tracking parameters section
+    with gr.Row():
+        gr.Markdown("### ⚙️ Tracking Parameters")
+    with gr.Row():
+        grid_size = gr.Slider(
+            minimum=10, maximum=100, step=10, value=50,
+            label="Grid Size", info="Tracking detail level"
+        )
+        vo_points = gr.Slider(
+            minimum=100, maximum=2000, step=50, value=756,
+            label="VO Points", info="Motion accuracy"
+        )
+        fps = gr.Slider(
+            minimum=1, maximum=20, step=1, value=3,
+            label="FPS", info="Processing speed"
+        )
+    # Advanced Point Selection with SAM - Collapsed by default
+    with gr.Row():
+        gr.Markdown("### 🎯 Advanced: Manual Point Selection with SAM")
+    with gr.Accordion("🔬 SAM Point Selection Controls", open=False):
+        gr.HTML("""
+        <div style='margin-bottom: 15px;'>
+            <ul style='color: #4a5568; font-size: 14px; line-height: 1.6; margin: 0; padding-left: 20px;'>
+                <li>Click on target objects in the image for SAM-guided segmentation</li>
+                <li>Positive points: include these areas | Negative points: exclude these areas</li>
+                <li>Get more accurate 3D tracking results with SAM's powerful segmentation</li>
+            </ul>
+        </div>
+        """)
+        with gr.Row():
+            with gr.Column():
+                interactive_frame = gr.Image(
+                    label="Click to select tracking points with SAM guidance",
+                    type="numpy",
+                    interactive=True,
+                    height=300
+                )
+                with gr.Row():
+                    point_type = gr.Radio(
+                        choices=["positive_point", "negative_point"],
+                        value="positive_point",
+                        label="Point Type",
+                        info="Positive: track these areas | Negative: avoid these areas"
+                    )
+                with gr.Row():
+                    reset_points_btn = gr.Button("🔄 Reset Points", variant="secondary", size="sm")
+    # Downloads section - hidden but still functional for local processing
+    with gr.Row(visible=False):
+        with gr.Column(scale=1):
+            tracking_video_download = gr.File(
+                label="📹 Download 2D Tracking Video",
+                interactive=False,
+                visible=False
+            )
+        with gr.Column(scale=1):
+            html_download = gr.File(
+                label="📄 Download 3D Visualization HTML",
+                interactive=False,
+                visible=False
+            )
+    # GitHub Star Section
+    gr.HTML("""
+    <div style='background: linear-gradient(135deg, #e8eaff 0%, #f0f2ff 100%);
+                border-radius: 8px; padding: 20px; margin: 15px 0;
+                box-shadow: 0 2px 8px rgba(102, 126, 234, 0.1);
+                border: 1px solid rgba(102, 126, 234, 0.15);'>
+        <div style='text-align: center;'>
+            <h3 style='color: #4a5568; margin: 0 0 10px 0; font-size: 18px; font-weight: 600;'>
+                ⭐ Love SpatialTracker? Give us a Star! ⭐
+            </h3>
+            <p style='color: #666; margin: 0 0 15px 0; font-size: 14px; line-height: 1.5;'>
+                Help us grow by starring our repository on GitHub! Your support means a lot to the community. 🚀
+            </p>
+            <a href="https://github.com/henry123-boy/SpaTrackerV2" target="_blank"
+               style='display: inline-flex; align-items: center; gap: 8px;
+                      background: rgba(102, 126, 234, 0.1); color: #4a5568;
+                      padding: 10px 20px; border-radius: 25px; text-decoration: none;
+                      font-weight: bold; font-size: 14px; border: 1px solid rgba(102, 126, 234, 0.2);
+                      transition: all 0.3s ease;'
+               onmouseover="this.style.background='rgba(102, 126, 234, 0.15)'; this.style.transform='translateY(-2px)'"
+               onmouseout="this.style.background='rgba(102, 126, 234, 0.1)'; this.style.transform='translateY(0)'">
+                <span style='font-size: 16px;'>⭐</span>
+                Star SpatialTracker V2 on GitHub
+            </a>
+        </div>
+    </div>
+    """)
+    # Acknowledgments Section
+    gr.HTML("""
+    <div style='background: linear-gradient(135deg, #fff8e1 0%, #fffbf0 100%);
+                border-radius: 8px; padding: 20px; margin: 15px 0;
+                box-shadow: 0 2px 8px rgba(255, 193, 7, 0.1);
+                border: 1px solid rgba(255, 193, 7, 0.2);'>
+        <div style='text-align: center;'>
+            <h3 style='color: #5d4037; margin: 0 0 10px 0; font-size: 18px; font-weight: 600;'>
+                📚 Acknowledgments
+            </h3>
+            <p style='color: #5d4037; margin: 0 0 15px 0; font-size: 14px; line-height: 1.5;'>
+                Our 3D visualizer is adapted from <strong>TAPIP3D</strong>. We thank the authors for their excellent work and contribution to the computer vision community!
+            </p>
+            <a href="https://github.com/zbw001/TAPIP3D" target="_blank"
+               style='display: inline-flex; align-items: center; gap: 8px;
+                      background: rgba(255, 193, 7, 0.15); color: #5d4037;
+                      padding: 10px 20px; border-radius: 25px; text-decoration: none;
+                      font-weight: bold; font-size: 14px; border: 1px solid rgba(255, 193, 7, 0.3);
+                      transition: all 0.3s ease;'
+               onmouseover="this.style.background='rgba(255, 193, 7, 0.25)'; this.style.transform='translateY(-2px)'"
+               onmouseout="this.style.background='rgba(255, 193, 7, 0.15)'; this.style.transform='translateY(0)'">
+                📚 Visit TAPIP3D Repository
+            </a>
+        </div>
+    </div>
+    """)
+    # Footer
+    gr.HTML("""
+    <div style='text-align: center; margin: 20px 0 10px 0;'>
+        <span style='font-size: 12px; color: #888; font-style: italic;'>
+            Powered by SpatialTracker V2 | Built with ❤️ for the Computer Vision Community
+        </span>
+    </div>
+    """)
+    # Hidden state variables
+    original_image_state = gr.State(None)
+    selected_points = gr.State([])
+    # Event handlers
+    video_input.change(
+        fn=handle_video_upload,
+        inputs=[video_input],
+        outputs=[original_image_state, interactive_frame, selected_points, grid_size, vo_points, fps]
+    )
+    interactive_frame.select(
+        fn=select_point,
+        inputs=[original_image_state, selected_points, point_type],
+        outputs=[interactive_frame, selected_points]
+    )
+    reset_points_btn.click(
+        fn=reset_points,
+        inputs=[original_image_state, selected_points],
+        outputs=[interactive_frame, selected_points]
+    )
+    clear_all_btn.click(
+        fn=clear_all_with_download,
+        outputs=[video_input, interactive_frame, selected_points, grid_size, vo_points, fps, tracking_video_download, html_download]
+    )
+    launch_btn.click(
+        fn=launch_viz,
+        inputs=[grid_size, vo_points, fps, original_image_state],
+        outputs=[viz_html, tracking_video_download, html_download]
+    )
+# Launch the interface
+if __name__ == "__main__":
+    print("🌟 Launching SpatialTracker V2 Local Version...")
+    print("🔗 Running in Local Processing Mode")
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=True,
+        debug=True,
+        show_error=True
+    )

app_3rd/README.md ADDED Viewed

	@@ -0,0 +1,12 @@

+# 🌟 SpatialTrackerV2 Integrated with SAM 🌟
+SAM receives a point prompt and generates a mask for the target object, facilitating easy interaction to obtain the object's 3D trajectories with SpaTrack2.
+## Installation
+```
+python -m pip install git+https://github.com/facebookresearch/segment-anything.git
+cd app_3rd/sam_utils
+mkdir checkpoints
+cd checkpoints
+wget https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth
+```

app_3rd/sam_utils/hf_sam_predictor.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import gc
+import numpy as np
+import torch
+from typing import Optional, Tuple, List, Union
+import warnings
+import cv2
+try:
+    from transformers import SamModel, SamProcessor
+    from huggingface_hub import hf_hub_download
+    HF_AVAILABLE = True
+except ImportError:
+    HF_AVAILABLE = False
+    warnings.warn("transformers or huggingface_hub not available. HF SAM models will not work.")
+# Hugging Face model mapping
+HF_MODELS = {
+    'vit_b': 'facebook/sam-vit-base',
+    'vit_l': 'facebook/sam-vit-large',
+    'vit_h': 'facebook/sam-vit-huge'
+}
+class HFSamPredictor:
+    """
+    Hugging Face version of SamPredictor that wraps the transformers SAM models.
+    This class provides the same interface as the original SamPredictor for seamless integration.
+    """
+    def __init__(self, model: SamModel, processor: SamProcessor, device: Optional[str] = None):
+        """
+        Initialize the HF SAM predictor.
+        Args:
+            model: The SAM model from transformers
+            processor: The SAM processor from transformers
+            device: Device to run the model on ('cuda', 'cpu', etc.)
+        """
+        self.model = model
+        self.processor = processor
+        self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
+        self.model.to(self.device)
+        self.model.eval()
+        # Store the current image and its features
+        self.original_size = None
+        self.input_size = None
+        self.features = None
+        self.image = None
+    @classmethod
+    def from_pretrained(cls, model_name: str, device: Optional[str] = None) -> 'HFSamPredictor':
+        """
+        Load a SAM model from Hugging Face Hub.
+        Args:
+            model_name: Model name from HF_MODELS or direct HF model path
+            device: Device to load the model on
+        Returns:
+            HFSamPredictor instance
+        """
+        if not HF_AVAILABLE:
+            raise ImportError("transformers and huggingface_hub are required for HF SAM models")
+        # Map model type to HF model name if needed
+        if model_name in HF_MODELS:
+            model_name = HF_MODELS[model_name]
+        print(f"Loading SAM model from Hugging Face: {model_name}")
+        # Load model and processor
+        model = SamModel.from_pretrained(model_name)
+        processor = SamProcessor.from_pretrained(model_name)
+        return cls(model, processor, device)
+    def preprocess(self, image: np.ndarray,
+                         input_points: List[List[float]], input_labels: List[int]) -> None:
+        """
+        Set the image for prediction. This preprocesses the image and extracts features.
+        Args:
+            image: Input image as numpy array (H, W, C) in RGB format
+        """
+        if image.dtype != np.uint8:
+            image = (image * 255).astype(np.uint8)
+        self.image = image
+        self.original_size = image.shape[:2]
+        # Use dummy point to ensure processor returns original_sizes & reshaped_input_sizes
+        inputs = self.processor(
+            images=image,
+            input_points=input_points,
+            input_labels=input_labels,
+            return_tensors="pt"
+        )
+        inputs = {k: v.to(self.device) for k, v in inputs.items()}
+        self.input_size = inputs['pixel_values'].shape[-2:]
+        self.features = inputs
+        return inputs
+def get_hf_sam_predictor(model_type: str = 'vit_h', device: Optional[str] = None,
+                        image: Optional[np.ndarray] = None) -> HFSamPredictor:
+    """
+    Get a Hugging Face SAM predictor with the same interface as the original get_sam_predictor.
+    Args:
+        model_type: Model type ('vit_b', 'vit_l', 'vit_h')
+        device: Device to run the model on
+        image: Optional image to set immediately
+    Returns:
+        HFSamPredictor instance
+    """
+    if not HF_AVAILABLE:
+        raise ImportError("transformers and huggingface_hub are required for HF SAM models")
+    if device is None:
+        device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    # Load the predictor
+    predictor = HFSamPredictor.from_pretrained(model_type, device)
+    # Set image if provided
+    if image is not None:
+        predictor.set_image(image)
+    return predictor

app_3rd/sam_utils/inference.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import gc
+import numpy as np
+import torch
+from segment_anything import SamPredictor, sam_model_registry
+# Try to import HF SAM support
+try:
+    from app_3rd.sam_utils.hf_sam_predictor import get_hf_sam_predictor, HFSamPredictor
+    HF_AVAILABLE = True
+except ImportError:
+    HF_AVAILABLE = False
+models = {
+  'vit_b': 'app_3rd/sam_utils/checkpoints/sam_vit_b_01ec64.pth',
+  'vit_l': 'app_3rd/sam_utils/checkpoints/sam_vit_l_0b3195.pth',
+  'vit_h': 'app_3rd/sam_utils/checkpoints/sam_vit_h_4b8939.pth'
+}
+def get_sam_predictor(model_type='vit_b', device=None, image=None, use_hf=True, predictor=None):
+  """
+  Get SAM predictor with option to use HuggingFace version
+  Args:
+      model_type: Model type ('vit_b', 'vit_l', 'vit_h')
+      device: Device to run on
+      image: Optional image to set immediately
+      use_hf: Whether to use HuggingFace SAM instead of original SAM
+  """
+  if predictor is not None:
+    return predictor
+  if use_hf:
+    if not HF_AVAILABLE:
+      raise ImportError("HuggingFace SAM not available. Install transformers and huggingface_hub.")
+    return get_hf_sam_predictor(model_type, device, image)
+  # Original SAM logic
+  if device is None and torch.cuda.is_available():
+    device = 'cuda'
+  elif device is None:
+    device = 'cpu'
+  # sam model
+  sam = sam_model_registry[model_type](checkpoint=models[model_type])
+  sam = sam.to(device)
+  predictor = SamPredictor(sam)
+  if image is not None:
+    predictor.set_image(image)
+  return predictor
+def run_inference(predictor, input_x, selected_points, multi_object: bool = False):
+  """
+  Run inference with either original SAM or HF SAM predictor
+  Args:
+      predictor: SamPredictor or HFSamPredictor instance
+      input_x: Input image
+      selected_points: List of (point, label) tuples
+      multi_object: Whether to handle multiple objects
+  """
+  if len(selected_points) == 0:
+    return []
+  # Check if using HF SAM
+  if isinstance(predictor, HFSamPredictor):
+    return _run_hf_inference(predictor, input_x, selected_points, multi_object)
+  else:
+    return _run_original_inference(predictor, input_x, selected_points, multi_object)
+def _run_original_inference(predictor: SamPredictor, input_x, selected_points, multi_object: bool = False):
+  """Run inference with original SAM"""
+  points = torch.Tensor(
+      [p for p, _ in selected_points]
+  ).to(predictor.device).unsqueeze(1)
+  labels = torch.Tensor(
+      [int(l) for _, l in selected_points]
+  ).to(predictor.device).unsqueeze(1)
+  transformed_points = predictor.transform.apply_coords_torch(
+      points, input_x.shape[:2])
+  masks, scores, logits = predictor.predict_torch(
+    point_coords=transformed_points[:,0][None],
+    point_labels=labels[:,0][None],
+    multimask_output=False,
+  )
+  masks = masks[0].cpu().numpy()  # N 1 H W   N is the number of points
+  gc.collect()
+  torch.cuda.empty_cache()
+  return [(masks, 'final_mask')]
+def _run_hf_inference(predictor: HFSamPredictor, input_x, selected_points, multi_object: bool = False):
+  """Run inference with HF SAM"""
+  # Prepare points and labels for HF SAM
+  select_pts = [[list(p) for p, _ in selected_points]]
+  select_lbls = [[int(l) for _, l in selected_points]]
+  # Preprocess inputs
+  inputs = predictor.preprocess(input_x, select_pts, select_lbls)
+  # Run inference
+  with torch.no_grad():
+    outputs = predictor.model(**inputs)
+  # Post-process masks
+  masks = predictor.processor.image_processor.post_process_masks(
+    outputs.pred_masks.cpu(),
+    inputs["original_sizes"].cpu(),
+    inputs["reshaped_input_sizes"].cpu(),
+  )
+  masks = masks[0][:,:1,...].cpu().numpy()
+  gc.collect()
+  torch.cuda.empty_cache()
+  return [(masks, 'final_mask')]

app_3rd/spatrack_utils/infer_track.py ADDED Viewed

	@@ -0,0 +1,194 @@

+from models.SpaTrackV2.models.predictor import Predictor
+import yaml
+import easydict
+import os
+import numpy as np
+import cv2
+import torch
+import torchvision.transforms as T
+from PIL import Image
+import io
+import moviepy.editor as mp
+from models.SpaTrackV2.utils.visualizer import Visualizer
+import tqdm
+from models.SpaTrackV2.models.utils import get_points_on_a_grid
+import glob
+from rich import print
+import argparse
+import decord
+from huggingface_hub import hf_hub_download
+config = {
+    "ckpt_dir": "Yuxihenry/SpatialTrackerCkpts",  # HuggingFace repo ID
+    "cfg_dir": "config/magic_infer_moge.yaml",
+}
+def get_tracker_predictor(output_dir: str, vo_points: int = 756, tracker_model=None):
+    """
+    Initialize and return the tracker predictor and visualizer
+    Args:
+        output_dir: Directory to save visualization results
+        vo_points: Number of points for visual odometry
+    Returns:
+        Tuple of (tracker_predictor, visualizer)
+    """
+    viz = True
+    os.makedirs(output_dir, exist_ok=True)
+    with open(config["cfg_dir"], "r") as f:
+        cfg = yaml.load(f, Loader=yaml.FullLoader)
+    cfg = easydict.EasyDict(cfg)
+    cfg.out_dir = output_dir
+    cfg.model.track_num = vo_points
+    # Check if it's a local path or HuggingFace repo
+    if tracker_model is not None:
+        model = tracker_model
+        model.spatrack.track_num = vo_points
+    else:
+        if os.path.exists(config["ckpt_dir"]):
+            # Local file
+            model = Predictor.from_pretrained(config["ckpt_dir"], model_cfg=cfg["model"])
+        else:
+            # HuggingFace repo - download the model
+            print(f"Downloading model from HuggingFace: {config['ckpt_dir']}")
+            checkpoint_path = hf_hub_download(
+                repo_id=config["ckpt_dir"],
+                repo_type="model",
+                filename="SpaTrack3_offline.pth"
+            )
+            model = Predictor.from_pretrained(checkpoint_path, model_cfg=cfg["model"])
+        model.eval()
+        model.to("cuda")
+    viser = Visualizer(save_dir=cfg.out_dir, grayscale=True,
+                     fps=10, pad_value=0, tracks_leave_trace=5)
+    return model, viser
+def run_tracker(model, viser, temp_dir, video_name, grid_size, vo_points, fps=3):
+    """
+    Run tracking on a video sequence
+    Args:
+        model: Tracker predictor instance
+        viser: Visualizer instance
+        temp_dir: Directory containing temporary files
+        video_name: Name of the video file (without extension)
+        grid_size: Size of the tracking grid
+        vo_points: Number of points for visual odometry
+        fps: Frames per second for visualization
+    """
+    # Setup paths
+    video_path = os.path.join(temp_dir, f"{video_name}.mp4")
+    mask_path = os.path.join(temp_dir, f"{video_name}.png")
+    out_dir = os.path.join(temp_dir, "results")
+    os.makedirs(out_dir, exist_ok=True)
+    # Load video using decord
+    video_reader = decord.VideoReader(video_path)
+    video_tensor = torch.from_numpy(video_reader.get_batch(range(len(video_reader))).asnumpy()).permute(0, 3, 1, 2)  # Convert to tensor and permute to (N, C, H, W)
+    # resize make sure the shortest side is 336
+    h, w = video_tensor.shape[2:]
+    scale = max(336 / h, 336 / w)
+    if scale < 1:
+        new_h, new_w = int(h * scale), int(w * scale)
+        video_tensor = T.Resize((new_h, new_w))(video_tensor)
+    video_tensor = video_tensor[::fps].float()
+    depth_tensor = None
+    intrs = None
+    extrs = None
+    data_npz_load = {}
+    # Load and process mask
+    if os.path.exists(mask_path):
+        mask = cv2.imread(mask_path)
+        mask = cv2.resize(mask, (video_tensor.shape[3], video_tensor.shape[2]))
+        mask = mask.sum(axis=-1)>0
+    else:
+        mask = np.ones_like(video_tensor[0,0].numpy())>0
+    # Get frame dimensions and create grid points
+    frame_H, frame_W = video_tensor.shape[2:]
+    grid_pts = get_points_on_a_grid(grid_size, (frame_H, frame_W), device="cpu")
+    # Sample mask values at grid points and filter out points where mask=0
+    if os.path.exists(mask_path):
+        grid_pts_int = grid_pts[0].long()
+        mask_values = mask[grid_pts_int[...,1], grid_pts_int[...,0]]
+        grid_pts = grid_pts[:, mask_values]
+    query_xyt = torch.cat([torch.zeros_like(grid_pts[:, :, :1]), grid_pts], dim=2)[0].numpy()
+    # run vggt
+    if os.environ.get("VGGT_DIR", None) is not None:
+        vggt_model = VGGT()
+        vggt_model.load_state_dict(torch.load(VGGT_DIR))
+        vggt_model.eval()
+        vggt_model = vggt_model.to("cuda")
+        # process the image tensor
+        video_tensor = preprocess_image(video_tensor)[None]
+        with torch.cuda.amp.autocast(dtype=torch.bfloat16):
+            # Predict attributes including cameras, depth maps, and point maps.
+            aggregated_tokens_list, ps_idx = vggt_model.aggregator(video_tensor.cuda()/255)
+            pose_enc = vggt_model.camera_head(aggregated_tokens_list)[-1]
+            # Extrinsic and intrinsic matrices, following OpenCV convention (camera from world)
+            extrinsic, intrinsic = pose_encoding_to_extri_intri(pose_enc, video_tensor.shape[-2:])
+            # Predict Depth Maps
+            depth_map, depth_conf = vggt_model.depth_head(aggregated_tokens_list, video_tensor.cuda()/255, ps_idx)
+            # clear the cache
+            del vggt_model, aggregated_tokens_list, ps_idx, pose_enc
+            torch.cuda.empty_cache()
+        depth_tensor = depth_map.squeeze().cpu().numpy()
+        extrs = np.eye(4)[None].repeat(len(depth_tensor), axis=0)
+        extrs[:, :3, :4] = extrinsic.squeeze().cpu().numpy()
+        intrs = intrinsic.squeeze().cpu().numpy()
+        video_tensor = video_tensor.squeeze()
+        #NOTE: 20% of the depth is not reliable
+        # threshold = depth_conf.squeeze().view(-1).quantile(0.5)
+        unc_metric = depth_conf.squeeze().cpu().numpy() > 0.5
+    # Run model inference
+    with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
+        (
+            c2w_traj, intrs, point_map, conf_depth,
+            track3d_pred, track2d_pred, vis_pred, conf_pred, video
+        ) = model.forward(video_tensor, depth=depth_tensor,
+                            intrs=intrs, extrs=extrs,
+                            queries=query_xyt,
+                            fps=1, full_point=False, iters_track=4,
+                            query_no_BA=True, fixed_cam=False, stage=1,
+                            support_frame=len(video_tensor)-1, replace_ratio=0.2)
+        # Resize results to avoid too large I/O Burden
+        max_size = 336
+        h, w = video.shape[2:]
+        scale = min(max_size / h, max_size / w)
+        if scale < 1:
+            new_h, new_w = int(h * scale), int(w * scale)
+            video = T.Resize((new_h, new_w))(video)
+            video_tensor = T.Resize((new_h, new_w))(video_tensor)
+            point_map = T.Resize((new_h, new_w))(point_map)
+            track2d_pred[...,:2] = track2d_pred[...,:2] * scale
+            intrs[:,:2,:] = intrs[:,:2,:] * scale
+            if depth_tensor is not None:
+                depth_tensor = T.Resize((new_h, new_w))(depth_tensor)
+            conf_depth = T.Resize((new_h, new_w))(conf_depth)
+        # Visualize tracks
+        viser.visualize(video=video[None],
+                        tracks=track2d_pred[None][...,:2],
+                        visibility=vis_pred[None],filename="test")
+        # Save in tapip3d format
+        data_npz_load["coords"] = (torch.einsum("tij,tnj->tni", c2w_traj[:,:3,:3], track3d_pred[:,:,:3].cpu()) + c2w_traj[:,:3,3][:,None,:]).numpy()
+        data_npz_load["extrinsics"] = torch.inverse(c2w_traj).cpu().numpy()
+        data_npz_load["intrinsics"] = intrs.cpu().numpy()
+        data_npz_load["depths"] = point_map[:,2,...].cpu().numpy()
+        data_npz_load["video"] = (video_tensor).cpu().numpy()/255
+        data_npz_load["visibs"] = vis_pred.cpu().numpy()
+        data_npz_load["confs"] = conf_pred.cpu().numpy()
+        data_npz_load["confs_depth"] = conf_depth.cpu().numpy()
+        np.savez(os.path.join(out_dir, f'result.npz'), **data_npz_load)
+        print(f"Results saved to {out_dir}.\nTo visualize them with tapip3d, run: [bold yellow]python tapip3d_viz.py {out_dir}/result.npz[/bold yellow]")

config/__init__.py ADDED Viewed

File without changes

config/magic_infer_moge.yaml ADDED Viewed

	@@ -0,0 +1,48 @@

+seed: 0
+# config the hydra logger, only in hydra `$` can be decoded as cite
+data: ./assets/room
+vis_track: false
+hydra:
+  run:
+    dir: .
+  output_subdir: null
+  job_logging: {}
+  hydra_logging: {}
+mixed_precision: bf16
+visdom:
+  viz_ip: "localhost"
+  port: 6666
+relax_load: false
+res_all: 336
+# config the ckpt path
+# ckpts: "/mnt/bn/xyxdata/home/codes/my_projs/SpaTrack2/checkpoints/new_base.pth"
+ckpts: "Yuxihenry/SpatialTracker_Files"
+batch_size: 1
+input:
+  type: image
+fps: 1
+model_wind_size: 32
+model:
+  backbone_cfg:
+    ckpt_dir: "checkpoints/model.pt"
+  chunk_size: 24        # downsample factor for patchified features
+  ckpt_fwd: true
+  ft_cfg:
+    mode: "fix"
+    paras_name:  []
+  resolution: 336
+  max_len: 512
+  Track_cfg:
+    base_ckpt: "checkpoints/scaled_offline.pth"
+    base:
+      stride: 4
+      corr_radius: 3
+      window_len: 60
+    stablizer: True
+    mode: "online"
+    s_wind: 200
+    overlap: 4
+  track_num: 0
+dist_train:
+  num_nodes: 1

examples/backpack.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4b5ac6b2285ffb48e3a740e419e38c781df9c963589a5fd894e5b4e13dd6a8b8
+size 1208738

examples/ball.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:31f6e3bf875a85284b376c05170b4c08b546b7d5e95106848b1e3818a9d0db91
+size 3030268

examples/basketball.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0df3b429d5fd64c298f2d79b2d818a4044e7341a71d70b957f60b24e313c3760
+size 2487837

examples/biker.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fba880c24bdb8fa3b84b1b491d52f2c1f426fb09e34c3013603e5a549cf3b22b
+size 249196

examples/cinema_0.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a68a5643c14f61c05d48e25a98ddf5cf0344d3ffcda08ad4a0adc989d49d7a9c
+size 1774022

examples/cinema_1.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:99624e2d0fb2e9f994e46aefb904e884de37a6d78e7f6b6670e286eaa397e515
+size 2370749

examples/drifting.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4f3937871117d3cc5d7da3ef31d1edf5626fc8372126b73590f75f05713fe97c
+size 4695804

examples/ego_kc1.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:22fe64e458e329e8b3c3e20b3725ffd85c3a2e725fd03909cf883d3fd02c80b3
+size 1365980

examples/ego_teaser.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8780b291b48046b1c7dea90712c1c3f59d60c03216df1c489f6f03e8d61fae5c
+size 7365665

examples/handwave.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e6dde7cf4ffa7c66b6861bb5abdedc49dfc4b5b4dd9dd46ee8415dd4953935b6
+size 2099369

examples/hockey.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8c3be095777b442dc401e7d1f489b749611ffade3563a01e4e3d1e511311bd86
+size 1795810

examples/ken_block_0.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7b788faeb4d3206fa604d622a05268f1321ad6a229178fe12319d20c9438deb1
+size 196343

examples/kiss.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f78fffc5108d95d4e5837d7607226f3dd9796615ea3481f2629c69ccd2ccb12f
+size 1073570

examples/kitchen.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3120e942a9b3d7b300928e43113b000fb5ccc209012a2c560ec26b8a04c2d5f9
+size 543970

examples/kitchen_egocentric.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5468ab10d8d39b68b51fa616adc3d099dab7543e38dd221a0a7a20a2401824a2
+size 2176685

examples/pillow.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8f05818f586d7b0796fcd4714ea4be489c93701598cadc86ce7973fc24655fee
+size 1407147

examples/protein.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b2dc9cfceb0984b61ebc62fda4c826135ebe916c8c966a8123dcc3315d43b73f
+size 2002300

examples/pusht.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:996d1923e36811a1069e4d6b5e8c0338d9068c0870ea09c4c04e13e9fbcd207a
+size 5256495

examples/robot1.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7a3b9e4449572129fdd96a751938e211241cdd86bcc56ffd33bfd23fc4d6e9c0
+size 1178671

examples/robot2.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:188b2d8824ce345c86a603bff210639a6158d72cf6119cc1d3f79d409ac68bb3
+size 867261

examples/robot_3.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:784a0f9c36a316d0da5745075dbc8cefd9ce60c25b067d3d80a1d52830df8a37
+size 1153015

examples/robot_unitree.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:99bc274f7613a665c6135085fe01691ebfaa9033101319071f37c550ab21d1ea
+size 1964268

examples/running.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9ceb96b287fefb1c090dcd2f5db7634f808d2079413500beeb7b33023dfae51b
+size 7307897

examples/teleop2.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:59ea006a18227da8cf5db1fa50cd48e71ec7eb66fef48ea2158c325088bd9fee
+size 1077267

examples/vertical_place.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6c8061ae449f986113c2ecb17aefc2c13f737aecbcd41d6c057c88e6d41ac3ee
+size 719810

models/SpaTrackV2/models/SpaTrack.py ADDED Viewed

	@@ -0,0 +1,759 @@

+#python
+"""
+SpaTrackerV2, which is an unified model to estimate 'intrinsic',
+'video depth', 'extrinsic' and '3D Tracking' from casual video frames.
+Contact: DM [email protected]
+"""
+import os
+import numpy as np
+from typing import Literal, Union, List, Tuple, Dict
+import cv2
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# from depth anything v2
+from huggingface_hub import PyTorchModelHubMixin  # used for model hub
+from einops import rearrange
+from models.monoD.depth_anything_v2.dpt import DepthAnythingV2
+from models.moge.model.v1 import MoGeModel
+import copy
+from functools import partial
+from models.SpaTrackV2.models.tracker3D.TrackRefiner import TrackRefiner3D
+import kornia
+from models.SpaTrackV2.utils.model_utils import sample_features5d
+import utils3d
+from models.SpaTrackV2.models.tracker3D.spatrack_modules.utils import depth_to_points_colmap, get_nth_visible_time_index
+from models.SpaTrackV2.models.utils import pose_enc2mat, matrix_to_quaternion, get_track_points, normalize_rgb
+import random
+class SpaTrack2(nn.Module, PyTorchModelHubMixin):
+    def __init__(
+        self,
+        loggers: list,   # include [ viz, logger_tf, logger]
+        backbone_cfg,
+        Track_cfg=None,
+        chunk_size=24,
+        ckpt_fwd: bool = False,
+        ft_cfg=None,
+        resolution=518,
+        max_len=600,  # the maximum video length we can preprocess,
+        track_num=768,
+    ):
+        self.chunk_size = chunk_size
+        self.max_len = max_len
+        self.resolution = resolution
+        # config the T-Lora Dinov2
+        #NOTE: initial the base model
+        base_cfg = copy.deepcopy(backbone_cfg)
+        backbone_ckpt_dir = base_cfg.pop('ckpt_dir', None)
+        super(SpaTrack2, self).__init__()
+        if os.path.exists(backbone_ckpt_dir)==False:
+            base_model = MoGeModel.from_pretrained('Ruicheng/moge-vitl')
+        else:
+            checkpoint = torch.load(backbone_ckpt_dir, map_location='cpu', weights_only=True)
+            base_model = MoGeModel(**checkpoint["model_config"])
+            base_model.load_state_dict(checkpoint['model'])
+        # avoid the base_model is a member of SpaTrack2
+        object.__setattr__(self, 'base_model', base_model)
+        # Tracker model
+        self.Track3D = TrackRefiner3D(Track_cfg)
+        track_base_ckpt_dir = Track_cfg.base_ckpt
+        if os.path.exists(track_base_ckpt_dir):
+            track_pretrain = torch.load(track_base_ckpt_dir)
+            self.Track3D.load_state_dict(track_pretrain, strict=False)
+        # wrap the function of make lora trainable
+        self.make_paras_trainable = partial(self.make_paras_trainable,
+                                            mode=ft_cfg.mode,
+                                            paras_name=ft_cfg.paras_name)
+        self.track_num = track_num
+    def make_paras_trainable(self, mode: str = 'fix', paras_name: List[str] = []):
+        # gradient required for the lora_experts and gate
+        for name, param in self.named_parameters():
+            if any(x in name for x in paras_name):
+                if mode == 'fix':
+                    param.requires_grad = False
+                else:
+                    param.requires_grad = True
+            else:
+                if mode == 'fix':
+                    param.requires_grad = True
+                else:
+                    param.requires_grad = False
+        total_params = sum(p.numel() for p in self.parameters())
+        trainable_params = sum(p.numel() for p in self.parameters() if p.requires_grad)
+        print(f"Total parameters: {total_params}")
+        print(f"Trainable parameters: {trainable_params/total_params*100:.2f}%")
+    def ProcVid(self,
+                        x: torch.Tensor):
+        """
+        split the video into several overlapped windows.
+        args:
+            x: the input video frames.   [B, T, C, H, W]
+        outputs:
+            patch_size: the patch size of the video features
+        raises:
+            ValueError: if the input video is longer than `max_len`.
+        """
+        # normalize the input images
+        num_types = x.dtype
+        x = normalize_rgb(x, input_size=self.resolution)
+        x = x.to(num_types)
+        # get the video features
+        B, T, C, H, W = x.size()
+        if T > self.max_len:
+            raise ValueError(f"the video length should no more than {self.max_len}.")
+        # get the video features
+        patch_h, patch_w = H // 14, W // 14
+        patch_size = (patch_h,  patch_w)
+        # resize and get the video features
+        x = x.view(B * T, C, H, W)
+        # operate the temporal encoding
+        return patch_size, x
+    def forward_stream(
+            self,
+            video: torch.Tensor,
+            queries: torch.Tensor = None,
+            T_org: int = None,
+            depth: torch.Tensor|np.ndarray|str=None,
+            unc_metric_in: torch.Tensor|np.ndarray|str=None,
+            intrs: torch.Tensor|np.ndarray|str=None,
+            extrs: torch.Tensor|np.ndarray|str=None,
+            queries_3d: torch.Tensor = None,
+            window_len: int = 16,
+            overlap_len: int = 4,
+            full_point: bool = False,
+            track2d_gt: torch.Tensor = None,
+            fixed_cam: bool = False,
+            query_no_BA: bool = False,
+            stage: int = 0,
+            support_frame: int = 0,
+            replace_ratio: float = 0.6,
+            annots_train: Dict = None,
+            iters_track=4,
+            **kwargs,
+    ):
+        # step 1 allocate the query points on the grid
+        T, C, H, W = video.shape
+        if annots_train is not None:
+            vis_gt = annots_train["vis"]
+            _, _, N = vis_gt.shape
+            number_visible = vis_gt.sum(dim=1)
+            ratio_rand = torch.rand(1, N, device=vis_gt.device)
+            first_positive_inds = get_nth_visible_time_index(vis_gt, (number_visible*ratio_rand).long().clamp(min=1, max=T))
+            assert (torch.gather(vis_gt, 1, first_positive_inds[:, None, :].repeat(1, T, 1)) < 0).sum() == 0
+            first_positive_inds = first_positive_inds.long()
+            gather = torch.gather(
+                annots_train["traj_3d"][...,:2], 1, first_positive_inds[:, :, None, None].repeat(1, 1, N, 2)
+                )
+            xys = torch.diagonal(gather, dim1=1, dim2=2).permute(0, 2, 1)
+            queries = torch.cat([first_positive_inds[:, :, None], xys], dim=-1)[0].cpu().numpy()
+        # Unfold video into segments of window_len with overlap_len
+        step_slide = window_len - overlap_len
+        if T < window_len:
+            video_unf = video.unsqueeze(0)
+            if depth is not None:
+                depth_unf = depth.unsqueeze(0)
+            else:
+                depth_unf = None
+            if unc_metric_in is not None:
+                unc_metric_unf = unc_metric_in.unsqueeze(0)
+            else:
+                unc_metric_unf = None
+            if intrs is not None:
+                intrs_unf = intrs.unsqueeze(0)
+            else:
+                intrs_unf = None
+            if extrs is not None:
+                extrs_unf = extrs.unsqueeze(0)
+            else:
+                extrs_unf = None
+        else:
+            video_unf = video.unfold(0, window_len, step_slide).permute(0, 4, 1, 2, 3)  # [B, S, C, H, W]
+            if depth is not None:
+                depth_unf = depth.unfold(0, window_len, step_slide).permute(0, 3, 1, 2)
+                intrs_unf = intrs.unfold(0, window_len, step_slide).permute(0, 3, 1, 2)
+            else:
+                depth_unf = None
+                intrs_unf = None
+            if extrs is not None:
+                extrs_unf = extrs.unfold(0, window_len, step_slide).permute(0, 3, 1, 2)
+            else:
+                extrs_unf = None
+            if unc_metric_in is not None:
+                unc_metric_unf = unc_metric_in.unfold(0, window_len, step_slide).permute(0, 3, 1, 2)
+            else:
+                unc_metric_unf = None
+        # parallel
+        # Get number of segments
+        B = video_unf.shape[0]
+        #TODO: Process each segment in parallel using torch.nn.DataParallel
+        c2w_traj = torch.eye(4, 4)[None].repeat(T, 1, 1)
+        intrs_out = torch.eye(3, 3)[None].repeat(T, 1, 1)
+        point_map = torch.zeros(T, 3, H, W).cuda()
+        unc_metric = torch.zeros(T, H, W).cuda()
+        # set the queries
+        N, _ = queries.shape
+        track3d_pred = torch.zeros(T, N, 6).cuda()
+        track2d_pred = torch.zeros(T, N, 3).cuda()
+        vis_pred = torch.zeros(T, N, 1).cuda()
+        conf_pred = torch.zeros(T, N, 1).cuda()
+        dyn_preds = torch.zeros(T, N, 1).cuda()
+        # sort the queries by time
+        sorted_indices = np.argsort(queries[...,0])
+        sorted_inv_indices = np.argsort(sorted_indices)
+        sort_query = queries[sorted_indices]
+        sort_query = torch.from_numpy(sort_query).cuda()
+        if queries_3d is not None:
+            sort_query_3d = queries_3d[sorted_indices]
+            sort_query_3d = torch.from_numpy(sort_query_3d).cuda()
+        queries_len = 0
+        overlap_d = None
+        cache = None
+        loss = 0.0
+        for i in range(B):
+            segment = video_unf[i:i+1].cuda()
+            # Forward pass through model
+            # detect the key points for each frames
+            queries_new_mask = (sort_query[...,0] < i * step_slide + window_len) * (sort_query[...,0] >= (i * step_slide + overlap_len if i > 0 else 0))
+            if queries_3d is not None:
+                queries_new_3d = sort_query_3d[queries_new_mask]
+                queries_new_3d = queries_new_3d.float()
+            else:
+                queries_new_3d = None
+            queries_new = sort_query[queries_new_mask.bool()]
+            queries_new = queries_new.float()
+            if i > 0:
+                overlap2d = track2d_pred[i*step_slide:(i+1)*step_slide, :queries_len, :]
+                overlapvis = vis_pred[i*step_slide:(i+1)*step_slide, :queries_len, :]
+                overlapconf = conf_pred[i*step_slide:(i+1)*step_slide, :queries_len, :]
+                overlap_query = (overlapvis * overlapconf).max(dim=0)[1][None, ...]
+                overlap_xy = torch.gather(overlap2d, 0, overlap_query.repeat(1,1,2))
+                overlap_d = torch.gather(overlap2d, 0, overlap_query.repeat(1,1,3))[...,2].detach()
+                overlap_query = torch.cat([overlap_query[...,:1], overlap_xy], dim=-1)[0]
+                queries_new[...,0] -= i*step_slide
+                queries_new = torch.cat([overlap_query, queries_new], dim=0).detach()
+            if annots_train is None:
+                annots = {}
+            else:
+                annots = copy.deepcopy(annots_train)
+                annots["traj_3d"] = annots["traj_3d"][:, i*step_slide:i*step_slide+window_len, sorted_indices,:][...,:len(queries_new),:]
+                annots["vis"] = annots["vis"][:, i*step_slide:i*step_slide+window_len, sorted_indices][...,:len(queries_new)]
+                annots["poses_gt"] =  annots["poses_gt"][:, i*step_slide:i*step_slide+window_len]
+                annots["depth_gt"] = annots["depth_gt"][:, i*step_slide:i*step_slide+window_len]
+                annots["intrs"] = annots["intrs"][:, i*step_slide:i*step_slide+window_len]
+                annots["traj_mat"] = annots["traj_mat"][:,i*step_slide:i*step_slide+window_len]
+            if depth is not None:
+                annots["depth_gt"] = depth_unf[i:i+1].to(segment.device).to(segment.dtype)
+            if unc_metric_in is not None:
+                annots["unc_metric"] = unc_metric_unf[i:i+1].to(segment.device).to(segment.dtype)
+            if intrs is not None:
+                intr_seg = intrs_unf[i:i+1].to(segment.device).to(segment.dtype)[0].clone()
+                focal = (intr_seg[:,0,0] / segment.shape[-1] + intr_seg[:,1,1]/segment.shape[-2]) / 2
+                pose_fake = torch.zeros(1, 8).to(depth.device).to(depth.dtype).repeat(segment.shape[1], 1)
+                pose_fake[:, -1] = focal
+                pose_fake[:,3]=1
+                annots["intrs_gt"] = intr_seg
+            if extrs is not None:
+                extrs_unf_norm = extrs_unf[i:i+1][0].clone()
+                extrs_unf_norm = torch.inverse(extrs_unf_norm[:1,...]) @ extrs_unf[i:i+1][0]
+                rot_vec = matrix_to_quaternion(extrs_unf_norm[:,:3,:3])
+                annots["poses_gt"] = torch.zeros(1, rot_vec.shape[0], 7).to(segment.device).to(segment.dtype)
+                annots["poses_gt"][:, :, 3:7] = rot_vec.to(segment.device).to(segment.dtype)[None]
+                annots["poses_gt"][:, :, :3] = extrs_unf_norm[:,:3,3].to(segment.device).to(segment.dtype)[None]
+                annots["use_extr"] = True
+            kwargs.update({"stage": stage})
+            #TODO: DEBUG
+            out = self.forward(segment, pts_q=queries_new,
+                                pts_q_3d=queries_new_3d, overlap_d=overlap_d,
+                                full_point=full_point,
+                                fixed_cam=fixed_cam, query_no_BA=query_no_BA,
+                                support_frame=segment.shape[1]-1,
+                                cache=cache, replace_ratio=replace_ratio,
+                                iters_track=iters_track,
+                                **kwargs, annots=annots)
+            if self.training:
+                loss += out["loss"].squeeze()
+            # from models.SpaTrackV2.utils.visualizer import Visualizer
+            # vis_track = Visualizer(grayscale=False,
+            #                     fps=10, pad_value=50, tracks_leave_trace=0)
+            # vis_track.visualize(video=segment,
+            #                         tracks=out["traj_est"][...,:2],
+            #                         visibility=out["vis_est"],
+            #                         save_video=True)
+            # # visualize 4d
+            # import os, json
+            # import os.path as osp
+            # viser4d_dir = os.path.join("viser_4d_results")
+            # os.makedirs(viser4d_dir, exist_ok=True)
+            # depth_est = annots["depth_gt"][0]
+            # unc_metric = out["unc_metric"]
+            # mask = (unc_metric > 0.5).squeeze(1)
+            # # pose_est = out["poses_pred"].squeeze(0)
+            # pose_est = annots["traj_mat"][0]
+            # rgb_tracks = out["rgb_tracks"].squeeze(0)
+            # intrinsics = out["intrs"].squeeze(0)
+            # for i_k in range(out["depth"].shape[0]):
+            #     img_i = out["imgs_raw"][0][i_k].permute(1, 2, 0).cpu().numpy()
+            #     img_i = cv2.cvtColor(img_i, cv2.COLOR_BGR2RGB)
+            #     cv2.imwrite(osp.join(viser4d_dir, f'frame_{i_k:04d}.png'), img_i)
+            #     if stage == 1:
+            #         depth = depth_est[i_k].squeeze().cpu().numpy()
+            #         np.save(osp.join(viser4d_dir, f'frame_{i_k:04d}.npy'), depth)
+            #     else:
+            #         point_map_vis = out["points_map"][i_k].cpu().numpy()
+            #         np.save(osp.join(viser4d_dir, f'point_{i_k:04d}.npy'), point_map_vis)
+            # np.save(os.path.join(viser4d_dir, f'intrinsics.npy'), intrinsics.cpu().numpy())
+            # np.save(os.path.join(viser4d_dir, f'extrinsics.npy'), pose_est.cpu().numpy())
+            # np.save(os.path.join(viser4d_dir, f'conf.npy'), mask.float().cpu().numpy())
+            # np.save(os.path.join(viser4d_dir, f'colored_track3d.npy'), rgb_tracks.cpu().numpy())
+            queries_len = len(queries_new)
+            # update the track3d and track2d
+            left_len = len(track3d_pred[i*step_slide:i*step_slide+window_len, :queries_len, :])
+            track3d_pred[i*step_slide:i*step_slide+window_len, :queries_len, :] = out["rgb_tracks"][0,:left_len,:queries_len,:]
+            track2d_pred[i*step_slide:i*step_slide+window_len, :queries_len, :] = out["traj_est"][0,:left_len,:queries_len,:3]
+            vis_pred[i*step_slide:i*step_slide+window_len, :queries_len, :] = out["vis_est"][0,:left_len,:queries_len,None]
+            conf_pred[i*step_slide:i*step_slide+window_len, :queries_len, :] = out["conf_pred"][0,:left_len,:queries_len,None]
+            dyn_preds[i*step_slide:i*step_slide+window_len, :queries_len, :] = out["dyn_preds"][0,:left_len,:queries_len,None]
+            # process the output for each segment
+            seg_c2w = out["poses_pred"][0]
+            seg_intrs = out["intrs"][0]
+            seg_point_map = out["points_map"]
+            seg_conf_depth = out["unc_metric"]
+            # cache management
+            cache = out["cache"]
+            for k in cache.keys():
+                if "_pyramid" in k:
+                    for j in range(len(cache[k])):
+                        if len(cache[k][j].shape) == 5:
+                            cache[k][j] = cache[k][j][:,:,:,:queries_len,:]
+                        elif len(cache[k][j].shape) == 4:
+                            cache[k][j] = cache[k][j][:,:1,:queries_len,:]
+                elif "_pred_cache" in k:
+                    cache[k] = cache[k][-overlap_len:,:queries_len,:]
+                else:
+                    cache[k] = cache[k][-overlap_len:]
+            # update the results
+            idx_glob = i * step_slide
+            # refine part
+            # mask_update = sort_query[..., 0] < i * step_slide + window_len
+            # sort_query_pick = sort_query[mask_update]
+            intrs_out[idx_glob:idx_glob+window_len] = seg_intrs
+            point_map[idx_glob:idx_glob+window_len] = seg_point_map
+            unc_metric[idx_glob:idx_glob+window_len] = seg_conf_depth
+            # update the camera poses
+            # if using the ground truth pose
+            # if extrs_unf is not None:
+            #     c2w_traj[idx_glob:idx_glob+window_len] = extrs_unf[i:i+1][0].to(c2w_traj.device).to(c2w_traj.dtype)
+            # else:
+            prev_c2w = c2w_traj[idx_glob:idx_glob+window_len][:1]
+            c2w_traj[idx_glob:idx_glob+window_len] = prev_c2w@seg_c2w.to(c2w_traj.device).to(c2w_traj.dtype)
+        track2d_pred = track2d_pred[:T_org,sorted_inv_indices,:]
+        track3d_pred = track3d_pred[:T_org,sorted_inv_indices,:]
+        vis_pred = vis_pred[:T_org,sorted_inv_indices,:]
+        conf_pred = conf_pred[:T_org,sorted_inv_indices,:]
+        dyn_preds = dyn_preds[:T_org,sorted_inv_indices,:]
+        unc_metric = unc_metric[:T_org,:]
+        point_map = point_map[:T_org,:]
+        intrs_out = intrs_out[:T_org,:]
+        c2w_traj = c2w_traj[:T_org,:]
+        if self.training:
+            ret = {
+                "loss": loss,
+                "depth_loss": 0.0,
+                "ab_loss": 0.0,
+                "vis_loss": out["vis_loss"],
+                "track_loss": out["track_loss"],
+                "conf_loss": out["conf_loss"],
+                "dyn_loss": out["dyn_loss"],
+                "sync_loss": out["sync_loss"],
+                "poses_pred": c2w_traj[None],
+                "intrs": intrs_out[None],
+                "points_map": point_map,
+                "track3d_pred": track3d_pred[None],
+                "rgb_tracks": track3d_pred[None],
+                "track2d_pred": track2d_pred[None],
+                "traj_est": track2d_pred[None],
+                "vis_est": vis_pred[None], "conf_pred": conf_pred[None],
+                "dyn_preds": dyn_preds[None],
+                "imgs_raw": video[None],
+                "unc_metric": unc_metric,
+                }
+            return ret
+        else:
+            return c2w_traj, intrs_out, point_map, unc_metric, track3d_pred, track2d_pred, vis_pred, conf_pred
+    def forward(self,
+                 x: torch.Tensor,
+                 annots: Dict = {},
+                 pts_q: torch.Tensor = None,
+                 pts_q_3d: torch.Tensor = None,
+                 overlap_d: torch.Tensor = None,
+                 full_point = False,
+                 fixed_cam = False,
+                 support_frame = 0,
+                 query_no_BA = False,
+                 cache = None,
+                 replace_ratio = 0.6,
+                 iters_track=4,
+                 **kwargs):
+        """
+        forward the video camera model, which predict (
+            `intr` `camera poses` `video depth`
+            )
+        args:
+            x: the input video frames. [B, T, C, H, W]
+            annots: the annotations for video frames.
+                    {
+                        "poses_gt": the pose encoding for the video frames. [B, T, 7]
+                        "depth_gt": the ground truth depth for the video frames. [B, T, 1, H, W],
+                        "metric": bool, whether to calculate the metric for the video frames.
+                    }
+        """
+        self.support_frame = support_frame
+        #TODO: to adjust a little bit
+        track_loss=ab_loss=vis_loss=track_loss=conf_loss=dyn_loss=0.0
+        B, T, _, H, W = x.shape
+        imgs_raw = x.clone()
+        # get the video split and features for each segment
+        patch_size, x_resize = self.ProcVid(x)
+        x_resize = rearrange(x_resize, "(b t) c h w -> b t c h w", b=B)
+        H_resize, W_resize = x_resize.shape[-2:]
+        prec_fx = W / W_resize
+        prec_fy = H / H_resize
+        # get patch size
+        P_H, P_W = patch_size
+        # get the depth, pointmap and mask
+        #TODO: Release DepthAnything Version
+        points_map_gt = None
+        with torch.no_grad():
+            if_gt_depth = (("depth_gt" in annots.keys())) and (kwargs.get('stage', 0)==1 or kwargs.get('stage', 0)==3)
+            if if_gt_depth==False:
+                if cache is not None:
+                    T_cache = cache["points_map"].shape[0]
+                    T_new = T - T_cache
+                    x_resize_new = x_resize[:, T_cache:]
+                else:
+                    T_new = T
+                    x_resize_new = x_resize
+                # infer with chunk
+                chunk_size = self.chunk_size
+                metric_depth = []
+                intrs = []
+                unc_metric = []
+                mask = []
+                points_map = []
+                normals = []
+                normals_mask = []
+                for i in range(0, B*T_new, chunk_size):
+                    output = self.base_model.infer(x_resize_new.view(B*T_new, -1, H_resize, W_resize)[i:i+chunk_size])
+                    metric_depth.append(output['depth'])
+                    intrs.append(output['intrinsics'])
+                    unc_metric.append(output['mask_prob'])
+                    mask.append(output['mask'])
+                    points_map.append(output['points'])
+                    normals_i, normals_mask_i = utils3d.torch.points_to_normals(output['points'], mask=output['mask'])
+                    normals.append(normals_i)
+                    normals_mask.append(normals_mask_i)
+                metric_depth = torch.cat(metric_depth, dim=0).view(B*T_new, 1, H_resize, W_resize).to(x.dtype)
+                intrs = torch.cat(intrs, dim=0).view(B, T_new, 3, 3).to(x.dtype)
+                intrs[:,:,0,:] *= W_resize
+                intrs[:,:,1,:] *= H_resize
+                # points_map = torch.cat(points_map, dim=0)
+                mask = torch.cat(mask, dim=0).view(B*T_new, 1, H_resize, W_resize).to(x.dtype)
+                # cat the normals
+                normals = torch.cat(normals, dim=0)
+                normals_mask = torch.cat(normals_mask, dim=0)
+                metric_depth = metric_depth.clone()
+                metric_depth[metric_depth == torch.inf] = 0
+                _depths = metric_depth[metric_depth > 0].reshape(-1)
+                q25 = torch.kthvalue(_depths, int(0.25 * len(_depths))).values
+                q75 = torch.kthvalue(_depths, int(0.75 * len(_depths))).values
+                iqr = q75 - q25
+                upper_bound = (q75 + 0.8*iqr).clamp(min=1e-6, max=10*q25)
+                _depth_roi = torch.tensor(
+                    [1e-1, upper_bound.item()],
+                    dtype=metric_depth.dtype,
+                    device=metric_depth.device
+                )
+                mask_roi = (metric_depth > _depth_roi[0]) & (metric_depth < _depth_roi[1])
+                mask = mask * mask_roi
+                mask = mask * (~(utils3d.torch.depth_edge(metric_depth, rtol=0.03, mask=mask.bool()))) * normals_mask[:,None,...]
+                points_map = depth_to_points_colmap(metric_depth.squeeze(1), intrs.view(B*T_new, 3, 3))
+                unc_metric = torch.cat(unc_metric, dim=0).view(B*T_new, 1, H_resize, W_resize).to(x.dtype)
+                unc_metric *= mask
+                if full_point:
+                    unc_metric = (~(utils3d.torch.depth_edge(metric_depth, rtol=0.1, mask=torch.ones_like(metric_depth).bool()))).float() * (metric_depth != 0)
+                if cache is not None:
+                    assert B==1, "only support batch size 1 right now."
+                    unc_metric = torch.cat([cache["unc_metric"], unc_metric], dim=0)
+                    intrs = torch.cat([cache["intrs"][None], intrs], dim=1)
+                    points_map = torch.cat([cache["points_map"].permute(0,2,3,1), points_map], dim=0)
+                    metric_depth = torch.cat([cache["metric_depth"], metric_depth], dim=0)
+            if "poses_gt" in annots.keys():
+                intrs, c2w_traj_gt = pose_enc2mat(annots["poses_gt"],
+                                            H_resize, W_resize, self.resolution)
+            else:
+                c2w_traj_gt = None
+            if "intrs_gt" in annots.keys():
+                intrs = annots["intrs_gt"].view(B, T, 3, 3)
+                fx_factor = W_resize / W
+                fy_factor = H_resize / H
+                intrs[:,:,0,:] *= fx_factor
+                intrs[:,:,1,:] *= fy_factor
+            if "depth_gt" in annots.keys():
+                metric_depth_gt = annots['depth_gt'].view(B*T, 1, H, W)
+                metric_depth_gt = F.interpolate(metric_depth_gt,
+                                size=(H_resize, W_resize), mode='nearest')
+                _depths = metric_depth_gt[metric_depth_gt > 0].reshape(-1)
+                q25 = torch.kthvalue(_depths, int(0.25 * len(_depths))).values
+                q75 = torch.kthvalue(_depths, int(0.75 * len(_depths))).values
+                iqr = q75 - q25
+                upper_bound = (q75 + 0.8*iqr).clamp(min=1e-6, max=10*q25)
+                _depth_roi = torch.tensor(
+                    [1e-1, upper_bound.item()],
+                    dtype=metric_depth_gt.dtype,
+                    device=metric_depth_gt.device
+                )
+                mask_roi = (metric_depth_gt > _depth_roi[0]) & (metric_depth_gt < _depth_roi[1])
+                # if (upper_bound > 200).any():
+                #     import pdb; pdb.set_trace()
+                if (kwargs.get('stage', 0) == 2):
+                    unc_metric = ((metric_depth_gt > 0)*(mask_roi) * (unc_metric > 0.5)).float()
+                    metric_depth_gt[metric_depth_gt > 10*q25] = 0
+                else:
+                    unc_metric = ((metric_depth_gt > 0)*(mask_roi)).float()
+                    unc_metric *= (~(utils3d.torch.depth_edge(metric_depth_gt, rtol=0.03, mask=mask_roi.bool()))).float()
+                    # filter the sky
+                    metric_depth_gt[metric_depth_gt > 10*q25] = 0
+                if "unc_metric" in annots.keys():
+                    unc_metric_ = F.interpolate(annots["unc_metric"].permute(1,0,2,3),
+                                size=(H_resize, W_resize), mode='nearest')
+                    unc_metric = unc_metric * unc_metric_
+                if if_gt_depth:
+                    points_map = depth_to_points_colmap(metric_depth_gt.squeeze(1), intrs.view(B*T, 3, 3))
+                    metric_depth = metric_depth_gt
+                    points_map_gt = points_map
+                else:
+                    points_map_gt = depth_to_points_colmap(metric_depth_gt.squeeze(1), intrs.view(B*T, 3, 3))
+        # track the 3d points
+        ret_track = None
+        regular_track = True
+        dyn_preds, final_tracks = None, None
+        if "use_extr" in annots.keys():
+            init_pose = True
+        else:
+            init_pose = False
+        # set the custom vid and valid only
+        custom_vid = annots.get("custom_vid", False)
+        valid_only = annots.get("data_dir", [""])[0] == "stereo4d"
+        if self.training:
+            if (annots["vis"].sum() > 0) and (kwargs.get('stage', 0)==1 or kwargs.get('stage', 0)==3):
+                traj3d = annots['traj_3d']
+                if (kwargs.get('stage', 0)==1) and (annots.get("custom_vid", False)==False):
+                    support_pts_q = get_track_points(H_resize, W_resize,
+                                                    T, x.device, query_size=self.track_num // 2,
+                                                    support_frame=self.support_frame, unc_metric=unc_metric, mode="incremental")[None]
+                else:
+                    support_pts_q = get_track_points(H_resize, W_resize,
+                                                    T, x.device, query_size=random.randint(32, 256),
+                                                    support_frame=self.support_frame, unc_metric=unc_metric, mode="incremental")[None]
+                if pts_q is not None:
+                    pts_q = pts_q[None,None]
+                    ret_track, dyn_preds, final_tracks, rgb_tracks, intrs_org, point_map_org_refined, cache = self.Track3D(imgs_raw,
+                                                        metric_depth,
+                                                        unc_metric.detach(), points_map, pts_q,
+                                                        intrs=intrs.clone(), cache=cache,
+                                                        prec_fx=prec_fx, prec_fy=prec_fy, overlap_d=overlap_d,
+                                                        vis_gt=annots['vis'], traj3d_gt=traj3d, iters=iters_track,
+                                                        cam_gt=c2w_traj_gt, support_pts_q=support_pts_q, custom_vid=custom_vid,
+                                                        init_pose=init_pose, fixed_cam=fixed_cam, stage=kwargs.get('stage', 0),
+                                                        points_map_gt=points_map_gt, valid_only=valid_only, replace_ratio=replace_ratio)
+                else:
+                    ret_track, dyn_preds, final_tracks, rgb_tracks, intrs_org, point_map_org_refined, cache = self.Track3D(imgs_raw,
+                                                        metric_depth,
+                                                        unc_metric.detach(), points_map, traj3d[..., :2],
+                                                        intrs=intrs.clone(), cache=cache,
+                                                        prec_fx=prec_fx, prec_fy=prec_fy, overlap_d=overlap_d,
+                                                        vis_gt=annots['vis'], traj3d_gt=traj3d, iters=iters_track,
+                                                        cam_gt=c2w_traj_gt, support_pts_q=support_pts_q, custom_vid=custom_vid,
+                                                        init_pose=init_pose, fixed_cam=fixed_cam, stage=kwargs.get('stage', 0),
+                                                        points_map_gt=points_map_gt, valid_only=valid_only, replace_ratio=replace_ratio)
+                regular_track = False
+        if regular_track:
+            if pts_q is None:
+                pts_q = get_track_points(H_resize, W_resize,
+                                            T, x.device, query_size=self.track_num,
+                                            support_frame=self.support_frame, unc_metric=unc_metric, mode="incremental" if self.training else "incremental")[None]
+                support_pts_q = None
+            else:
+                pts_q = pts_q[None,None]
+                # resize the query points
+                pts_q[...,1] *= W_resize / W
+                pts_q[...,2] *= H_resize / H
+                if pts_q_3d is not None:
+                    pts_q_3d = pts_q_3d[None,None]
+                    # resize the query points
+                    pts_q_3d[...,1] *= W_resize / W
+                    pts_q_3d[...,2] *= H_resize / H
+                else:
+                    # adjust the query with uncertainty
+                    if (full_point==False) and (overlap_d is None):
+                        pts_q_unc = sample_features5d(unc_metric[None], pts_q).squeeze()
+                        pts_q = pts_q[:,:,pts_q_unc>0.5,:]
+                        if (pts_q_unc<0.5).sum() > 0:
+                            # pad the query points
+                            pad_num = pts_q_unc.shape[0] - pts_q.shape[2]
+                            # pick the random indices
+                            indices = torch.randint(0, pts_q.shape[2], (pad_num,), device=pts_q.device)
+                            pad_pts = indices
+                            pts_q = torch.cat([pts_q, pts_q[:,:,pad_pts,:]], dim=-2)
+                support_pts_q = get_track_points(H_resize, W_resize,
+                                            T, x.device, query_size=self.track_num,
+                                            support_frame=self.support_frame,
+                                            unc_metric=unc_metric, mode="mixed")[None]
+            points_map[points_map>1e3] = 0
+            points_map = depth_to_points_colmap(metric_depth.squeeze(1), intrs.view(B*T, 3, 3))
+            ret_track, dyn_preds, final_tracks, rgb_tracks, intrs_org, point_map_org_refined, cache = self.Track3D(imgs_raw,
+                                                    metric_depth,
+                                                    unc_metric.detach(), points_map, pts_q,
+                                                    pts_q_3d=pts_q_3d, intrs=intrs.clone(),cache=cache,
+                                                    overlap_d=overlap_d, cam_gt=c2w_traj_gt if kwargs.get('stage', 0)==1 else None,
+                                                    prec_fx=prec_fx, prec_fy=prec_fy, support_pts_q=support_pts_q, custom_vid=custom_vid, valid_only=valid_only,
+                                                    fixed_cam=fixed_cam, query_no_BA=query_no_BA, init_pose=init_pose, iters=iters_track,
+                                                    stage=kwargs.get('stage', 0), points_map_gt=points_map_gt, replace_ratio=replace_ratio)
+        intrs = intrs_org
+        points_map = point_map_org_refined
+        c2w_traj = ret_track["cam_pred"]
+        if ret_track is not None:
+            if ret_track["loss"] is not None:
+                track_loss, conf_loss, dyn_loss, vis_loss, point_map_loss, scale_loss, shift_loss, sync_loss= ret_track["loss"]
+        # update the cache
+        cache.update({"metric_depth": metric_depth, "unc_metric": unc_metric, "points_map": points_map, "intrs": intrs[0]})
+        # output
+        depth = F.interpolate(metric_depth,
+                            size=(H, W), mode='bilinear', align_corners=True).squeeze(1)
+        points_map = F.interpolate(points_map,
+                            size=(H, W), mode='bilinear', align_corners=True).squeeze(1)
+        unc_metric = F.interpolate(unc_metric,
+                            size=(H, W), mode='bilinear', align_corners=True).squeeze(1)
+        if self.training:
+            loss = track_loss + conf_loss + dyn_loss + sync_loss + vis_loss + point_map_loss + (scale_loss + shift_loss)*50
+            ret = {"loss": loss,
+                    "depth_loss": point_map_loss,
+                    "ab_loss": (scale_loss + shift_loss)*50,
+                    "vis_loss": vis_loss, "track_loss": track_loss,
+                    "poses_pred": c2w_traj, "dyn_preds": dyn_preds, "traj_est": final_tracks, "conf_loss": conf_loss,
+                    "imgs_raw": imgs_raw, "rgb_tracks": rgb_tracks, "vis_est": ret_track['vis_pred'],
+                    "depth": depth, "points_map": points_map, "unc_metric": unc_metric, "intrs": intrs, "dyn_loss": dyn_loss,
+                    "sync_loss": sync_loss, "conf_pred": ret_track['conf_pred'], "cache": cache,
+                    }
+        else:
+            if ret_track is not None:
+                traj_est = ret_track['preds']
+                traj_est[..., 0] *= W / W_resize
+                traj_est[..., 1] *= H / H_resize
+                vis_est = ret_track['vis_pred']
+            else:
+                traj_est = torch.zeros(B, self.track_num // 2, 3).to(x.device)
+                vis_est = torch.zeros(B, self.track_num // 2).to(x.device)
+            if intrs is not None:
+                intrs[..., 0, :] *= W / W_resize
+                intrs[..., 1, :] *= H / H_resize
+            ret = {"poses_pred": c2w_traj, "dyn_preds": dyn_preds,
+                    "depth": depth, "traj_est": traj_est, "vis_est": vis_est, "imgs_raw": imgs_raw,
+                    "rgb_tracks": rgb_tracks, "intrs": intrs, "unc_metric": unc_metric, "points_map": points_map,
+                    "conf_pred": ret_track['conf_pred'], "cache": cache,
+                    }
+        return ret
+# three stages of training
+# stage 1:
+# gt depth and intrinsics synthetic (includes Dynamic Replica, Kubric, Pointodyssey, Vkitti, TartanAir and Indoor() ) Motion Patern (tapvid3d)
+# Tracking and Pose as well -> based on gt depth and intrinsics
+# (Finished) -> (megasam + base model) vs. tapip3d.     (use depth from megasam or pose, which keep the same setting as tapip3d.)
+# stage 2: fixed 3D tracking
+# Joint depth refiner
+# input depth from whatever + rgb -> temporal module + scale and shift token -> coarse alignment -> scale and shift
+# estimate the 3D tracks -> 3D tracks combine with pointmap -> update for pointmap (iteratively) -> residual map B T 3 H W
+# ongoing two days
+# stage 3: train multi windows by propagation
+# 4 frames overlapped -> train on 64 -> fozen image encoder and finetuning the transformer  (learnable parameters pretty small)
+# types of scenarioes:
+# 1. auto driving (waymo open dataset)
+# 2. robot
+# 3. internet ego video
+# Iterative Transformer -- Solver -- General Neural MegaSAM + Tracks
+# Update Variables:
+# 1. 3D tracks B T N 3  xyz.
+# 2. 2D tracks B T N 2  x y.
+# 3. Dynamic Mask B T H W.
+# 4. Camera Pose B T 4 4.
+# 5. Video Depth.
+# (RGB, RGBD, RGBD+Pose) x (Static, Dynamic)
+# Campatiablity by product.

models/SpaTrackV2/models/__init__.py ADDED Viewed

File without changes

models/SpaTrackV2/models/blocks.py ADDED Viewed

	@@ -0,0 +1,519 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.cuda.amp import autocast
+from einops import rearrange
+import collections
+from functools import partial
+from itertools import repeat
+import torchvision.models as tvm
+from torch.utils.checkpoint import checkpoint
+from models.monoD.depth_anything.dpt import DPTHeadEnc, DPTHead
+from typing import Union, Tuple
+from torch import Tensor
+# From PyTorch internals
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+            return tuple(x)
+        return tuple(repeat(x, n))
+    return parse
+def exists(val):
+    return val is not None
+def default(val, d):
+    return val if exists(val) else d
+to_2tuple = _ntuple(2)
+class LayerScale(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        init_values: Union[float, Tensor] = 1e-5,
+        inplace: bool = False,
+    ) -> None:
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+    def forward(self, x: Tensor) -> Tensor:
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma
+class Mlp(nn.Module):
+    """MLP as used in Vision Transformer, MLP-Mixer and related networks"""
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        norm_layer=None,
+        bias=True,
+        drop=0.0,
+        use_conv=False,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        bias = to_2tuple(bias)
+        drop_probs = to_2tuple(drop)
+        linear_layer = partial(nn.Conv2d, kernel_size=1) if use_conv else nn.Linear
+        self.fc1 = linear_layer(in_features, hidden_features, bias=bias[0])
+        self.act = act_layer()
+        self.drop1 = nn.Dropout(drop_probs[0])
+        self.norm = norm_layer(hidden_features) if norm_layer is not None else nn.Identity()
+        self.fc2 = linear_layer(hidden_features, out_features, bias=bias[1])
+        self.drop2 = nn.Dropout(drop_probs[1])
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop1(x)
+        x = self.fc2(x)
+        x = self.drop2(x)
+        return x
+class Attention(nn.Module):
+    def __init__(self, query_dim, context_dim=None,
+                  num_heads=8, dim_head=48, qkv_bias=False, flash=False):
+        super().__init__()
+        inner_dim = self.inner_dim = dim_head * num_heads
+        context_dim = default(context_dim, query_dim)
+        self.scale = dim_head**-0.5
+        self.heads = num_heads
+        self.flash = flash
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=qkv_bias)
+        self.to_kv = nn.Linear(context_dim, inner_dim * 2, bias=qkv_bias)
+        self.to_out = nn.Linear(inner_dim, query_dim)
+    def forward(self, x, context=None, attn_bias=None):
+        B, N1, _ = x.shape
+        C = self.inner_dim
+        h = self.heads
+        q = self.to_q(x).reshape(B, N1, h, C // h).permute(0, 2, 1, 3)
+        context = default(context, x)
+        k, v = self.to_kv(context).chunk(2, dim=-1)
+        N2 = context.shape[1]
+        k = k.reshape(B, N2, h, C // h).permute(0, 2, 1, 3)
+        v = v.reshape(B, N2, h, C // h).permute(0, 2, 1, 3)
+        with torch.autocast("cuda", enabled=True, dtype=torch.bfloat16):
+            if self.flash==False:
+                sim = (q @ k.transpose(-2, -1)) * self.scale
+                if attn_bias is not None:
+                    sim = sim + attn_bias
+                if sim.abs().max()>1e2:
+                    import pdb; pdb.set_trace()
+                attn = sim.softmax(dim=-1)
+                x = (attn @ v).transpose(1, 2).reshape(B, N1, C)
+            else:
+                input_args = [x.contiguous() for x in [q, k, v]]
+                x = F.scaled_dot_product_attention(*input_args).permute(0,2,1,3).reshape(B,N1,-1)  # type: ignore
+            if self.to_out.bias.dtype != x.dtype:
+                x = x.to(self.to_out.bias.dtype)
+        return self.to_out(x)
+class VGG19(nn.Module):
+    def __init__(self, pretrained=False, amp = False, amp_dtype = torch.float16) -> None:
+        super().__init__()
+        self.layers = nn.ModuleList(tvm.vgg19_bn(pretrained=pretrained).features[:40])
+        self.amp = amp
+        self.amp_dtype = amp_dtype
+    def forward(self, x, **kwargs):
+        with torch.autocast("cuda", enabled=self.amp, dtype = self.amp_dtype):
+            feats = {}
+            scale = 1
+            for layer in self.layers:
+                if isinstance(layer, nn.MaxPool2d):
+                    feats[scale] = x
+                    scale = scale*2
+                x = layer(x)
+            return feats
+class CNNandDinov2(nn.Module):
+    def __init__(self, cnn_kwargs = None, amp = True, amp_dtype = torch.float16):
+        super().__init__()
+        # in case the Internet connection is not stable, please load the DINOv2 locally
+        self.dinov2_vitl14 = torch.hub.load('models/torchhub/facebookresearch_dinov2_main',
+                                          'dinov2_{:}14'.format("vitl"), source='local', pretrained=False)
+        state_dict = torch.load("models/monoD/zoeDepth/ckpts/dinov2_vitl14_pretrain.pth")
+        self.dinov2_vitl14.load_state_dict(state_dict, strict=True)
+        cnn_kwargs = cnn_kwargs if cnn_kwargs is not None else {}
+        self.cnn = VGG19(**cnn_kwargs)
+        self.amp = amp
+        self.amp_dtype = amp_dtype
+        if self.amp:
+            dinov2_vitl14 = dinov2_vitl14.to(self.amp_dtype)
+        self.dinov2_vitl14 = [dinov2_vitl14] # ugly hack to not show parameters to DDP
+    def train(self, mode: bool = True):
+        return self.cnn.train(mode)
+    def forward(self, x, upsample = False):
+        B,C,H,W = x.shape
+        feature_pyramid = self.cnn(x)
+        if not upsample:
+            with torch.no_grad():
+                if self.dinov2_vitl14[0].device != x.device:
+                    self.dinov2_vitl14[0] = self.dinov2_vitl14[0].to(x.device).to(self.amp_dtype)
+                dinov2_features_16 = self.dinov2_vitl14[0].forward_features(x.to(self.amp_dtype))
+                features_16 = dinov2_features_16['x_norm_patchtokens'].permute(0,2,1).reshape(B,1024,H//14, W//14)
+                del dinov2_features_16
+                feature_pyramid[16] = features_16
+        return feature_pyramid
+class Dinov2(nn.Module):
+    def __init__(self, amp = True, amp_dtype = torch.float16):
+        super().__init__()
+        # in case the Internet connection is not stable, please load the DINOv2 locally
+        self.dinov2_vitl14 = torch.hub.load('models/torchhub/facebookresearch_dinov2_main',
+                                          'dinov2_{:}14'.format("vitl"), source='local', pretrained=False)
+        state_dict = torch.load("models/monoD/zoeDepth/ckpts/dinov2_vitl14_pretrain.pth")
+        self.dinov2_vitl14.load_state_dict(state_dict, strict=True)
+        self.amp = amp
+        self.amp_dtype = amp_dtype
+        if self.amp:
+            self.dinov2_vitl14 = self.dinov2_vitl14.to(self.amp_dtype)
+    def forward(self, x, upsample = False):
+        B,C,H,W = x.shape
+        mean_ = torch.tensor([0.485, 0.456, 0.406],
+                             device=x.device).view(1, 3, 1, 1)
+        std_ = torch.tensor([0.229, 0.224, 0.225],
+                            device=x.device).view(1, 3, 1, 1)
+        x = (x+1)/2
+        x = (x - mean_)/std_
+        h_re, w_re = 560, 560
+        x_resize = F.interpolate(x, size=(h_re, w_re),
+                                  mode='bilinear', align_corners=True)
+        if not upsample:
+            with torch.no_grad():
+                dinov2_features_16 = self.dinov2_vitl14.forward_features(x_resize.to(self.amp_dtype))
+                features_16 = dinov2_features_16['x_norm_patchtokens'].permute(0,2,1).reshape(B,1024,h_re//14, w_re//14)
+                del dinov2_features_16
+        features_16 = F.interpolate(features_16, size=(H//8, W//8), mode="bilinear", align_corners=True)
+        return features_16
+class AttnBlock(nn.Module):
+    """
+    A DiT block with adaptive layer norm zero (adaLN-Zero) conditioning.
+    """
+    def __init__(self, hidden_size, num_heads, mlp_ratio=4.0,
+                  flash=False, ckpt_fwd=False, debug=False, **block_kwargs):
+        super().__init__()
+        self.debug=debug
+        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.flash=flash
+        self.attn = Attention(
+            hidden_size, num_heads=num_heads, qkv_bias=True, flash=flash,
+            **block_kwargs
+        )
+        self.ls = LayerScale(hidden_size, init_values=0.005)
+        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        self.mlp = Mlp(
+            in_features=hidden_size,
+            hidden_features=mlp_hidden_dim,
+            act_layer=approx_gelu,
+        )
+        self.ckpt_fwd = ckpt_fwd
+    def forward(self, x):
+        if self.debug:
+            print(x.max(), x.min(), x.mean())
+        if self.ckpt_fwd:
+            x = x + checkpoint(self.attn, self.norm1(x), use_reentrant=False)
+        else:
+            x = x + self.attn(self.norm1(x))
+        x = x + self.ls(self.mlp(self.norm2(x)))
+        return x
+class CrossAttnBlock(nn.Module):
+    def __init__(self, hidden_size, context_dim, num_heads=1, mlp_ratio=4.0, head_dim=48,
+                 flash=False, ckpt_fwd=False, **block_kwargs):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.norm_context = nn.LayerNorm(hidden_size)
+        self.cross_attn = Attention(
+            hidden_size, context_dim=context_dim, dim_head=head_dim,
+            num_heads=num_heads, qkv_bias=True, **block_kwargs, flash=flash,
+        )
+        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        self.mlp = Mlp(
+            in_features=hidden_size,
+            hidden_features=mlp_hidden_dim,
+            act_layer=approx_gelu,
+            drop=0,
+        )
+        self.ckpt_fwd = ckpt_fwd
+    def forward(self, x, context):
+        if self.ckpt_fwd:
+            with autocast():
+                x = x + checkpoint(self.cross_attn,
+                                    self.norm1(x), self.norm_context(context), use_reentrant=False)
+        else:
+            with autocast():
+                x = x + self.cross_attn(
+                    self.norm1(x), self.norm_context(context)
+                )
+        x = x + self.mlp(self.norm2(x))
+        return x
+def bilinear_sampler(img, coords, mode="bilinear", mask=False):
+    """Wrapper for grid_sample, uses pixel coordinates"""
+    H, W = img.shape[-2:]
+    xgrid, ygrid = coords.split([1, 1], dim=-1)
+    # go to 0,1 then 0,2 then -1,1
+    xgrid = 2 * xgrid / (W - 1) - 1
+    ygrid = 2 * ygrid / (H - 1) - 1
+    grid = torch.cat([xgrid, ygrid], dim=-1)
+    img = F.grid_sample(img, grid, align_corners=True, mode=mode)
+    if mask:
+        mask = (xgrid > -1) & (ygrid > -1) & (xgrid < 1) & (ygrid < 1)
+        return img, mask.float()
+    return img
+class CorrBlock:
+    def __init__(self, fmaps, num_levels=4, radius=4, depths_dnG=None):
+        B, S, C, H_prev, W_prev = fmaps.shape
+        self.S, self.C, self.H, self.W = S, C, H_prev, W_prev
+        self.num_levels = num_levels
+        self.radius = radius
+        self.fmaps_pyramid = []
+        self.depth_pyramid = []
+        self.fmaps_pyramid.append(fmaps)
+        if depths_dnG is not None:
+           self.depth_pyramid.append(depths_dnG)
+        for i in range(self.num_levels - 1):
+            if depths_dnG is not None:
+                depths_dnG_ = depths_dnG.reshape(B * S, 1, H_prev, W_prev)
+                depths_dnG_ = F.avg_pool2d(depths_dnG_, 2, stride=2)
+                _, _, H, W = depths_dnG_.shape
+                depths_dnG = depths_dnG_.reshape(B, S, 1, H, W)
+                self.depth_pyramid.append(depths_dnG)
+            fmaps_ = fmaps.reshape(B * S, C, H_prev, W_prev)
+            fmaps_ = F.avg_pool2d(fmaps_, 2, stride=2)
+            _, _, H, W = fmaps_.shape
+            fmaps = fmaps_.reshape(B, S, C, H, W)
+            H_prev = H
+            W_prev = W
+            self.fmaps_pyramid.append(fmaps)
+    def sample(self, coords):
+        r = self.radius
+        B, S, N, D = coords.shape
+        assert D == 2
+        H, W = self.H, self.W
+        out_pyramid = []
+        for i in range(self.num_levels):
+            corrs = self.corrs_pyramid[i]  # B, S, N, H, W
+            _, _, _, H, W = corrs.shape
+            dx = torch.linspace(-r, r, 2 * r + 1)
+            dy = torch.linspace(-r, r, 2 * r + 1)
+            delta = torch.stack(torch.meshgrid(dy, dx, indexing="ij"), axis=-1).to(
+                coords.device
+            )
+            centroid_lvl = coords.reshape(B * S * N, 1, 1, 2) / 2 ** i
+            delta_lvl = delta.view(1, 2 * r + 1, 2 * r + 1, 2)
+            coords_lvl = centroid_lvl + delta_lvl
+            corrs = bilinear_sampler(corrs.reshape(B * S * N, 1, H, W), coords_lvl)
+            corrs = corrs.view(B, S, N, -1)
+            out_pyramid.append(corrs)
+        out = torch.cat(out_pyramid, dim=-1)  # B, S, N, LRR*2
+        return out.contiguous().float()
+    def corr(self, targets):
+        B, S, N, C = targets.shape
+        assert C == self.C
+        assert S == self.S
+        fmap1 = targets
+        self.corrs_pyramid = []
+        for fmaps in self.fmaps_pyramid:
+            _, _, _, H, W = fmaps.shape
+            fmap2s = fmaps.view(B, S, C, H * W)
+            corrs = torch.matmul(fmap1, fmap2s)
+            corrs = corrs.view(B, S, N, H, W)
+            corrs = corrs / torch.sqrt(torch.tensor(C).float())
+            self.corrs_pyramid.append(corrs)
+    def corr_sample(self, targets, coords, coords_dp=None):
+        B, S, N, C = targets.shape
+        r = self.radius
+        Dim_c = (2*r+1)**2
+        assert C == self.C
+        assert S == self.S
+        out_pyramid = []
+        out_pyramid_dp = []
+        for i in range(self.num_levels):
+            dx = torch.linspace(-r, r, 2 * r + 1)
+            dy = torch.linspace(-r, r, 2 * r + 1)
+            delta = torch.stack(torch.meshgrid(dy, dx, indexing="ij"), axis=-1).to(
+                coords.device
+            )
+            centroid_lvl = coords.reshape(B * S * N, 1, 1, 2) / 2 ** i
+            delta_lvl = delta.view(1, 2 * r + 1, 2 * r + 1, 2)
+            coords_lvl = centroid_lvl + delta_lvl
+            fmaps = self.fmaps_pyramid[i]
+            _, _, _, H, W = fmaps.shape
+            fmap2s = fmaps.view(B*S, C, H, W)
+            if len(self.depth_pyramid)>0:
+                depths_dnG_i = self.depth_pyramid[i]
+                depths_dnG_i = depths_dnG_i.view(B*S, 1, H, W)
+                dnG_sample = bilinear_sampler(depths_dnG_i, coords_lvl.view(B*S,1,N*Dim_c,2))
+                dp_corrs = (dnG_sample.view(B*S,N,-1) - coords_dp[0]).abs()/coords_dp[0]
+                out_pyramid_dp.append(dp_corrs)
+            fmap2s_sample = bilinear_sampler(fmap2s, coords_lvl.view(B*S,1,N*Dim_c,2))
+            fmap2s_sample = fmap2s_sample.permute(0, 3, 1, 2) # B*S, N*Dim_c, C, -1
+            corrs = torch.matmul(targets.reshape(B*S*N, 1, -1), fmap2s_sample.reshape(B*S*N, Dim_c, -1).permute(0, 2, 1))
+            corrs = corrs / torch.sqrt(torch.tensor(C).float())
+            corrs = corrs.view(B, S, N, -1)
+            out_pyramid.append(corrs)
+        out = torch.cat(out_pyramid, dim=-1)  # B, S, N, LRR*2
+        if len(self.depth_pyramid)>0:
+            out_dp = torch.cat(out_pyramid_dp, dim=-1)
+            self.fcorrD = out_dp.contiguous().float()
+        else:
+            self.fcorrD = torch.zeros_like(out).contiguous().float()
+        return out.contiguous().float()
+class EUpdateFormer(nn.Module):
+    """
+    Transformer model that updates track estimates.
+    """
+    def __init__(
+        self,
+        space_depth=12,
+        time_depth=12,
+        input_dim=320,
+        hidden_size=384,
+        num_heads=8,
+        output_dim=130,
+        mlp_ratio=4.0,
+        vq_depth=3,
+        add_space_attn=True,
+        add_time_attn=True,
+        flash=True
+    ):
+        super().__init__()
+        self.out_channels = 2
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+        self.add_space_attn = add_space_attn
+        self.input_transform = torch.nn.Linear(input_dim, hidden_size, bias=True)
+        self.flash = flash
+        self.flow_head = nn.Sequential(
+            nn.Linear(hidden_size, output_dim, bias=True),
+            nn.ReLU(inplace=True),
+            nn.Linear(output_dim, output_dim, bias=True),
+            nn.ReLU(inplace=True),
+            nn.Linear(output_dim, output_dim, bias=True)
+        )
+        self.norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        cfg = xLSTMBlockStackConfig(
+            mlstm_block=mLSTMBlockConfig(
+                mlstm=mLSTMLayerConfig(
+                    conv1d_kernel_size=4, qkv_proj_blocksize=4, num_heads=4
+                )
+            ),
+            slstm_block=sLSTMBlockConfig(
+                slstm=sLSTMLayerConfig(
+                    backend="cuda",
+                    num_heads=4,
+                    conv1d_kernel_size=4,
+                    bias_init="powerlaw_blockdependent",
+                ),
+                feedforward=FeedForwardConfig(proj_factor=1.3, act_fn="gelu"),
+            ),
+            context_length=50,
+            num_blocks=7,
+            embedding_dim=384,
+            slstm_at=[1],
+        )
+        self.xlstm_fwd = xLSTMBlockStack(cfg)
+        self.xlstm_bwd = xLSTMBlockStack(cfg)
+        self.initialize_weights()
+    def initialize_weights(self):
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+    def forward(self,
+                input_tensor,
+                track_mask=None):
+        """ Updating with Transformer
+        Args:
+            input_tensor: B, N, T, C
+            arap_embed: B, N, T, C
+        """
+        B, N, T, C = input_tensor.shape
+        x = self.input_transform(input_tensor)
+        track_mask = track_mask.permute(0,2,1,3).float()
+        fwd_x = x*track_mask
+        bwd_x = x.flip(2)*track_mask.flip(2)
+        feat_fwd = self.xlstm_fwd(self.norm(fwd_x.view(B*N, T, -1)))
+        feat_bwd = self.xlstm_bwd(self.norm(bwd_x.view(B*N, T, -1)))
+        feat = (feat_bwd.flip(1) + feat_fwd).view(B, N, T, -1)
+        flow = self.flow_head(feat)
+        return flow[..., :2], flow[..., 2:]

models/SpaTrackV2/models/camera_transform.py ADDED Viewed

	@@ -0,0 +1,248 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Adapted from https://github.com/amyxlase/relpose-plus-plus
+import torch
+import numpy as np
+import math
+def bbox_xyxy_to_xywh(xyxy):
+    wh = xyxy[2:] - xyxy[:2]
+    xywh = np.concatenate([xyxy[:2], wh])
+    return xywh
+def adjust_camera_to_bbox_crop_(fl, pp, image_size_wh: torch.Tensor, clamp_bbox_xywh: torch.Tensor):
+    focal_length_px, principal_point_px = _convert_ndc_to_pixels(fl, pp, image_size_wh)
+    principal_point_px_cropped = principal_point_px - clamp_bbox_xywh[:2]
+    focal_length, principal_point_cropped = _convert_pixels_to_ndc(
+        focal_length_px, principal_point_px_cropped, clamp_bbox_xywh[2:]
+    )
+    return focal_length, principal_point_cropped
+def adjust_camera_to_image_scale_(fl, pp, original_size_wh: torch.Tensor, new_size_wh: torch.LongTensor):
+    focal_length_px, principal_point_px = _convert_ndc_to_pixels(fl, pp, original_size_wh)
+    # now scale and convert from pixels to NDC
+    image_size_wh_output = new_size_wh.float()
+    scale = (image_size_wh_output / original_size_wh).min(dim=-1, keepdim=True).values
+    focal_length_px_scaled = focal_length_px * scale
+    principal_point_px_scaled = principal_point_px * scale
+    focal_length_scaled, principal_point_scaled = _convert_pixels_to_ndc(
+        focal_length_px_scaled, principal_point_px_scaled, image_size_wh_output
+    )
+    return focal_length_scaled, principal_point_scaled
+def _convert_ndc_to_pixels(focal_length: torch.Tensor, principal_point: torch.Tensor, image_size_wh: torch.Tensor):
+    half_image_size = image_size_wh / 2
+    rescale = half_image_size.min()
+    principal_point_px = half_image_size - principal_point * rescale
+    focal_length_px = focal_length * rescale
+    return focal_length_px, principal_point_px
+def _convert_pixels_to_ndc(
+    focal_length_px: torch.Tensor, principal_point_px: torch.Tensor, image_size_wh: torch.Tensor
+):
+    half_image_size = image_size_wh / 2
+    rescale = half_image_size.min()
+    principal_point = (half_image_size - principal_point_px) / rescale
+    focal_length = focal_length_px / rescale
+    return focal_length, principal_point
+def normalize_cameras(
+    cameras, compute_optical=True, first_camera=True, normalize_trans=True, scale=1.0, points=None, max_norm=False,
+    pose_mode="C2W"
+):
+    """
+    Normalizes cameras such that
+    (1) the optical axes point to the origin and the average distance to the origin is 1
+    (2) the first camera is the origin
+    (3) the translation vector is normalized
+    TODO: some transforms overlap with others. no need to do so many transforms
+    Args:
+        cameras (List[camera]).
+    """
+    # Let distance from first camera to origin be unit
+    new_cameras = cameras.clone()
+    scale = 1.0
+    if compute_optical:
+        new_cameras, points = compute_optical_transform(new_cameras, points=points)
+    if first_camera:
+        new_cameras, points = first_camera_transform(new_cameras, points=points, pose_mode=pose_mode)
+    if normalize_trans:
+        new_cameras, points, scale = normalize_translation(new_cameras,
+                                                            points=points, max_norm=max_norm)
+    return new_cameras, points, scale
+def compute_optical_transform(new_cameras, points=None):
+    """
+    adapted from https://github.com/amyxlase/relpose-plus-plus
+    """
+    new_transform = new_cameras.get_world_to_view_transform()
+    p_intersect, dist, p_line_intersect, pp, r = compute_optical_axis_intersection(new_cameras)
+    t = Translate(p_intersect)
+    scale = dist.squeeze()[0]
+    if points is not None:
+        points = t.inverse().transform_points(points)
+        points = points / scale
+    # Degenerate case
+    if scale == 0:
+        scale = torch.norm(new_cameras.T, dim=(0, 1))
+        scale = torch.sqrt(scale)
+        new_cameras.T = new_cameras.T / scale
+    else:
+        new_matrix = t.compose(new_transform).get_matrix()
+        new_cameras.R = new_matrix[:, :3, :3]
+        new_cameras.T = new_matrix[:, 3, :3] / scale
+    return new_cameras, points
+def compute_optical_axis_intersection(cameras):
+    centers = cameras.get_camera_center()
+    principal_points = cameras.principal_point
+    one_vec = torch.ones((len(cameras), 1))
+    optical_axis = torch.cat((principal_points, one_vec), -1)
+    pp = cameras.unproject_points(optical_axis, from_ndc=True, world_coordinates=True)
+    pp2 = pp[torch.arange(pp.shape[0]), torch.arange(pp.shape[0])]
+    directions = pp2 - centers
+    centers = centers.unsqueeze(0).unsqueeze(0)
+    directions = directions.unsqueeze(0).unsqueeze(0)
+    p_intersect, p_line_intersect, _, r = intersect_skew_line_groups(p=centers, r=directions, mask=None)
+    p_intersect = p_intersect.squeeze().unsqueeze(0)
+    dist = (p_intersect - centers).norm(dim=-1)
+    return p_intersect, dist, p_line_intersect, pp2, r
+def intersect_skew_line_groups(p, r, mask):
+    # p, r both of shape (B, N, n_intersected_lines, 3)
+    # mask of shape (B, N, n_intersected_lines)
+    p_intersect, r = intersect_skew_lines_high_dim(p, r, mask=mask)
+    _, p_line_intersect = _point_line_distance(p, r, p_intersect[..., None, :].expand_as(p))
+    intersect_dist_squared = ((p_line_intersect - p_intersect[..., None, :]) ** 2).sum(dim=-1)
+    return p_intersect, p_line_intersect, intersect_dist_squared, r
+def intersect_skew_lines_high_dim(p, r, mask=None):
+    # Implements https://en.wikipedia.org/wiki/Skew_lines In more than two dimensions
+    dim = p.shape[-1]
+    # make sure the heading vectors are l2-normed
+    if mask is None:
+        mask = torch.ones_like(p[..., 0])
+    r = torch.nn.functional.normalize(r, dim=-1)
+    eye = torch.eye(dim, device=p.device, dtype=p.dtype)[None, None]
+    I_min_cov = (eye - (r[..., None] * r[..., None, :])) * mask[..., None, None]
+    sum_proj = I_min_cov.matmul(p[..., None]).sum(dim=-3)
+    p_intersect = torch.linalg.lstsq(I_min_cov.sum(dim=-3), sum_proj).solution[..., 0]
+    if torch.any(torch.isnan(p_intersect)):
+        print(p_intersect)
+        raise ValueError(f"p_intersect is NaN")
+    return p_intersect, r
+def _point_line_distance(p1, r1, p2):
+    df = p2 - p1
+    proj_vector = df - ((df * r1).sum(dim=-1, keepdim=True) * r1)
+    line_pt_nearest = p2 - proj_vector
+    d = (proj_vector).norm(dim=-1)
+    return d, line_pt_nearest
+def first_camera_transform(cameras, rotation_only=False,
+                            points=None, pose_mode="C2W"):
+    """
+    Transform so that the first camera is the origin
+    """
+    new_cameras = cameras.clone()
+    # new_transform = new_cameras.get_world_to_view_transform()
+    R = cameras.R
+    T = cameras.T
+    Tran_M = torch.cat([R, T.unsqueeze(-1)], dim=-1) # [B, 3, 4]
+    Tran_M = torch.cat([Tran_M,
+                        torch.tensor([[[0, 0, 0, 1]]], device=Tran_M.device).expand(Tran_M.shape[0], -1, -1)], dim=1)
+    if pose_mode == "C2W":
+        Tran_M_new = (Tran_M[:1,...].inverse())@Tran_M
+    elif pose_mode == "W2C":
+        Tran_M_new = Tran_M@(Tran_M[:1,...].inverse())
+    if False:
+        tR = Rotate(new_cameras.R[0].unsqueeze(0))
+        if rotation_only:
+            t = tR.inverse()
+        else:
+            tT = Translate(new_cameras.T[0].unsqueeze(0))
+            t = tR.compose(tT).inverse()
+        if points is not None:
+            points = t.inverse().transform_points(points)
+        if pose_mode == "C2W":
+            new_matrix = new_transform.compose(t).get_matrix()
+        else:
+            import ipdb; ipdb.set_trace()
+            new_matrix = t.compose(new_transform).get_matrix()
+    new_cameras.R = Tran_M_new[:, :3, :3]
+    new_cameras.T = Tran_M_new[:, :3, 3]
+    return new_cameras, points
+def normalize_translation(new_cameras, points=None, max_norm=False):
+    t_gt = new_cameras.T.clone()
+    t_gt = t_gt[1:, :]
+    if max_norm:
+        t_gt_norm = torch.norm(t_gt, dim=(-1))
+        t_gt_scale = t_gt_norm.max()
+        if t_gt_norm.max() < 0.001:
+            t_gt_scale = torch.ones_like(t_gt_scale)
+        t_gt_scale = t_gt_scale.clamp(min=0.01, max=1e5)
+    else:
+        t_gt_norm = torch.norm(t_gt, dim=(0, 1))
+        t_gt_scale = t_gt_norm / math.sqrt(len(t_gt))
+        t_gt_scale = t_gt_scale / 2
+        if t_gt_norm.max()  < 0.001:
+            t_gt_scale = torch.ones_like(t_gt_scale)
+        t_gt_scale = t_gt_scale.clamp(min=0.01, max=1e5)
+    new_cameras.T = new_cameras.T / t_gt_scale
+    if points is not None:
+        points = points / t_gt_scale
+    return new_cameras, points, t_gt_scale

models/SpaTrackV2/models/depth_refiner/backbone.py ADDED Viewed

	@@ -0,0 +1,472 @@

+# ---------------------------------------------------------------
+# Copyright (c) 2021, NVIDIA Corporation. All rights reserved.
+#
+# This work is licensed under the NVIDIA Source Code License
+# ---------------------------------------------------------------
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from functools import partial
+from timm.layers import DropPath, to_2tuple, trunc_normal_
+from timm.models import register_model
+from timm.models.vision_transformer import _cfg
+import math
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.dwconv = DWConv(hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    def forward(self, x, H, W):
+        x = self.fc1(x)
+        x = self.dwconv(x, H, W)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class Attention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., sr_ratio=1):
+        super().__init__()
+        assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}."
+        self.dim = dim
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        self.q = nn.Linear(dim, dim, bias=qkv_bias)
+        self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.sr_ratio = sr_ratio
+        if sr_ratio > 1:
+            self.sr = nn.Conv2d(dim, dim, kernel_size=sr_ratio, stride=sr_ratio)
+            self.norm = nn.LayerNorm(dim)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+        q = self.q(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
+        if self.sr_ratio > 1:
+            x_ = x.permute(0, 2, 1).reshape(B, C, H, W)
+            x_ = self.sr(x_).reshape(B, C, -1).permute(0, 2, 1)
+            x_ = self.norm(x_)
+            kv = self.kv(x_).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        else:
+            kv = self.kv(x).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        k, v = kv[0], kv[1]
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class Block(nn.Module):
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, sr_ratio=1):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
+            attn_drop=attn_drop, proj_drop=drop, sr_ratio=sr_ratio)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    def forward(self, x, H, W):
+        x = x + self.drop_path(self.attn(self.norm1(x), H, W))
+        x = x + self.drop_path(self.mlp(self.norm2(x), H, W))
+        return x
+class OverlapPatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+    def __init__(self, img_size=224, patch_size=7, stride=4, in_chans=3, embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.H, self.W = img_size[0] // patch_size[0], img_size[1] // patch_size[1]
+        self.num_patches = self.H * self.W
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=stride,
+                              padding=(patch_size[0] // 2, patch_size[1] // 2))
+        self.norm = nn.LayerNorm(embed_dim)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    def forward(self, x):
+        x = self.proj(x)
+        _, _, H, W = x.shape
+        x = x.flatten(2).transpose(1, 2)
+        x = self.norm(x)
+        return x, H, W
+class OverlapPatchEmbed43(nn.Module):
+    """ Image to Patch Embedding
+    """
+    def __init__(self, img_size=224, patch_size=7, stride=4, in_chans=3, embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.H, self.W = img_size[0] // patch_size[0], img_size[1] // patch_size[1]
+        self.num_patches = self.H * self.W
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=stride,
+                              padding=(patch_size[0] // 2, patch_size[1] // 2))
+        self.norm = nn.LayerNorm(embed_dim)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    def forward(self, x):
+        if x.shape[1]==4:
+            x = self.proj_4c(x)
+        else:
+            x = self.proj(x)
+        _, _, H, W = x.shape
+        x = x.flatten(2).transpose(1, 2)
+        x = self.norm(x)
+        return x, H, W
+class MixVisionTransformer(nn.Module):
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dims=[64, 128, 256, 512],
+                 num_heads=[1, 2, 4, 8], mlp_ratios=[4, 4, 4, 4], qkv_bias=False, qk_scale=None, drop_rate=0.,
+                 attn_drop_rate=0., drop_path_rate=0., norm_layer=nn.LayerNorm,
+                 depths=[3, 4, 6, 3], sr_ratios=[8, 4, 2, 1]):
+        super().__init__()
+        self.num_classes = num_classes
+        self.depths = depths
+        # patch_embed    43
+        self.patch_embed1 = OverlapPatchEmbed(img_size=img_size, patch_size=7, stride=4, in_chans=in_chans,
+                                              embed_dim=embed_dims[0])
+        self.patch_embed2 = OverlapPatchEmbed(img_size=img_size // 4, patch_size=3, stride=2, in_chans=embed_dims[0],
+                                              embed_dim=embed_dims[1])
+        self.patch_embed3 = OverlapPatchEmbed(img_size=img_size // 8, patch_size=3, stride=2, in_chans=embed_dims[1],
+                                              embed_dim=embed_dims[2])
+        self.patch_embed4 = OverlapPatchEmbed(img_size=img_size // 16, patch_size=3, stride=2, in_chans=embed_dims[2],
+                                              embed_dim=embed_dims[3])
+        # transformer encoder
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))]  # stochastic depth decay rule
+        cur = 0
+        self.block1 = nn.ModuleList([Block(
+            dim=embed_dims[0], num_heads=num_heads[0], mlp_ratio=mlp_ratios[0], qkv_bias=qkv_bias, qk_scale=qk_scale,
+            drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer,
+            sr_ratio=sr_ratios[0])
+            for i in range(depths[0])])
+        self.norm1 = norm_layer(embed_dims[0])
+        cur += depths[0]
+        self.block2 = nn.ModuleList([Block(
+            dim=embed_dims[1], num_heads=num_heads[1], mlp_ratio=mlp_ratios[1], qkv_bias=qkv_bias, qk_scale=qk_scale,
+            drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer,
+            sr_ratio=sr_ratios[1])
+            for i in range(depths[1])])
+        self.norm2 = norm_layer(embed_dims[1])
+        cur += depths[1]
+        self.block3 = nn.ModuleList([Block(
+            dim=embed_dims[2], num_heads=num_heads[2], mlp_ratio=mlp_ratios[2], qkv_bias=qkv_bias, qk_scale=qk_scale,
+            drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer,
+            sr_ratio=sr_ratios[2])
+            for i in range(depths[2])])
+        self.norm3 = norm_layer(embed_dims[2])
+        cur += depths[2]
+        self.block4 = nn.ModuleList([Block(
+            dim=embed_dims[3], num_heads=num_heads[3], mlp_ratio=mlp_ratios[3], qkv_bias=qkv_bias, qk_scale=qk_scale,
+            drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer,
+            sr_ratio=sr_ratios[3])
+            for i in range(depths[3])])
+        self.norm4 = norm_layer(embed_dims[3])
+        # classification head
+        # self.head = nn.Linear(embed_dims[3], num_classes) if num_classes > 0 else nn.Identity()
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    def init_weights(self, pretrained=None):
+        if isinstance(pretrained, str):
+            logger = get_root_logger()
+            load_checkpoint(self, pretrained, map_location='cpu', strict=False, logger=logger)
+    def reset_drop_path(self, drop_path_rate):
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(self.depths))]
+        cur = 0
+        for i in range(self.depths[0]):
+            self.block1[i].drop_path.drop_prob = dpr[cur + i]
+        cur += self.depths[0]
+        for i in range(self.depths[1]):
+            self.block2[i].drop_path.drop_prob = dpr[cur + i]
+        cur += self.depths[1]
+        for i in range(self.depths[2]):
+            self.block3[i].drop_path.drop_prob = dpr[cur + i]
+        cur += self.depths[2]
+        for i in range(self.depths[3]):
+            self.block4[i].drop_path.drop_prob = dpr[cur + i]
+    def freeze_patch_emb(self):
+        self.patch_embed1.requires_grad = False
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'pos_embed1', 'pos_embed2', 'pos_embed3', 'pos_embed4', 'cls_token'}  # has pos_embed may be better
+    def get_classifier(self):
+        return self.head
+    def reset_classifier(self, num_classes, global_pool=''):
+        self.num_classes = num_classes
+        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+    def forward_features(self, x):
+        B = x.shape[0]
+        outs = []
+        # stage 1
+        x, H, W = self.patch_embed1(x)
+        for i, blk in enumerate(self.block1):
+            x = blk(x, H, W)
+        x = self.norm1(x)
+        x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
+        outs.append(x)
+        # stage 2
+        x, H, W = self.patch_embed2(x)
+        for i, blk in enumerate(self.block2):
+            x = blk(x, H, W)
+        x = self.norm2(x)
+        x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
+        outs.append(x)
+        # stage 3
+        x, H, W = self.patch_embed3(x)
+        for i, blk in enumerate(self.block3):
+            x = blk(x, H, W)
+        x = self.norm3(x)
+        x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
+        outs.append(x)
+        # stage 4
+        x, H, W = self.patch_embed4(x)
+        for i, blk in enumerate(self.block4):
+            x = blk(x, H, W)
+        x = self.norm4(x)
+        x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
+        outs.append(x)
+        return outs
+    def forward(self, x):
+        if x.dim() == 5:
+            x = x.reshape(x.shape[0]*x.shape[1],x.shape[2],x.shape[3],x.shape[4])
+        x = self.forward_features(x)
+        # x = self.head(x)
+        return x
+class DWConv(nn.Module):
+    def __init__(self, dim=768):
+        super(DWConv, self).__init__()
+        self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim)
+    def forward(self, x, H, W):
+        B, N, C = x.shape
+        x = x.transpose(1, 2).view(B, C, H, W)
+        x = self.dwconv(x)
+        x = x.flatten(2).transpose(1, 2)
+        return x
+#@BACKBONES.register_module()
+class mit_b0(MixVisionTransformer):
+    def __init__(self, **kwargs):
+        super(mit_b0, self).__init__(
+            patch_size=4, embed_dims=[32, 64, 160, 256], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4],
+            qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[2, 2, 2, 2], sr_ratios=[8, 4, 2, 1],
+            drop_rate=0.0, drop_path_rate=0.1)
+#@BACKBONES.register_module()
+class mit_b1(MixVisionTransformer):
+    def __init__(self, **kwargs):
+        super(mit_b1, self).__init__(
+            patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4],
+            qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[2, 2, 2, 2], sr_ratios=[8, 4, 2, 1],
+            drop_rate=0.0, drop_path_rate=0.1)
+#@BACKBONES.register_module()
+class mit_b2(MixVisionTransformer):
+    def __init__(self, **kwargs):
+        super(mit_b2, self).__init__(
+            patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4],
+            qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 4, 6, 3], sr_ratios=[8, 4, 2, 1],
+            drop_rate=0.0, drop_path_rate=0.1)
+#@BACKBONES.register_module()
+class mit_b3(MixVisionTransformer):
+    def __init__(self, **kwargs):
+        super(mit_b3, self).__init__(
+            patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4],
+            qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 4, 18, 3], sr_ratios=[8, 4, 2, 1],
+            drop_rate=0.0, drop_path_rate=0.1)
+#@BACKBONES.register_module()
+class mit_b4(MixVisionTransformer):
+    def __init__(self, **kwargs):
+        super(mit_b4, self).__init__(
+            patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4],
+            qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 8, 27, 3], sr_ratios=[8, 4, 2, 1],
+            drop_rate=0.0, drop_path_rate=0.1)
+#@BACKBONES.register_module()
+class mit_b5(MixVisionTransformer):
+    def __init__(self, **kwargs):
+        super(mit_b5, self).__init__(
+            patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4],
+            qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 6, 40, 3], sr_ratios=[8, 4, 2, 1],
+            drop_rate=0.0, drop_path_rate=0.1)

models/SpaTrackV2/models/depth_refiner/decode_head.py ADDED Viewed

	@@ -0,0 +1,619 @@

+from abc import ABCMeta, abstractmethod
+import torch
+import torch.nn as nn
+# from mmcv.cnn import normal_init
+# from mmcv.runner import auto_fp16, force_fp32
+# from mmseg.core import build_pixel_sampler
+# from mmseg.ops import resize
+class BaseDecodeHead(nn.Module, metaclass=ABCMeta):
+    """Base class for BaseDecodeHead.
+    Args:
+        in_channels (int|Sequence[int]): Input channels.
+        channels (int): Channels after modules, before conv_seg.
+        num_classes (int): Number of classes.
+        dropout_ratio (float): Ratio of dropout layer. Default: 0.1.
+        conv_cfg (dict|None): Config of conv layers. Default: None.
+        norm_cfg (dict|None): Config of norm layers. Default: None.
+        act_cfg (dict): Config of activation layers.
+            Default: dict(type='ReLU')
+        in_index (int|Sequence[int]): Input feature index. Default: -1
+        input_transform (str|None): Transformation type of input features.
+            Options: 'resize_concat', 'multiple_select', None.
+            'resize_concat': Multiple feature maps will be resize to the
+                same size as first one and than concat together.
+                Usually used in FCN head of HRNet.
+            'multiple_select': Multiple feature maps will be bundle into
+                a list and passed into decode head.
+            None: Only one select feature map is allowed.
+            Default: None.
+        loss_decode (dict): Config of decode loss.
+            Default: dict(type='CrossEntropyLoss').
+        ignore_index (int | None): The label index to be ignored. When using
+            masked BCE loss, ignore_index should be set to None. Default: 255
+        sampler (dict|None): The config of segmentation map sampler.
+            Default: None.
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+    """
+    def __init__(self,
+                 in_channels,
+                 channels,
+                 *,
+                 num_classes,
+                 dropout_ratio=0.1,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=dict(type='ReLU'),
+                 in_index=-1,
+                 input_transform=None,
+                 loss_decode=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=False,
+                     loss_weight=1.0),
+                 decoder_params=None,
+                 ignore_index=255,
+                 sampler=None,
+                 align_corners=False):
+        super(BaseDecodeHead, self).__init__()
+        self._init_inputs(in_channels, in_index, input_transform)
+        self.channels = channels
+        self.num_classes = num_classes
+        self.dropout_ratio = dropout_ratio
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.in_index = in_index
+        self.ignore_index = ignore_index
+        self.align_corners = align_corners
+        if sampler is not None:
+            self.sampler = build_pixel_sampler(sampler, context=self)
+        else:
+            self.sampler = None
+        self.conv_seg = nn.Conv2d(channels, num_classes, kernel_size=1)
+        if dropout_ratio > 0:
+            self.dropout = nn.Dropout2d(dropout_ratio)
+        else:
+            self.dropout = None
+        self.fp16_enabled = False
+    def extra_repr(self):
+        """Extra repr."""
+        s = f'input_transform={self.input_transform}, ' \
+            f'ignore_index={self.ignore_index}, ' \
+            f'align_corners={self.align_corners}'
+        return s
+    def _init_inputs(self, in_channels, in_index, input_transform):
+        """Check and initialize input transforms.
+        The in_channels, in_index and input_transform must match.
+        Specifically, when input_transform is None, only single feature map
+        will be selected. So in_channels and in_index must be of type int.
+        When input_transform
+        Args:
+            in_channels (int|Sequence[int]): Input channels.
+            in_index (int|Sequence[int]): Input feature index.
+            input_transform (str|None): Transformation type of input features.
+                Options: 'resize_concat', 'multiple_select', None.
+                'resize_concat': Multiple feature maps will be resize to the
+                    same size as first one and than concat together.
+                    Usually used in FCN head of HRNet.
+                'multiple_select': Multiple feature maps will be bundle into
+                    a list and passed into decode head.
+                None: Only one select feature map is allowed.
+        """
+        if input_transform is not None:
+            assert input_transform in ['resize_concat', 'multiple_select']
+        self.input_transform = input_transform
+        self.in_index = in_index
+        if input_transform is not None:
+            assert isinstance(in_channels, (list, tuple))
+            assert isinstance(in_index, (list, tuple))
+            assert len(in_channels) == len(in_index)
+            if input_transform == 'resize_concat':
+                self.in_channels = sum(in_channels)
+            else:
+                self.in_channels = in_channels
+        else:
+            assert isinstance(in_channels, int)
+            assert isinstance(in_index, int)
+            self.in_channels = in_channels
+    def init_weights(self):
+        """Initialize weights of classification layer."""
+        normal_init(self.conv_seg, mean=0, std=0.01)
+    def _transform_inputs(self, inputs):
+        """Transform inputs for decoder.
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+        Returns:
+            Tensor: The transformed inputs
+        """
+        if self.input_transform == 'resize_concat':
+            inputs = [inputs[i] for i in self.in_index]
+            upsampled_inputs = [
+                resize(
+                    input=x,
+                    size=inputs[0].shape[2:],
+                    mode='bilinear',
+                    align_corners=self.align_corners) for x in inputs
+            ]
+            inputs = torch.cat(upsampled_inputs, dim=1)
+        elif self.input_transform == 'multiple_select':
+            inputs = [inputs[i] for i in self.in_index]
+        else:
+            inputs = inputs[self.in_index]
+        return inputs
+    # @auto_fp16()
+    @abstractmethod
+    def forward(self, inputs):
+        """Placeholder of forward function."""
+        pass
+    def forward_train(self, inputs, img_metas, gt_semantic_seg, train_cfg):
+        """Forward function for training.
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+            img_metas (list[dict]): List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:Collect`.
+            gt_semantic_seg (Tensor): Semantic segmentation masks
+                used if the architecture supports semantic segmentation task.
+            train_cfg (dict): The training config.
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        seg_logits = self.forward(inputs)
+        losses = self.losses(seg_logits, gt_semantic_seg)
+        return losses
+    def forward_test(self, inputs, img_metas, test_cfg):
+        """Forward function for testing.
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+            img_metas (list[dict]): List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:Collect`.
+            test_cfg (dict): The testing config.
+        Returns:
+            Tensor: Output segmentation map.
+        """
+        return self.forward(inputs)
+    def cls_seg(self, feat):
+        """Classify each pixel."""
+        if self.dropout is not None:
+            feat = self.dropout(feat)
+        output = self.conv_seg(feat)
+        return output
+class BaseDecodeHead_clips(nn.Module, metaclass=ABCMeta):
+    """Base class for BaseDecodeHead_clips.
+    Args:
+        in_channels (int|Sequence[int]): Input channels.
+        channels (int): Channels after modules, before conv_seg.
+        num_classes (int): Number of classes.
+        dropout_ratio (float): Ratio of dropout layer. Default: 0.1.
+        conv_cfg (dict|None): Config of conv layers. Default: None.
+        norm_cfg (dict|None): Config of norm layers. Default: None.
+        act_cfg (dict): Config of activation layers.
+            Default: dict(type='ReLU')
+        in_index (int|Sequence[int]): Input feature index. Default: -1
+        input_transform (str|None): Transformation type of input features.
+            Options: 'resize_concat', 'multiple_select', None.
+            'resize_concat': Multiple feature maps will be resize to the
+                same size as first one and than concat together.
+                Usually used in FCN head of HRNet.
+            'multiple_select': Multiple feature maps will be bundle into
+                a list and passed into decode head.
+            None: Only one select feature map is allowed.
+            Default: None.
+        loss_decode (dict): Config of decode loss.
+            Default: dict(type='CrossEntropyLoss').
+        ignore_index (int | None): The label index to be ignored. When using
+            masked BCE loss, ignore_index should be set to None. Default: 255
+        sampler (dict|None): The config of segmentation map sampler.
+            Default: None.
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+    """
+    def __init__(self,
+                 in_channels,
+                 channels,
+                 *,
+                 num_classes,
+                 dropout_ratio=0.1,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=dict(type='ReLU'),
+                 in_index=-1,
+                 input_transform=None,
+                 loss_decode=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=False,
+                     loss_weight=1.0),
+                 decoder_params=None,
+                 ignore_index=255,
+                 sampler=None,
+                 align_corners=False,
+                 num_clips=5):
+        super(BaseDecodeHead_clips, self).__init__()
+        self._init_inputs(in_channels, in_index, input_transform)
+        self.channels = channels
+        self.num_classes = num_classes
+        self.dropout_ratio = dropout_ratio
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.in_index = in_index
+        self.ignore_index = ignore_index
+        self.align_corners = align_corners
+        self.num_clips=num_clips
+        if sampler is not None:
+            self.sampler = build_pixel_sampler(sampler, context=self)
+        else:
+            self.sampler = None
+        self.conv_seg = nn.Conv2d(channels, num_classes, kernel_size=1)
+        if dropout_ratio > 0:
+            self.dropout = nn.Dropout2d(dropout_ratio)
+        else:
+            self.dropout = None
+        self.fp16_enabled = False
+    def extra_repr(self):
+        """Extra repr."""
+        s = f'input_transform={self.input_transform}, ' \
+            f'ignore_index={self.ignore_index}, ' \
+            f'align_corners={self.align_corners}'
+        return s
+    def _init_inputs(self, in_channels, in_index, input_transform):
+        """Check and initialize input transforms.
+        The in_channels, in_index and input_transform must match.
+        Specifically, when input_transform is None, only single feature map
+        will be selected. So in_channels and in_index must be of type int.
+        When input_transform
+        Args:
+            in_channels (int|Sequence[int]): Input channels.
+            in_index (int|Sequence[int]): Input feature index.
+            input_transform (str|None): Transformation type of input features.
+                Options: 'resize_concat', 'multiple_select', None.
+                'resize_concat': Multiple feature maps will be resize to the
+                    same size as first one and than concat together.
+                    Usually used in FCN head of HRNet.
+                'multiple_select': Multiple feature maps will be bundle into
+                    a list and passed into decode head.
+                None: Only one select feature map is allowed.
+        """
+        if input_transform is not None:
+            assert input_transform in ['resize_concat', 'multiple_select']
+        self.input_transform = input_transform
+        self.in_index = in_index
+        if input_transform is not None:
+            assert isinstance(in_channels, (list, tuple))
+            assert isinstance(in_index, (list, tuple))
+            assert len(in_channels) == len(in_index)
+            if input_transform == 'resize_concat':
+                self.in_channels = sum(in_channels)
+            else:
+                self.in_channels = in_channels
+        else:
+            assert isinstance(in_channels, int)
+            assert isinstance(in_index, int)
+            self.in_channels = in_channels
+    def init_weights(self):
+        """Initialize weights of classification layer."""
+        normal_init(self.conv_seg, mean=0, std=0.01)
+    def _transform_inputs(self, inputs):
+        """Transform inputs for decoder.
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+        Returns:
+            Tensor: The transformed inputs
+        """
+        if self.input_transform == 'resize_concat':
+            inputs = [inputs[i] for i in self.in_index]
+            upsampled_inputs = [
+                resize(
+                    input=x,
+                    size=inputs[0].shape[2:],
+                    mode='bilinear',
+                    align_corners=self.align_corners) for x in inputs
+            ]
+            inputs = torch.cat(upsampled_inputs, dim=1)
+        elif self.input_transform == 'multiple_select':
+            inputs = [inputs[i] for i in self.in_index]
+        else:
+            inputs = inputs[self.in_index]
+        return inputs
+    # @auto_fp16()
+    @abstractmethod
+    def forward(self, inputs):
+        """Placeholder of forward function."""
+        pass
+    def forward_train(self, inputs, img_metas, gt_semantic_seg, train_cfg,batch_size, num_clips):
+        """Forward function for training.
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+            img_metas (list[dict]): List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:Collect`.
+            gt_semantic_seg (Tensor): Semantic segmentation masks
+                used if the architecture supports semantic segmentation task.
+            train_cfg (dict): The training config.
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        seg_logits = self.forward(inputs,batch_size, num_clips)
+        losses = self.losses(seg_logits, gt_semantic_seg)
+        return losses
+    def forward_test(self, inputs, img_metas, test_cfg, batch_size, num_clips):
+        """Forward function for testing.
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+            img_metas (list[dict]): List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:Collect`.
+            test_cfg (dict): The testing config.
+        Returns:
+            Tensor: Output segmentation map.
+        """
+        return self.forward(inputs, batch_size, num_clips)
+    def cls_seg(self, feat):
+        """Classify each pixel."""
+        if self.dropout is not None:
+            feat = self.dropout(feat)
+        output = self.conv_seg(feat)
+        return output
+class BaseDecodeHead_clips_flow(nn.Module, metaclass=ABCMeta):
+    """Base class for BaseDecodeHead_clips_flow.
+    Args:
+        in_channels (int|Sequence[int]): Input channels.
+        channels (int): Channels after modules, before conv_seg.
+        num_classes (int): Number of classes.
+        dropout_ratio (float): Ratio of dropout layer. Default: 0.1.
+        conv_cfg (dict|None): Config of conv layers. Default: None.
+        norm_cfg (dict|None): Config of norm layers. Default: None.
+        act_cfg (dict): Config of activation layers.
+            Default: dict(type='ReLU')
+        in_index (int|Sequence[int]): Input feature index. Default: -1
+        input_transform (str|None): Transformation type of input features.
+            Options: 'resize_concat', 'multiple_select', None.
+            'resize_concat': Multiple feature maps will be resize to the
+                same size as first one and than concat together.
+                Usually used in FCN head of HRNet.
+            'multiple_select': Multiple feature maps will be bundle into
+                a list and passed into decode head.
+            None: Only one select feature map is allowed.
+            Default: None.
+        loss_decode (dict): Config of decode loss.
+            Default: dict(type='CrossEntropyLoss').
+        ignore_index (int | None): The label index to be ignored. When using
+            masked BCE loss, ignore_index should be set to None. Default: 255
+        sampler (dict|None): The config of segmentation map sampler.
+            Default: None.
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+    """
+    def __init__(self,
+                 in_channels,
+                 channels,
+                 *,
+                 num_classes,
+                 dropout_ratio=0.1,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=dict(type='ReLU'),
+                 in_index=-1,
+                 input_transform=None,
+                 loss_decode=dict(
+                     type='CrossEntropyLoss',
+                     use_sigmoid=False,
+                     loss_weight=1.0),
+                 decoder_params=None,
+                 ignore_index=255,
+                 sampler=None,
+                 align_corners=False,
+                 num_clips=5):
+        super(BaseDecodeHead_clips_flow, self).__init__()
+        self._init_inputs(in_channels, in_index, input_transform)
+        self.channels = channels
+        self.num_classes = num_classes
+        self.dropout_ratio = dropout_ratio
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.in_index = in_index
+        self.ignore_index = ignore_index
+        self.align_corners = align_corners
+        self.num_clips=num_clips
+        if sampler is not None:
+            self.sampler = build_pixel_sampler(sampler, context=self)
+        else:
+            self.sampler = None
+        self.conv_seg = nn.Conv2d(channels, num_classes, kernel_size=1)
+        if dropout_ratio > 0:
+            self.dropout = nn.Dropout2d(dropout_ratio)
+        else:
+            self.dropout = None
+        self.fp16_enabled = False
+    def extra_repr(self):
+        """Extra repr."""
+        s = f'input_transform={self.input_transform}, ' \
+            f'ignore_index={self.ignore_index}, ' \
+            f'align_corners={self.align_corners}'
+        return s
+    def _init_inputs(self, in_channels, in_index, input_transform):
+        """Check and initialize input transforms.
+        The in_channels, in_index and input_transform must match.
+        Specifically, when input_transform is None, only single feature map
+        will be selected. So in_channels and in_index must be of type int.
+        When input_transform
+        Args:
+            in_channels (int|Sequence[int]): Input channels.
+            in_index (int|Sequence[int]): Input feature index.
+            input_transform (str|None): Transformation type of input features.
+                Options: 'resize_concat', 'multiple_select', None.
+                'resize_concat': Multiple feature maps will be resize to the
+                    same size as first one and than concat together.
+                    Usually used in FCN head of HRNet.
+                'multiple_select': Multiple feature maps will be bundle into
+                    a list and passed into decode head.
+                None: Only one select feature map is allowed.
+        """
+        if input_transform is not None:
+            assert input_transform in ['resize_concat', 'multiple_select']
+        self.input_transform = input_transform
+        self.in_index = in_index
+        if input_transform is not None:
+            assert isinstance(in_channels, (list, tuple))
+            assert isinstance(in_index, (list, tuple))
+            assert len(in_channels) == len(in_index)
+            if input_transform == 'resize_concat':
+                self.in_channels = sum(in_channels)
+            else:
+                self.in_channels = in_channels
+        else:
+            assert isinstance(in_channels, int)
+            assert isinstance(in_index, int)
+            self.in_channels = in_channels
+    def init_weights(self):
+        """Initialize weights of classification layer."""
+        normal_init(self.conv_seg, mean=0, std=0.01)
+    def _transform_inputs(self, inputs):
+        """Transform inputs for decoder.
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+        Returns:
+            Tensor: The transformed inputs
+        """
+        if self.input_transform == 'resize_concat':
+            inputs = [inputs[i] for i in self.in_index]
+            upsampled_inputs = [
+                resize(
+                    input=x,
+                    size=inputs[0].shape[2:],
+                    mode='bilinear',
+                    align_corners=self.align_corners) for x in inputs
+            ]
+            inputs = torch.cat(upsampled_inputs, dim=1)
+        elif self.input_transform == 'multiple_select':
+            inputs = [inputs[i] for i in self.in_index]
+        else:
+            inputs = inputs[self.in_index]
+        return inputs
+    # @auto_fp16()
+    @abstractmethod
+    def forward(self, inputs):
+        """Placeholder of forward function."""
+        pass
+    def forward_train(self, inputs, img_metas, gt_semantic_seg, train_cfg,batch_size, num_clips,img=None):
+        """Forward function for training.
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+            img_metas (list[dict]): List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:Collect`.
+            gt_semantic_seg (Tensor): Semantic segmentation masks
+                used if the architecture supports semantic segmentation task.
+            train_cfg (dict): The training config.
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        seg_logits = self.forward(inputs,batch_size, num_clips,img)
+        losses = self.losses(seg_logits, gt_semantic_seg)
+        return losses
+    def forward_test(self, inputs, img_metas, test_cfg, batch_size=None, num_clips=None, img=None):
+        """Forward function for testing.
+        Args:
+            inputs (list[Tensor]): List of multi-level img features.
+            img_metas (list[dict]): List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:Collect`.
+            test_cfg (dict): The testing config.
+        Returns:
+            Tensor: Output segmentation map.
+        """
+        return self.forward(inputs, batch_size, num_clips,img)
+    def cls_seg(self, feat):
+        """Classify each pixel."""
+        if self.dropout is not None:
+            feat = self.dropout(feat)
+        output = self.conv_seg(feat)
+        return output

models/SpaTrackV2/models/depth_refiner/depth_refiner.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from models.monoD.depth_anything_v2.dinov2_layers.patch_embed import PatchEmbed
+from models.SpaTrackV2.models.depth_refiner.backbone import mit_b3
+from models.SpaTrackV2.models.depth_refiner.stablizer import Stabilization_Network_Cross_Attention
+from einops import rearrange
+class TrackStablizer(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.backbone = mit_b3()
+        old_conv = self.backbone.patch_embed1.proj
+        new_conv = nn.Conv2d(old_conv.in_channels + 4, old_conv.out_channels, kernel_size=old_conv.kernel_size, stride=old_conv.stride, padding=old_conv.padding)
+        new_conv.weight[:, :3, :, :].data.copy_(old_conv.weight.clone())
+        self.backbone.patch_embed1.proj = new_conv
+        self.Track_Stabilizer = Stabilization_Network_Cross_Attention(in_channels=[64, 128, 320, 512],
+                    in_index=[0, 1, 2, 3],
+                    feature_strides=[4, 8, 16, 32],
+                    channels=128,
+                    dropout_ratio=0.1,
+                    num_classes=1,
+                    align_corners=False,
+                    decoder_params=dict(embed_dim=256, depths=4),
+                    num_clips=16,
+                    norm_cfg = dict(type='SyncBN', requires_grad=True))
+        self.edge_conv = nn.Sequential(nn.Conv2d(in_channels=4, out_channels=64, kernel_size=3, padding=1, stride=1, bias=True),\
+                                  nn.ReLU(inplace=True))
+        self.edge_conv1 = nn.Sequential(nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, padding=1, stride=2, bias=True),\
+                                  nn.ReLU(inplace=True))
+        self.success = False
+        self.x = None
+    def buffer_forward(self, inputs, num_clips=16):
+        """
+            buffer forward for getting the pointmap and image features
+        """
+        B, T, C, H, W = inputs.shape
+        self.x = self.backbone(inputs)
+        scale, shift = self.Track_Stabilizer.buffer_forward(self.x, num_clips=num_clips)
+        self.success = True
+        return scale, shift
+    def forward(self, inputs, tracks, tracks_uvd, num_clips=16, imgs=None, vis_track=None):
+        """
+        Args:
+            inputs: [B, T, C, H, W], RGB + PointMap + Mask
+            tracks: [B, T, N, 4], 3D tracks in camera coordinate + visibility
+            num_clips: int, number of clips to use
+        """
+        B, T, C, H, W = inputs.shape
+        edge_feat = self.edge_conv(inputs.view(B*T,4,H,W))
+        edge_feat1 = self.edge_conv1(edge_feat)
+        if not self.success:
+            scale, shift = self.Track_Stabilizer.buffer_forward(self.x,num_clips=num_clips)
+            self.success = True
+            update = self.Track_Stabilizer(self.x,edge_feat,edge_feat1,tracks,tracks_uvd,num_clips=num_clips, imgs=imgs, vis_track=vis_track)
+        else:
+            update = self.Track_Stabilizer(self.x,edge_feat,edge_feat1,tracks,tracks_uvd,num_clips=num_clips, imgs=imgs, vis_track=vis_track)
+        return update
+    def reset_success(self):
+        self.success = False
+        self.x = None
+        self.Track_Stabilizer.reset_success()
+if __name__ == "__main__":
+    # Create test input tensors
+    batch_size = 1
+    seq_len = 16
+    channels = 7  # 3 for RGB + 3 for PointMap + 1 for Mask
+    height = 384
+    width = 512
+    # Create random input tensor with shape [B, T, C, H, W]
+    inputs = torch.randn(batch_size, seq_len, channels, height, width)
+    # Create random tracks
+    tracks = torch.randn(batch_size, seq_len, 1024, 4)
+    # Create random test images
+    test_imgs = torch.randn(batch_size, seq_len, 3, height, width)
+    # Initialize model and move to GPU
+    model = TrackStablizer().cuda()
+    # Move inputs to GPU and run forward pass
+    inputs = inputs.cuda()
+    tracks = tracks.cuda()
+    outputs = model.buffer_forward(inputs, num_clips=seq_len)
+    import time
+    start_time = time.time()
+    outputs = model(inputs, tracks, num_clips=seq_len)
+    end_time = time.time()
+    print(f"Time taken: {end_time - start_time} seconds")
+    import pdb; pdb.set_trace()
+    # # Print shapes for verification
+    # print(f"Input shape: {inputs.shape}")
+    # print(f"Output shape: {outputs.shape}")
+    # # Basic tests
+    # assert outputs.shape[0] == batch_size, "Batch size mismatch"
+    # assert len(outputs.shape) == 4, "Output should be 4D: [B,C,H,W]"
+    # assert torch.all(outputs >= 0), "Output should be non-negative after ReLU"
+    # print("All tests passed!")

models/SpaTrackV2/models/depth_refiner/network.py ADDED Viewed

	@@ -0,0 +1,429 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+'''
+Author: Ke Xian
+Email: [email protected]
+Date: 2020/07/20
+'''
+import torch
+import torch.nn as nn
+import torch.nn.init as init
+# ==============================================================================================================
+class FTB(nn.Module):
+    def __init__(self, inchannels, midchannels=512):
+        super(FTB, self).__init__()
+        self.in1 = inchannels
+        self.mid = midchannels
+        self.conv1 = nn.Conv2d(in_channels=self.in1, out_channels=self.mid, kernel_size=3, padding=1, stride=1, bias=True)
+        self.conv_branch = nn.Sequential(nn.ReLU(inplace=True),\
+                                         nn.Conv2d(in_channels=self.mid, out_channels=self.mid, kernel_size=3, padding=1, stride=1, bias=True),\
+                                         #nn.BatchNorm2d(num_features=self.mid),\
+                                         nn.ReLU(inplace=True),\
+                                         nn.Conv2d(in_channels=self.mid, out_channels= self.mid, kernel_size=3, padding=1, stride=1, bias=True))
+        self.relu = nn.ReLU(inplace=True)
+        self.init_params()
+    def forward(self, x):
+        x = self.conv1(x)
+        x = x + self.conv_branch(x)
+        x = self.relu(x)
+        return x
+    def init_params(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                #init.kaiming_normal_(m.weight, mode='fan_out')
+                init.normal_(m.weight, std=0.01)
+                # init.xavier_normal_(m.weight)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+            elif isinstance(m, nn.ConvTranspose2d):
+                #init.kaiming_normal_(m.weight, mode='fan_out')
+                init.normal_(m.weight, std=0.01)
+                # init.xavier_normal_(m.weight)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+            elif isinstance(m, nn.BatchNorm2d):  #nn.BatchNorm2d
+                init.constant_(m.weight, 1)
+                init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                init.normal_(m.weight, std=0.01)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+class ATA(nn.Module):
+    def __init__(self, inchannels, reduction = 8):
+        super(ATA, self).__init__()
+        self.inchannels = inchannels
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Sequential(nn.Linear(self.inchannels*2, self.inchannels // reduction),
+                                nn.ReLU(inplace=True),
+                                nn.Linear(self.inchannels // reduction, self.inchannels),
+                                nn.Sigmoid())
+        self.init_params()
+    def forward(self, low_x, high_x):
+        n, c, _, _ = low_x.size()
+        x = torch.cat([low_x, high_x], 1)
+        x = self.avg_pool(x)
+        x = x.view(n, -1)
+        x = self.fc(x).view(n,c,1,1)
+        x = low_x * x + high_x
+        return x
+    def init_params(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                #init.kaiming_normal_(m.weight, mode='fan_out')
+                #init.normal(m.weight, std=0.01)
+                init.xavier_normal_(m.weight)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+            elif isinstance(m, nn.ConvTranspose2d):
+                #init.kaiming_normal_(m.weight, mode='fan_out')
+                #init.normal_(m.weight, std=0.01)
+                init.xavier_normal_(m.weight)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+            elif isinstance(m, nn.BatchNorm2d): #nn.BatchNorm2d
+                init.constant_(m.weight, 1)
+                init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                init.normal_(m.weight, std=0.01)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+class FFM(nn.Module):
+    def __init__(self, inchannels, midchannels, outchannels, upfactor=2):
+        super(FFM, self).__init__()
+        self.inchannels = inchannels
+        self.midchannels = midchannels
+        self.outchannels = outchannels
+        self.upfactor = upfactor
+        self.ftb1 = FTB(inchannels=self.inchannels, midchannels=self.midchannels)
+        self.ftb2 = FTB(inchannels=self.midchannels, midchannels=self.outchannels)
+        self.upsample = nn.Upsample(scale_factor=self.upfactor, mode='bilinear', align_corners=True)
+        self.init_params()
+        #self.p1 = nn.Conv2d(512, 256, kernel_size=1, padding=0, bias=False)
+        #self.p2 = nn.Conv2d(512, 256, kernel_size=1, padding=0, bias=False)
+        #self.p3 = nn.Conv2d(512, 256, kernel_size=1, padding=0, bias=False)
+    def forward(self, low_x, high_x):
+        x = self.ftb1(low_x)
+        '''
+        x = torch.cat((x,high_x),1)
+        if x.shape[2] == 12:
+            x = self.p1(x)
+        elif x.shape[2] == 24:
+            x = self.p2(x)
+        elif x.shape[2] == 48:
+            x = self.p3(x)
+        '''
+        x = x + high_x            ###high_x
+        x = self.ftb2(x)
+        x = self.upsample(x)
+        return x
+    def init_params(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                #init.kaiming_normal_(m.weight, mode='fan_out')
+                init.normal_(m.weight, std=0.01)
+                #init.xavier_normal_(m.weight)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+            elif isinstance(m, nn.ConvTranspose2d):
+                #init.kaiming_normal_(m.weight, mode='fan_out')
+                init.normal_(m.weight, std=0.01)
+                #init.xavier_normal_(m.weight)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+            elif isinstance(m, nn.BatchNorm2d): #nn.Batchnorm2d
+                init.constant_(m.weight, 1)
+                init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                init.normal_(m.weight, std=0.01)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+class noFFM(nn.Module):
+    def __init__(self, inchannels, midchannels, outchannels, upfactor=2):
+        super(noFFM, self).__init__()
+        self.inchannels = inchannels
+        self.midchannels = midchannels
+        self.outchannels = outchannels
+        self.upfactor = upfactor
+        self.ftb2 = FTB(inchannels=self.midchannels, midchannels=self.outchannels)
+        self.upsample = nn.Upsample(scale_factor=self.upfactor, mode='bilinear', align_corners=True)
+        self.init_params()
+        #self.p1 = nn.Conv2d(512, 256, kernel_size=1, padding=0, bias=False)
+        #self.p2 = nn.Conv2d(512, 256, kernel_size=1, padding=0, bias=False)
+        #self.p3 = nn.Conv2d(512, 256, kernel_size=1, padding=0, bias=False)
+    def forward(self, low_x, high_x):
+        #x = self.ftb1(low_x)
+        x = high_x            ###high_x
+        x = self.ftb2(x)
+        x = self.upsample(x)
+        return x
+    def init_params(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                #init.kaiming_normal_(m.weight, mode='fan_out')
+                init.normal_(m.weight, std=0.01)
+                #init.xavier_normal_(m.weight)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+            elif isinstance(m, nn.ConvTranspose2d):
+                #init.kaiming_normal_(m.weight, mode='fan_out')
+                init.normal_(m.weight, std=0.01)
+                #init.xavier_normal_(m.weight)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+            elif isinstance(m, nn.BatchNorm2d): #nn.Batchnorm2d
+                init.constant_(m.weight, 1)
+                init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                init.normal_(m.weight, std=0.01)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+class AO(nn.Module):
+    # Adaptive output module
+    def __init__(self, inchannels, outchannels, upfactor=2):
+        super(AO, self).__init__()
+        self.inchannels = inchannels
+        self.outchannels = outchannels
+        self.upfactor = upfactor
+        """
+        self.adapt_conv = nn.Sequential(nn.Conv2d(in_channels=self.inchannels, out_channels=self.inchannels//2, kernel_size=3, padding=1, stride=1, bias=True),\
+                                  nn.BatchNorm2d(num_features=self.inchannels//2),\
+                                  nn.ReLU(inplace=True),\
+                                  nn.Conv2d(in_channels=self.inchannels//2, out_channels=self.outchannels, kernel_size=3, padding=1, stride=1, bias=True),\
+                                  nn.Upsample(scale_factor=self.upfactor, mode='bilinear', align_corners=True) )#,\
+                                  #nn.ReLU(inplace=True))  ## get positive values
+        """
+        self.adapt_conv = nn.Sequential(nn.Conv2d(in_channels=self.inchannels, out_channels=self.inchannels//2, kernel_size=3, padding=1, stride=1, bias=True),\
+                                  #nn.BatchNorm2d(num_features=self.inchannels//2),\
+                                  nn.ReLU(inplace=True),\
+                                  nn.Upsample(scale_factor=self.upfactor, mode='bilinear', align_corners=True), \
+                                  nn.Conv2d(in_channels=self.inchannels//2, out_channels=self.outchannels, kernel_size=1, padding=0, stride=1))
+                                  #nn.ReLU(inplace=True))  ## get positive values
+        self.init_params()
+    def forward(self, x):
+        x = self.adapt_conv(x)
+        return x
+    def init_params(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                #init.kaiming_normal_(m.weight, mode='fan_out')
+                init.normal_(m.weight, std=0.01)
+                #init.xavier_normal_(m.weight)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+            elif isinstance(m, nn.ConvTranspose2d):
+                #init.kaiming_normal_(m.weight, mode='fan_out')
+                init.normal_(m.weight, std=0.01)
+                #init.xavier_normal_(m.weight)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+            elif isinstance(m, nn.BatchNorm2d): #nn.Batchnorm2d
+                init.constant_(m.weight, 1)
+                init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                init.normal_(m.weight, std=0.01)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+class ASPP(nn.Module):
+    def __init__(self, inchannels=256, planes=128, rates = [1, 6, 12, 18]):
+        super(ASPP, self).__init__()
+        self.inchannels = inchannels
+        self.planes = planes
+        self.rates = rates
+        self.kernel_sizes = []
+        self.paddings = []
+        for rate in self.rates:
+            if rate == 1:
+                self.kernel_sizes.append(1)
+                self.paddings.append(0)
+            else:
+                self.kernel_sizes.append(3)
+                self.paddings.append(rate)
+        self.atrous_0 = nn.Sequential(nn.Conv2d(in_channels=self.inchannels, out_channels=self.planes, kernel_size=self.kernel_sizes[0],
+                                                     stride=1, padding=self.paddings[0], dilation=self.rates[0], bias=True),
+                                      nn.ReLU(inplace=True),
+                                      nn.BatchNorm2d(num_features=self.planes)
+                                      )
+        self.atrous_1 = nn.Sequential(nn.Conv2d(in_channels=self.inchannels, out_channels=self.planes, kernel_size=self.kernel_sizes[1],
+                                                     stride=1, padding=self.paddings[1], dilation=self.rates[1], bias=True),
+                                      nn.ReLU(inplace=True),
+                                      nn.BatchNorm2d(num_features=self.planes),
+                                      )
+        self.atrous_2 = nn.Sequential(nn.Conv2d(in_channels=self.inchannels, out_channels=self.planes, kernel_size=self.kernel_sizes[2],
+                                                     stride=1, padding=self.paddings[2], dilation=self.rates[2], bias=True),
+                                      nn.ReLU(inplace=True),
+                                      nn.BatchNorm2d(num_features=self.planes),
+                                      )
+        self.atrous_3 = nn.Sequential(nn.Conv2d(in_channels=self.inchannels, out_channels=self.planes, kernel_size=self.kernel_sizes[3],
+                                                     stride=1, padding=self.paddings[3], dilation=self.rates[3], bias=True),
+                                      nn.ReLU(inplace=True),
+                                      nn.BatchNorm2d(num_features=self.planes),
+                                      )
+        #self.conv = nn.Conv2d(in_channels=self.planes * 4, out_channels=self.inchannels, kernel_size=3, padding=1, stride=1, bias=True)
+    def forward(self, x):
+        x = torch.cat([self.atrous_0(x), self.atrous_1(x), self.atrous_2(x), self.atrous_3(x)],1)
+        #x = self.conv(x)
+        return x
+# ==============================================================================================================
+class ResidualConv(nn.Module):
+    def __init__(self, inchannels):
+        super(ResidualConv, self).__init__()
+        #nn.BatchNorm2d
+        self.conv = nn.Sequential(
+                                  #nn.BatchNorm2d(num_features=inchannels),
+                                  nn.ReLU(inplace=False),
+                                  #nn.Conv2d(in_channels=inchannels, out_channels=inchannels, kernel_size=3, padding=1, stride=1, groups=inchannels,bias=True),
+                                  #nn.Conv2d(in_channels=inchannels, out_channels=inchannels, kernel_size=1, padding=0, stride=1, groups=1,bias=True)
+                                  nn.Conv2d(in_channels=inchannels, out_channels=inchannels//2, kernel_size=3, padding=1, stride=1, bias=False),
+                                  nn.BatchNorm2d(num_features=inchannels//2),
+                                  nn.ReLU(inplace=False),
+                                  nn.Conv2d(in_channels=inchannels//2, out_channels=inchannels, kernel_size=3, padding=1, stride=1, bias=False)
+                                  )
+        self.init_params()
+    def forward(self, x):
+        x = self.conv(x)+x
+        return x
+    def init_params(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                #init.kaiming_normal_(m.weight, mode='fan_out')
+                init.normal_(m.weight, std=0.01)
+                #init.xavier_normal_(m.weight)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+            elif isinstance(m, nn.ConvTranspose2d):
+                #init.kaiming_normal_(m.weight, mode='fan_out')
+                init.normal_(m.weight, std=0.01)
+                #init.xavier_normal_(m.weight)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+            elif isinstance(m, nn.BatchNorm2d): #nn.BatchNorm2d
+                init.constant_(m.weight, 1)
+                init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                init.normal_(m.weight, std=0.01)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+class FeatureFusion(nn.Module):
+    def __init__(self, inchannels, outchannels):
+        super(FeatureFusion, self).__init__()
+        self.conv = ResidualConv(inchannels=inchannels)
+        #nn.BatchNorm2d
+        self.up = nn.Sequential(ResidualConv(inchannels=inchannels),
+                                nn.ConvTranspose2d(in_channels=inchannels, out_channels=outchannels, kernel_size=3,stride=2, padding=1, output_padding=1),
+                                nn.BatchNorm2d(num_features=outchannels),
+                                nn.ReLU(inplace=True))
+    def forward(self, lowfeat, highfeat):
+        return self.up(highfeat + self.conv(lowfeat))
+    def init_params(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                #init.kaiming_normal_(m.weight, mode='fan_out')
+                init.normal_(m.weight, std=0.01)
+                #init.xavier_normal_(m.weight)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+            elif isinstance(m, nn.ConvTranspose2d):
+                #init.kaiming_normal_(m.weight, mode='fan_out')
+                init.normal_(m.weight, std=0.01)
+                #init.xavier_normal_(m.weight)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+            elif isinstance(m, nn.BatchNorm2d): #nn.BatchNorm2d
+                init.constant_(m.weight, 1)
+                init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                init.normal_(m.weight, std=0.01)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+class SenceUnderstand(nn.Module):
+    def __init__(self, channels):
+        super(SenceUnderstand, self).__init__()
+        self.channels = channels
+        self.conv1 = nn.Sequential(nn.Conv2d(in_channels=512, out_channels=512, kernel_size=3, padding=1),
+                                   nn.ReLU(inplace = True))
+        self.pool = nn.AdaptiveAvgPool2d(8)
+        self.fc = nn.Sequential(nn.Linear(512*8*8, self.channels),
+                                nn.ReLU(inplace = True))
+        self.conv2 = nn.Sequential(nn.Conv2d(in_channels=self.channels, out_channels=self.channels, kernel_size=1, padding=0),
+                                   nn.ReLU(inplace=True))
+        self.initial_params()
+    def forward(self, x):
+        n,c,h,w = x.size()
+        x = self.conv1(x)
+        x = self.pool(x)
+        x = x.view(n,-1)
+        x = self.fc(x)
+        x = x.view(n, self.channels, 1, 1)
+        x = self.conv2(x)
+        x = x.repeat(1,1,h,w)
+        return x
+    def initial_params(self, dev=0.01):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                #print torch.sum(m.weight)
+                m.weight.data.normal_(0, dev)
+                if m.bias is not None:
+                    m.bias.data.fill_(0)
+            elif isinstance(m, nn.ConvTranspose2d):
+                #print torch.sum(m.weight)
+                m.weight.data.normal_(0, dev)
+                if m.bias is not None:
+                    m.bias.data.fill_(0)
+            elif isinstance(m, nn.Linear):
+                m.weight.data.normal_(0, dev)

models/SpaTrackV2/models/depth_refiner/stablilization_attention.py ADDED Viewed

	@@ -0,0 +1,1187 @@

+import math
+import time
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from timm.layers import DropPath, to_2tuple, trunc_normal_
+from einops import rearrange
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    return windows
+def window_partition_noreshape(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+    Returns:
+        windows: (B, num_windows_h, num_windows_w, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
+    windows = x.permute(0, 1, 3, 2, 4, 5).contiguous()
+    return windows
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+    Returns:
+        x: (B, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+def get_roll_masks(H, W, window_size, shift_size):
+    #####################################
+    # move to top-left
+    img_mask = torch.zeros((1, H, W, 1))  # 1 H W 1
+    h_slices = (slice(0, H-window_size),
+                slice(H-window_size, H-shift_size),
+                slice(H-shift_size, H))
+    w_slices = (slice(0, W-window_size),
+                slice(W-window_size, W-shift_size),
+                slice(W-shift_size, W))
+    cnt = 0
+    for h in h_slices:
+        for w in w_slices:
+            img_mask[:, h, w, :] = cnt
+            cnt += 1
+    mask_windows = window_partition(img_mask, window_size)  # nW, window_size, window_size, 1
+    mask_windows = mask_windows.view(-1, window_size * window_size)
+    attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+    attn_mask_tl = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+    ####################################
+    # move to top right
+    img_mask = torch.zeros((1, H, W, 1))  # 1 H W 1
+    h_slices = (slice(0, H-window_size),
+                slice(H-window_size, H-shift_size),
+                slice(H-shift_size, H))
+    w_slices = (slice(0, shift_size),
+                slice(shift_size, window_size),
+                slice(window_size, W))
+    cnt = 0
+    for h in h_slices:
+        for w in w_slices:
+            img_mask[:, h, w, :] = cnt
+            cnt += 1
+    mask_windows = window_partition(img_mask, window_size)  # nW, window_size, window_size, 1
+    mask_windows = mask_windows.view(-1, window_size * window_size)
+    attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+    attn_mask_tr = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+    ####################################
+    # move to bottom left
+    img_mask = torch.zeros((1, H, W, 1))  # 1 H W 1
+    h_slices = (slice(0, shift_size),
+                slice(shift_size, window_size),
+                slice(window_size, H))
+    w_slices = (slice(0, W-window_size),
+                slice(W-window_size, W-shift_size),
+                slice(W-shift_size, W))
+    cnt = 0
+    for h in h_slices:
+        for w in w_slices:
+            img_mask[:, h, w, :] = cnt
+            cnt += 1
+    mask_windows = window_partition(img_mask, window_size)  # nW, window_size, window_size, 1
+    mask_windows = mask_windows.view(-1, window_size * window_size)
+    attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+    attn_mask_bl = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+    ####################################
+    # move to bottom right
+    img_mask = torch.zeros((1, H, W, 1))  # 1 H W 1
+    h_slices = (slice(0, shift_size),
+                slice(shift_size, window_size),
+                slice(window_size, H))
+    w_slices = (slice(0, shift_size),
+                slice(shift_size, window_size),
+                slice(window_size, W))
+    cnt = 0
+    for h in h_slices:
+        for w in w_slices:
+            img_mask[:, h, w, :] = cnt
+            cnt += 1
+    mask_windows = window_partition(img_mask, window_size)  # nW, window_size, window_size, 1
+    mask_windows = mask_windows.view(-1, window_size * window_size)
+    attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+    attn_mask_br = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+    # append all
+    attn_mask_all = torch.cat((attn_mask_tl, attn_mask_tr, attn_mask_bl, attn_mask_br), -1)
+    return attn_mask_all
+def get_relative_position_index(q_windows, k_windows):
+    """
+    Args:
+        q_windows: tuple (query_window_height, query_window_width)
+        k_windows: tuple (key_window_height, key_window_width)
+    Returns:
+        relative_position_index: query_window_height*query_window_width, key_window_height*key_window_width
+    """
+    # get pair-wise relative position index for each token inside the window
+    coords_h_q = torch.arange(q_windows[0])
+    coords_w_q = torch.arange(q_windows[1])
+    coords_q = torch.stack(torch.meshgrid([coords_h_q, coords_w_q]))  # 2, Wh_q, Ww_q
+    coords_h_k = torch.arange(k_windows[0])
+    coords_w_k = torch.arange(k_windows[1])
+    coords_k = torch.stack(torch.meshgrid([coords_h_k, coords_w_k]))  # 2, Wh, Ww
+    coords_flatten_q = torch.flatten(coords_q, 1)  # 2, Wh_q*Ww_q
+    coords_flatten_k = torch.flatten(coords_k, 1)  # 2, Wh_k*Ww_k
+    relative_coords = coords_flatten_q[:, :, None] - coords_flatten_k[:, None, :]  # 2, Wh_q*Ww_q, Wh_k*Ww_k
+    relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh_q*Ww_q, Wh_k*Ww_k, 2
+    relative_coords[:, :, 0] += k_windows[0] - 1  # shift to start from 0
+    relative_coords[:, :, 1] += k_windows[1] - 1
+    relative_coords[:, :, 0] *= (q_windows[1] + k_windows[1]) - 1
+    relative_position_index = relative_coords.sum(-1)  #  Wh_q*Ww_q, Wh_k*Ww_k
+    return relative_position_index
+def get_relative_position_index3d(q_windows, k_windows, num_clips):
+    """
+    Args:
+        q_windows: tuple (query_window_height, query_window_width)
+        k_windows: tuple (key_window_height, key_window_width)
+    Returns:
+        relative_position_index: query_window_height*query_window_width, key_window_height*key_window_width
+    """
+    # get pair-wise relative position index for each token inside the window
+    coords_d_q = torch.arange(num_clips)
+    coords_h_q = torch.arange(q_windows[0])
+    coords_w_q = torch.arange(q_windows[1])
+    coords_q = torch.stack(torch.meshgrid([coords_d_q, coords_h_q, coords_w_q]))  # 2, Wh_q, Ww_q
+    coords_d_k = torch.arange(num_clips)
+    coords_h_k = torch.arange(k_windows[0])
+    coords_w_k = torch.arange(k_windows[1])
+    coords_k = torch.stack(torch.meshgrid([coords_d_k, coords_h_k, coords_w_k]))  # 2, Wh, Ww
+    coords_flatten_q = torch.flatten(coords_q, 1)  # 2, Wh_q*Ww_q
+    coords_flatten_k = torch.flatten(coords_k, 1)  # 2, Wh_k*Ww_k
+    relative_coords = coords_flatten_q[:, :, None] - coords_flatten_k[:, None, :]  # 2, Wh_q*Ww_q, Wh_k*Ww_k
+    relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh_q*Ww_q, Wh_k*Ww_k, 2
+    relative_coords[:, :, 0] += num_clips - 1  # shift to start from 0
+    relative_coords[:, :, 1] += k_windows[0] - 1
+    relative_coords[:, :, 2] += k_windows[1] - 1
+    relative_coords[:, :, 0] *= (q_windows[0] + k_windows[0] - 1)*(q_windows[1] + k_windows[1] - 1)
+    relative_coords[:, :, 1] *= (q_windows[1] + k_windows[1] - 1)
+    relative_position_index = relative_coords.sum(-1)  #  Wh_q*Ww_q, Wh_k*Ww_k
+    return relative_position_index
+class WindowAttention3d3(nn.Module):
+    r""" Window based multi-head self attention (W-MSA) module with relative position bias.
+    Args:
+        dim (int): Number of input channels.
+        expand_size (int): The expand size at focal level 1.
+        window_size (tuple[int]): The height and width of the window.
+        focal_window (int): Focal region size.
+        focal_level (int): Focal attention level.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+        pool_method (str): window pooling method. Default: none
+    """
+    def __init__(self, dim, expand_size, window_size, focal_window, focal_level, num_heads,
+                    qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0., pool_method="none", focal_l_clips=[7,1,2], focal_kernel_clips=[7,5,3]):
+        super().__init__()
+        self.dim = dim
+        self.expand_size = expand_size
+        self.window_size = window_size  # Wh, Ww
+        self.pool_method = pool_method
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        self.focal_level = focal_level
+        self.focal_window = focal_window
+        # define a parameter table of relative position bias for each window
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        self.register_buffer("relative_position_index", relative_position_index)
+        num_clips=4
+        # # define a parameter table of relative position bias
+        # self.relative_position_bias_table = nn.Parameter(
+        #     torch.zeros((2 * num_clips - 1) * (2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads))  # 2*Wd-1 * 2*Wh-1 * 2*Ww-1, nH
+        # # get pair-wise relative position index for each token inside the window
+        # coords_d = torch.arange(num_clips)
+        # coords_h = torch.arange(self.window_size[0])
+        # coords_w = torch.arange(self.window_size[1])
+        # coords = torch.stack(torch.meshgrid(coords_d, coords_h, coords_w))  # 3, Wd, Wh, Ww
+        # coords_flatten = torch.flatten(coords, 1)  # 3, Wd*Wh*Ww
+        # relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 3, Wd*Wh*Ww, Wd*Wh*Ww
+        # relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wd*Wh*Ww, Wd*Wh*Ww, 3
+        # relative_coords[:, :, 0] += num_clips - 1  # shift to start from 0
+        # relative_coords[:, :, 1] += self.window_size[0] - 1
+        # relative_coords[:, :, 2] += self.window_size[1] - 1
+        # relative_coords[:, :, 0] *= (2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1)
+        # relative_coords[:, :, 1] *= (2 * self.window_size[1] - 1)
+        # relative_position_index = relative_coords.sum(-1)  # Wd*Wh*Ww, Wd*Wh*Ww
+        # self.register_buffer("relative_position_index", relative_position_index)
+        if self.expand_size > 0 and focal_level > 0:
+            # define a parameter table of position bias between window and its fine-grained surroundings
+            self.window_size_of_key = self.window_size[0] * self.window_size[1] if self.expand_size == 0 else \
+                (4 * self.window_size[0] * self.window_size[1] - 4 * (self.window_size[0] -  self.expand_size) * (self.window_size[0] -  self.expand_size))
+            self.relative_position_bias_table_to_neighbors = nn.Parameter(
+                torch.zeros(1, num_heads, self.window_size[0] * self.window_size[1], self.window_size_of_key))  # Wh*Ww, nH, nSurrounding
+            trunc_normal_(self.relative_position_bias_table_to_neighbors, std=.02)
+            # get mask for rolled k and rolled v
+            mask_tl = torch.ones(self.window_size[0], self.window_size[1]); mask_tl[:-self.expand_size, :-self.expand_size] = 0
+            mask_tr = torch.ones(self.window_size[0], self.window_size[1]); mask_tr[:-self.expand_size, self.expand_size:] = 0
+            mask_bl = torch.ones(self.window_size[0], self.window_size[1]); mask_bl[self.expand_size:, :-self.expand_size] = 0
+            mask_br = torch.ones(self.window_size[0], self.window_size[1]); mask_br[self.expand_size:, self.expand_size:] = 0
+            mask_rolled = torch.stack((mask_tl, mask_tr, mask_bl, mask_br), 0).flatten(0)
+            self.register_buffer("valid_ind_rolled", mask_rolled.nonzero().view(-1))
+        if pool_method != "none" and focal_level > 1:
+            #self.relative_position_bias_table_to_windows = nn.ParameterList()
+            #self.relative_position_bias_table_to_windows_clips = nn.ParameterList()
+            #self.register_parameter('relative_position_bias_table_to_windows',[])
+            #self.register_parameter('relative_position_bias_table_to_windows_clips',[])
+            self.unfolds = nn.ModuleList()
+            self.unfolds_clips=nn.ModuleList()
+            # build relative position bias between local patch and pooled windows
+            for k in range(focal_level-1):
+                stride = 2**k
+                kernel_size = 2*(self.focal_window // 2) + 2**k + (2**k-1)
+                # define unfolding operations
+                self.unfolds += [nn.Unfold(
+                    kernel_size=(kernel_size, kernel_size),
+                    stride=stride, padding=kernel_size // 2)
+                ]
+                # define relative position bias table
+                relative_position_bias_table_to_windows = nn.Parameter(
+                    torch.zeros(
+                        self.num_heads,
+                        (self.window_size[0] + self.focal_window + 2**k - 2) * (self.window_size[1] + self.focal_window + 2**k - 2),
+                        )
+                )
+                trunc_normal_(relative_position_bias_table_to_windows, std=.02)
+                #self.relative_position_bias_table_to_windows.append(relative_position_bias_table_to_windows)
+                self.register_parameter('relative_position_bias_table_to_windows_{}'.format(k),relative_position_bias_table_to_windows)
+                # define relative position bias index
+                relative_position_index_k = get_relative_position_index(self.window_size, to_2tuple(self.focal_window + 2**k - 1))
+                # relative_position_index_k = get_relative_position_index3d(self.window_size, to_2tuple(self.focal_window + 2**k - 1), num_clips)
+                self.register_buffer("relative_position_index_{}".format(k), relative_position_index_k)
+                # define unfolding index for focal_level > 0
+                if k > 0:
+                    mask = torch.zeros(kernel_size, kernel_size); mask[(2**k)-1:, (2**k)-1:] = 1
+                    self.register_buffer("valid_ind_unfold_{}".format(k), mask.flatten(0).nonzero().view(-1))
+            for k in range(len(focal_l_clips)):
+                # kernel_size=focal_kernel_clips[k]
+                focal_l_big_flag=False
+                if focal_l_clips[k]>self.window_size[0]:
+                    stride=1
+                    padding=0
+                    kernel_size=focal_kernel_clips[k]
+                    kernel_size_true=kernel_size
+                    focal_l_big_flag=True
+                    # stride=math.ceil(self.window_size/focal_l_clips[k])
+                    # padding=(kernel_size-stride)/2
+                else:
+                    stride = focal_l_clips[k]
+                    # kernel_size
+                    # kernel_size = 2*(focal_kernel_clips[k]// 2) + 2**focal_l_clips[k] + (2**focal_l_clips[k]-1)
+                    kernel_size = focal_kernel_clips[k]     ## kernel_size must be jishu
+                    assert kernel_size%2==1
+                    padding=kernel_size // 2
+                    # kernel_size_true=focal_kernel_clips[k]+2**focal_l_clips[k]-1
+                    kernel_size_true=kernel_size
+                # stride=math.ceil(self.window_size/focal_l_clips[k])
+                self.unfolds_clips += [nn.Unfold(
+                    kernel_size=(kernel_size, kernel_size),
+                    stride=stride,
+                    padding=padding)
+                ]
+                relative_position_bias_table_to_windows = nn.Parameter(
+                    torch.zeros(
+                        self.num_heads,
+                        (self.window_size[0] + kernel_size_true - 1) * (self.window_size[0] + kernel_size_true - 1),
+                        )
+                )
+                trunc_normal_(relative_position_bias_table_to_windows, std=.02)
+                #self.relative_position_bias_table_to_windows_clips.append(relative_position_bias_table_to_windows)
+                self.register_parameter('relative_position_bias_table_to_windows_clips_{}'.format(k),relative_position_bias_table_to_windows)
+                relative_position_index_k = get_relative_position_index(self.window_size, to_2tuple(kernel_size_true))
+                self.register_buffer("relative_position_index_clips_{}".format(k), relative_position_index_k)
+                # if (not focal_l_big_flag) and  focal_l_clips[k]>0:
+                #     mask = torch.zeros(kernel_size, kernel_size); mask[(2**focal_l_clips[k])-1:, (2**focal_l_clips[k])-1:] = 1
+                #     self.register_buffer("valid_ind_unfold_clips_{}".format(k), mask.flatten(0).nonzero().view(-1))
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.softmax = nn.Softmax(dim=-1)
+        self.focal_l_clips=focal_l_clips
+        self.focal_kernel_clips=focal_kernel_clips
+    def forward(self, x_all, mask_all=None, batch_size=None, num_clips=None):
+        """
+        Args:
+            x_all (list[Tensors]): input features at different granularity
+            mask_all (list[Tensors/None]): masks for input features at different granularity
+        """
+        x = x_all[0][0] #
+        B0, nH, nW, C = x.shape
+        # assert B==batch_size*num_clips
+        assert B0==batch_size
+        qkv = self.qkv(x).reshape(B0, nH, nW, 3, C).permute(3, 0, 1, 2, 4).contiguous()
+        q, k, v = qkv[0], qkv[1], qkv[2]  # B0, nH, nW, C
+        # partition q map
+        # print("x.shape: ", x.shape)
+        # print("q.shape: ", q.shape)   # [4, 126, 126, 256]
+        (q_windows, k_windows, v_windows) = map(
+            lambda t: window_partition(t, self.window_size[0]).view(
+            -1, self.window_size[0] * self.window_size[0], self.num_heads, C // self.num_heads
+            ).transpose(1, 2),
+            (q, k, v)
+        )
+        # q_dim0, q_dim1, q_dim2, q_dim3=q_windows.shape
+        # q_windows=q_windows.view(batch_size, num_clips, (nH//self.window_size[0])*(nW//self.window_size[1]), q_dim1, q_dim2, q_dim3)
+        # q_windows=q_windows[:,-1].contiguous().view(-1, q_dim1, q_dim2, q_dim3)   # query for the last frame (target frame)
+        # k_windows.shape [1296, 8, 49, 32]
+        if self.expand_size > 0 and self.focal_level > 0:
+            (k_tl, v_tl) = map(
+                lambda t: torch.roll(t, shifts=(-self.expand_size, -self.expand_size), dims=(1, 2)), (k, v)
+            )
+            (k_tr, v_tr) = map(
+                lambda t: torch.roll(t, shifts=(-self.expand_size, self.expand_size), dims=(1, 2)), (k, v)
+            )
+            (k_bl, v_bl) = map(
+                lambda t: torch.roll(t, shifts=(self.expand_size, -self.expand_size), dims=(1, 2)), (k, v)
+            )
+            (k_br, v_br) = map(
+                lambda t: torch.roll(t, shifts=(self.expand_size, self.expand_size), dims=(1, 2)), (k, v)
+            )
+            (k_tl_windows, k_tr_windows, k_bl_windows, k_br_windows) = map(
+                lambda t: window_partition(t, self.window_size[0]).view(-1, self.window_size[0] * self.window_size[0], self.num_heads, C // self.num_heads),
+                (k_tl, k_tr, k_bl, k_br)
+            )
+            (v_tl_windows, v_tr_windows, v_bl_windows, v_br_windows) = map(
+                lambda t: window_partition(t, self.window_size[0]).view(-1, self.window_size[0] * self.window_size[0], self.num_heads, C // self.num_heads),
+                (v_tl, v_tr, v_bl, v_br)
+            )
+            k_rolled = torch.cat((k_tl_windows, k_tr_windows, k_bl_windows, k_br_windows), 1).transpose(1, 2)
+            v_rolled = torch.cat((v_tl_windows, v_tr_windows, v_bl_windows, v_br_windows), 1).transpose(1, 2)
+            # mask out tokens in current window
+            # print("self.valid_ind_rolled.shape: ", self.valid_ind_rolled.shape)    # [132]
+            # print("k_rolled.shape: ", k_rolled.shape)    # [1296, 8, 196, 32]
+            k_rolled = k_rolled[:, :, self.valid_ind_rolled]
+            v_rolled = v_rolled[:, :, self.valid_ind_rolled]
+            k_rolled = torch.cat((k_windows, k_rolled), 2)
+            v_rolled = torch.cat((v_windows, v_rolled), 2)
+        else:
+            k_rolled = k_windows; v_rolled = v_windows;
+        # print("k_rolled.shape: ", k_rolled.shape)  # [1296, 8, 181, 32]
+        if self.pool_method != "none" and self.focal_level > 1:
+            k_pooled = []
+            v_pooled = []
+            for k in range(self.focal_level-1):
+                stride = 2**k
+                x_window_pooled = x_all[0][k+1]  # B0, nWh, nWw, C
+                nWh, nWw = x_window_pooled.shape[1:3]
+                # generate mask for pooled windows
+                # print("x_window_pooled.shape: ", x_window_pooled.shape)
+                mask = x_window_pooled.new(nWh, nWw).fill_(1)
+                # print("here: ",x_window_pooled.shape, self.unfolds[k].kernel_size, self.unfolds[k](mask.unsqueeze(0).unsqueeze(1)).shape)
+                # print(mask.unique())
+                unfolded_mask = self.unfolds[k](mask.unsqueeze(0).unsqueeze(1)).view(
+                    1, 1, self.unfolds[k].kernel_size[0], self.unfolds[k].kernel_size[1], -1).permute(0, 4, 2, 3, 1).contiguous().\
+                    view(nWh*nWw // stride // stride, -1, 1)
+                if k > 0:
+                    valid_ind_unfold_k = getattr(self, "valid_ind_unfold_{}".format(k))
+                    unfolded_mask = unfolded_mask[:, valid_ind_unfold_k]
+                # print("unfolded_mask.shape: ", unfolded_mask.shape, unfolded_mask.unique())
+                x_window_masks = unfolded_mask.flatten(1).unsqueeze(0)
+                # print((x_window_masks == 0).sum(), (x_window_masks > 0).sum(), x_window_masks.unique())
+                x_window_masks = x_window_masks.masked_fill(x_window_masks == 0, float(-100.0)).masked_fill(x_window_masks > 0, float(0.0))
+                # print(x_window_masks.shape)
+                mask_all[0][k+1] = x_window_masks
+                # generate k and v for pooled windows
+                qkv_pooled = self.qkv(x_window_pooled).reshape(B0, nWh, nWw, 3, C).permute(3, 0, 4, 1, 2).contiguous()
+                k_pooled_k, v_pooled_k = qkv_pooled[1], qkv_pooled[2]  # B0, C, nWh, nWw
+                (k_pooled_k, v_pooled_k) = map(
+                    lambda t: self.unfolds[k](t).view(
+                    B0, C, self.unfolds[k].kernel_size[0], self.unfolds[k].kernel_size[1], -1).permute(0, 4, 2, 3, 1).contiguous().\
+                    view(-1, self.unfolds[k].kernel_size[0]*self.unfolds[k].kernel_size[1], self.num_heads, C // self.num_heads).transpose(1, 2),
+                    (k_pooled_k, v_pooled_k)  # (B0 x (nH*nW)) x nHeads x (unfold_wsize x unfold_wsize) x head_dim
+                )
+                # print("k_pooled_k.shape: ", k_pooled_k.shape)
+                # print("valid_ind_unfold_k.shape: ", valid_ind_unfold_k.shape)
+                if k > 0:
+                    (k_pooled_k, v_pooled_k) = map(
+                        lambda t: t[:, :, valid_ind_unfold_k], (k_pooled_k, v_pooled_k)
+                    )
+                # print("k_pooled_k.shape: ", k_pooled_k.shape)
+                k_pooled += [k_pooled_k]
+                v_pooled += [v_pooled_k]
+            for k in range(len(self.focal_l_clips)):
+                focal_l_big_flag=False
+                if self.focal_l_clips[k]>self.window_size[0]:
+                    stride=1
+                    focal_l_big_flag=True
+                else:
+                    stride = self.focal_l_clips[k]
+                # if self.window_size>=focal_l_clips[k]:
+                #     stride=math.ceil(self.window_size/focal_l_clips[k])
+                #     # padding=(kernel_size-stride)/2
+                # else:
+                #     stride=1
+                    # padding=0
+                x_window_pooled = x_all[k+1]
+                nWh, nWw = x_window_pooled.shape[1:3]
+                mask = x_window_pooled.new(nWh, nWw).fill_(1)
+                # import pdb; pdb.set_trace()
+                # print(x_window_pooled.shape, self.unfolds_clips[k].kernel_size, self.unfolds_clips[k](mask.unsqueeze(0).unsqueeze(1)).shape)
+                unfolded_mask = self.unfolds_clips[k](mask.unsqueeze(0).unsqueeze(1)).view(
+                    1, 1, self.unfolds_clips[k].kernel_size[0], self.unfolds_clips[k].kernel_size[1], -1).permute(0, 4, 2, 3, 1).contiguous().\
+                    view(nWh*nWw // stride // stride, -1, 1)
+                # if (not focal_l_big_flag) and self.focal_l_clips[k]>0:
+                #     valid_ind_unfold_k = getattr(self, "valid_ind_unfold_clips_{}".format(k))
+                #     unfolded_mask = unfolded_mask[:, valid_ind_unfold_k]
+                # print("unfolded_mask.shape: ", unfolded_mask.shape, unfolded_mask.unique())
+                x_window_masks = unfolded_mask.flatten(1).unsqueeze(0)
+                # print((x_window_masks == 0).sum(), (x_window_masks > 0).sum(), x_window_masks.unique())
+                x_window_masks = x_window_masks.masked_fill(x_window_masks == 0, float(-100.0)).masked_fill(x_window_masks > 0, float(0.0))
+                # print(x_window_masks.shape)
+                mask_all[k+1] = x_window_masks
+                # generate k and v for pooled windows
+                qkv_pooled = self.qkv(x_window_pooled).reshape(B0, nWh, nWw, 3, C).permute(3, 0, 4, 1, 2).contiguous()
+                k_pooled_k, v_pooled_k = qkv_pooled[1], qkv_pooled[2]  # B0, C, nWh, nWw
+                if (not focal_l_big_flag):
+                    (k_pooled_k, v_pooled_k) = map(
+                        lambda t: self.unfolds_clips[k](t).view(
+                        B0, C, self.unfolds_clips[k].kernel_size[0], self.unfolds_clips[k].kernel_size[1], -1).permute(0, 4, 2, 3, 1).contiguous().\
+                        view(-1, self.unfolds_clips[k].kernel_size[0]*self.unfolds_clips[k].kernel_size[1], self.num_heads, C // self.num_heads).transpose(1, 2),
+                        (k_pooled_k, v_pooled_k)  # (B0 x (nH*nW)) x nHeads x (unfold_wsize x unfold_wsize) x head_dim
+                    )
+                else:
+                    (k_pooled_k, v_pooled_k) = map(
+                        lambda t: self.unfolds_clips[k](t),
+                        (k_pooled_k, v_pooled_k)  # (B0 x (nH*nW)) x nHeads x (unfold_wsize x unfold_wsize) x head_dim
+                    )
+                    LLL=k_pooled_k.size(2)
+                    LLL_h=int(LLL**0.5)
+                    assert LLL_h**2==LLL
+                    k_pooled_k=k_pooled_k.reshape(B0, -1, LLL_h, LLL_h)
+                    v_pooled_k=v_pooled_k.reshape(B0, -1, LLL_h, LLL_h)
+                # print("k_pooled_k.shape: ", k_pooled_k.shape)
+                # print("valid_ind_unfold_k.shape: ", valid_ind_unfold_k.shape)
+                # if (not focal_l_big_flag) and self.focal_l_clips[k]:
+                #     (k_pooled_k, v_pooled_k) = map(
+                #         lambda t: t[:, :, valid_ind_unfold_k], (k_pooled_k, v_pooled_k)
+                #     )
+                # print("k_pooled_k.shape: ", k_pooled_k.shape)
+                k_pooled += [k_pooled_k]
+                v_pooled += [v_pooled_k]
+                # qkv_pooled = self.qkv(x_window_pooled).reshape(B0, nWh, nWw, 3, C).permute(3, 0, 4, 1, 2).contiguous()
+                # k_pooled_k, v_pooled_k = qkv_pooled[1], qkv_pooled[2]  # B0, C, nWh, nWw
+                # (k_pooled_k, v_pooled_k) = map(
+                #     lambda t: self.unfolds[k](t).view(
+                #     B0, C, self.unfolds[k].kernel_size[0], self.unfolds[k].kernel_size[1], -1).permute(0, 4, 2, 3, 1).contiguous().\
+                #     view(-1, self.unfolds[k].kernel_size[0]*self.unfolds[k].kernel_size[1], self.num_heads, C // self.num_heads).transpose(1, 2),
+                #     (k_pooled_k, v_pooled_k)  # (B0 x (nH*nW)) x nHeads x (unfold_wsize x unfold_wsize) x head_dim
+                # )
+                # k_pooled += [k_pooled_k]
+                # v_pooled += [v_pooled_k]
+            k_all = torch.cat([k_rolled] + k_pooled, 2)
+            v_all = torch.cat([v_rolled] + v_pooled, 2)
+        else:
+            k_all = k_rolled
+            v_all = v_rolled
+        N = k_all.shape[-2]
+        q_windows = q_windows * self.scale
+        # print(q_windows.shape, k_all.shape, v_all.shape)
+        # exit()
+        # k_all_dim0, k_all_dim1, k_all_dim2, k_all_dim3=k_all.shape
+        # k_all=k_all.contiguous().view(batch_size, num_clips, (nH//self.window_size[0])*(nW//self.window_size[1]),
+        #     k_all_dim1, k_all_dim2, k_all_dim3).permute(0,2,3,4,1,5).contiguous().view(-1, k_all_dim1, k_all_dim2*num_clips, k_all_dim3)
+        # v_all=v_all.contiguous().view(batch_size, num_clips, (nH//self.window_size[0])*(nW//self.window_size[1]),
+        #     k_all_dim1, k_all_dim2, k_all_dim3).permute(0,2,3,4,1,5).contiguous().view(-1, k_all_dim1, k_all_dim2*num_clips, k_all_dim3)
+        # print(q_windows.shape, k_all.shape, v_all.shape, k_rolled.shape)
+        # exit()
+        attn = (q_windows @ k_all.transpose(-2, -1))  # B0*nW, nHead, window_size*window_size, focal_window_size*focal_window_size
+        window_area = self.window_size[0] * self.window_size[1]
+        # window_area_clips= num_clips*self.window_size[0] * self.window_size[1]
+        window_area_rolled = k_rolled.shape[2]
+        # add relative position bias for tokens inside window
+        relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+            self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1)  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+        # print(relative_position_bias.shape, attn.shape)
+        attn[:, :, :window_area, :window_area] = attn[:, :, :window_area, :window_area] + relative_position_bias.unsqueeze(0)
+        # relative_position_bias = self.relative_position_bias_table[self.relative_position_index[-window_area:, :window_area_clips].reshape(-1)].view(
+        #     window_area, window_area_clips, -1)  # Wh*Ww,Wd*Wh*Ww,nH
+        # relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous().view(self.num_heads,window_area,num_clips,window_area
+        # ).permute(0,1,3,2).contiguous().view(self.num_heads,window_area,window_area_clips).contiguous()  # nH, Wh*Ww, Wh*Ww*Wd
+        # # attn_dim0, attn_dim1, attn_dim2, attn_dim3=attn.shape
+        # # attn=attn.view(attn_dim0,attn_dim1,attn_dim2,num_clips,-1)
+        # # print(attn.shape, relative_position_bias.shape)
+        # attn[:,:,:window_area, :window_area_clips]=attn[:,:,:window_area, :window_area_clips] + relative_position_bias.unsqueeze(0)
+        # attn = attn + relative_position_bias.unsqueeze(0) # B_, nH, N, N
+        # add relative position bias for patches inside a window
+        if self.expand_size > 0 and self.focal_level > 0:
+            attn[:, :, :window_area, window_area:window_area_rolled] = attn[:, :, :window_area, window_area:window_area_rolled] + self.relative_position_bias_table_to_neighbors
+        if self.pool_method != "none" and self.focal_level > 1:
+            # add relative position bias for different windows in an image
+            offset = window_area_rolled
+            # print(offset)
+            for k in range(self.focal_level-1):
+                # add relative position bias
+                relative_position_index_k = getattr(self, 'relative_position_index_{}'.format(k))
+                relative_position_bias_to_windows = getattr(self,'relative_position_bias_table_to_windows_{}'.format(k))[:, relative_position_index_k.view(-1)].view(
+                    -1, self.window_size[0] * self.window_size[1], (self.focal_window+2**k-1)**2,
+                ) # nH, NWh*NWw,focal_region*focal_region
+                attn[:, :, :window_area, offset:(offset + (self.focal_window+2**k-1)**2)] = \
+                    attn[:, :, :window_area, offset:(offset + (self.focal_window+2**k-1)**2)] + relative_position_bias_to_windows.unsqueeze(0)
+                # add attentional mask
+                if mask_all[0][k+1] is not None:
+                    attn[:, :, :window_area, offset:(offset + (self.focal_window+2**k-1)**2)] = \
+                        attn[:, :, :window_area, offset:(offset + (self.focal_window+2**k-1)**2)] + \
+                            mask_all[0][k+1][:, :, None, None, :].repeat(attn.shape[0] // mask_all[0][k+1].shape[1], 1, 1, 1, 1).view(-1, 1, 1, mask_all[0][k+1].shape[-1])
+                offset += (self.focal_window+2**k-1)**2
+            # print(offset)
+            for k in range(len(self.focal_l_clips)):
+                focal_l_big_flag=False
+                if self.focal_l_clips[k]>self.window_size[0]:
+                    stride=1
+                    padding=0
+                    kernel_size=self.focal_kernel_clips[k]
+                    kernel_size_true=kernel_size
+                    focal_l_big_flag=True
+                    # stride=math.ceil(self.window_size/focal_l_clips[k])
+                    # padding=(kernel_size-stride)/2
+                else:
+                    stride = self.focal_l_clips[k]
+                    # kernel_size
+                    # kernel_size = 2*(self.focal_kernel_clips[k]// 2) + 2**self.focal_l_clips[k] + (2**self.focal_l_clips[k]-1)
+                    kernel_size = self.focal_kernel_clips[k]
+                    padding=kernel_size // 2
+                    # kernel_size_true=self.focal_kernel_clips[k]+2**self.focal_l_clips[k]-1
+                    kernel_size_true=kernel_size
+                relative_position_index_k = getattr(self, 'relative_position_index_clips_{}'.format(k))
+                relative_position_bias_to_windows = getattr(self,'relative_position_bias_table_to_windows_clips_{}'.format(k))[:, relative_position_index_k.view(-1)].view(
+                    -1, self.window_size[0] * self.window_size[1], (kernel_size_true)**2,
+                )
+                attn[:, :, :window_area, offset:(offset + (kernel_size_true)**2)] = \
+                    attn[:, :, :window_area, offset:(offset + (kernel_size_true)**2)] + relative_position_bias_to_windows.unsqueeze(0)
+                if mask_all[k+1] is not None:
+                    attn[:, :, :window_area, offset:(offset + (kernel_size_true)**2)] = \
+                        attn[:, :, :window_area, offset:(offset + (kernel_size_true)**2)] + \
+                            mask_all[k+1][:, :, None, None, :].repeat(attn.shape[0] // mask_all[k+1].shape[1], 1, 1, 1, 1).view(-1, 1, 1, mask_all[k+1].shape[-1])
+                offset += (kernel_size_true)**2
+                # print(offset)
+                # relative_position_index_k = getattr(self, 'relative_position_index_{}'.format(k))
+                # # relative_position_bias_to_windows = self.relative_position_bias_table_to_windows[k][:, relative_position_index_k.view(-1)].view(
+                # #     -1, self.window_size[0] * self.window_size[1], (self.focal_window+2**k-1)**2,
+                # # ) # nH, NWh*NWw,focal_region*focal_region
+                # # attn[:, :, :window_area, offset:(offset + (self.focal_window+2**k-1)**2)] = \
+                # #     attn[:, :, :window_area, offset:(offset + (self.focal_window+2**k-1)**2)] + relative_position_bias_to_windows.unsqueeze(0)
+                # relative_position_bias_to_windows = self.relative_position_bias_table_to_windows[k][:, relative_position_index_k[-window_area:, :].view(-1)].view(
+                #     -1, self.window_size[0] * self.window_size[1], num_clips*(self.focal_window+2**k-1)**2,
+                # ).contiguous() # nH, NWh*NWw, num_clips*focal_region*focal_region
+                # relative_position_bias_to_windows = relative_position_bias_to_windows.view(self.num_heads,
+                #     window_area,num_clips,-1).permute(0,1,3,2).contiguous().view(self.num_heads,window_area,-1)
+                # attn[:, :, :window_area, offset:(offset + num_clips*(self.focal_window+2**k-1)**2)] = \
+                #     attn[:, :, :window_area, offset:(offset + num_clips*(self.focal_window+2**k-1)**2)] + relative_position_bias_to_windows.unsqueeze(0)
+                # # add attentional mask
+                # if mask_all[k+1] is not None:
+                #     # print("inside the mask, be careful 1")
+                #     # attn[:, :, :window_area, offset:(offset + (self.focal_window+2**k-1)**2)] = \
+                #     #     attn[:, :, :window_area, offset:(offset + (self.focal_window+2**k-1)**2)] + \
+                #     #         mask_all[k+1][:, :, None, None, :].repeat(attn.shape[0] // mask_all[k+1].shape[1], 1, 1, 1, 1).view(-1, 1, 1, mask_all[k+1].shape[-1])
+                #     # print("here: ", mask_all[k+1].shape, mask_all[k+1][:, :, None, None, :].shape)
+                #     attn[:, :, :window_area, offset:(offset + num_clips*(self.focal_window+2**k-1)**2)] = \
+                #         attn[:, :, :window_area, offset:(offset + num_clips*(self.focal_window+2**k-1)**2)] + \
+                #             mask_all[k+1][:, :, None, None, :,None].repeat(attn.shape[0] // mask_all[k+1].shape[1], 1, 1, 1, 1, num_clips).view(-1, 1, 1, mask_all[k+1].shape[-1]*num_clips)
+                #     # print()
+                # offset += (self.focal_window+2**k-1)**2
+        # print("mask_all[0]: ", mask_all[0])
+        # exit()
+        if mask_all[0][0] is not None:
+            print("inside the mask, be careful 0")
+            nW = mask_all[0].shape[0]
+            attn = attn.view(attn.shape[0] // nW, nW, self.num_heads, window_area, N)
+            attn[:, :, :, :, :window_area] = attn[:, :, :, :, :window_area] + mask_all[0][None, :, None, :, :]
+            attn = attn.view(-1, self.num_heads, window_area, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+        attn = self.attn_drop(attn)
+        x = (attn @ v_all).transpose(1, 2).reshape(attn.shape[0], window_area, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        # print(x.shape)
+        # x = x.view(B/num_clips, nH, nW, C )
+        # exit()
+        return x
+    def extra_repr(self) -> str:
+        return f'dim={self.dim}, window_size={self.window_size}, num_heads={self.num_heads}'
+    def flops(self, N, window_size, unfold_size):
+        # calculate flops for 1 window with token length of N
+        flops = 0
+        # qkv = self.qkv(x)
+        flops += N * self.dim * 3 * self.dim
+        # attn = (q @ k.transpose(-2, -1))
+        flops += self.num_heads * N * (self.dim // self.num_heads) * N
+        if self.pool_method != "none" and self.focal_level > 1:
+            flops += self.num_heads * N * (self.dim // self.num_heads) * (unfold_size * unfold_size)
+        if self.expand_size > 0 and self.focal_level > 0:
+            flops += self.num_heads * N * (self.dim // self.num_heads) * ((window_size + 2*self.expand_size)**2-window_size**2)
+        #  x = (attn @ v)
+        flops += self.num_heads * N * N * (self.dim // self.num_heads)
+        if self.pool_method != "none" and self.focal_level > 1:
+            flops += self.num_heads * N * (self.dim // self.num_heads) * (unfold_size * unfold_size)
+        if self.expand_size > 0 and self.focal_level > 0:
+            flops += self.num_heads * N * (self.dim // self.num_heads) * ((window_size + 2*self.expand_size)**2-window_size**2)
+        # x = self.proj(x)
+        flops += N * self.dim * self.dim
+        return flops
+class CffmTransformerBlock3d3(nn.Module):
+    r""" Focal Transformer Block.
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resulotion.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        expand_size (int): expand size at first focal level (finest level).
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+        pool_method (str): window pooling method. Default: none, options: [none|fc|conv]
+        focal_level (int): number of focal levels. Default: 1.
+        focal_window (int): region size of focal attention. Default: 1
+        use_layerscale (bool): whether use layer scale for training stability. Default: False
+        layerscale_value (float): scaling value for layer scale. Default: 1e-4
+    """
+    def __init__(self, dim, input_resolution, num_heads, window_size=7, expand_size=0, shift_size=0,
+                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0.,
+                 act_layer=nn.GELU, norm_layer=nn.LayerNorm, pool_method="none",
+                 focal_level=1, focal_window=1, use_layerscale=False, layerscale_value=1e-4, focal_l_clips=[7,2,4], focal_kernel_clips=[7,5,3]):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.expand_size = expand_size
+        self.mlp_ratio = mlp_ratio
+        self.pool_method = pool_method
+        self.focal_level = focal_level
+        self.focal_window = focal_window
+        self.use_layerscale = use_layerscale
+        self.focal_l_clips=focal_l_clips
+        self.focal_kernel_clips=focal_kernel_clips
+        if min(self.input_resolution) <= self.window_size:
+            # if window size is larger than input resolution, we don't partition windows
+            self.expand_size = 0
+            self.shift_size = 0
+            self.window_size = min(self.input_resolution)
+        assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
+        self.window_size_glo = self.window_size
+        self.pool_layers = nn.ModuleList()
+        self.pool_layers_clips = nn.ModuleList()
+        if self.pool_method != "none":
+            for k in range(self.focal_level-1):
+                window_size_glo = math.floor(self.window_size_glo / (2 ** k))
+                if self.pool_method == "fc":
+                    self.pool_layers.append(nn.Linear(window_size_glo * window_size_glo, 1))
+                    self.pool_layers[-1].weight.data.fill_(1./(window_size_glo * window_size_glo))
+                    self.pool_layers[-1].bias.data.fill_(0)
+                elif self.pool_method == "conv":
+                    self.pool_layers.append(nn.Conv2d(dim, dim, kernel_size=window_size_glo, stride=window_size_glo, groups=dim))
+            for k in range(len(focal_l_clips)):
+                # window_size_glo = math.floor(self.window_size_glo / (2 ** k))
+                if focal_l_clips[k]>self.window_size:
+                    window_size_glo = focal_l_clips[k]
+                else:
+                    window_size_glo = math.floor(self.window_size_glo / (focal_l_clips[k]))
+                # window_size_glo = focal_l_clips[k]
+                if self.pool_method == "fc":
+                    self.pool_layers_clips.append(nn.Linear(window_size_glo * window_size_glo, 1))
+                    self.pool_layers_clips[-1].weight.data.fill_(1./(window_size_glo * window_size_glo))
+                    self.pool_layers_clips[-1].bias.data.fill_(0)
+                elif self.pool_method == "conv":
+                    self.pool_layers_clips.append(nn.Conv2d(dim, dim, kernel_size=window_size_glo, stride=window_size_glo, groups=dim))
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention3d3(
+            dim, expand_size=self.expand_size, window_size=to_2tuple(self.window_size),
+            focal_window=focal_window, focal_level=focal_level, num_heads=num_heads,
+            qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop, pool_method=pool_method, focal_l_clips=focal_l_clips, focal_kernel_clips=focal_kernel_clips)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+        # print("******self.shift_size: ", self.shift_size)
+        if self.shift_size > 0:
+            # calculate attention mask for SW-MSA
+            H, W = self.input_resolution
+            img_mask = torch.zeros((1, H, W, 1))  # 1 H W 1
+            h_slices = (slice(0, -self.window_size),
+                        slice(-self.window_size, -self.shift_size),
+                        slice(-self.shift_size, None))
+            w_slices = (slice(0, -self.window_size),
+                        slice(-self.window_size, -self.shift_size),
+                        slice(-self.shift_size, None))
+            cnt = 0
+            for h in h_slices:
+                for w in w_slices:
+                    img_mask[:, h, w, :] = cnt
+                    cnt += 1
+            mask_windows = window_partition(img_mask, self.window_size)  # nW, window_size, window_size, 1
+            mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+            attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+        else:
+            # print("here mask none")
+            attn_mask = None
+        self.register_buffer("attn_mask", attn_mask)
+        if self.use_layerscale:
+            self.gamma_1 = nn.Parameter(layerscale_value * torch.ones((dim)), requires_grad=True)
+            self.gamma_2 = nn.Parameter(layerscale_value * torch.ones((dim)), requires_grad=True)
+    def forward(self, x):
+        H0, W0 = self.input_resolution
+        # B, L, C = x.shape
+        B0, D0, H0, W0, C = x.shape
+        shortcut = x
+        # assert L == H * W, "input feature has wrong size"
+        x=x.reshape(B0*D0,H0,W0,C).reshape(B0*D0,H0*W0,C)
+        x = self.norm1(x)
+        x = x.reshape(B0*D0, H0, W0, C)
+        # print("here")
+        # exit()
+        # pad feature maps to multiples of window size
+        pad_l = pad_t = 0
+        pad_r = (self.window_size - W0 % self.window_size) % self.window_size
+        pad_b = (self.window_size - H0 % self.window_size) % self.window_size
+        if pad_r > 0 or pad_b > 0:
+            x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
+        B, H, W, C = x.shape     ## B=B0*D0
+        if self.shift_size > 0:
+            shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
+        else:
+            shifted_x = x
+        # print("shifted_x.shape: ", shifted_x.shape)
+        shifted_x=shifted_x.view(B0,D0,H,W,C)
+        x_windows_all = [shifted_x[:,-1]]
+        x_windows_all_clips=[]
+        x_window_masks_all = [self.attn_mask]
+        x_window_masks_all_clips=[]
+        if self.focal_level > 1 and self.pool_method != "none":
+            # if we add coarser granularity and the pool method is not none
+            # pooling_index=0
+            for k in range(self.focal_level-1):
+                window_size_glo = math.floor(self.window_size_glo / (2 ** k))
+                pooled_h = math.ceil(H / self.window_size) * (2 ** k)
+                pooled_w = math.ceil(W / self.window_size) * (2 ** k)
+                H_pool = pooled_h * window_size_glo
+                W_pool = pooled_w * window_size_glo
+                x_level_k = shifted_x[:,-1]
+                # trim or pad shifted_x depending on the required size
+                if H > H_pool:
+                    trim_t = (H - H_pool) // 2
+                    trim_b = H - H_pool - trim_t
+                    x_level_k = x_level_k[:, trim_t:-trim_b]
+                elif H < H_pool:
+                    pad_t = (H_pool - H) // 2
+                    pad_b = H_pool - H - pad_t
+                    x_level_k = F.pad(x_level_k, (0,0,0,0,pad_t,pad_b))
+                if W > W_pool:
+                    trim_l = (W - W_pool) // 2
+                    trim_r = W - W_pool - trim_l
+                    x_level_k = x_level_k[:, :, trim_l:-trim_r]
+                elif W < W_pool:
+                    pad_l = (W_pool - W) // 2
+                    pad_r = W_pool - W - pad_l
+                    x_level_k = F.pad(x_level_k, (0,0,pad_l,pad_r))
+                x_windows_noreshape = window_partition_noreshape(x_level_k.contiguous(), window_size_glo) # B0, nw, nw, window_size, window_size, C
+                nWh, nWw = x_windows_noreshape.shape[1:3]
+                if self.pool_method == "mean":
+                    x_windows_pooled = x_windows_noreshape.mean([3, 4]) # B0, nWh, nWw, C
+                elif self.pool_method == "max":
+                    x_windows_pooled = x_windows_noreshape.max(-2)[0].max(-2)[0].view(B0, nWh, nWw, C) # B0, nWh, nWw, C
+                elif self.pool_method == "fc":
+                    x_windows_noreshape = x_windows_noreshape.view(B0, nWh, nWw, window_size_glo*window_size_glo, C).transpose(3, 4) # B0, nWh, nWw, C, wsize**2
+                    x_windows_pooled = self.pool_layers[k](x_windows_noreshape).flatten(-2) # B0, nWh, nWw, C
+                elif self.pool_method == "conv":
+                    x_windows_noreshape = x_windows_noreshape.view(-1, window_size_glo, window_size_glo, C).permute(0, 3, 1, 2).contiguous() # B0 * nw * nw, C, wsize, wsize
+                    x_windows_pooled = self.pool_layers[k](x_windows_noreshape).view(B0, nWh, nWw, C) # B0, nWh, nWw, C
+                x_windows_all += [x_windows_pooled]
+                # print(x_windows_pooled.shape)
+                x_window_masks_all += [None]
+                # pooling_index=pooling_index+1
+            x_windows_all_clips += [x_windows_all]
+            x_window_masks_all_clips += [x_window_masks_all]
+            for k in range(len(self.focal_l_clips)):
+                if self.focal_l_clips[k]>self.window_size:
+                    window_size_glo = self.focal_l_clips[k]
+                else:
+                    window_size_glo = math.floor(self.window_size_glo / (self.focal_l_clips[k]))
+                    pooled_h = math.ceil(H / self.window_size) * (self.focal_l_clips[k])
+                    pooled_w = math.ceil(W / self.window_size) * (self.focal_l_clips[k])
+                H_pool = pooled_h * window_size_glo
+                W_pool = pooled_w * window_size_glo
+                x_level_k = shifted_x[:,k]
+                if H!=H_pool or W!=W_pool:
+                    x_level_k=F.interpolate(x_level_k.permute(0,3,1,2), size=(H_pool, W_pool), mode='bilinear').permute(0,2,3,1)
+                # print(x_level_k.shape)
+                x_windows_noreshape = window_partition_noreshape(x_level_k.contiguous(), window_size_glo) # B0, nw, nw, window_size, window_size, C
+                nWh, nWw = x_windows_noreshape.shape[1:3]
+                if self.pool_method == "mean":
+                    x_windows_pooled = x_windows_noreshape.mean([3, 4]) # B0, nWh, nWw, C
+                elif self.pool_method == "max":
+                    x_windows_pooled = x_windows_noreshape.max(-2)[0].max(-2)[0].view(B0, nWh, nWw, C) # B0, nWh, nWw, C
+                elif self.pool_method == "fc":
+                    x_windows_noreshape = x_windows_noreshape.view(B0, nWh, nWw, window_size_glo*window_size_glo, C).transpose(3, 4) # B0, nWh, nWw, C, wsize**2
+                    x_windows_pooled = self.pool_layers_clips[k](x_windows_noreshape).flatten(-2) # B0, nWh, nWw, C
+                elif self.pool_method == "conv":
+                    x_windows_noreshape = x_windows_noreshape.view(-1, window_size_glo, window_size_glo, C).permute(0, 3, 1, 2).contiguous() # B0 * nw * nw, C, wsize, wsize
+                    x_windows_pooled = self.pool_layers_clips[k](x_windows_noreshape).view(B0, nWh, nWw, C) # B0, nWh, nWw, C
+                x_windows_all_clips += [x_windows_pooled]
+                # print(x_windows_pooled.shape)
+                x_window_masks_all_clips += [None]
+                # pooling_index=pooling_index+1
+        # exit()
+        attn_windows = self.attn(x_windows_all_clips, mask_all=x_window_masks_all_clips, batch_size=B0, num_clips=D0)  # nW*B0, window_size*window_size, C
+        attn_windows = attn_windows[:, :self.window_size ** 2]
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
+        shifted_x = window_reverse(attn_windows, self.window_size, H, W)  # B H(padded) W(padded) C
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
+        else:
+            x = shifted_x
+        # x = x[:, :self.input_resolution[0], :self.input_resolution[1]].contiguous().view(B, -1, C)
+        x = x[:, :H0, :W0].contiguous().view(B0, -1, C)
+        # FFN
+        # x = shortcut + self.drop_path(x if (not self.use_layerscale) else (self.gamma_1 * x))
+        # x = x + self.drop_path(self.mlp(self.norm2(x)) if (not self.use_layerscale) else (self.gamma_2 * self.mlp(self.norm2(x))))
+        # print(x.shape, shortcut[:,-1].view(B0, -1, C).shape)
+        x = shortcut[:,-1].view(B0, -1, C) + self.drop_path(x if (not self.use_layerscale) else (self.gamma_1 * x))
+        x = x + self.drop_path(self.mlp(self.norm2(x)) if (not self.use_layerscale) else (self.gamma_2 * self.mlp(self.norm2(x))))
+        # x=torch.cat([shortcut[:,:-1],x.view(B0,self.input_resolution[0],self.input_resolution[1],C).unsqueeze(1)],1)
+        x=torch.cat([shortcut[:,:-1],x.view(B0,H0,W0,C).unsqueeze(1)],1)
+        assert x.shape==shortcut.shape
+        # exit()
+        return x
+    def extra_repr(self) -> str:
+        return f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, " \
+               f"window_size={self.window_size}, shift_size={self.shift_size}, mlp_ratio={self.mlp_ratio}"
+    def flops(self):
+        flops = 0
+        H, W = self.input_resolution
+        # norm1
+        flops += self.dim * H * W
+        # W-MSA/SW-MSA
+        nW = H * W / self.window_size / self.window_size
+        flops += nW * self.attn.flops(self.window_size * self.window_size, self.window_size, self.focal_window)
+        if self.pool_method != "none" and self.focal_level > 1:
+            for k in range(self.focal_level-1):
+                window_size_glo = math.floor(self.window_size_glo / (2 ** k))
+                nW_glo = nW * (2**k)
+                # (sub)-window pooling
+                flops += nW_glo * self.dim * window_size_glo * window_size_glo
+                # qkv for global levels
+                # NOTE: in our implementation, we pass the pooled window embedding to qkv embedding layer,
+                # but theoritically, we only need to compute k and v.
+                flops += nW_glo * self.dim * 3 * self.dim
+        # mlp
+        flops += 2 * H * W * self.dim * self.dim * self.mlp_ratio
+        # norm2
+        flops += self.dim * H * W
+        return flops
+class BasicLayer3d3(nn.Module):
+    """ A basic Focal Transformer layer for one stage.
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (tuple[int]): Input resolution.
+        depth (int): Number of blocks.
+        num_heads (int): Number of attention heads.
+        window_size (int): Local window size.
+        expand_size (int): expand size for focal level 1.
+        expand_layer (str): expand layer. Default: all
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.0.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        pool_method (str): Window pooling method. Default: none.
+        focal_level (int): Number of focal levels. Default: 1.
+        focal_window (int): region size at each focal level. Default: 1.
+        use_conv_embed (bool): whether use overlapped convolutional patch embedding layer. Default: False
+        use_shift (bool): Whether use window shift as in Swin Transformer. Default: False
+        use_pre_norm (bool): Whether use pre-norm before patch embedding projection for stability. Default: False
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+        use_layerscale (bool): Whether use layer scale for stability. Default: False.
+        layerscale_value (float): Layerscale value. Default: 1e-4.
+    """
+    def __init__(self, dim, input_resolution, depth, num_heads, window_size, expand_size, expand_layer="all",
+                 mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., norm_layer=nn.LayerNorm, pool_method="none",
+                 focal_level=1, focal_window=1, use_conv_embed=False, use_shift=False, use_pre_norm=False,
+                 downsample=None, use_checkpoint=False, use_layerscale=False, layerscale_value=1e-4, focal_l_clips=[16,8,2], focal_kernel_clips=[7,5,3]):
+        super().__init__()
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+        if expand_layer == "even":
+            expand_factor = 0
+        elif expand_layer == "odd":
+            expand_factor = 1
+        elif expand_layer == "all":
+            expand_factor = -1
+        # build blocks
+        self.blocks = nn.ModuleList([
+            CffmTransformerBlock3d3(dim=dim, input_resolution=input_resolution,
+                                 num_heads=num_heads, window_size=window_size,
+                                 shift_size=(0 if (i % 2 == 0) else window_size // 2) if use_shift else 0,
+                                 expand_size=0 if (i % 2 == expand_factor) else expand_size,
+                                 mlp_ratio=mlp_ratio,
+                                 qkv_bias=qkv_bias, qk_scale=qk_scale,
+                                 drop=drop,
+                                 attn_drop=attn_drop,
+                                 drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
+                                 norm_layer=norm_layer,
+                                 pool_method=pool_method,
+                                 focal_level=focal_level,
+                                 focal_window=focal_window,
+                                 use_layerscale=use_layerscale,
+                                 layerscale_value=layerscale_value,
+                                 focal_l_clips=focal_l_clips,
+                                 focal_kernel_clips=focal_kernel_clips)
+            for i in range(depth)])
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(
+                img_size=input_resolution, patch_size=2, in_chans=dim, embed_dim=2*dim,
+                use_conv_embed=use_conv_embed, norm_layer=norm_layer, use_pre_norm=use_pre_norm,
+                is_stem=False
+            )
+        else:
+            self.downsample = None
+    def forward(self, x, batch_size=None, num_clips=None, reg_tokens=None):
+        B, D, C, H, W = x.shape
+        x = rearrange(x, 'b d c h w -> b d h w c')
+        for blk in self.blocks:
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x)
+            else:
+                x = blk(x)
+        if self.downsample is not None:
+            x = x.view(x.shape[0], self.input_resolution[0], self.input_resolution[1], -1).permute(0, 3, 1, 2).contiguous()
+            x = self.downsample(x)
+        x = rearrange(x, 'b d h w c -> b d c h w')
+        return x
+    def extra_repr(self) -> str:
+        return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}"
+    def flops(self):
+        flops = 0
+        for blk in self.blocks:
+            flops += blk.flops()
+        if self.downsample is not None:
+            flops += self.downsample.flops()
+        return flops

models/SpaTrackV2/models/depth_refiner/stablizer.py ADDED Viewed

	@@ -0,0 +1,342 @@

+import numpy as np
+import torch.nn as nn
+import torch
+# from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
+from collections import OrderedDict
+# from mmseg.ops import resize
+from torch.nn.functional import interpolate as resize
+# from builder import HEADS
+from models.SpaTrackV2.models.depth_refiner.decode_head import BaseDecodeHead, BaseDecodeHead_clips, BaseDecodeHead_clips_flow
+# from mmseg.models.utils import *
+import attr
+from IPython import embed
+from models.SpaTrackV2.models.depth_refiner.stablilization_attention import BasicLayer3d3
+import cv2
+from models.SpaTrackV2.models.depth_refiner.network import *
+import warnings
+# from mmcv.utils import Registry, build_from_cfg
+from torch import nn
+from einops import rearrange
+import torch.nn.functional as F
+from models.SpaTrackV2.models.blocks import (
+    AttnBlock, CrossAttnBlock, Mlp
+)
+class MLP(nn.Module):
+    """
+    Linear Embedding
+    """
+    def __init__(self, input_dim=2048, embed_dim=768):
+        super().__init__()
+        self.proj = nn.Linear(input_dim, embed_dim)
+    def forward(self, x):
+        x = x.flatten(2).transpose(1, 2)
+        x = self.proj(x)
+        return x
+def scatter_multiscale_fast(
+    track2d: torch.Tensor,
+    trackfeature: torch.Tensor,
+    H: int,
+    W: int,
+    kernel_sizes = [1]
+) -> torch.Tensor:
+    """
+    Scatter sparse track features onto a dense image grid with weighted multi-scale pooling to handle zero-value gaps.
+    This function scatters sparse track features into a dense image grid and applies multi-scale average pooling
+    while excluding zero-value holes. The weight mask ensures that only valid feature regions contribute to the pooling,
+    avoiding dilution by empty pixels.
+    Args:
+        track2d (torch.Tensor): Float tensor of shape (B, T, N, 2) containing (x, y) pixel coordinates
+                                for each track point across batches, frames, and points.
+        trackfeature (torch.Tensor): Float tensor of shape (B, T, N, C) with C-dimensional features
+                                    for each track point.
+        H (int): Height of the target output image.
+        W (int): Width of the target output image.
+        kernel_sizes (List[int]): List of odd integers for average pooling kernel sizes. Default: [3, 5, 7].
+    Returns:
+        torch.Tensor: Multi-scale fused feature map of shape (B, T, C, H, W) with hole-resistant pooling.
+    """
+    B, T, N, C = trackfeature.shape
+    device = trackfeature.device
+    # 1. Flatten coordinates and filter valid points within image bounds
+    coords_flat = track2d.round().long().reshape(-1, 2)  # (B*T*N, 2)
+    x = coords_flat[:, 0]  # x coordinates
+    y = coords_flat[:, 1]  # y coordinates
+    feat_flat = trackfeature.reshape(-1, C)  # Flatten features
+    valid_mask = (x >= 0) & (x < W) & (y >= 0) & (y < H)
+    x = x[valid_mask]
+    y = y[valid_mask]
+    feat_flat = feat_flat[valid_mask]
+    valid_count = x.shape[0]
+    if valid_count == 0:
+        return torch.zeros(B, T, C, H, W, device=device)  # Handle no-valid-point case
+    # 2. Calculate linear indices and batch-frame indices for scattering
+    lin_idx = y * W + x  # Linear index within a single frame (H*W range)
+    # Generate batch-frame indices (e.g., 0~B*T-1 for each frame in batch)
+    bt_idx_raw = (
+        torch.arange(B * T, device=device)
+        .view(B, T, 1)
+        .expand(B, T, N)
+        .reshape(-1)
+    )
+    bt_idx = bt_idx_raw[valid_mask]  # Indices for valid points across batch and frames
+    # 3. Create accumulation buffers for features and weights
+    total_space = B * T * H * W
+    img_accum_flat = torch.zeros(total_space, C, device=device)    # Feature accumulator
+    weight_accum_flat = torch.zeros(total_space, 1, device=device) # Weight accumulator (counts)
+    # 4. Scatter features and weights into accumulation buffers
+    idx_in_accum = bt_idx * (H * W) + lin_idx  # Global index: batch_frame * H*W + pixel_index
+    # Add features to corresponding indices (index_add_ is efficient for sparse updates)
+    img_accum_flat.index_add_(0, idx_in_accum, feat_flat)
+    weight_accum_flat.index_add_(0, idx_in_accum, torch.ones((valid_count, 1), device=device))
+    # 5. Normalize features by valid weights, keep zeros for invalid regions
+    valid_mask_flat = weight_accum_flat > 0  # Binary mask for valid pixels
+    img_accum_flat = img_accum_flat / (weight_accum_flat + 1e-6)  # Avoid division by zero
+    img_accum_flat = img_accum_flat * valid_mask_flat.float()  # Mask out invalid regions
+    # 6. Reshape to (B, T, C, H, W) for further processing
+    img = (
+        img_accum_flat.view(B, T, H, W, C)
+        .permute(0, 1, 4, 2, 3)
+        .contiguous()
+    )  # Shape: (B, T, C, H, W)
+    # 7. Multi-scale pooling with weight masking to exclude zero holes
+    blurred_outputs = []
+    for k in kernel_sizes:
+        pad = k // 2
+        img_bt = img.view(B*T, C, H, W)  # Flatten batch and time for pooling
+        # Create weight mask for valid regions (1 where features exist, 0 otherwise)
+        weight_mask = (
+            weight_accum_flat.view(B, T, 1, H, W) > 0
+        ).float().view(B*T, 1, H, W)  # Shape: (B*T, 1, H, W)
+        # Calculate number of valid neighbors in each pooling window
+        weight_sum = F.conv2d(
+            weight_mask,
+            torch.ones((1, 1, k, k), device=device),
+            stride=1,
+            padding=pad
+        )  # Shape: (B*T, 1, H, W)
+        # Sum features only in valid regions
+        feat_sum = F.conv2d(
+            img_bt * weight_mask,  # Mask out invalid regions before summing
+            torch.ones((1, 1, k, k), device=device).expand(C, 1, k, k),
+            stride=1,
+            padding=pad,
+            groups=C
+        )  # Shape: (B*T, C, H, W)
+        # Compute average only over valid neighbors
+        feat_avg = feat_sum / (weight_sum + 1e-6)
+        blurred_outputs.append(feat_avg)
+    # 8. Fuse multi-scale results by averaging across kernel sizes
+    fused = torch.stack(blurred_outputs).mean(dim=0)  # Average over kernel sizes
+    return fused.view(B, T, C, H, W)  # Restore original shape
+#@HEADS.register_module()
+class Stabilization_Network_Cross_Attention(BaseDecodeHead_clips_flow):
+    def __init__(self, feature_strides, **kwargs):
+        super(Stabilization_Network_Cross_Attention, self).__init__(input_transform='multiple_select', **kwargs)
+        self.training = False
+        assert len(feature_strides) == len(self.in_channels)
+        assert min(feature_strides) == feature_strides[0]
+        self.feature_strides = feature_strides
+        c1_in_channels, c2_in_channels, c3_in_channels, c4_in_channels = self.in_channels
+        decoder_params = kwargs['decoder_params']
+        embedding_dim = decoder_params['embed_dim']
+        self.linear_c4 = MLP(input_dim=c4_in_channels, embed_dim=embedding_dim)
+        self.linear_c3 = MLP(input_dim=c3_in_channels, embed_dim=embedding_dim)
+        self.linear_c2 = MLP(input_dim=c2_in_channels, embed_dim=embedding_dim)
+        self.linear_c1 = MLP(input_dim=c1_in_channels, embed_dim=embedding_dim)
+        self.linear_fuse = nn.Sequential(nn.Conv2d(embedding_dim*4, embedding_dim, kernel_size=(1, 1), stride=(1, 1), bias=False),\
+                                         nn.ReLU(inplace=True))
+        self.proj_track = nn.Conv2d(100, 128, kernel_size=(1, 1), stride=(1, 1), bias=True)
+        depths = decoder_params['depths']
+        self.reg_tokens = nn.Parameter(torch.zeros(1, 2, embedding_dim))
+        self.global_patch = nn.Conv2d(embedding_dim, embedding_dim, kernel_size=(8, 8), stride=(8, 8), bias=True)
+        self.att_temporal = nn.ModuleList(
+            [
+                AttnBlock(embedding_dim, 8,
+                          mlp_ratio=4, flash=True, ckpt_fwd=True)
+                for _ in range(8)
+            ]
+        )
+        self.att_spatial = nn.ModuleList(
+            [
+                AttnBlock(embedding_dim, 8,
+                          mlp_ratio=4, flash=True, ckpt_fwd=True)
+                for _ in range(8)
+            ]
+        )
+        self.scale_shift_head = nn.Sequential(nn.Linear(embedding_dim, embedding_dim), nn.GELU(), nn.Linear(embedding_dim, 4))
+        # Initialize reg tokens
+        nn.init.trunc_normal_(self.reg_tokens, std=0.02)
+        self.decoder_focal=BasicLayer3d3(dim=embedding_dim,
+               input_resolution=(96,
+                                 96),
+               depth=depths,
+               num_heads=8,
+               window_size=7,
+               mlp_ratio=4.,
+               qkv_bias=True,
+               qk_scale=None,
+               drop=0.,
+               attn_drop=0.,
+               drop_path=0.,
+               norm_layer=nn.LayerNorm,
+               pool_method='fc',
+               downsample=None,
+               focal_level=2,
+               focal_window=5,
+               expand_size=3,
+               expand_layer="all",
+               use_conv_embed=False,
+               use_shift=False,
+               use_pre_norm=False,
+               use_checkpoint=False,
+               use_layerscale=False,
+               layerscale_value=1e-4,
+               focal_l_clips=[7,4,2],
+               focal_kernel_clips=[7,5,3])
+        self.ffm2 = FFM(inchannels= 256, midchannels= 256, outchannels = 128)
+        self.ffm1 = FFM(inchannels= 128, midchannels= 128, outchannels = 64)
+        self.ffm0 = FFM(inchannels= 64, midchannels= 64, outchannels = 32,upfactor=1)
+        self.AO = AO(32, outchannels=3, upfactor=1)
+        self._c2 = None
+        self._c_further = None
+    def buffer_forward(self, inputs, num_clips=None, imgs=None):#,infermode=1):
+        # input: B T 7 H W  (7 means 3 rgb + 3 pointmap + 1 uncertainty)  normalized
+        if self.training:
+            assert self.num_clips==num_clips
+        x = self._transform_inputs(inputs)  # len=4, 1/4,1/8,1/16,1/32
+        c1, c2, c3, c4 = x
+        ############## MLP decoder on C1-C4 ###########
+        n, _, h, w = c4.shape
+        batch_size = n // num_clips
+        _c4 = self.linear_c4(c4).permute(0,2,1).reshape(n, -1, c4.shape[2], c4.shape[3])
+        _c4 = resize(_c4, size=c1.size()[2:],mode='bilinear',align_corners=False)
+        _c3 = self.linear_c3(c3).permute(0,2,1).reshape(n, -1, c3.shape[2], c3.shape[3])
+        _c3 = resize(_c3, size=c1.size()[2:],mode='bilinear',align_corners=False)
+        _c2 = self.linear_c2(c2).permute(0,2,1).reshape(n, -1, c2.shape[2], c2.shape[3])
+        _c2 = resize(_c2, size=c1.size()[2:],mode='bilinear',align_corners=False)
+        _c1 = self.linear_c1(c1).permute(0,2,1).reshape(n, -1, c1.shape[2], c1.shape[3])
+        _c = self.linear_fuse(torch.cat([_c4, _c3, _c2, _c1], dim=1))
+        _, _, h, w=_c.shape
+        _c_further=_c.reshape(batch_size, num_clips, -1, h, w)  #h2w2
+        # Expand reg_tokens to match batch size
+        reg_tokens = self.reg_tokens.expand(batch_size*num_clips, -1, -1)  # [B, 2, C]
+        _c2=self.decoder_focal(_c_further, batch_size=batch_size, num_clips=num_clips, reg_tokens=reg_tokens)
+        assert _c_further.shape==_c2.shape
+        self._c2 = _c2
+        self._c_further = _c_further
+        # compute the scale and shift of the global patch
+        global_patch = self.global_patch(_c2.view(batch_size*num_clips, -1, h, w)).view(batch_size*num_clips, _c2.shape[2], -1).permute(0,2,1)
+        global_patch = torch.cat([global_patch, reg_tokens], dim=1)
+        for i in range(8):
+            global_patch = self.att_temporal[i](global_patch)
+            global_patch = rearrange(global_patch, '(b t) n c -> (b n) t c', b=batch_size, t=num_clips, c=_c2.shape[2])
+            global_patch = self.att_spatial[i](global_patch)
+            global_patch = rearrange(global_patch, '(b n) t c -> (b t) n c', b=batch_size, t=num_clips, c=_c2.shape[2])
+        reg_tokens = global_patch[:, -2:, :]
+        s_ = self.scale_shift_head(reg_tokens)
+        scale = 1 + s_[:, 0, :1].view(batch_size, num_clips, 1, 1, 1)
+        shift = s_[:, 1, 1:].view(batch_size, num_clips, 3, 1, 1)
+        shift[:,:,:2,...] = 0
+        return scale, shift
+    def forward(self, inputs, edge_feat, edge_feat1, tracks, tracks_uvd, num_clips=None, imgs=None, vis_track=None):#,infermode=1):
+        if self._c2 is None:
+            scale, shift = self.buffer_forward(inputs,num_clips,imgs)
+        B, T, N, _ = tracks.shape
+        _c2 = self._c2
+        _c_further = self._c_further
+        # skip and head
+        _c_further = rearrange(_c_further, 'b t c h w -> (b t) c h w', b=B, t=T)
+        _c2 = rearrange(_c2, 'b t c h w -> (b t) c h w', b=B, t=T)
+        outframe = self.ffm2(_c_further, _c2)
+        tracks_uv = tracks_uvd[...,:2].clone()
+        track_feature = scatter_multiscale_fast(tracks_uv/2, tracks, outframe.shape[-2], outframe.shape[-1], kernel_sizes=[1, 3, 5])
+        # visualize track_feature as video
+        # import cv2
+        # import imageio
+        # import os
+        # BT, C, H, W = outframe.shape
+        # track_feature_vis = track_feature.view(B, T, 3, H, W).float().detach().cpu().numpy()
+        # track_feature_vis = track_feature_vis.transpose(0,1,3,4,2)
+        # track_feature_vis = (track_feature_vis - track_feature_vis.min()) / (track_feature_vis.max() - track_feature_vis.min() + 1e-6)
+        # track_feature_vis = (track_feature_vis * 255).astype(np.uint8)
+        # imgs =(imgs.detach() + 1) * 127.5
+        # vis_track.visualize(video=imgs, tracks=tracks_uv, filename="test")
+        # for b in range(B):
+        #     frames = []
+        #     for t in range(T):
+        #         frame = track_feature_vis[b,t]
+        #         frame = cv2.applyColorMap(frame[...,0], cv2.COLORMAP_JET)
+        #         frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+        #         frames.append(frame)
+        #     # Save as gif
+        #     imageio.mimsave(f'track_feature_b{b}.gif', frames, duration=0.1)
+        # import pdb; pdb.set_trace()
+        track_feature = rearrange(track_feature, 'b t c h w -> (b t) c h w')
+        track_feature = self.proj_track(track_feature)
+        outframe = self.ffm1(edge_feat1 + track_feature,outframe)
+        outframe = self.ffm0(edge_feat,outframe)
+        outframe = self.AO(outframe)
+        return outframe
+    def reset_success(self):
+        self._c2 = None
+        self._c_further = None

models/SpaTrackV2/models/predictor.py ADDED Viewed

	@@ -0,0 +1,153 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn.functional as F
+from tqdm import tqdm
+from models.SpaTrackV2.models.SpaTrack import SpaTrack2
+from typing import Literal
+import numpy as np
+from pathlib import Path
+from typing import Union, Optional
+import cv2
+import os
+import decord
+class Predictor(torch.nn.Module):
+    def __init__(self, args=None):
+        super().__init__()
+        self.args = args
+        self.spatrack = SpaTrack2(loggers=[None, None, None], **args)
+        self.S_wind = args.Track_cfg.s_wind
+        self.overlap = args.Track_cfg.overlap
+    def to(self, device: Union[str, torch.device]):
+        self.spatrack.to(device)
+        self.spatrack.base_model.to(device)
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Union[str, Path],
+        *,
+        force_download: bool = False,
+        cache_dir: Optional[str] = None,
+        device: Optional[Union[str, torch.device]] = None,
+        model_cfg: Optional[dict] = None,
+        **kwargs,
+    ) -> "SpaTrack2":
+        """
+        Load a pretrained model from a local file or a remote repository.
+        Args:
+            pretrained_model_name_or_path (str or Path):
+                - Path to a local model file (e.g., `./model.pth`).
+                - HuggingFace Hub model ID (e.g., `username/model-name`).
+            force_download (bool, optional):
+                Whether to force re-download even if cached. Default: False.
+            cache_dir (str, optional):
+                Custom cache directory. Default: None (use default cache).
+            device (str or torch.device, optional):
+                Target device (e.g., "cuda", "cpu"). Default: None (keep original).
+            **kwargs:
+                Additional config overrides.
+        Returns:
+            SpaTrack2: Loaded pretrained model.
+        """
+        # (1) check the path is local or remote
+        if isinstance(pretrained_model_name_or_path, Path):
+            model_path = str(pretrained_model_name_or_path)
+        else:
+            model_path = pretrained_model_name_or_path
+        # (2) if the path is remote, download it
+        if not os.path.exists(model_path):
+            raise NotImplementedError("Remote download not implemented yet. Use a local path.")
+        # (3) load the model weights
+        state_dict = torch.load(model_path, map_location="cpu")
+        # (4) initialize the model (can load config.json if exists)
+        config_path = os.path.join(os.path.dirname(model_path), "config.json")
+        config = {}
+        if os.path.exists(config_path):
+            import json
+            with open(config_path, "r") as f:
+                config.update(json.load(f))
+        config.update(kwargs)  # allow override the config
+        if model_cfg is not None:
+            config = model_cfg
+        model = cls(config)
+        if "model" in state_dict:
+            model.spatrack.load_state_dict(state_dict["model"], strict=False)
+        else:
+            model.spatrack.load_state_dict(state_dict, strict=False)
+        # (5) device management
+        if device is not None:
+            model.to(device)
+        return model
+    def forward(self, video: str|torch.Tensor|np.ndarray,
+                 depth: str|torch.Tensor|np.ndarray=None,
+                 unc_metric: str|torch.Tensor|np.ndarray=None,
+                 intrs: str|torch.Tensor|np.ndarray=None,
+                 extrs: str|torch.Tensor|np.ndarray=None,
+                 queries=None, queries_3d=None, iters_track=4,
+                 full_point=False, fps=30, track2d_gt=None,
+                 fixed_cam=False, query_no_BA=False, stage=0,
+                 support_frame=0, replace_ratio=0.6):
+        """
+        video: this could be a path to a video, a tensor of shape (T, C, H, W) or a numpy array of shape (T, C, H, W)
+        queries: (B, N, 2)
+        """
+        if isinstance(video, str):
+            video = decord.VideoReader(video)
+            video = video[::fps].asnumpy()  # Convert to numpy array
+            video = np.array(video)  # Ensure numpy array
+            video = torch.from_numpy(video).permute(0, 3, 1, 2).float()
+        elif isinstance(video, np.ndarray):
+            video = torch.from_numpy(video).float()
+        if isinstance(depth, np.ndarray):
+            depth = torch.from_numpy(depth).float()
+        if isinstance(intrs, np.ndarray):
+            intrs = torch.from_numpy(intrs).float()
+        if isinstance(extrs, np.ndarray):
+            extrs = torch.from_numpy(extrs).float()
+        if isinstance(unc_metric, np.ndarray):
+            unc_metric = torch.from_numpy(unc_metric).float()
+        T_, C, H, W = video.shape
+        step_slide = self.S_wind - self.overlap
+        if T_ > self.S_wind:
+            num_windows = (T_ - self.S_wind + step_slide) // step_slide
+            T = num_windows * step_slide + self.S_wind
+            pad_len = T - T_
+            video = torch.cat([video, video[-1:].repeat(T-video.shape[0], 1, 1, 1)], dim=0)
+            if depth is not None:
+                depth = torch.cat([depth, depth[-1:].repeat(T-depth.shape[0], 1, 1)], dim=0)
+            if intrs is not None:
+                intrs = torch.cat([intrs, intrs[-1:].repeat(T-intrs.shape[0], 1, 1)], dim=0)
+            if extrs is not None:
+                extrs = torch.cat([extrs, extrs[-1:].repeat(T-extrs.shape[0], 1, 1)], dim=0)
+            if unc_metric is not None:
+                unc_metric = torch.cat([unc_metric, unc_metric[-1:].repeat(T-unc_metric.shape[0], 1, 1)], dim=0)
+        with torch.no_grad():
+            ret = self.spatrack.forward_stream(video, queries, T_org=T_,
+                                                depth=depth, intrs=intrs, unc_metric_in=unc_metric, extrs=extrs, queries_3d=queries_3d,
+                                                window_len=self.S_wind, overlap_len=self.overlap, track2d_gt=track2d_gt, full_point=full_point, iters_track=iters_track,
+                                                fixed_cam=fixed_cam, query_no_BA=query_no_BA, stage=stage, support_frame=support_frame, replace_ratio=replace_ratio) + (video[:T_],)
+        return ret

models/SpaTrackV2/models/tracker3D/TrackRefiner.py ADDED Viewed

	@@ -0,0 +1,1478 @@

+import os, sys
+import torch
+import torch.amp
+from models.SpaTrackV2.models.tracker3D.co_tracker.cotracker_base import CoTrackerThreeOffline, get_1d_sincos_pos_embed_from_grid
+import torch.nn.functional as F
+from models.SpaTrackV2.utils.visualizer import Visualizer
+from models.SpaTrackV2.utils.model_utils import sample_features5d
+from models.SpaTrackV2.models.blocks import bilinear_sampler
+import torch.nn as nn
+from models.SpaTrackV2.models.tracker3D.co_tracker.utils import (
+    EfficientUpdateFormer, AttnBlock, Attention, CrossAttnBlock,
+    sequence_BCE_loss, sequence_loss, sequence_prob_loss, sequence_dyn_prob_loss, sequence_loss_xyz, balanced_binary_cross_entropy
+)
+from torchvision.io import write_video
+import math
+from models.SpaTrackV2.models.tracker3D.co_tracker.utils import (
+    Mlp, BasicEncoder, EfficientUpdateFormer, GeometryEncoder, NeighborTransformer, CorrPointformer
+)
+from models.SpaTrackV2.utils.embeddings import get_3d_sincos_pos_embed_from_grid
+from einops import rearrange, repeat
+from models.SpaTrackV2.models.tracker3D.spatrack_modules.utils import (
+    EfficientUpdateFormer3D, weighted_procrustes_torch, posenc, key_fr_wprocrustes, get_topo_mask,
+    TrackFusion, get_nth_visible_time_index
+)
+from models.SpaTrackV2.models.tracker3D.spatrack_modules.ba import extract_static_from_3DTracks, ba_pycolmap
+from models.SpaTrackV2.models.tracker3D.spatrack_modules.pointmap_updator import PointMapUpdator
+from models.SpaTrackV2.models.depth_refiner.depth_refiner import TrackStablizer
+from models.SpaTrackV2.models.tracker3D.spatrack_modules.alignment import affine_invariant_global_loss
+from models.SpaTrackV2.models.tracker3D.delta_utils.upsample_transformer import UpsampleTransformerAlibi
+class TrackRefiner3D(CoTrackerThreeOffline):
+    def __init__(self, args=None):
+        super().__init__(**args.base)
+        """
+        This is 3D warpper from cotracker, which load the cotracker pretrain and
+        jointly refine the `camera pose`, `3D tracks`, `video depth`, `visibility` and `conf`
+        """
+        self.updateformer3D = EfficientUpdateFormer3D(self.updateformer)
+        self.corr_depth_mlp = Mlp(in_features=256, hidden_features=256, out_features=256)
+        self.rel_pos_mlp = Mlp(in_features=75, hidden_features=128, out_features=128)
+        self.rel_pos_glob_mlp = Mlp(in_features=75, hidden_features=128, out_features=256)
+        self.corr_xyz_mlp = Mlp(in_features=256, hidden_features=128, out_features=128)
+        self.xyz_mlp = Mlp(in_features=126, hidden_features=128, out_features=84)
+        # self.track_feat_mlp = Mlp(in_features=1110, hidden_features=128, out_features=128)
+        self.proj_xyz_embed = Mlp(in_features=1210+50, hidden_features=1110, out_features=1110)
+        # get the anchor point's embedding, and init the pts refiner
+        update_pts = True
+        # self.corr_transformer = nn.ModuleList([
+        #     CorrPointformer(
+        #         dim=128,
+        #         num_heads=8,
+        #         head_dim=128 // 8,
+        #         mlp_ratio=4.0,
+        #     )
+        #     for _ in range(self.corr_levels)
+        # ])
+        self.corr_transformer = nn.ModuleList([
+            CorrPointformer(
+                dim=128,
+                num_heads=8,
+                head_dim=128 // 8,
+                mlp_ratio=4.0,
+            )
+        ]
+        )
+        self.fnet = BasicEncoder(input_dim=3,
+                                 output_dim=self.latent_dim, stride=self.stride)
+        self.corr3d_radius = 3
+        if args.stablizer:
+            self.scale_shift_tokens = nn.Parameter(torch.randn(1, 2, self.latent_dim, requires_grad=True))
+            self.upsample_kernel_size = 5
+            self.residual_embedding = nn.Parameter(torch.randn(
+                                            self.latent_dim, self.model_resolution[0]//16,
+                                            self.model_resolution[1]//16, requires_grad=True))
+            self.dense_mlp = nn.Conv2d(2*self.latent_dim+63, self.latent_dim, kernel_size=1, stride=1, padding=0)
+            self.upsample_factor = 4
+            self.upsample_transformer = UpsampleTransformerAlibi(
+                kernel_size=self.upsample_kernel_size, # kernel_size=3, #
+                stride=self.stride,
+                latent_dim=self.latent_dim,
+                num_attn_blocks=2,
+                upsample_factor=4,
+            )
+        else:
+            self.update_pointmap = None
+        self.mode = args.mode
+        if self.mode == "online":
+            self.s_wind = args.s_wind
+            self.overlap = args.overlap
+    def upsample_with_mask(
+        self, inp: torch.Tensor, mask: torch.Tensor
+    ) -> torch.Tensor:
+        """Upsample flow field [H/P, W/P, 2] -> [H, W, 2] using convex combination"""
+        H, W = inp.shape[-2:]
+        up_inp = F.unfold(
+            inp, [self.upsample_kernel_size, self.upsample_kernel_size], padding=(self.upsample_kernel_size - 1) // 2
+        )
+        up_inp = rearrange(up_inp, "b c (h w) -> b c h w", h=H, w=W)
+        up_inp = F.interpolate(up_inp, scale_factor=self.upsample_factor, mode="nearest")
+        up_inp = rearrange(
+            up_inp, "b (c i j) h w -> b c (i j) h w", i=self.upsample_kernel_size, j=self.upsample_kernel_size
+        )
+        up_inp = torch.sum(mask * up_inp, dim=2)
+        return up_inp
+    def track_from_cam(self, queries, c2w_traj, intrs,
+                                rgbs=None, visualize=False):
+        """
+        This function will generate tracks by camera transform
+        Args:
+            queries: B T N 4
+            c2w_traj: B T 4 4
+            intrs: B T 3 3
+        """
+        B, T, N, _ = queries.shape
+        query_t = queries[:,0,:,0].to(torch.int64) # B N
+        query_c2w = torch.gather(c2w_traj,
+                                  dim=1, index=query_t[..., None, None].expand(-1, -1, 4, 4))  # B N 4 4
+        query_intr = torch.gather(intrs,
+                                  dim=1, index=query_t[..., None, None].expand(-1, -1, 3, 3))  # B N 3 3
+        query_pts = queries[:,0,:,1:4].clone() # B N 3
+        query_d = queries[:,0,:,3:4] # B N 3
+        query_pts[...,2] = 1
+        cam_pts = torch.einsum("bnij,bnj->bni", torch.inverse(query_intr), query_pts)*query_d # B N 3
+        # convert to world
+        cam_pts_h = torch.zeros(B, N, 4, device=cam_pts.device)
+        cam_pts_h[..., :3] = cam_pts
+        cam_pts_h[..., 3] = 1
+        world_pts = torch.einsum("bnij,bnj->bni", query_c2w, cam_pts_h)
+        # convert to other frames
+        cam_other_pts_ = torch.einsum("btnij,btnj->btni",
+                                    torch.inverse(c2w_traj[:,:,None].float().repeat(1,1,N,1,1)),
+                                    world_pts[:,None].repeat(1,T,1,1))
+        cam_depth = cam_other_pts_[...,2:3]
+        cam_other_pts = cam_other_pts_[...,:3] / (cam_other_pts_[...,2:3].abs()+1e-6)
+        cam_other_pts = torch.einsum("btnij,btnj->btni", intrs[:,:,None].repeat(1,1,N,1,1), cam_other_pts[...,:3])
+        cam_other_pts[..., 2:] = cam_depth
+        if visualize:
+            viser = Visualizer(save_dir=".", grayscale=True,
+                               fps=10, pad_value=50, tracks_leave_trace=0)
+            cam_other_pts[..., 0] /= self.factor_x
+            cam_other_pts[..., 1] /= self.factor_y
+            viser.visualize(video=rgbs, tracks=cam_other_pts[..., :2], filename="test")
+        init_xyzs = cam_other_pts
+        return init_xyzs, world_pts[..., :3], cam_other_pts_[..., :3]
+    def cam_from_track(self, tracks, intrs,
+                       dyn_prob=None, metric_unc=None,
+                       vis_est=None, only_cam_pts=False,
+                       track_feat_concat=None,
+                       tracks_xyz=None,
+                       query_pts=None,
+                       fixed_cam=False,
+                       depth_unproj=None,
+                       cam_gt=None,
+                       init_pose=False,
+                       ):
+        """
+        This function will generate tracks by camera transform
+        Args:
+            queries: B T N 3
+            scale_est: 1 1
+            shift_est: 1 1
+            intrs: B T 3 3
+            dyn_prob: B T N
+            metric_unc: B N 1
+            query_pts: B T N 3
+        """
+        if tracks_xyz is not None:
+            B, T, N, _ = tracks.shape
+            cam_pts = tracks_xyz
+            intr_repeat = intrs[:,:,None].repeat(1,1,N,1,1)
+        else:
+            B, T, N, _ = tracks.shape
+            # get the pts in cam coordinate
+            tracks_xy = tracks[...,:3].clone().detach() # B T N 3
+            # tracks_z = 1/(tracks[...,2:] * scale_est + shift_est) # B T N 1
+            tracks_z = tracks[...,2:].detach() # B T N 1
+            tracks_xy[...,2] = 1
+            intr_repeat = intrs[:,:,None].repeat(1,1,N,1,1)
+            cam_pts = torch.einsum("bnij,bnj->bni",
+                                torch.inverse(intr_repeat.view(B*T,N,3,3)).float(),
+                                    tracks_xy.view(B*T, N, 3))*(tracks_z.view(B*T,N,1).abs()) # B*T N 3
+            cam_pts[...,2] *= torch.sign(tracks_z.view(B*T,N))
+            # get the normalized cam pts, and pts refiner
+            mask_z = (tracks_z.max(dim=1)[0]<200).squeeze()
+            cam_pts = cam_pts.view(B, T, N, 3)
+        if only_cam_pts:
+            return cam_pts
+        dyn_prob = dyn_prob.mean(dim=1)[..., None]
+        # B T N 3 -> local frames coordinates.  transformer  static points  B T N 3 -> B T N 3  static (B T N 3) -> same -> dynamic points @ C2T.inverse()
+        # get the cam pose
+        vis_est_ = vis_est[:,:,None,:]
+        graph_matrix = (vis_est_*vis_est_.permute(0, 2,1,3)).detach()
+        # find the max connected component
+        key_fr_idx = [0]
+        weight_final = (metric_unc) # * vis_est
+        with torch.amp.autocast(enabled=False, device_type='cuda'):
+            if fixed_cam:
+                c2w_traj_init = self.c2w_est_curr
+                c2w_traj_glob = c2w_traj_init
+                cam_pts_refine = cam_pts
+                intrs_refine = intrs
+                xy_refine = query_pts[...,1:3]
+                world_tracks_init = torch.einsum("btij,btnj->btni", c2w_traj_init[:,:,:3,:3], cam_pts) + c2w_traj_init[:,:,None,:3,3]
+                world_tracks_refined = world_tracks_init
+                # extract the stable static points for refine the camera pose
+                intrs_dn = intrs.clone()
+                intrs_dn[...,0,:] *= self.factor_x
+                intrs_dn[...,1,:] *= self.factor_y
+                _, query_world_pts, _ = self.track_from_cam(query_pts, c2w_traj_init, intrs_dn)
+                world_tracks_static, mask_static, mask_topk, vis_mask_static, tracks2d_static = extract_static_from_3DTracks(world_tracks_init,
+                                                                                                                dyn_prob, query_world_pts,
+                                                                                                                vis_est, tracks, img_size=self.image_size,
+                                                                                                                K=0)
+                world_static_refine = world_tracks_static
+            else:
+                if (not self.training):
+                    # if (self.c2w_est_curr==torch.eye(4, device=cam_pts.device).repeat(B, T, 1, 1)).all():
+                    campts_update = torch.einsum("btij,btnj->btni", self.c2w_est_curr[...,:3,:3], cam_pts) + self.c2w_est_curr[...,None,:3,3]
+                    # campts_update = cam_pts
+                    c2w_traj_init_update = key_fr_wprocrustes(campts_update, graph_matrix,
+                                                                (weight_final*(1-dyn_prob)).permute(0,2,1), vis_est_.permute(0,1,3,2))
+                    c2w_traj_init = [email protected]_est_curr
+                    # else:
+                        # c2w_traj_init = self.c2w_est_curr                # extract the stable static points for refine the camera pose
+                else:
+                    # if (self.c2w_est_curr==torch.eye(4, device=cam_pts.device).repeat(B, T, 1, 1)).all():
+                    campts_update = torch.einsum("btij,btnj->btni", self.c2w_est_curr[...,:3,:3], cam_pts) + self.c2w_est_curr[...,None,:3,3]
+                    # campts_update = cam_pts
+                    c2w_traj_init_update = key_fr_wprocrustes(campts_update, graph_matrix,
+                                                                (weight_final*(1-dyn_prob)).permute(0,2,1), vis_est_.permute(0,1,3,2))
+                    c2w_traj_init = [email protected]_est_curr
+                    # else:
+                        # c2w_traj_init = self.c2w_est_curr                # extract the stable static points for refine the camera pose
+                intrs_dn = intrs.clone()
+                intrs_dn[...,0,:] *= self.factor_x
+                intrs_dn[...,1,:] *= self.factor_y
+                _, query_world_pts, _ = self.track_from_cam(query_pts, c2w_traj_init, intrs_dn)
+                # refine the world tracks
+                world_tracks_init = torch.einsum("btij,btnj->btni", c2w_traj_init[:,:,:3,:3], cam_pts) + c2w_traj_init[:,:,None,:3,3]
+                world_tracks_static, mask_static, mask_topk, vis_mask_static, tracks2d_static = extract_static_from_3DTracks(world_tracks_init,
+                                                                                                            dyn_prob, query_world_pts,
+                                                                                                            vis_est, tracks, img_size=self.image_size,
+                                                                                                            K=150 if self.training else 1500)
+                # calculate the efficient ba
+                cam_tracks_static = cam_pts[:,:,mask_static.squeeze(),:][:,:,mask_topk.squeeze(),:]
+                cam_tracks_static[...,2] = depth_unproj.view(B, T, N)[:,:,mask_static.squeeze()][:,:,mask_topk.squeeze()]
+                c2w_traj_glob, world_static_refine, intrs_refine = ba_pycolmap(world_tracks_static, intrs,
+                                                                                c2w_traj_init, vis_mask_static,
+                                                                                tracks2d_static, self.image_size,
+                                                                                cam_tracks_static=cam_tracks_static,
+                                                                                training=self.training, query_pts=query_pts)
+                c2w_traj_glob = c2w_traj_glob.view(B, T, 4, 4)
+                world_tracks_refined = world_tracks_init
+            #NOTE: merge the index of static points and topk points
+            # merge_idx = torch.where(mask_static.squeeze()>0)[0][mask_topk.squeeze()]
+            # world_tracks_refined[:,:,merge_idx] = world_static_refine
+            # test the procrustes
+            w2c_traj_glob = torch.inverse(c2w_traj_init.detach())
+            cam_pts_refine = torch.einsum("btij,btnj->btni", w2c_traj_glob[:,:,:3,:3], world_tracks_refined) + w2c_traj_glob[:,:,None,:3,3]
+            # get the xyz_refine
+            #TODO: refiner
+            cam_pts4_proj = cam_pts_refine.clone()
+            cam_pts4_proj[...,2] *= torch.sign(cam_pts4_proj[...,2:3].view(B*T,N))
+            xy_refine = torch.einsum("btnij,btnj->btni", intrs_refine.view(B,T,1,3,3).repeat(1,1,N,1,1), cam_pts4_proj/cam_pts4_proj[...,2:3].abs())
+            xy_refine[..., 2] = cam_pts4_proj[...,2:3].view(B*T,N)
+        # xy_refine = torch.zeros_like(cam_pts_refine)[...,:2]
+        return c2w_traj_glob, cam_pts_refine, intrs_refine, xy_refine, world_tracks_init, world_tracks_refined, c2w_traj_init
+    def extract_img_feat(self, video, fmaps_chunk_size=200):
+        B, T, C, H, W = video.shape
+        dtype = video.dtype
+        H4, W4 = H // self.stride, W // self.stride
+        # Compute convolutional features for the video or for the current chunk in case of online mode
+        if T > fmaps_chunk_size:
+            fmaps = []
+            for t in range(0, T, fmaps_chunk_size):
+                video_chunk = video[:, t : t + fmaps_chunk_size]
+                fmaps_chunk = self.fnet(video_chunk.reshape(-1, C, H, W))
+                T_chunk = video_chunk.shape[1]
+                C_chunk, H_chunk, W_chunk = fmaps_chunk.shape[1:]
+                fmaps.append(fmaps_chunk.reshape(B, T_chunk, C_chunk, H_chunk, W_chunk))
+            fmaps = torch.cat(fmaps, dim=1).reshape(-1, C_chunk, H_chunk, W_chunk)
+        else:
+            fmaps = self.fnet(video.reshape(-1, C, H, W))
+        fmaps = fmaps.permute(0, 2, 3, 1)
+        fmaps = fmaps / torch.sqrt(
+            torch.maximum(
+                torch.sum(torch.square(fmaps), axis=-1, keepdims=True),
+                torch.tensor(1e-12, device=fmaps.device),
+            )
+        )
+        fmaps = fmaps.permute(0, 3, 1, 2).reshape(
+            B, -1, self.latent_dim, H // self.stride, W // self.stride
+        )
+        fmaps = fmaps.to(dtype)
+        return fmaps
+    def norm_xyz(self, xyz):
+        """
+        xyz can be (B T N 3) or (B T 3 H W) or (B N 3)
+        """
+        if xyz.ndim == 3:
+            min_pts = self.min_pts
+            max_pts = self.max_pts
+            return (xyz - min_pts[None,None,:]) / (max_pts - min_pts)[None,None,:] * 2 - 1
+        elif xyz.ndim == 4:
+            min_pts = self.min_pts
+            max_pts = self.max_pts
+            return (xyz - min_pts[None,None,None,:]) / (max_pts - min_pts)[None,None,None,:] * 2 - 1
+        elif xyz.ndim == 5:
+            if xyz.shape[2] == 3:
+                min_pts = self.min_pts
+                max_pts = self.max_pts
+                return (xyz - min_pts[None,None,:,None,None]) / (max_pts - min_pts)[None,None,:,None,None] * 2 - 1
+            elif xyz.shape[-1] == 3:
+                min_pts = self.min_pts
+                max_pts = self.max_pts
+                return (xyz - min_pts[None,None,None,None,:]) / (max_pts - min_pts)[None,None,None,None,:] * 2 - 1
+    def denorm_xyz(self, xyz):
+        """
+        xyz can be (B T N 3) or (B T 3 H W) or (B N 3)
+        """
+        if xyz.ndim == 3:
+            min_pts = self.min_pts
+            max_pts = self.max_pts
+            return (xyz + 1) / 2 * (max_pts - min_pts)[None,None,:] + min_pts[None,None,:]
+        elif xyz.ndim == 4:
+            min_pts = self.min_pts
+            max_pts = self.max_pts
+            return (xyz + 1) / 2 * (max_pts - min_pts)[None,None,None,:] + min_pts[None,None,None,:]
+        elif xyz.ndim == 5:
+            if xyz.shape[2] == 3:
+                min_pts = self.min_pts
+                max_pts = self.max_pts
+                return (xyz + 1) / 2 * (max_pts - min_pts)[None,None,:,None,None] + min_pts[None,None,:,None,None]
+            elif xyz.shape[-1] == 3:
+                min_pts = self.min_pts
+                max_pts = self.max_pts
+                return (xyz + 1) / 2 * (max_pts - min_pts)[None,None,None,None,:] + min_pts[None,None,None,None,:]
+    def forward(
+        self,
+        video,
+        metric_depth,
+        metric_unc,
+        point_map,
+        queries,
+        pts_q_3d=None,
+        overlap_d=None,
+        iters=4,
+        add_space_attn=True,
+        fmaps_chunk_size=200,
+        intrs=None,
+        traj3d_gt=None,
+        custom_vid=False,
+        vis_gt=None,
+        prec_fx=None,
+        prec_fy=None,
+        cam_gt=None,
+        init_pose=False,
+        support_pts_q=None,
+        update_pointmap=True,
+        fixed_cam=False,
+        query_no_BA=False,
+        stage=0,
+        cache=None,
+        points_map_gt=None,
+        valid_only=False,
+        replace_ratio=0.6,
+    ):
+        """Predict tracks
+        Args:
+            video (FloatTensor[B, T, 3 H W]): input videos.
+            queries (FloatTensor[B, N, 3]): point queries.
+            iters (int, optional): number of updates. Defaults to 4.
+            vdp_feats_cache: last layer's feature of depth
+            tracks_init: B T N 3 the initialization of 3D tracks computed by cam pose
+        Returns:
+            - coords_predicted (FloatTensor[B, T, N, 2]):
+            - vis_predicted (FloatTensor[B, T, N]):
+            - train_data: `None` if `is_train` is false, otherwise:
+                - all_vis_predictions (List[FloatTensor[B, S, N, 1]]):
+                - all_coords_predictions (List[FloatTensor[B, S, N, 2]]):
+                - mask (BoolTensor[B, T, N]):
+        """
+        self.stage = stage
+        if cam_gt is not None:
+            cam_gt = cam_gt.clone()
+            cam_gt = torch.inverse(cam_gt[:,:1,...])@cam_gt
+        B, T, C, _, _ = video.shape
+        _, _, H_, W_ = metric_depth.shape
+        _, _, N, __ = queries.shape
+        if (vis_gt is not None)&(queries.shape[1] == T):
+            aug_visb = True
+            if aug_visb:
+                number_visible = vis_gt.sum(dim=1)
+                ratio_rand = torch.rand(B, N, device=vis_gt.device)
+                # first_positive_inds = get_nth_visible_time_index(vis_gt, 1)
+                first_positive_inds = get_nth_visible_time_index(vis_gt, (number_visible*ratio_rand).long().clamp(min=1, max=T))
+                assert (torch.gather(vis_gt, 1, first_positive_inds[:, None, :].repeat(1, T, 1)) < 0).sum() == 0
+            else:
+                __, first_positive_inds = torch.max(vis_gt, dim=1)
+            first_positive_inds = first_positive_inds.long()
+            gather = torch.gather(
+                queries, 1, first_positive_inds[:, :, None, None].repeat(1, 1, N, 2)
+                )
+            xys = torch.diagonal(gather, dim1=1, dim2=2).permute(0, 2, 1)
+            gather_xyz = torch.gather(
+                traj3d_gt, 1, first_positive_inds[:, :, None, None].repeat(1, 1, N, 3)
+            )
+            z_gt_query = torch.diagonal(gather_xyz, dim1=1, dim2=2).permute(0, 2, 1)[...,2]
+            queries = torch.cat([first_positive_inds[:, :, None], xys], dim=-1)
+            queries = torch.cat([queries, support_pts_q[:,0]], dim=1)
+        else:
+            # Generate the 768 points randomly in the whole video
+            queries = queries.squeeze(1)
+            ba_len = queries.shape[1]
+            z_gt_query = None
+            if support_pts_q is not None:
+                queries = torch.cat([queries, support_pts_q[:,0]], dim=1)
+        if (abs(prec_fx-1.0) > 1e-4) & (self.training) & (traj3d_gt is not None):
+            traj3d_gt[..., 0] /= prec_fx
+            traj3d_gt[..., 1] /= prec_fy
+            queries[...,1] /= prec_fx
+            queries[...,2] /= prec_fy
+        video_vis = F.interpolate(video.clone().view(B*T, 3, video.shape[-2], video.shape[-1]), (H_, W_), mode="bilinear", align_corners=False).view(B, T, 3, H_, W_)
+        self.image_size = torch.tensor([H_, W_])
+        # self.model_resolution = (H_, W_)
+        # resize the queries and intrs
+        self.factor_x = self.model_resolution[1]/W_
+        self.factor_y = self.model_resolution[0]/H_
+        queries[...,1] *= self.factor_x
+        queries[...,2] *= self.factor_y
+        intrs_org = intrs.clone()
+        intrs[...,0,:] *= self.factor_x
+        intrs[...,1,:] *= self.factor_y
+        # get the fmaps and color features
+        video = F.interpolate(video.view(B*T, 3, video.shape[-2], video.shape[-1]),
+                              (self.model_resolution[0], self.model_resolution[1])).view(B, T, 3, self.model_resolution[0], self.model_resolution[1])
+        _, _, _, H, W = video.shape
+        if cache is not None:
+            T_cache = cache["fmaps"].shape[0]
+            fmaps = self.extract_img_feat(video[:,T_cache:], fmaps_chunk_size=fmaps_chunk_size)
+            fmaps = torch.cat([cache["fmaps"][None], fmaps], dim=1)
+        else:
+            fmaps = self.extract_img_feat(video, fmaps_chunk_size=fmaps_chunk_size)
+        fmaps_org = fmaps.clone()
+        metric_depth = F.interpolate(metric_depth.view(B*T, 1, H_, W_),
+                              (self.model_resolution[0], self.model_resolution[1]),mode="nearest").view(B*T, 1, self.model_resolution[0], self.model_resolution[1]).clamp(0.01, 200)
+        self.metric_unc_org = metric_unc.clone()
+        metric_unc = F.interpolate(metric_unc.view(B*T, 1, H_, W_),
+                                (self.model_resolution[0], self.model_resolution[1]),mode="nearest").view(B*T, 1, self.model_resolution[0], self.model_resolution[1])
+        if (self.stage == 2) & (self.training):
+            scale_rand = (torch.rand(B, T, device=video.device) - 0.5) + 1
+            point_map = scale_rand.view(B*T,1,1,1) * point_map
+        point_map_org = point_map.permute(0,3,1,2).view(B*T, 3, H_, W_).clone()
+        point_map = F.interpolate(point_map_org.clone(),
+                                  (self.model_resolution[0], self.model_resolution[1]),mode="nearest").view(B*T, 3, self.model_resolution[0], self.model_resolution[1])
+        # align the point map
+        point_map_org_train = point_map_org.view(B*T, 3, H_, W_).clone()
+        if (stage == 2):
+            # align the point map
+            try:
+                self.pred_points, scale_gt, shift_gt = affine_invariant_global_loss(
+                    point_map_org_train.permute(0,2,3,1),
+                    points_map_gt,
+                    mask=self.metric_unc_org[:,0]>0.5,
+                    align_resolution=32,
+                    only_align=True
+                )
+            except:
+                scale_gt, shift_gt = torch.ones(B*T).to(video.device), torch.zeros(B*T,3).to(video.device)
+            self.scale_gt, self.shift_gt = scale_gt, shift_gt
+        else:
+            scale_est, shift_est = None, None
+        # extract the pts features
+        device = queries.device
+        assert H % self.stride == 0 and W % self.stride == 0
+        B, N, __ = queries.shape
+        queries_z = sample_features5d(metric_depth.view(B, T, 1, H, W),
+                                                queries[:,None], interp_mode="nearest").squeeze(1)
+        queries_z_unc = sample_features5d(metric_unc.view(B, T, 1, H, W),
+                                                queries[:,None], interp_mode="nearest").squeeze(1)
+        queries_rgb = sample_features5d(video.view(B, T, C, H, W),
+                                                queries[:,None], interp_mode="nearest").squeeze(1)
+        queries_point_map = sample_features5d(point_map.view(B, T, 3, H, W),
+                                                    queries[:,None], interp_mode="nearest").squeeze(1)
+        if ((queries_z > 100)*(queries_z == 0)).sum() > 0:
+            import pdb; pdb.set_trace()
+        if overlap_d is not None:
+            queries_z[:,:overlap_d.shape[1],:] = overlap_d[...,None]
+            queries_point_map[:,:overlap_d.shape[1],2:] = overlap_d[...,None]
+        if pts_q_3d is not None:
+            scale_factor = (pts_q_3d[...,-1].permute(0,2,1) / queries_z[:,:pts_q_3d.shape[2],:]).squeeze().median()
+            queries_z[:,:pts_q_3d.shape[2],:] = pts_q_3d[...,-1].permute(0,2,1) / scale_factor
+            queries_point_map[:,:pts_q_3d.shape[2],2:] = pts_q_3d[...,-1].permute(0,2,1) / scale_factor
+        # normalize the points
+        self.min_pts, self.max_pts = queries_point_map.mean(dim=(0,1)) - 3*queries_point_map.std(dim=(0,1)), queries_point_map.mean(dim=(0,1)) + 3*queries_point_map.std(dim=(0,1))
+        queries_point_map = self.norm_xyz(queries_point_map)
+        queries_point_map_ = queries_point_map.reshape(B, 1, N, 3).expand(B, T, N, 3).clone()
+        point_map = self.norm_xyz(point_map.view(B, T, 3, H, W)).view(B*T, 3, H, W)
+        if z_gt_query is not None:
+            queries_z[:,:z_gt_query.shape[1],:] = z_gt_query[:,:,None]
+            mask_traj_gt = ((queries_z[:,:z_gt_query.shape[1],:] - z_gt_query[:,:,None])).abs() < 0.1
+        else:
+            if traj3d_gt is not None:
+                mask_traj_gt = torch.ones_like(queries_z[:, :traj3d_gt.shape[2]]).bool()
+            else:
+                mask_traj_gt = torch.ones_like(queries_z).bool()
+        queries_xyz = torch.cat([queries, queries_z], dim=-1)[:,None].repeat(1, T, 1, 1)
+        if cache is not None:
+            cache_T, cache_N = cache["track2d_pred_cache"].shape[0], cache["track2d_pred_cache"].shape[1]
+            cachexy = cache["track2d_pred_cache"].clone()
+            cachexy[...,0] = cachexy[...,0] * self.factor_x
+            cachexy[...,1] = cachexy[...,1] * self.factor_y
+            # initialize the 2d points with cache
+            queries_xyz[:,:cache_T,:cache_N,1:] = cachexy
+            queries_xyz[:,cache_T:,:cache_N,1:] = cachexy[-1:]
+            # initialize the 3d points with cache
+            queries_point_map_[:,:cache_T,:cache_N,:] = self.norm_xyz(cache["track3d_pred_cache"][None])
+            queries_point_map_[:,cache_T:,:cache_N,:] = self.norm_xyz(cache["track3d_pred_cache"][-1:][None])
+        if cam_gt is not None:
+            q_static_proj, q_xyz_world, q_xyz_cam = self.track_from_cam(queries_xyz, cam_gt,
+                                intrs, rgbs=video_vis, visualize=False)
+            q_static_proj[..., 0] /= self.factor_x
+            q_static_proj[..., 1] /= self.factor_y
+        assert T >= 1  # A tracker needs at least two frames to track something
+        video = 2 * (video / 255.0) - 1.0
+        dtype = video.dtype
+        queried_frames = queries[:, :, 0].long()
+        queried_coords = queries[..., 1:3]
+        queried_coords = queried_coords / self.stride
+        # We store our predictions here
+        (all_coords_predictions, all_coords_xyz_predictions,all_vis_predictions,
+         all_confidence_predictions, all_cam_predictions, all_dynamic_prob_predictions,
+         all_cam_pts_predictions, all_world_tracks_predictions, all_world_tracks_refined_predictions,
+         all_scale_est, all_shift_est) = (
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+            [],
+            []
+        )
+        # We compute track features
+        fmaps_pyramid = []
+        point_map_pyramid = []
+        track_feat_pyramid = []
+        track_feat_support_pyramid = []
+        track_feat3d_pyramid = []
+        track_feat_support3d_pyramid = []
+        track_depth_support_pyramid = []
+        track_point_map_pyramid = []
+        track_point_map_support_pyramid = []
+        fmaps_pyramid.append(fmaps)
+        metric_depth = metric_depth
+        point_map = point_map
+        metric_depth_align = F.interpolate(metric_depth, scale_factor=0.25, mode='nearest')
+        point_map_align = F.interpolate(point_map, scale_factor=0.25, mode='nearest')
+        point_map_pyramid.append(point_map_align.view(B, T, 3, point_map_align.shape[-2], point_map_align.shape[-1]))
+        for i in range(self.corr_levels - 1):
+            fmaps_ = fmaps.reshape(
+                B * T, self.latent_dim, fmaps.shape[-2], fmaps.shape[-1]
+            )
+            fmaps_ = F.avg_pool2d(fmaps_, 2, stride=2)
+            fmaps = fmaps_.reshape(
+                B, T, self.latent_dim, fmaps_.shape[-2], fmaps_.shape[-1]
+            )
+            fmaps_pyramid.append(fmaps)
+            # downsample the depth
+            metric_depth_ = metric_depth_align.reshape(B*T,1,metric_depth_align.shape[-2],metric_depth_align.shape[-1])
+            metric_depth_ = F.interpolate(metric_depth_, scale_factor=0.5, mode='nearest')
+            metric_depth_align = metric_depth_.reshape(B,T,1,metric_depth_.shape[-2], metric_depth_.shape[-1])
+            # downsample the point map
+            point_map_ = point_map_align.reshape(B*T,3,point_map_align.shape[-2],point_map_align.shape[-1])
+            point_map_ = F.interpolate(point_map_, scale_factor=0.5, mode='nearest')
+            point_map_align = point_map_.reshape(B,T,3,point_map_.shape[-2], point_map_.shape[-1])
+            point_map_pyramid.append(point_map_align)
+        for i in range(self.corr_levels):
+            if cache is not None:
+                cache_N = cache["track_feat_pyramid"][i].shape[2]
+                track_feat_cached, track_feat_support_cached = cache["track_feat_pyramid"][i], cache["track_feat_support_pyramid"][i]
+                track_feat3d_cached, track_feat_support3d_cached = cache["track_feat3d_pyramid"][i], cache["track_feat_support3d_pyramid"][i]
+                track_point_map_cached, track_point_map_support_cached = self.norm_xyz(cache["track_point_map_pyramid"][i]), self.norm_xyz(cache["track_point_map_support_pyramid"][i])
+                queried_coords_new = queried_coords[:,cache_N:,:] / 2**i
+                queried_frames_new = queried_frames[:,cache_N:]
+            else:
+                queried_coords_new = queried_coords / 2**i
+                queried_frames_new = queried_frames
+            track_feat, track_feat_support = self.get_track_feat(
+                fmaps_pyramid[i],
+                queried_frames_new,
+                queried_coords_new,
+                support_radius=self.corr_radius,
+            )
+            # get 3d track feat
+            track_point_map, track_point_map_support = self.get_track_feat(
+                point_map_pyramid[i],
+                queried_frames_new,
+                queried_coords_new,
+                support_radius=self.corr3d_radius,
+            )
+            track_feat3d, track_feat_support3d = self.get_track_feat(
+                fmaps_pyramid[i],
+                queried_frames_new,
+                queried_coords_new,
+                support_radius=self.corr3d_radius,
+            )
+            if cache is not None:
+                track_feat = torch.cat([track_feat_cached, track_feat], dim=2)
+                track_point_map = torch.cat([track_point_map_cached, track_point_map], dim=2)
+                track_feat_support = torch.cat([track_feat_support_cached[:,0], track_feat_support], dim=2)
+                track_point_map_support = torch.cat([track_point_map_support_cached[:,0], track_point_map_support], dim=2)
+                track_feat3d = torch.cat([track_feat3d_cached, track_feat3d], dim=2)
+                track_feat_support3d = torch.cat([track_feat_support3d_cached[:,0], track_feat_support3d], dim=2)
+            track_feat_pyramid.append(track_feat.repeat(1, T, 1, 1))
+            track_feat_support_pyramid.append(track_feat_support.unsqueeze(1))
+            track_feat3d_pyramid.append(track_feat3d.repeat(1, T, 1, 1))
+            track_feat_support3d_pyramid.append(track_feat_support3d.unsqueeze(1))
+            track_point_map_pyramid.append(track_point_map.repeat(1, T, 1, 1))
+            track_point_map_support_pyramid.append(track_point_map_support.unsqueeze(1))
+        D_coords = 2
+        (coord_preds, coords_xyz_preds, vis_preds, confidence_preds,
+         dynamic_prob_preds, cam_preds, pts3d_cam_pred, world_tracks_pred,
+         world_tracks_refined_pred, point_map_preds, scale_ests, shift_ests) = (
+            [], [], [], [], [], [], [], [], [], [], [], []
+        )
+        c2w_ests = []
+        vis = torch.zeros((B, T, N), device=device).float()
+        confidence = torch.zeros((B, T, N), device=device).float()
+        dynamic_prob = torch.zeros((B, T, N), device=device).float()
+        pro_analysis_w = torch.zeros((B, T, N), device=device).float()
+        coords = queries_xyz[...,1:].clone()
+        coords[...,:2] /= self.stride
+        # coords[...,:2] = queried_coords.reshape(B, 1, N, 2).expand(B, T, N, 2).float()[...,:2]
+        # initialize the 3d points
+        coords_xyz = queries_point_map_.clone()
+        # if cache is not None:
+        #     viser = Visualizer(save_dir=".", grayscale=True,
+        #                        fps=10, pad_value=50, tracks_leave_trace=0)
+        #     coords_clone = coords.clone()
+        #     coords_clone[...,:2] *= self.stride
+        #     coords_clone[..., 0] /= self.factor_x
+        #     coords_clone[..., 1] /= self.factor_y
+        #     viser.visualize(video=video_vis, tracks=coords_clone[..., :2], filename="test")
+        #     import pdb; pdb.set_trace()
+        if init_pose:
+            q_init_proj, q_xyz_world, q_xyz_cam = self.track_from_cam(queries_xyz, cam_gt,
+                                intrs, rgbs=video_vis, visualize=False)
+            q_init_proj[..., 0] /= self.stride
+            q_init_proj[..., 1] /= self.stride
+        r = 2 * self.corr_radius + 1
+        r_depth = 2 * self.corr3d_radius + 1
+        anchor_loss = 0
+        # two current states
+        self.c2w_est_curr = torch.eye(4, device=device).repeat(B, T , 1, 1)
+        coords_proj_curr = coords.view(B * T, N, 3)[...,:2]
+        if init_pose:
+            self.c2w_est_curr = cam_gt.to(coords_proj_curr.device).to(coords_proj_curr.dtype)
+        sync_loss = 0
+        if stage == 2:
+            extra_sparse_tokens = self.scale_shift_tokens[:,:,None,:].repeat(B, 1, T, 1)
+            extra_dense_tokens = self.residual_embedding[None,None].repeat(B, T, 1, 1, 1)
+            xyz_pos_enc = posenc(point_map_pyramid[-2].permute(0,1,3,4,2), min_deg=0, max_deg=10).permute(0,1,4,2,3)
+            extra_dense_tokens = torch.cat([xyz_pos_enc, extra_dense_tokens, fmaps_pyramid[-2]], dim=2)
+            extra_dense_tokens = rearrange(extra_dense_tokens, 'b t c h w -> (b t) c h w')
+            extra_dense_tokens = self.dense_mlp(extra_dense_tokens)
+            extra_dense_tokens = rearrange(extra_dense_tokens, '(b t) c h w -> b t c h w', b=B, t=T)
+        else:
+            extra_sparse_tokens = None
+            extra_dense_tokens = None
+        scale_est, shift_est = torch.ones(B, T, 1, 1, device=device), torch.zeros(B, T, 1, 3, device=device)
+        residual_point = torch.zeros(B, T, 3, self.model_resolution[0]//self.stride,
+                                                         self.model_resolution[1]//self.stride, device=device)
+        for it in range(iters):
+            # query points scale and shift
+            scale_est_query = torch.gather(scale_est, dim=1, index=queries[:,:,None,:1].long())
+            shift_est_query = torch.gather(shift_est, dim=1, index=queries[:,:,None,:1].long().repeat(1, 1, 1, 3))
+            coords = coords.detach()  # B T N 3
+            coords_xyz = coords_xyz.detach()
+            vis = vis.detach()
+            confidence = confidence.detach()
+            dynamic_prob = dynamic_prob.detach()
+            pro_analysis_w = pro_analysis_w.detach()
+            coords_init = coords.view(B * T, N, 3)
+            coords_xyz_init = coords_xyz.view(B * T, N, 3)
+            corr_embs = []
+            corr_depth_embs = []
+            corr_feats = []
+            for i in range(self.corr_levels):
+                # K_level = int(32*0.8**(i))
+                K_level = 16
+                corr_feat = self.get_correlation_feat(
+                    fmaps_pyramid[i], coords_init[...,:2] / 2**i
+                )
+                #NOTE: update the point map
+                residual_point_i = F.interpolate(residual_point.view(B*T,3,residual_point.shape[-2],residual_point.shape[-1]),
+                                                                                     size=(point_map_pyramid[i].shape[-2], point_map_pyramid[i].shape[-1]), mode='nearest')
+                point_map_pyramid_i = (self.denorm_xyz(point_map_pyramid[i]) * scale_est[...,None]
+                                                     + shift_est.permute(0,1,3,2)[...,None] + residual_point_i.view(B,T,3,point_map_pyramid[i].shape[-2], point_map_pyramid[i].shape[-1])).clone().detach()
+                corr_point_map = self.get_correlation_feat(
+                    self.norm_xyz(point_map_pyramid_i), coords_proj_curr / 2**i, radius=self.corr3d_radius
+                )
+                corr_point_feat = self.get_correlation_feat(
+                    fmaps_pyramid[i], coords_proj_curr / 2**i, radius=self.corr3d_radius
+                )
+                track_feat_support = (
+                    track_feat_support_pyramid[i]
+                    .view(B, 1, r, r, N, self.latent_dim)
+                    .squeeze(1)
+                    .permute(0, 3, 1, 2, 4)
+                )
+                track_feat_support3d = (
+                    track_feat_support3d_pyramid[i]
+                    .view(B, 1, r_depth, r_depth, N, self.latent_dim)
+                    .squeeze(1)
+                    .permute(0, 3, 1, 2, 4)
+                )
+                #NOTE: update the point map
+                track_point_map_support_pyramid_i = (self.denorm_xyz(track_point_map_support_pyramid[i]) * scale_est_query.view(B,1,1,N,1)
+                                                                                                + shift_est_query.view(B,1,1,N,3)).clone().detach()
+                track_point_map_support = (
+                    self.norm_xyz(track_point_map_support_pyramid_i)
+                    .view(B, 1, r_depth, r_depth, N, 3)
+                    .squeeze(1)
+                    .permute(0, 3, 1, 2, 4)
+                )
+                corr_volume = torch.einsum(
+                    "btnhwc,bnijc->btnhwij", corr_feat, track_feat_support
+                )
+                corr_emb = self.corr_mlp(corr_volume.reshape(B, T, N, r * r * r * r))
+                with torch.no_grad():
+                    rel_pos_query_ = track_point_map_support - track_point_map_support[:,:,self.corr3d_radius,self.corr3d_radius,:][...,None,None,:]
+                    rel_pos_target_ = corr_point_map - coords_xyz_init.view(B, T, N, 1, 1, 3)
+                    # select the top 9 points
+                    rel_pos_query_idx = rel_pos_query_.norm(dim=-1).view(B, N, -1).topk(K_level+1, dim=-1, largest=False)[1][...,1:,None]
+                    rel_pos_target_idx = rel_pos_target_.norm(dim=-1).view(B, T, N, -1).topk(K_level+1, dim=-1, largest=False)[1][...,1:,None]
+                    rel_pos_query_ = torch.gather(rel_pos_query_.view(B, N, -1, 3), dim=-2, index=rel_pos_query_idx.expand(B, N, K_level, 3))
+                    rel_pos_target_ = torch.gather(rel_pos_target_.view(B, T, N, -1, 3), dim=-2, index=rel_pos_target_idx.expand(B, T, N, K_level, 3))
+                    rel_pos_query = rel_pos_query_
+                    rel_pos_target = rel_pos_target_
+                    rel_pos_query = posenc(rel_pos_query, min_deg=0, max_deg=12)
+                    rel_pos_target = posenc(rel_pos_target, min_deg=0, max_deg=12)
+                rel_pos_target = self.rel_pos_mlp(rel_pos_target)
+                rel_pos_query = self.rel_pos_mlp(rel_pos_query)
+                with torch.no_grad():
+                    # integrate with feature
+                    track_feat_support_ = rearrange(track_feat_support3d, 'b n r k c -> b n (r k) c', r=r_depth, k=r_depth, n=N, b=B)
+                    track_feat_support_ = torch.gather(track_feat_support_, dim=-2, index=rel_pos_query_idx.expand(B, N, K_level, 128))
+                    queried_feat = torch.cat([rel_pos_query, track_feat_support_], dim=-1)
+                    corr_feat_ = rearrange(corr_point_feat, 'b t n r k c -> b t n (r k) c', t=T, n=N, b=B)
+                    corr_feat_ = torch.gather(corr_feat_, dim=-2, index=rel_pos_target_idx.expand(B, T, N, K_level, 128))
+                    target_feat = torch.cat([rel_pos_target, corr_feat_], dim=-1)
+                # 3d attention
+                queried_feat = self.corr_xyz_mlp(queried_feat)
+                target_feat = self.corr_xyz_mlp(target_feat)
+                queried_feat = repeat(queried_feat, 'b n k c -> b t n k c', k=K_level, t=T, n=N, b=B)
+                corr_depth_emb = self.corr_transformer[0](queried_feat.reshape(B*T*N,-1,128),
+                                                        target_feat.reshape(B*T*N,-1,128),
+                                                        target_rel_pos=rel_pos_target.reshape(B*T*N,-1,128))
+                corr_depth_emb = rearrange(corr_depth_emb, '(b t n) 1 c -> b t n c', t=T, n=N, b=B)
+                corr_depth_emb = self.corr_depth_mlp(corr_depth_emb)
+                valid_mask = self.denorm_xyz(coords_xyz_init).view(B, T, N, -1)[...,2:3] > 0
+                corr_depth_embs.append(corr_depth_emb*valid_mask)
+                corr_embs.append(corr_emb)
+            corr_embs = torch.cat(corr_embs, dim=-1)
+            corr_embs = corr_embs.view(B, T, N, corr_embs.shape[-1])
+            corr_depth_embs = torch.cat(corr_depth_embs, dim=-1)
+            corr_depth_embs = corr_depth_embs.view(B, T, N, corr_depth_embs.shape[-1])
+            transformer_input = [vis[..., None], confidence[..., None], corr_embs]
+            transformer_input_depth = [vis[..., None], confidence[..., None], corr_depth_embs]
+            rel_coords_forward = coords[:,:-1,...,:2] - coords[:,1:,...,:2]
+            rel_coords_backward = coords[:, 1:,...,:2] - coords[:, :-1,...,:2]
+            rel_xyz_forward = coords_xyz[:,:-1,...,:3] - coords_xyz[:,1:,...,:3]
+            rel_xyz_backward = coords_xyz[:, 1:,...,:3] - coords_xyz[:, :-1,...,:3]
+            rel_coords_forward = torch.nn.functional.pad(
+                rel_coords_forward, (0, 0, 0, 0, 0, 1)
+            )
+            rel_coords_backward = torch.nn.functional.pad(
+                rel_coords_backward, (0, 0, 0, 0, 1, 0)
+            )
+            rel_xyz_forward = torch.nn.functional.pad(
+                rel_xyz_forward, (0, 0, 0, 0, 0, 1)
+            )
+            rel_xyz_backward = torch.nn.functional.pad(
+                rel_xyz_backward, (0, 0, 0, 0, 1, 0)
+            )
+            scale = (
+                torch.tensor(
+                    [self.model_resolution[1], self.model_resolution[0]],
+                    device=coords.device,
+                )
+                / self.stride
+            )
+            rel_coords_forward = rel_coords_forward / scale
+            rel_coords_backward = rel_coords_backward / scale
+            rel_pos_emb_input = posenc(
+                torch.cat([rel_coords_forward, rel_coords_backward], dim=-1),
+                min_deg=0,
+                max_deg=10,
+            )  # batch, num_points, num_frames, 84
+            rel_xyz_emb_input = posenc(
+                torch.cat([rel_xyz_forward, rel_xyz_backward], dim=-1),
+                min_deg=0,
+                max_deg=10,
+            )  # batch, num_points, num_frames, 126
+            rel_xyz_emb_input = self.xyz_mlp(rel_xyz_emb_input)
+            transformer_input.append(rel_pos_emb_input)
+            transformer_input_depth.append(rel_xyz_emb_input)
+            # get the queries world
+            with torch.no_grad():
+                # update the query points with scale and shift
+                queries_xyz_i = queries_xyz.clone().detach()
+                queries_xyz_i[..., -1] = queries_xyz_i[..., -1] * scale_est_query.view(B,1,N) + shift_est_query.view(B,1,N,3)[...,2]
+                _, _, q_xyz_cam = self.track_from_cam(queries_xyz_i, self.c2w_est_curr,
+                                                intrs, rgbs=None, visualize=False)
+                q_xyz_cam = self.norm_xyz(q_xyz_cam)
+            query_t = queries[:,None,:,:1].repeat(B, T, 1, 1)
+            q_xyz_cam = torch.cat([query_t/T, q_xyz_cam], dim=-1)
+            T_all = torch.arange(T, device=device)[None,:,None,None].repeat(B, 1, N, 1)
+            current_xyzt = torch.cat([T_all/T, coords_xyz_init.view(B, T, N, -1)], dim=-1)
+            rel_pos_query_glob = q_xyz_cam - current_xyzt
+            # embed the confidence and dynamic probability
+            confidence_curr = torch.sigmoid(confidence[...,None])
+            dynamic_prob_curr = torch.sigmoid(dynamic_prob[...,None]).mean(dim=1, keepdim=True).repeat(1,T,1,1)
+            # embed the confidence and dynamic probability
+            rel_pos_query_glob = torch.cat([rel_pos_query_glob, confidence_curr, dynamic_prob_curr], dim=-1)
+            rel_pos_query_glob = posenc(rel_pos_query_glob, min_deg=0, max_deg=12)
+            transformer_input_depth.append(rel_pos_query_glob)
+            x = (
+                torch.cat(transformer_input, dim=-1)
+                .permute(0, 2, 1, 3)
+                .reshape(B * N, T, -1)
+            )
+            x_depth = (
+                torch.cat(transformer_input_depth, dim=-1)
+                .permute(0, 2, 1, 3)
+                .reshape(B * N, T, -1)
+            )
+            x_depth = self.proj_xyz_embed(x_depth)
+            x = x + self.interpolate_time_embed(x, T)
+            x = x.view(B, N, T, -1)  # (B N) T D -> B N T D
+            x_depth = x_depth + self.interpolate_time_embed(x_depth, T)
+            x_depth = x_depth.view(B, N, T, -1)  # (B N) T D -> B N T D
+            delta, delta_depth, delta_dynamic_prob, delta_pro_analysis_w, scale_shift_out, dense_res_out = self.updateformer3D(
+                x,
+                x_depth,
+                self.updateformer,
+                add_space_attn=add_space_attn,
+                extra_sparse_tokens=extra_sparse_tokens,
+                extra_dense_tokens=extra_dense_tokens,
+            )
+            # update the scale and shift
+            if scale_shift_out is not None:
+                extra_sparse_tokens = extra_sparse_tokens + scale_shift_out[...,:128]
+                scale_update = scale_shift_out[:,:1,:,-1].permute(0,2,1)[...,None]
+                shift_update = scale_shift_out[:,1:,:,-1].permute(0,2,1)[...,None]
+                scale_est = scale_est + scale_update
+                shift_est[...,2:] = shift_est[...,2:] + shift_update / 10
+                # dense tokens update
+                extra_dense_tokens = extra_dense_tokens + dense_res_out[:,:,-128:]
+                res_low = dense_res_out[:,:,:3]
+                up_mask = self.upsample_transformer(extra_dense_tokens.mean(dim=1), res_low)
+                up_mask = repeat(up_mask, "b k h w -> b s k h w", s=T)
+                up_mask = rearrange(up_mask, "b s c h w -> (b s) 1 c h w")
+                res_up = self.upsample_with_mask(
+                        rearrange(res_low, 'b t c h w -> (b t) c h w'),
+                        up_mask,
+                    )
+                res_up = rearrange(res_up, "(b t) c h w -> b t c h w", b=B, t=T)
+                # residual_point = residual_point + res_up
+            delta_coords = delta[..., :D_coords].permute(0, 2, 1, 3)
+            delta_vis = delta[..., D_coords].permute(0, 2, 1)
+            delta_confidence = delta[..., D_coords + 1].permute(0, 2, 1)
+            vis = vis + delta_vis
+            confidence = confidence + delta_confidence
+            dynamic_prob = dynamic_prob + delta_dynamic_prob[...,0].permute(0, 2, 1)
+            pro_analysis_w = pro_analysis_w + delta_pro_analysis_w[...,0].permute(0, 2, 1)
+            # update the depth
+            vis_est = torch.sigmoid(vis.detach())
+            delta_xyz = delta_depth[...,:3].permute(0,2,1,3)
+            denorm_delta_depth = (self.denorm_xyz(coords_xyz+delta_xyz)-self.denorm_xyz(coords_xyz))[...,2:3]
+            delta_depth_ = denorm_delta_depth.detach()
+            delta_coords = torch.cat([delta_coords, delta_depth_],dim=-1)
+            coords = coords + delta_coords
+            coords_append = coords.clone()
+            coords_xyz_append = self.denorm_xyz(coords_xyz + delta_xyz).clone()
+            coords_append[..., :2] = coords_append[..., :2] * float(self.stride)
+            coords_append[..., 0] /= self.factor_x
+            coords_append[..., 1] /= self.factor_y
+            # get the camera pose from tracks
+            dynamic_prob_curr = torch.sigmoid(dynamic_prob.detach())*torch.sigmoid(pro_analysis_w)
+            mask_out = (coords_append[...,0]<W_)&(coords_append[...,0]>0)&(coords_append[...,1]<H_)&(coords_append[...,1]>0)
+            if query_no_BA:
+                dynamic_prob_curr[:,:,:ba_len] = torch.ones_like(dynamic_prob_curr[:,:,:ba_len])
+            point_map_org_i = scale_est.view(B*T,1,1,1)*point_map_org.clone().detach() + shift_est.view(B*T,3,1,1)
+            # depth_unproj = bilinear_sampler(point_map_org_i, coords_append[...,:2].view(B*T, N, 1, 2), mode="nearest")[:,2,:,0].detach()
+            depth_unproj_neg = self.get_correlation_feat(
+                    point_map_org_i.view(B,T,3,point_map_org_i.shape[-2], point_map_org_i.shape[-1]),
+                     coords_append[...,:2].view(B*T, N, 2), radius=self.corr3d_radius
+                )[..., 2]
+            depth_diff = (depth_unproj_neg.view(B,T,N,-1) - coords_append[...,2:]).abs()
+            idx_neg = torch.argmin(depth_diff, dim=-1)
+            depth_unproj = depth_unproj_neg.view(B,T,N,-1)[torch.arange(B)[:, None, None, None],
+                                                          torch.arange(T)[None, :, None, None],
+                                                          torch.arange(N)[None, None, :, None],
+                                                          idx_neg.view(B,T,N,1)].view(B*T, N)
+            unc_unproj = bilinear_sampler(self.metric_unc_org, coords_append[...,:2].view(B*T, N, 1, 2), mode="nearest")[:,0,:,0].detach()
+            depth_unproj[unc_unproj<0.5] = 0.0
+            # replace the depth for visible and solid points
+            conf_est = torch.sigmoid(confidence.detach())
+            replace_mask = (depth_unproj.view(B,T,N)>0.0) * (vis_est>0.5) # * (conf_est>0.5)
+            #NOTE: way1: find the jitter points
+            depth_rel = (depth_unproj.view(B, T, N) - queries_z.permute(0, 2, 1))
+            depth_ddt1 = depth_rel[:, 1:, :] - depth_rel[:, :-1, :]
+            depth_ddt2 = depth_rel[:, 2:, :] - 2 * depth_rel[:, 1:-1, :] + depth_rel[:, :-2, :]
+            jitter_mask = torch.zeros_like(depth_rel, dtype=torch.bool)
+            if depth_ddt2.abs().max()>0:
+                thre2 = torch.quantile(depth_ddt2.abs()[depth_ddt2.abs()>0], replace_ratio)
+                jitter_mask[:, 1:-1, :] = (depth_ddt2.abs() < thre2)
+                thre1 = torch.quantile(depth_ddt1.abs()[depth_ddt1.abs()>0], replace_ratio)
+                jitter_mask[:, :-1, :] *= (depth_ddt1.abs() < thre1)
+                replace_mask = replace_mask * jitter_mask
+            #NOTE: way2: top k topological change detection
+            # coords_2d_lift = coords_append.clone()
+            # coords_2d_lift[...,2][replace_mask] = depth_unproj.view(B,T,N)[replace_mask]
+            # coords_2d_lift = self.cam_from_track(coords_2d_lift.clone(), intrs_org, only_cam_pts=True)
+            # coords_2d_lift[~replace_mask] = coords_xyz_append[~replace_mask]
+            # import pdb; pdb.set_trace()
+            # jitter_mask = get_topo_mask(coords_xyz_append, coords_2d_lift, replace_ratio)
+            # replace_mask = replace_mask * jitter_mask
+            # replace the depth
+            if self.training:
+                replace_mask = torch.zeros_like(replace_mask)
+            coords_append[...,2][replace_mask] = depth_unproj.view(B,T,N)[replace_mask]
+            coords_xyz_unproj = self.cam_from_track(coords_append.clone(), intrs_org, only_cam_pts=True)
+            coords[...,2][replace_mask] = depth_unproj.view(B,T,N)[replace_mask]
+            # coords_xyz_append[replace_mask] = coords_xyz_unproj[replace_mask]
+            coords_xyz_append_refine = coords_xyz_append.clone()
+            coords_xyz_append_refine[replace_mask] = coords_xyz_unproj[replace_mask]
+            c2w_traj_est, cam_pts_est, intrs_refine, coords_refine, world_tracks, world_tracks_refined, c2w_traj_init = self.cam_from_track(coords_append.clone(),
+                                                  intrs_org, dynamic_prob_curr, queries_z_unc, conf_est*vis_est*mask_out.float(),
+                                                  track_feat_concat=x_depth, tracks_xyz=coords_xyz_append_refine, init_pose=init_pose,
+                                                  query_pts=queries_xyz_i, fixed_cam=fixed_cam, depth_unproj=depth_unproj, cam_gt=cam_gt)
+            intrs_org = intrs_refine.view(B, T, 3, 3).to(intrs_org.dtype)
+            # get the queries world
+            self.c2w_est_curr = c2w_traj_est.detach()
+            # update coords and coords_append
+            coords[..., 2] = (cam_pts_est)[...,2]
+            coords_append[..., 2] = (cam_pts_est)[...,2]
+            # update coords_xyz_append
+            # coords_xyz_append = cam_pts_est
+            coords_xyz = self.norm_xyz(cam_pts_est)
+            # proj
+            coords_xyz_de = coords_xyz_append.clone()
+            coords_xyz_de[coords_xyz_de[...,2].abs()<1e-6] = -1e-4
+            mask_nan = coords_xyz_de[...,2].abs()<1e-2
+            coords_proj = torch.einsum("btij,btnj->btni", intrs_org, coords_xyz_de/coords_xyz_de[...,2:3].abs())[...,:2]
+            coords_proj[...,0] *= self.factor_x
+            coords_proj[...,1] *= self.factor_y
+            coords_proj[...,:2] /= float(self.stride)
+            # make sure it is aligned with 2d tracking
+            coords_proj_curr = coords[...,:2].view(B*T, N, 2).detach()
+            vis_est = (vis_est>0.5).float()
+            sync_loss += (vis_est.detach()[...,None]*(coords_proj_curr - coords_proj).norm(dim=-1, keepdim=True)*(1-mask_nan[...,None].float())).mean()
+            # coords_proj_curr[~mask_nan.view(B*T, N)] = coords_proj.view(B*T, N, 2)[~mask_nan.view(B*T, N)].to(coords_proj_curr.dtype)
+            # if torch.isnan(coords_proj_curr).sum()>0:
+            #     import pdb; pdb.set_trace()
+            if False:
+                point_map_resize = point_map.clone().view(B, T, 3, H, W)
+                update_input = torch.cat([point_map_resize, metric_unc.view(B,T,1,H,W)], dim=2)
+                coords_append_resize = coords.clone().detach()
+                coords_append_resize[..., :2] = coords_append_resize[..., :2] * float(self.stride)
+                update_track_input = self.norm_xyz(cam_pts_est)*5
+                update_track_input = torch.cat([update_track_input, vis_est[...,None]], dim=-1)
+                update_track_input = posenc(update_track_input, min_deg=0, max_deg=12)
+                update = self.update_pointmap.stablizer(update_input,
+                                                        update_track_input, coords_append_resize)#, imgs=video, vis_track=viser)
+                #NOTE: update the point map
+                point_map_resize += update
+                point_map_refine_out = F.interpolate(point_map_resize.view(B*T, -1, H, W),
+                                                                size=(self.image_size[0].item(), self.image_size[1].item()), mode='nearest')
+                point_map_refine_out = rearrange(point_map_refine_out, '(b t) c h w -> b t c h w', t=T, b=B)
+                point_map_preds.append(self.denorm_xyz(point_map_refine_out))
+                point_map_org = self.denorm_xyz(point_map_refine_out).view(B*T, 3, H_, W_)
+            # if torch.isnan(coords).sum()>0:
+            #     import pdb; pdb.set_trace()
+            #NOTE: the 2d tracking + unproject depth
+            fix_cam_est = coords_append.clone()
+            fix_cam_est[...,2] = depth_unproj
+            fix_cam_pts = self.cam_from_track(
+                        fix_cam_est, intrs_org, only_cam_pts=True
+                    )
+            coord_preds.append(coords_append)
+            coords_xyz_preds.append(coords_xyz_append)
+            vis_preds.append(vis)
+            cam_preds.append(c2w_traj_init)
+            pts3d_cam_pred.append(cam_pts_est)
+            world_tracks_pred.append(world_tracks)
+            world_tracks_refined_pred.append(world_tracks_refined)
+            confidence_preds.append(confidence)
+            dynamic_prob_preds.append(dynamic_prob)
+            scale_ests.append(scale_est)
+            shift_ests.append(shift_est)
+        if stage!=0:
+            all_coords_predictions.append([coord for coord in coord_preds])
+            all_coords_xyz_predictions.append([coord_xyz for coord_xyz in coords_xyz_preds])
+            all_vis_predictions.append(vis_preds)
+            all_confidence_predictions.append(confidence_preds)
+            all_dynamic_prob_predictions.append(dynamic_prob_preds)
+            all_cam_predictions.append([cam for cam in cam_preds])
+            all_cam_pts_predictions.append([pts for pts in pts3d_cam_pred])
+            all_world_tracks_predictions.append([world_tracks for world_tracks in world_tracks_pred])
+            all_world_tracks_refined_predictions.append([world_tracks_refined for world_tracks_refined in world_tracks_refined_pred])
+            all_scale_est.append(scale_ests)
+            all_shift_est.append(shift_ests)
+        if stage!=0:
+            train_data = (
+                all_coords_predictions,
+                all_coords_xyz_predictions,
+                all_vis_predictions,
+                all_confidence_predictions,
+                all_dynamic_prob_predictions,
+                all_cam_predictions,
+                all_cam_pts_predictions,
+                all_world_tracks_predictions,
+                all_world_tracks_refined_predictions,
+                all_scale_est,
+                all_shift_est,
+                torch.ones_like(vis_preds[-1], device=vis_preds[-1].device),
+            )
+        else:
+            train_data = None
+        # resize back
+        # init the trajectories by camera motion
+        # if cache is not None:
+        #     viser = Visualizer(save_dir=".", grayscale=True,
+        #                        fps=10, pad_value=50, tracks_leave_trace=0)
+        #     coords_clone = coords.clone()
+        #     coords_clone[...,:2] *= self.stride
+        #     coords_clone[..., 0] /= self.factor_x
+        #     coords_clone[..., 1] /= self.factor_y
+        #     viser.visualize(video=video_vis, tracks=coords_clone[..., :2], filename="test_refine")
+        #     import pdb; pdb.set_trace()
+        if train_data is not None:
+            # get the gt pts in the world coordinate
+            self_supervised = False
+            if (traj3d_gt is not None):
+                if traj3d_gt[...,2].abs().max()>0:
+                    gt_cam_pts = self.cam_from_track(
+                        traj3d_gt, intrs_org, only_cam_pts=True
+                    )
+                else:
+                    self_supervised = True
+            else:
+                self_supervised = True
+            if self_supervised:
+                gt_cam_pts = self.cam_from_track(
+                    coord_preds[-1].detach(), intrs_org, only_cam_pts=True
+                )
+            if cam_gt is not None:
+                gt_world_pts = torch.einsum(
+                    "btij,btnj->btni",
+                    cam_gt[...,:3,:3],
+                    gt_cam_pts
+                ) + cam_gt[...,None, :3,3]  # B T N 3
+            else:
+                gt_world_pts = torch.einsum(
+                    "btij,btnj->btni",
+                    self.c2w_est_curr[...,:3,:3],
+                    gt_cam_pts
+                ) + self.c2w_est_curr[...,None, :3,3]  # B T N 3
+                # update the query points with scale and shift
+                queries_xyz_i = queries_xyz.clone().detach()
+                queries_xyz_i[..., -1] = queries_xyz_i[..., -1] * scale_est_query.view(B,1,N) + shift_est_query.view(B,1,N,3)[...,2]
+                q_static_proj, q_xyz_world, q_xyz_cam = self.track_from_cam(queries_xyz_i,
+                     self.c2w_est_curr,
+                    intrs, rgbs=video_vis, visualize=False)
+                q_static_proj[..., 0] /= self.factor_x
+                q_static_proj[..., 1] /= self.factor_y
+                cam_gt = self.c2w_est_curr[:,:,:3,:]
+            if traj3d_gt is not None:
+                ret_loss = self.loss(train_data, traj3d_gt,
+                                      vis_gt, None, cam_gt, queries_z_unc,
+                                      q_xyz_world, q_static_proj, anchor_loss=anchor_loss, fix_cam_pts=fix_cam_pts, video_vis=video_vis, stage=stage,
+                                      gt_world_pts=gt_world_pts, mask_traj_gt=mask_traj_gt, intrs=intrs_org, custom_vid=custom_vid, valid_only=valid_only,
+                                      c2w_ests=c2w_ests, point_map_preds=point_map_preds, points_map_gt=points_map_gt, metric_unc=metric_unc, scale_est=scale_est,
+                                      shift_est=shift_est, point_map_org_train=point_map_org_train)
+            else:
+                ret_loss = self.loss(train_data, traj3d_gt,
+                                      vis_gt, None, cam_gt, queries_z_unc,
+                                      q_xyz_world, q_static_proj, anchor_loss=anchor_loss, fix_cam_pts=fix_cam_pts, video_vis=video_vis, stage=stage,
+                                      gt_world_pts=gt_world_pts, mask_traj_gt=mask_traj_gt, intrs=intrs_org, custom_vid=custom_vid, valid_only=valid_only,
+                                      c2w_ests=c2w_ests, point_map_preds=point_map_preds, points_map_gt=points_map_gt, metric_unc=metric_unc, scale_est=scale_est,
+                                      shift_est=shift_est, point_map_org_train=point_map_org_train)
+            if custom_vid:
+                sync_loss = 0*sync_loss
+            if (sync_loss > 50) and (stage==1):
+                ret_loss = (0*sync_loss, 0*sync_loss, 0*sync_loss, 0*sync_loss, 0*sync_loss, 0*sync_loss, 0*sync_loss) + (0*sync_loss,)
+            else:
+                ret_loss = ret_loss+(10*sync_loss,)
+        else:
+            ret_loss = None
+        color_pts = torch.cat([pts3d_cam_pred[-1], queries_rgb[:,None].repeat(1, T, 1, 1)], dim=-1)
+        #TODO: For evaluation. We found our model have some bias on invisible points after training. (to be fixed)
+        vis_pred_out = torch.sigmoid(vis_preds[-1]) + 0.2
+        ret = {"preds": coord_preds[-1], "vis_pred": vis_pred_out,
+                 "conf_pred": torch.sigmoid(confidence_preds[-1]),
+                "cam_pred": self.c2w_est_curr,"loss": ret_loss}
+        cache = {
+            "fmaps": fmaps_org[0].detach(),
+            "track_feat_support3d_pyramid": [track_feat_support3d_pyramid[i].detach() for i in range(len(track_feat_support3d_pyramid))],
+            "track_point_map_support_pyramid": [self.denorm_xyz(track_point_map_support_pyramid[i].detach()) for i in range(len(track_point_map_support_pyramid))],
+            "track_feat3d_pyramid": [track_feat3d_pyramid[i].detach() for i in range(len(track_feat3d_pyramid))],
+            "track_point_map_pyramid": [self.denorm_xyz(track_point_map_pyramid[i].detach()) for i in range(len(track_point_map_pyramid))],
+            "track_feat_pyramid": [track_feat_pyramid[i].detach() for i in range(len(track_feat_pyramid))],
+            "track_feat_support_pyramid": [track_feat_support_pyramid[i].detach() for i in range(len(track_feat_support_pyramid))],
+            "track2d_pred_cache": coord_preds[-1][0].clone().detach(),
+            "track3d_pred_cache": pts3d_cam_pred[-1][0].clone().detach(),
+        }
+        #NOTE: update the point map
+        point_map_org = scale_est.view(B*T,1,1,1)*point_map_org + shift_est.view(B*T,3,1,1)
+        point_map_org_refined = point_map_org
+        return ret, torch.sigmoid(dynamic_prob_preds[-1])*queries_z_unc[:,None,:,0], coord_preds[-1], color_pts, intrs_org, point_map_org_refined, cache
+    def track_d2_loss(self, tracks3d, stride=[1,2,3], dyn_prob=None, mask=None):
+        """
+        tracks3d: B T N 3
+        dyn_prob: B T N 1
+        """
+        r = 0.8
+        t_diff_total = 0.0
+        for i, s_ in enumerate(stride):
+            w_ = r**i
+            tracks3d_stride = tracks3d[:, ::s_, :, :]  # B T//s_ N 3
+            t_diff_tracks3d = (tracks3d_stride[:, 1:, :, :] - tracks3d_stride[:, :-1, :, :])
+            t_diff2 = (t_diff_tracks3d[:, 1:, :, :] - t_diff_tracks3d[:, :-1, :, :])
+            t_diff_total += w_*(t_diff2.norm(dim=-1).mean())
+        return 1e2*t_diff_total
+    def loss(self, train_data, traj3d_gt=None,
+                         vis_gt=None, static_tracks_gt=None, cam_gt=None,
+                         z_unc=None, q_xyz_world=None, q_static_proj=None, anchor_loss=0, valid_only=False,
+                         gt_world_pts=None, mask_traj_gt=None, intrs=None, c2w_ests=None, custom_vid=False, video_vis=None, stage=0,
+                         fix_cam_pts=None, point_map_preds=None, points_map_gt=None, metric_unc=None, scale_est=None, shift_est=None, point_map_org_train=None):
+        """
+        Compute the loss of 3D tracking problem
+        """
+        (
+            coord_predictions, coords_xyz_predictions, vis_predictions, confidence_predicitons,
+            dynamic_prob_predictions, camera_predictions, cam_pts_predictions, world_tracks_predictions,
+            world_tracks_refined_predictions, scale_ests, shift_ests, valid_mask
+        ) = train_data
+        B, T, _, _ = cam_gt.shape
+        if (stage == 2) and self.training:
+            # get the scale and shift gt
+            self.metric_unc_org[:,0] = self.metric_unc_org[:,0] * (points_map_gt.norm(dim=-1)>0).float() * (self.metric_unc_org[:,0]>0.5).float()
+            if not (self.scale_gt==torch.ones(B*T).to(self.scale_gt.device)).all():
+                scale_gt, shift_gt = self.scale_gt, self.shift_gt
+                scale_re = scale_gt[:4].mean()
+                scale_loss = 0.0
+                shift_loss = 0.0
+                for i_scale in range(len(scale_ests[0])):
+                    scale_loss += 0.8**(len(scale_ests[0])-i_scale-1)*10*(scale_gt - scale_re*scale_ests[0][i_scale].view(-1)).abs().mean()
+                    shift_loss += 0.8**(len(shift_ests[0])-i_scale-1)*10*(shift_gt - scale_re*shift_ests[0][i_scale].view(-1,3)).abs().mean()
+            else:
+                scale_loss = 0.0 * scale_ests[0][0].mean()
+                shift_loss = 0.0 * shift_ests[0][0].mean()
+                scale_re = 1.0
+        else:
+            scale_loss = 0.0
+            shift_loss = 0.0
+        if len(point_map_preds)>0:
+            point_map_loss = 0.0
+            for i in range(len(point_map_preds)):
+                point_map_preds_i = point_map_preds[i]
+                point_map_preds_i = rearrange(point_map_preds_i, 'b t c h w -> (b t) c h w', b=B, t=T)
+                base_loss = ((self.pred_points - points_map_gt).norm(dim=-1) * self.metric_unc_org[:,0]).mean()
+                point_map_loss_i = ((point_map_preds_i - points_map_gt.permute(0,3,1,2)).norm(dim=1) * self.metric_unc_org[:,0]).mean()
+                point_map_loss += point_map_loss_i
+                # point_map_loss += ((point_map_org_train - points_map_gt.permute(0,3,1,2)).norm(dim=1) * self.metric_unc_org[:,0]).mean()
+            if scale_loss == 0.0:
+                point_map_loss = 0*point_map_preds_i.sum()
+        else:
+            point_map_loss = 0.0
+        # camera loss
+        cam_loss = 0.0
+        dyn_loss = 0.0
+        N_gt = gt_world_pts.shape[2]
+        # self supervised dynamic mask
+        H_org, W_org = self.image_size[0], self.image_size[1]
+        q_static_proj[torch.isnan(q_static_proj)] = -200
+        in_view_mask = (q_static_proj[...,0]>0) & (q_static_proj[...,0]<W_org) & (q_static_proj[...,1]>0) & (q_static_proj[...,1]<H_org)
+        dyn_mask_final = (((coord_predictions[0][-1] - q_static_proj))[...,:2].norm(dim=-1) * in_view_mask)
+        dyn_mask_final = dyn_mask_final.sum(dim=1) / (in_view_mask.sum(dim=1) + 1e-2)
+        dyn_mask_final = dyn_mask_final > 6
+        for iter_, cam_pred_i in enumerate(camera_predictions[0]):
+            # points loss
+            pts_i_world = world_tracks_predictions[0][iter_].view(B, T, -1, 3)
+            coords_xyz_i_world = coords_xyz_predictions[0][iter_].view(B, T, -1, 3)
+            coords_i = coord_predictions[0][iter_].view(B, T, -1, 3)[..., :2]
+            pts_i_world_refined = torch.einsum(
+                "btij,btnj->btni",
+                cam_gt[...,:3,:3],
+                coords_xyz_i_world
+            ) + cam_gt[...,None, :3,3]  # B T N 3
+            # pts_i_world_refined = world_tracks_refined_predictions[0][iter_].view(B, T, -1, 3)
+            pts_world = pts_i_world
+            dyn_prob_i_logits = dynamic_prob_predictions[0][iter_].mean(dim=1)
+            dyn_prob_i = torch.sigmoid(dyn_prob_i_logits).detach()
+            mask = pts_world.norm(dim=-1) < 200
+            # general
+            vis_i_logits = vis_predictions[0][iter_]
+            vis_i = torch.sigmoid(vis_i_logits).detach()
+            if mask_traj_gt is not None:
+                try:
+                    N_gt_mask = mask_traj_gt.shape[1]
+                    align_loss = (gt_world_pts - q_xyz_world[:,None,:N_gt,:,]).norm(dim=-1)[...,:N_gt_mask] * (mask_traj_gt.permute(0,2,1))
+                    visb_traj = (align_loss * vis_i[:,:,:N_gt_mask]).sum(dim=1)/vis_i[:,:,:N_gt_mask].sum(dim=1)
+                except:
+                    import pdb; pdb.set_trace()
+            else:
+                visb_traj = ((gt_world_pts - q_xyz_world[:,None,:N_gt,:,]).norm(dim=-1) * vis_i[:,:,:N_gt]).sum(dim=1)/vis_i[:,:,:N_gt].sum(dim=1)
+            # pts_loss = ((q_xyz_world[:,None,...] - pts_world)[:,:,:N_gt,:].norm(dim=-1)*(1-dyn_prob_i[:,None,:N_gt])) # - 0.1*(1-dyn_prob_i[:,None,:N_gt]).log()
+            pts_loss = 0
+            static_mask = ~dyn_mask_final   # more strict for static points
+            dyn_mask = dyn_mask_final
+            pts_loss_refined = ((q_xyz_world[:,None,...] - pts_i_world_refined).norm(dim=-1)*static_mask[:,None,:]).sum()/static_mask.sum() # - 0.1*(1-dyn_prob_i[:,None,:N_gt]).log()
+            vis_logits_final = vis_predictions[0][-1].detach()
+            vis_final = torch.sigmoid(vis_logits_final)+0.2 > 0.5  # more strict for visible points
+            dyn_vis_mask = dyn_mask*vis_final * (fix_cam_pts[...,2] > 0.1)
+            pts_loss_dynamic = ((fix_cam_pts - coords_xyz_i_world).norm(dim=-1)*dyn_vis_mask[:,None,:]).sum()/dyn_vis_mask.sum()
+            # pts_loss_refined = 0
+            if traj3d_gt is not None:
+                tap_traj = (gt_world_pts[:,:-1,...] - gt_world_pts[:,1:,...]).norm(dim=-1).sum(dim=1)[...,:N_gt_mask]
+                mask_dyn = tap_traj>0.5
+                if mask_traj_gt.sum() > 0:
+                    dyn_loss_i = 20*balanced_binary_cross_entropy(dyn_prob_i_logits[:,:N_gt_mask][mask_traj_gt.squeeze(-1)],
+                                                                                        mask_dyn.float()[mask_traj_gt.squeeze(-1)])
+                else:
+                    dyn_loss_i = 0
+            else:
+                dyn_loss_i = 10*balanced_binary_cross_entropy(dyn_prob_i_logits, dyn_mask_final.float())
+            dyn_loss += dyn_loss_i
+            # visible loss for out of view points
+            vis_i_train = torch.sigmoid(vis_i_logits)
+            out_of_view_mask = (coords_i[...,0]<0)|(coords_i[...,0]>self.image_size[1])|(coords_i[...,1]<0)|(coords_i[...,1]>self.image_size[0])
+            vis_loss_out_of_view = vis_i_train[out_of_view_mask].sum() / out_of_view_mask.sum()
+            if traj3d_gt is not None:
+                world_pts_loss = (((gt_world_pts - pts_i_world_refined[:,:,:gt_world_pts.shape[2],...]).norm(dim=-1))[...,:N_gt_mask] * mask_traj_gt.permute(0,2,1)).sum() / mask_traj_gt.sum()
+                # world_pts_init_loss = (((gt_world_pts - pts_i_world[:,:,:gt_world_pts.shape[2],...]).norm(dim=-1))[...,:N_gt_mask] * mask_traj_gt.permute(0,2,1)).sum() / mask_traj_gt.sum()
+            else:
+                world_pts_loss = 0
+            # cam regress
+            t_err = (cam_pred_i[...,:3,3] - cam_gt[...,:3,3]).norm(dim=-1).sum()
+            # xyz loss
+            in_view_mask_large = (q_static_proj[...,0]>-50) & (q_static_proj[...,0]<W_org+50) & (q_static_proj[...,1]>-50) & (q_static_proj[...,1]<H_org+50)
+            static_vis_mask = (q_static_proj[...,2]>0.05).float() * static_mask[:,None,:] * in_view_mask_large
+            xyz_loss = ((coord_predictions[0][iter_] - q_static_proj)).abs()[...,:2].norm(dim=-1)*static_vis_mask
+            xyz_loss = xyz_loss.sum()/static_vis_mask.sum()
+            # visualize the q_static_proj
+            # viser = Visualizer(save_dir=".", grayscale=True,
+            #                     fps=10, pad_value=50, tracks_leave_trace=0)
+            # video_vis_ = F.interpolate(video_vis.view(B*T,3,video_vis.shape[-2],video_vis.shape[-1]), (H_org, W_org), mode='bilinear', align_corners=False)
+            # viser.visualize(video=video_vis_, tracks=q_static_proj[:,:,dyn_mask_final.squeeze(), :2], filename="test")
+            # viser.visualize(video=video_vis_, tracks=coord_predictions[0][-1][:,:,dyn_mask_final.squeeze(), :2], filename="test_pred")
+            # import pdb; pdb.set_trace()
+            # temporal loss
+            t_loss = self.track_d2_loss(pts_i_world_refined, [1,2,3], dyn_prob=dyn_prob_i, mask=mask)
+            R_err = (cam_pred_i[...,:3,:3] - cam_gt[...,:3,:3]).abs().sum(dim=-1).mean()
+            if self.stage == 1:
+                cam_loss += 0.8**(len(camera_predictions[0])-iter_-1)*(10*t_err + 500*R_err + 20*pts_loss_refined + 10*xyz_loss + 20*pts_loss_dynamic + 10*vis_loss_out_of_view) #+ 5*(pts_loss + pts_loss_refined + world_pts_loss) + t_loss)
+            elif self.stage == 3:
+                cam_loss += 0.8**(len(camera_predictions[0])-iter_-1)*(10*t_err + 500*R_err + 10*vis_loss_out_of_view) #+ 5*(pts_loss + pts_loss_refined + world_pts_loss) + t_loss)
+            else:
+                cam_loss += 0*vis_loss_out_of_view
+        if (cam_loss > 20000)|(torch.isnan(cam_loss)):
+            cam_loss = torch.zeros_like(cam_loss)
+        if traj3d_gt is None:
+        # ================ Condition 1: The self-supervised signals from the self-consistency ===================
+            return cam_loss, train_data[0][0][0].mean()*0, dyn_loss, train_data[0][0][0].mean()*0, point_map_loss, scale_loss, shift_loss
+        # ================ Condition 2: The supervision signal given by the ground truth trajectories ===================
+        if (
+            (torch.isnan(traj3d_gt).any()
+            or traj3d_gt.abs().max() > 2000) and (custom_vid==False)
+        ):
+            return cam_loss, train_data[0][0][0].mean()*0, dyn_loss, train_data[0][0][0].mean()*0, point_map_loss, scale_loss, shift_loss
+        vis_gts = [vis_gt.float()]
+        invis_gts = [1-vis_gt.float()]
+        traj_gts = [traj3d_gt]
+        valids_gts = [valid_mask]
+        seq_loss_all = sequence_loss(
+            coord_predictions,
+            traj_gts,
+            valids_gts,
+            vis=vis_gts,
+            gamma=0.8,
+            add_huber_loss=False,
+            loss_only_for_visible=False if custom_vid==False else True,
+            z_unc=z_unc,
+            mask_traj_gt=mask_traj_gt
+        )
+        confidence_loss = sequence_prob_loss(
+            coord_predictions, confidence_predicitons, traj_gts, vis_gts
+        )
+        seq_loss_xyz = sequence_loss_xyz(
+            coords_xyz_predictions,
+            traj_gts,
+            valids_gts,
+            intrs=intrs,
+            vis=vis_gts,
+            gamma=0.8,
+            add_huber_loss=False,
+            loss_only_for_visible=False,
+            mask_traj_gt=mask_traj_gt
+        )
+        # filter the blinking points
+        mask_vis = vis_gts[0].clone()  # B T N
+        mask_vis[mask_vis==0] = -1
+        blink_mask = mask_vis[:,:-1,:] * mask_vis[:,1:,:] # first derivative   B (T-1) N
+        mask_vis[:,:-1,:], mask_vis[:,-1,:] = (blink_mask == 1), 0
+        vis_loss = sequence_BCE_loss(vis_predictions, vis_gts, mask=[mask_vis])
+        track_loss_out = (seq_loss_all+2*seq_loss_xyz + cam_loss)
+        if valid_only:
+            vis_loss = 0.0*vis_loss
+        if custom_vid:
+            return seq_loss_all, 0.0*seq_loss_all, 0.0*seq_loss_all, 10*vis_loss, 0.0*seq_loss_all, 0.0*seq_loss_all, 0.0*seq_loss_all
+        return track_loss_out, confidence_loss, dyn_loss, 10*vis_loss, point_map_loss, scale_loss, shift_loss

models/SpaTrackV2/models/tracker3D/co_tracker/cotracker_base.py ADDED Viewed

	@@ -0,0 +1,418 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from models.SpaTrackV2.utils.model_utils  import sample_features5d, bilinear_sampler
+from models.SpaTrackV2.models.tracker3D.co_tracker.utils import (
+    Mlp, BasicEncoder, EfficientUpdateFormer
+)
+torch.manual_seed(0)
+def get_1d_sincos_pos_embed_from_grid(
+    embed_dim: int, pos: torch.Tensor
+) -> torch.Tensor:
+    """
+    This function generates a 1D positional embedding from a given grid using sine and cosine functions.
+    Args:
+    - embed_dim: The embedding dimension.
+    - pos: The position to generate the embedding from.
+    Returns:
+    - emb: The generated 1D positional embedding.
+    """
+    assert embed_dim % 2 == 0
+    omega = torch.arange(embed_dim // 2, dtype=torch.double)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = torch.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+    emb_sin = torch.sin(out)  # (M, D/2)
+    emb_cos = torch.cos(out)  # (M, D/2)
+    emb = torch.cat([emb_sin, emb_cos], dim=1)  # (M, D)
+    return emb[None].float()
+def posenc(x, min_deg, max_deg):
+    """Cat x with a positional encoding of x with scales 2^[min_deg, max_deg-1].
+    Instead of computing [sin(x), cos(x)], we use the trig identity
+    cos(x) = sin(x + pi/2) and do one vectorized call to sin([x, x+pi/2]).
+    Args:
+      x: torch.Tensor, variables to be encoded. Note that x should be in [-pi, pi].
+      min_deg: int, the minimum (inclusive) degree of the encoding.
+      max_deg: int, the maximum (exclusive) degree of the encoding.
+      legacy_posenc_order: bool, keep the same ordering as the original tf code.
+    Returns:
+      encoded: torch.Tensor, encoded variables.
+    """
+    if min_deg == max_deg:
+        return x
+    scales = torch.tensor(
+        [2**i for i in range(min_deg, max_deg)], dtype=x.dtype, device=x.device
+    )
+    xb = (x[..., None, :] * scales[:, None]).reshape(list(x.shape[:-1]) + [-1])
+    four_feat = torch.sin(torch.cat([xb, xb + 0.5 * torch.pi], dim=-1))
+    return torch.cat([x] + [four_feat], dim=-1)
+class CoTrackerThreeBase(nn.Module):
+    def __init__(
+        self,
+        window_len=8,
+        stride=4,
+        corr_radius=3,
+        corr_levels=4,
+        num_virtual_tracks=64,
+        model_resolution=(384, 512),
+        add_space_attn=True,
+        linear_layer_for_vis_conf=True,
+    ):
+        super(CoTrackerThreeBase, self).__init__()
+        self.window_len = window_len
+        self.stride = stride
+        self.corr_radius = corr_radius
+        self.corr_levels = corr_levels
+        self.hidden_dim = 256
+        self.latent_dim = 128
+        self.linear_layer_for_vis_conf = linear_layer_for_vis_conf
+        self.fnet = BasicEncoder(input_dim=3, output_dim=self.latent_dim, stride=stride)
+        highres_dim = 128
+        lowres_dim = 256
+        self.num_virtual_tracks = num_virtual_tracks
+        self.model_resolution = model_resolution
+        self.input_dim = 1110
+        self.updateformer = EfficientUpdateFormer(
+            space_depth=3,
+            time_depth=3,
+            input_dim=self.input_dim,
+            hidden_size=384,
+            output_dim=4,
+            mlp_ratio=4.0,
+            num_virtual_tracks=num_virtual_tracks,
+            add_space_attn=add_space_attn,
+            linear_layer_for_vis_conf=linear_layer_for_vis_conf,
+        )
+        self.corr_mlp = Mlp(in_features=49 * 49, hidden_features=384, out_features=256)
+        time_grid = torch.linspace(0, window_len - 1, window_len).reshape(
+            1, window_len, 1
+        )
+        self.register_buffer(
+            "time_emb", get_1d_sincos_pos_embed_from_grid(self.input_dim, time_grid[0])
+        )
+    def get_support_points(self, coords, r, reshape_back=True):
+        B, _, N, _ = coords.shape
+        device = coords.device
+        centroid_lvl = coords.reshape(B, N, 1, 1, 3)
+        dx = torch.linspace(-r, r, 2 * r + 1, device=device)
+        dy = torch.linspace(-r, r, 2 * r + 1, device=device)
+        xgrid, ygrid = torch.meshgrid(dy, dx, indexing="ij")
+        zgrid = torch.zeros_like(xgrid, device=device)
+        delta = torch.stack([zgrid, xgrid, ygrid], axis=-1)
+        delta_lvl = delta.view(1, 1, 2 * r + 1, 2 * r + 1, 3)
+        coords_lvl = centroid_lvl + delta_lvl
+        if reshape_back:
+            return coords_lvl.reshape(B, N, (2 * r + 1) ** 2, 3).permute(0, 2, 1, 3)
+        else:
+            return coords_lvl
+    def get_track_feat(self, fmaps, queried_frames, queried_coords, support_radius=0):
+        sample_frames = queried_frames[:, None, :, None]
+        sample_coords = torch.cat(
+            [
+                sample_frames,
+                queried_coords[:, None],
+            ],
+            dim=-1,
+        )
+        support_points = self.get_support_points(sample_coords, support_radius)
+        support_track_feats = sample_features5d(fmaps, support_points)
+        return (
+            support_track_feats[:, None, support_track_feats.shape[1] // 2],
+            support_track_feats,
+        )
+    def get_correlation_feat(self, fmaps, queried_coords, radius=None, padding_mode="border"):
+        B, T, D, H_, W_ = fmaps.shape
+        N = queried_coords.shape[1]
+        if radius is None:
+            r = self.corr_radius
+        else:
+            r = radius
+        sample_coords = torch.cat(
+            [torch.zeros_like(queried_coords[..., :1]), queried_coords], dim=-1
+        )[:, None]
+        support_points = self.get_support_points(sample_coords, r, reshape_back=False)
+        correlation_feat = bilinear_sampler(
+            fmaps.reshape(B * T, D, 1, H_, W_), support_points, padding_mode=padding_mode
+        )
+        return correlation_feat.view(B, T, D, N, (2 * r + 1), (2 * r + 1)).permute(
+            0, 1, 3, 4, 5, 2
+        )
+    def interpolate_time_embed(self, x, t):
+        previous_dtype = x.dtype
+        T = self.time_emb.shape[1]
+        if t == T:
+            return self.time_emb
+        time_emb = self.time_emb.float()
+        time_emb = F.interpolate(
+            time_emb.permute(0, 2, 1), size=t, mode="linear"
+        ).permute(0, 2, 1)
+        return time_emb.to(previous_dtype)
+class CoTrackerThreeOffline(CoTrackerThreeBase):
+    def __init__(self, **args):
+        super(CoTrackerThreeOffline, self).__init__(**args)
+    def forward(
+        self,
+        video,
+        queries,
+        iters=4,
+        is_train=False,
+        add_space_attn=True,
+        fmaps_chunk_size=200,
+    ):
+        """Predict tracks
+        Args:
+            video (FloatTensor[B, T, 3]): input videos.
+            queries (FloatTensor[B, N, 3]): point queries.
+            iters (int, optional): number of updates. Defaults to 4.
+            is_train (bool, optional): enables training mode. Defaults to False.
+        Returns:
+            - coords_predicted (FloatTensor[B, T, N, 2]):
+            - vis_predicted (FloatTensor[B, T, N]):
+            - train_data: `None` if `is_train` is false, otherwise:
+                - all_vis_predictions (List[FloatTensor[B, S, N, 1]]):
+                - all_coords_predictions (List[FloatTensor[B, S, N, 2]]):
+                - mask (BoolTensor[B, T, N]):
+        """
+        B, T, C, H, W = video.shape
+        device = queries.device
+        assert H % self.stride == 0 and W % self.stride == 0
+        B, N, __ = queries.shape
+        # B = batch size
+        # S_trimmed = actual number of frames in the window
+        # N = number of tracks
+        # C = color channels (3 for RGB)
+        # E = positional embedding size
+        # LRR = local receptive field radius
+        # D = dimension of the transformer input tokens
+        # video = B T C H W
+        # queries = B N 3
+        # coords_init = B T N 2
+        # vis_init = B T N 1
+        assert T >= 1  # A tracker needs at least two frames to track something
+        video = 2 * (video / 255.0) - 1.0
+        dtype = video.dtype
+        queried_frames = queries[:, :, 0].long()
+        queried_coords = queries[..., 1:3]
+        queried_coords = queried_coords / self.stride
+        # We store our predictions here
+        all_coords_predictions, all_vis_predictions, all_confidence_predictions = (
+            [],
+            [],
+            [],
+        )
+        C_ = C
+        H4, W4 = H // self.stride, W // self.stride
+        # Compute convolutional features for the video or for the current chunk in case of online mode
+        if T > fmaps_chunk_size:
+            fmaps = []
+            for t in range(0, T, fmaps_chunk_size):
+                video_chunk = video[:, t : t + fmaps_chunk_size]
+                fmaps_chunk = self.fnet(video_chunk.reshape(-1, C_, H, W))
+                T_chunk = video_chunk.shape[1]
+                C_chunk, H_chunk, W_chunk = fmaps_chunk.shape[1:]
+                fmaps.append(fmaps_chunk.reshape(B, T_chunk, C_chunk, H_chunk, W_chunk))
+            fmaps = torch.cat(fmaps, dim=1).reshape(-1, C_chunk, H_chunk, W_chunk)
+        else:
+            fmaps = self.fnet(video.reshape(-1, C_, H, W))
+        fmaps = fmaps.permute(0, 2, 3, 1)
+        fmaps = fmaps / torch.sqrt(
+            torch.maximum(
+                torch.sum(torch.square(fmaps), axis=-1, keepdims=True),
+                torch.tensor(1e-12, device=fmaps.device),
+            )
+        )
+        fmaps = fmaps.permute(0, 3, 1, 2).reshape(
+            B, -1, self.latent_dim, H // self.stride, W // self.stride
+        )
+        fmaps = fmaps.to(dtype)
+        # We compute track features
+        fmaps_pyramid = []
+        track_feat_pyramid = []
+        track_feat_support_pyramid = []
+        fmaps_pyramid.append(fmaps)
+        for i in range(self.corr_levels - 1):
+            fmaps_ = fmaps.reshape(
+                B * T, self.latent_dim, fmaps.shape[-2], fmaps.shape[-1]
+            )
+            fmaps_ = F.avg_pool2d(fmaps_, 2, stride=2)
+            fmaps = fmaps_.reshape(
+                B, T, self.latent_dim, fmaps_.shape[-2], fmaps_.shape[-1]
+            )
+            fmaps_pyramid.append(fmaps)
+        for i in range(self.corr_levels):
+            track_feat, track_feat_support = self.get_track_feat(
+                fmaps_pyramid[i],
+                queried_frames,
+                queried_coords / 2**i,
+                support_radius=self.corr_radius,
+            )
+            track_feat_pyramid.append(track_feat.repeat(1, T, 1, 1))
+            track_feat_support_pyramid.append(track_feat_support.unsqueeze(1))
+        D_coords = 2
+        coord_preds, vis_preds, confidence_preds = [], [], []
+        vis = torch.zeros((B, T, N), device=device).float()
+        confidence = torch.zeros((B, T, N), device=device).float()
+        coords = queried_coords.reshape(B, 1, N, 2).expand(B, T, N, 2).float()
+        r = 2 * self.corr_radius + 1
+        for it in range(iters):
+            coords = coords.detach()  # B T N 2
+            coords_init = coords.view(B * T, N, 2)
+            corr_embs = []
+            corr_feats = []
+            for i in range(self.corr_levels):
+                corr_feat = self.get_correlation_feat(
+                    fmaps_pyramid[i], coords_init / 2**i
+                )
+                track_feat_support = (
+                    track_feat_support_pyramid[i]
+                    .view(B, 1, r, r, N, self.latent_dim)
+                    .squeeze(1)
+                    .permute(0, 3, 1, 2, 4)
+                )
+                corr_volume = torch.einsum(
+                    "btnhwc,bnijc->btnhwij", corr_feat, track_feat_support
+                )
+                corr_emb = self.corr_mlp(corr_volume.reshape(B * T * N, r * r * r * r))
+                corr_embs.append(corr_emb)
+            corr_embs = torch.cat(corr_embs, dim=-1)
+            corr_embs = corr_embs.view(B, T, N, corr_embs.shape[-1])
+            transformer_input = [vis[..., None], confidence[..., None], corr_embs]
+            rel_coords_forward = coords[:, :-1] - coords[:, 1:]
+            rel_coords_backward = coords[:, 1:] - coords[:, :-1]
+            rel_coords_forward = torch.nn.functional.pad(
+                rel_coords_forward, (0, 0, 0, 0, 0, 1)
+            )
+            rel_coords_backward = torch.nn.functional.pad(
+                rel_coords_backward, (0, 0, 0, 0, 1, 0)
+            )
+            scale = (
+                torch.tensor(
+                    [self.model_resolution[1], self.model_resolution[0]],
+                    device=coords.device,
+                )
+                / self.stride
+            )
+            rel_coords_forward = rel_coords_forward / scale
+            rel_coords_backward = rel_coords_backward / scale
+            rel_pos_emb_input = posenc(
+                torch.cat([rel_coords_forward, rel_coords_backward], dim=-1),
+                min_deg=0,
+                max_deg=10,
+            )  # batch, num_points, num_frames, 84
+            transformer_input.append(rel_pos_emb_input)
+            x = (
+                torch.cat(transformer_input, dim=-1)
+                .permute(0, 2, 1, 3)
+                .reshape(B * N, T, -1)
+            )
+            x = x + self.interpolate_time_embed(x, T)
+            x = x.view(B, N, T, -1)  # (B N) T D -> B N T D
+            delta = self.updateformer(
+                x,
+                add_space_attn=add_space_attn,
+            )
+            delta_coords = delta[..., :D_coords].permute(0, 2, 1, 3)
+            delta_vis = delta[..., D_coords].permute(0, 2, 1)
+            delta_confidence = delta[..., D_coords + 1].permute(0, 2, 1)
+            vis = vis + delta_vis
+            confidence = confidence + delta_confidence
+            coords = coords + delta_coords
+            coords_append = coords.clone()
+            coords_append[..., :2] = coords_append[..., :2] * float(self.stride)
+            coord_preds.append(coords_append)
+            vis_preds.append(torch.sigmoid(vis))
+            confidence_preds.append(torch.sigmoid(confidence))
+        if is_train:
+            all_coords_predictions.append([coord[..., :2] for coord in coord_preds])
+            all_vis_predictions.append(vis_preds)
+            all_confidence_predictions.append(confidence_preds)
+        if is_train:
+            train_data = (
+                all_coords_predictions,
+                all_vis_predictions,
+                all_confidence_predictions,
+                torch.ones_like(vis_preds[-1], device=vis_preds[-1].device),
+            )
+        else:
+            train_data = None
+        return coord_preds[-1][..., :2], vis_preds[-1], confidence_preds[-1], train_data
+if __name__ == "__main__":
+    cotrack_cktp = "/data0/xyx/scaled_offline.pth"
+    cotracker = CoTrackerThreeOffline(
+                stride=4, corr_radius=3, window_len=60
+            )
+    with open(cotrack_cktp, "rb") as f:
+        state_dict = torch.load(f, map_location="cpu")
+        if "model" in state_dict:
+            state_dict = state_dict["model"]
+        cotracker.load_state_dict(state_dict)
+    import pdb; pdb.set_trace()

models/SpaTrackV2/models/tracker3D/co_tracker/utils.py ADDED Viewed

	@@ -0,0 +1,929 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from functools import partial
+from typing import Callable, List
+import collections
+from torch import Tensor
+from itertools import repeat
+from models.SpaTrackV2.utils.model_utils import bilinear_sampler
+from models.SpaTrackV2.models.blocks import CrossAttnBlock as CrossAttnBlock_F
+from torch.nn.functional import scaled_dot_product_attention
+from torch.nn.attention import sdpa_kernel, SDPBackend
+# import flash_attn
+EPS = 1e-6
+class ResidualBlock(nn.Module):
+    def __init__(self, in_planes, planes, norm_fn="group", stride=1):
+        super(ResidualBlock, self).__init__()
+        self.conv1 = nn.Conv2d(
+            in_planes,
+            planes,
+            kernel_size=3,
+            padding=1,
+            stride=stride,
+            padding_mode="zeros",
+        )
+        self.conv2 = nn.Conv2d(
+            planes, planes, kernel_size=3, padding=1, padding_mode="zeros"
+        )
+        self.relu = nn.ReLU(inplace=True)
+        num_groups = planes // 8
+        if norm_fn == "group":
+            self.norm1 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
+            self.norm2 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
+            if not stride == 1:
+                self.norm3 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
+        elif norm_fn == "batch":
+            self.norm1 = nn.BatchNorm2d(planes)
+            self.norm2 = nn.BatchNorm2d(planes)
+            if not stride == 1:
+                self.norm3 = nn.BatchNorm2d(planes)
+        elif norm_fn == "instance":
+            self.norm1 = nn.InstanceNorm2d(planes)
+            self.norm2 = nn.InstanceNorm2d(planes)
+            if not stride == 1:
+                self.norm3 = nn.InstanceNorm2d(planes)
+        elif norm_fn == "none":
+            self.norm1 = nn.Sequential()
+            self.norm2 = nn.Sequential()
+            if not stride == 1:
+                self.norm3 = nn.Sequential()
+        if stride == 1:
+            self.downsample = None
+        else:
+            self.downsample = nn.Sequential(
+                nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), self.norm3
+            )
+    def forward(self, x):
+        y = x
+        y = self.relu(self.norm1(self.conv1(y)))
+        y = self.relu(self.norm2(self.conv2(y)))
+        if self.downsample is not None:
+            x = self.downsample(x)
+        return self.relu(x + y)
+def reduce_masked_mean(input, mask, dim=None, keepdim=False):
+    r"""Masked mean
+    `reduce_masked_mean(x, mask)` computes the mean of a tensor :attr:`input`
+    over a mask :attr:`mask`, returning
+    .. math::
+        \text{output} =
+        \frac
+        {\sum_{i=1}^N \text{input}_i \cdot \text{mask}_i}
+        {\epsilon + \sum_{i=1}^N \text{mask}_i}
+    where :math:`N` is the number of elements in :attr:`input` and
+    :attr:`mask`, and :math:`\epsilon` is a small constant to avoid
+    division by zero.
+    `reduced_masked_mean(x, mask, dim)` computes the mean of a tensor
+    :attr:`input` over a mask :attr:`mask` along a dimension :attr:`dim`.
+    Optionally, the dimension can be kept in the output by setting
+    :attr:`keepdim` to `True`. Tensor :attr:`mask` must be broadcastable to
+    the same dimension as :attr:`input`.
+    The interface is similar to `torch.mean()`.
+    Args:
+        inout (Tensor): input tensor.
+        mask (Tensor): mask.
+        dim (int, optional): Dimension to sum over. Defaults to None.
+        keepdim (bool, optional): Keep the summed dimension. Defaults to False.
+    Returns:
+        Tensor: mean tensor.
+    """
+    mask = mask.expand_as(input)
+    prod = input * mask
+    if dim is None:
+        numer = torch.sum(prod)
+        denom = torch.sum(mask)
+    else:
+        numer = torch.sum(prod, dim=dim, keepdim=keepdim)
+        denom = torch.sum(mask, dim=dim, keepdim=keepdim)
+    mean = numer / (EPS + denom)
+    return mean
+class GeometryEncoder(nn.Module):
+    def __init__(self, input_dim=3, output_dim=128, stride=4):
+        super(GeometryEncoder, self).__init__()
+        self.stride = stride
+        self.norm_fn = "instance"
+        self.in_planes = output_dim // 2
+        self.norm1 = nn.InstanceNorm2d(self.in_planes)
+        self.norm2 = nn.InstanceNorm2d(output_dim * 2)
+        self.conv1 = nn.Conv2d(
+            input_dim,
+            self.in_planes,
+            kernel_size=7,
+            stride=2,
+            padding=3,
+            padding_mode="zeros",
+        )
+        self.relu1 = nn.ReLU(inplace=True)
+        self.layer1 = self._make_layer(output_dim // 2, stride=1)
+        self.layer2 = self._make_layer(output_dim // 4 * 3, stride=2)
+        self.conv2 = nn.Conv2d(
+            output_dim * 5 // 4,
+            output_dim,
+            kernel_size=3,
+            padding=1,
+            padding_mode="zeros",
+        )
+        self.relu2 = nn.ReLU(inplace=True)
+        self.conv3 = nn.Conv2d(output_dim, output_dim, kernel_size=1)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+            elif isinstance(m, (nn.InstanceNorm2d)):
+                if m.weight is not None:
+                    nn.init.constant_(m.weight, 1)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+    def _make_layer(self, dim, stride=1):
+        layer1 = ResidualBlock(self.in_planes, dim, self.norm_fn, stride=stride)
+        layer2 = ResidualBlock(dim, dim, self.norm_fn, stride=1)
+        layers = (layer1, layer2)
+        self.in_planes = dim
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        _, _, H, W = x.shape
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu1(x)
+        a = self.layer1(x)
+        b = self.layer2(a)
+        def _bilinear_intepolate(x):
+            return F.interpolate(
+                x,
+                (H // self.stride, W // self.stride),
+                mode="bilinear",
+                align_corners=True,
+            )
+        a = _bilinear_intepolate(a)
+        b = _bilinear_intepolate(b)
+        x = self.conv2(torch.cat([a, b], dim=1))
+        x = self.norm2(x)
+        x = self.relu2(x)
+        x = self.conv3(x)
+        return x
+class BasicEncoder(nn.Module):
+    def __init__(self, input_dim=3, output_dim=128, stride=4):
+        super(BasicEncoder, self).__init__()
+        self.stride = stride
+        self.norm_fn = "instance"
+        self.in_planes = output_dim // 2
+        self.norm1 = nn.InstanceNorm2d(self.in_planes)
+        self.norm2 = nn.InstanceNorm2d(output_dim * 2)
+        self.conv1 = nn.Conv2d(
+            input_dim,
+            self.in_planes,
+            kernel_size=7,
+            stride=2,
+            padding=3,
+            padding_mode="zeros",
+        )
+        self.relu1 = nn.ReLU(inplace=True)
+        self.layer1 = self._make_layer(output_dim // 2, stride=1)
+        self.layer2 = self._make_layer(output_dim // 4 * 3, stride=2)
+        self.layer3 = self._make_layer(output_dim, stride=2)
+        self.layer4 = self._make_layer(output_dim, stride=2)
+        self.conv2 = nn.Conv2d(
+            output_dim * 3 + output_dim // 4,
+            output_dim * 2,
+            kernel_size=3,
+            padding=1,
+            padding_mode="zeros",
+        )
+        self.relu2 = nn.ReLU(inplace=True)
+        self.conv3 = nn.Conv2d(output_dim * 2, output_dim, kernel_size=1)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+            elif isinstance(m, (nn.InstanceNorm2d)):
+                if m.weight is not None:
+                    nn.init.constant_(m.weight, 1)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+    def _make_layer(self, dim, stride=1):
+        layer1 = ResidualBlock(self.in_planes, dim, self.norm_fn, stride=stride)
+        layer2 = ResidualBlock(dim, dim, self.norm_fn, stride=1)
+        layers = (layer1, layer2)
+        self.in_planes = dim
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        _, _, H, W = x.shape
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu1(x)
+        a = self.layer1(x)
+        b = self.layer2(a)
+        c = self.layer3(b)
+        d = self.layer4(c)
+        def _bilinear_intepolate(x):
+            return F.interpolate(
+                x,
+                (H // self.stride, W // self.stride),
+                mode="bilinear",
+                align_corners=True,
+            )
+        a = _bilinear_intepolate(a)
+        b = _bilinear_intepolate(b)
+        c = _bilinear_intepolate(c)
+        d = _bilinear_intepolate(d)
+        x = self.conv2(torch.cat([a, b, c, d], dim=1))
+        x = self.norm2(x)
+        x = self.relu2(x)
+        x = self.conv3(x)
+        return x
+# From PyTorch internals
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
+            return tuple(x)
+        return tuple(repeat(x, n))
+    return parse
+def exists(val):
+    return val is not None
+def default(val, d):
+    return val if exists(val) else d
+to_2tuple = _ntuple(2)
+class Mlp(nn.Module):
+    """MLP as used in Vision Transformer, MLP-Mixer and related networks"""
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        norm_layer=None,
+        bias=True,
+        drop=0.0,
+        use_conv=False,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        bias = to_2tuple(bias)
+        drop_probs = to_2tuple(drop)
+        linear_layer = partial(nn.Conv2d, kernel_size=1) if use_conv else nn.Linear
+        self.fc1 = linear_layer(in_features, hidden_features, bias=bias[0])
+        self.act = act_layer()
+        self.drop1 = nn.Dropout(drop_probs[0])
+        self.norm = (
+            norm_layer(hidden_features) if norm_layer is not None else nn.Identity()
+        )
+        self.fc2 = linear_layer(hidden_features, out_features, bias=bias[1])
+        self.drop2 = nn.Dropout(drop_probs[1])
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop1(x)
+        x = self.fc2(x)
+        x = self.drop2(x)
+        return x
+class Attention(nn.Module):
+    def __init__(
+        self, query_dim, context_dim=None, num_heads=8, dim_head=48, qkv_bias=False
+    ):
+        super().__init__()
+        inner_dim = dim_head * num_heads
+        self.inner_dim = inner_dim
+        context_dim = default(context_dim, query_dim)
+        self.scale = dim_head**-0.5
+        self.heads = num_heads
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=qkv_bias)
+        self.to_kv = nn.Linear(context_dim, inner_dim * 2, bias=qkv_bias)
+        self.to_out = nn.Linear(inner_dim, query_dim)
+    def forward(self, x, context=None, attn_bias=None, flash=True):
+        B, N1, C = x.shape
+        h = self.heads
+        q = self.to_q(x).reshape(B, N1, h, self.inner_dim // h).permute(0, 2, 1, 3)
+        context = default(context, x)
+        k, v = self.to_kv(context).chunk(2, dim=-1)
+        N2 = context.shape[1]
+        k = k.reshape(B, N2, h, self.inner_dim // h).permute(0, 2, 1, 3)
+        v = v.reshape(B, N2, h, self.inner_dim // h).permute(0, 2, 1, 3)
+        if (
+            (N1 < 64 and N2 < 64) or
+            (B > 1e4) or
+            (q.shape[1] != k.shape[1]) or
+            (q.shape[1] % k.shape[1] != 0)
+        ):
+            flash = False
+        if flash == False:
+            sim = (q @ k.transpose(-2, -1)) * self.scale
+            if attn_bias is not None:
+                sim = sim + attn_bias
+            if sim.abs().max() > 1e2:
+                import pdb; pdb.set_trace()
+            attn = sim.softmax(dim=-1)
+            x = (attn @ v).transpose(1, 2).reshape(B, N1, self.inner_dim)
+        else:
+            input_args = [x.contiguous() for x in [q, k, v]]
+            try:
+                # print(f"q.shape: {q.shape}, dtype: {q.dtype}, device: {q.device}")
+                # print(f"Flash SDP available: {torch.backends.cuda.flash_sdp_enabled()}")
+                # print(f"Flash SDP allowed: {torch.backends.cuda.enable_flash_sdp}")
+                with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
+                    x = F.scaled_dot_product_attention(*input_args).permute(0,2,1,3).reshape(B,N1,-1)  # type: ignore
+            except Exception as e:
+                print(e)
+        if self.to_out.bias.dtype != x.dtype:
+            x = x.to(self.to_out.bias.dtype)
+        return self.to_out(x)
+class CrossAttnBlock(nn.Module):
+    def __init__(
+        self, hidden_size, context_dim, num_heads=1, mlp_ratio=4.0, **block_kwargs
+    ):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.norm_context = nn.LayerNorm(context_dim)
+        self.cross_attn = Attention(
+            hidden_size,
+            context_dim=context_dim,
+            num_heads=num_heads,
+            qkv_bias=True,
+            **block_kwargs
+        )
+        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        self.mlp = Mlp(
+            in_features=hidden_size,
+            hidden_features=mlp_hidden_dim,
+            act_layer=approx_gelu,
+            drop=0,
+        )
+    def forward(self, x, context, mask=None):
+        attn_bias = None
+        if mask is not None:
+            if mask.shape[1] == x.shape[1]:
+                mask = mask[:, None, :, None].expand(
+                    -1, self.cross_attn.heads, -1, context.shape[1]
+                )
+            else:
+                mask = mask[:, None, None].expand(
+                    -1, self.cross_attn.heads, x.shape[1], -1
+                )
+            max_neg_value = -torch.finfo(x.dtype).max
+            attn_bias = (~mask) * max_neg_value
+        x = x + self.cross_attn(
+            self.norm1(x), context=self.norm_context(context), attn_bias=attn_bias
+        )
+        x = x + self.mlp(self.norm2(x))
+        return x
+class AttnBlock(nn.Module):
+    def __init__(
+        self,
+        hidden_size,
+        num_heads,
+        attn_class: Callable[..., nn.Module] = Attention,
+        mlp_ratio=4.0,
+        **block_kwargs
+    ):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.attn = attn_class(
+            hidden_size, num_heads=num_heads, qkv_bias=True, **block_kwargs
+        )
+        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        self.mlp = Mlp(
+            in_features=hidden_size,
+            hidden_features=mlp_hidden_dim,
+            act_layer=approx_gelu,
+            drop=0,
+        )
+    def forward(self, x, mask=None):
+        attn_bias = mask
+        if mask is not None:
+            mask = (
+                (mask[:, None] * mask[:, :, None])
+                .unsqueeze(1)
+                .expand(-1, self.attn.num_heads, -1, -1)
+            )
+            max_neg_value = -torch.finfo(x.dtype).max
+            attn_bias = (~mask) * max_neg_value
+        x = x + self.attn(self.norm1(x), attn_bias=attn_bias)
+        x = x + self.mlp(self.norm2(x))
+        return x
+class EfficientUpdateFormer(nn.Module):
+    """
+    Transformer model that updates track estimates.
+    """
+    def __init__(
+        self,
+        space_depth=6,
+        time_depth=6,
+        input_dim=320,
+        hidden_size=384,
+        num_heads=8,
+        output_dim=130,
+        mlp_ratio=4.0,
+        num_virtual_tracks=64,
+        add_space_attn=True,
+        linear_layer_for_vis_conf=False,
+        patch_feat=False,
+        patch_dim=128,
+    ):
+        super().__init__()
+        self.out_channels = 2
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+        self.input_transform = torch.nn.Linear(input_dim, hidden_size, bias=True)
+        if linear_layer_for_vis_conf:
+            self.flow_head = torch.nn.Linear(hidden_size, output_dim - 2, bias=True)
+            self.vis_conf_head = torch.nn.Linear(hidden_size, 2, bias=True)
+        else:
+            self.flow_head = torch.nn.Linear(hidden_size, output_dim, bias=True)
+        if patch_feat==False:
+            self.virual_tracks = nn.Parameter(
+                torch.randn(1, num_virtual_tracks, 1, hidden_size)
+            )
+            self.num_virtual_tracks = num_virtual_tracks
+        else:
+            self.patch_proj = nn.Linear(patch_dim, hidden_size, bias=True)
+        self.add_space_attn = add_space_attn
+        self.linear_layer_for_vis_conf = linear_layer_for_vis_conf
+        self.time_blocks = nn.ModuleList(
+            [
+                AttnBlock(
+                    hidden_size,
+                    num_heads,
+                    mlp_ratio=mlp_ratio,
+                    attn_class=Attention,
+                )
+                for _ in range(time_depth)
+            ]
+        )
+        if add_space_attn:
+            self.space_virtual_blocks = nn.ModuleList(
+                [
+                    AttnBlock(
+                        hidden_size,
+                        num_heads,
+                        mlp_ratio=mlp_ratio,
+                        attn_class=Attention,
+                    )
+                    for _ in range(space_depth)
+                ]
+            )
+            self.space_point2virtual_blocks = nn.ModuleList(
+                [
+                    CrossAttnBlock(
+                        hidden_size, hidden_size, num_heads, mlp_ratio=mlp_ratio
+                    )
+                    for _ in range(space_depth)
+                ]
+            )
+            self.space_virtual2point_blocks = nn.ModuleList(
+                [
+                    CrossAttnBlock(
+                        hidden_size, hidden_size, num_heads, mlp_ratio=mlp_ratio
+                    )
+                    for _ in range(space_depth)
+                ]
+            )
+            assert len(self.time_blocks) >= len(self.space_virtual2point_blocks)
+        self.initialize_weights()
+    def initialize_weights(self):
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+            torch.nn.init.trunc_normal_(self.flow_head.weight, std=0.001)
+            if self.linear_layer_for_vis_conf:
+                torch.nn.init.trunc_normal_(self.vis_conf_head.weight, std=0.001)
+        def _trunc_init(module):
+            """ViT weight initialization, original timm impl (for reproducibility)"""
+            if isinstance(module, nn.Linear):
+                torch.nn.init.trunc_normal_(module.weight, std=0.02)
+                if module.bias is not None:
+                    nn.init.zeros_(module.bias)
+        self.apply(_basic_init)
+    def forward(self, input_tensor, mask=None, add_space_attn=True, patch_feat=None):
+        tokens = self.input_transform(input_tensor)
+        B, _, T, _ = tokens.shape
+        if patch_feat is None:
+            virtual_tokens = self.virual_tracks.repeat(B, 1, T, 1)
+            tokens = torch.cat([tokens, virtual_tokens], dim=1)
+        else:
+            patch_feat = self.patch_proj(patch_feat.detach())
+            tokens = torch.cat([tokens, patch_feat], dim=1)
+            self.num_virtual_tracks = patch_feat.shape[1]
+        _, N, _, _ = tokens.shape
+        j = 0
+        layers = []
+        for i in range(len(self.time_blocks)):
+            time_tokens = tokens.contiguous().view(B * N, T, -1)  # B N T C -> (B N) T C
+            time_tokens = torch.utils.checkpoint.checkpoint(
+                self.time_blocks[i],
+                time_tokens
+            )
+            tokens = time_tokens.view(B, N, T, -1)  # (B N) T C -> B N T C
+            if (
+                add_space_attn
+                and hasattr(self, "space_virtual_blocks")
+                and (i % (len(self.time_blocks) // len(self.space_virtual_blocks)) == 0)
+            ):
+                space_tokens = (
+                    tokens.permute(0, 2, 1, 3).contiguous().view(B * T, N, -1)
+                )  # B N T C -> (B T) N C
+                point_tokens = space_tokens[:, : N - self.num_virtual_tracks]
+                virtual_tokens = space_tokens[:, N - self.num_virtual_tracks :]
+                virtual_tokens = torch.utils.checkpoint.checkpoint(
+                    self.space_virtual2point_blocks[j],
+                    virtual_tokens, point_tokens, mask
+                )
+                virtual_tokens = torch.utils.checkpoint.checkpoint(
+                    self.space_virtual_blocks[j],
+                    virtual_tokens
+                )
+                point_tokens = torch.utils.checkpoint.checkpoint(
+                    self.space_point2virtual_blocks[j],
+                    point_tokens, virtual_tokens, mask
+                )
+                space_tokens = torch.cat([point_tokens, virtual_tokens], dim=1)
+                tokens = space_tokens.view(B, T, N, -1).permute(
+                    0, 2, 1, 3
+                )  # (B T) N C -> B N T C
+                j += 1
+        tokens = tokens[:, : N - self.num_virtual_tracks]
+        flow = self.flow_head(tokens)
+        if self.linear_layer_for_vis_conf:
+            vis_conf = self.vis_conf_head(tokens)
+            flow = torch.cat([flow, vis_conf], dim=-1)
+        return flow
+def focal_loss(logits, targets, alpha=0.25, gamma=2.0):
+    probs = torch.sigmoid(logits)
+    ce_loss = F.binary_cross_entropy_with_logits(logits, targets, reduction='none')
+    p_t = probs * targets + (1 - probs) * (1 - targets)
+    loss = alpha * (1 - p_t) ** gamma * ce_loss
+    return loss.mean()
+def balanced_binary_cross_entropy(logits, targets, balance_weight=1.0, eps=1e-6, reduction="mean", pos_bias=0.0, mask=None):
+    """
+    logits: Tensor of arbitrary shape
+    targets: same shape as logits
+    balance_weight: scaling the loss
+    reduction: 'mean', 'sum', or 'none'
+    """
+    targets = targets.float()
+    positive = (targets == 1).float().sum()
+    total = targets.numel()
+    positive_ratio = positive / (total + eps)
+    pos_weight = (1 - positive_ratio) / (positive_ratio + eps)
+    pos_weight = pos_weight.clamp(min=0.1, max=10.0)
+    loss = F.binary_cross_entropy_with_logits(
+        logits,
+        targets,
+        pos_weight=pos_weight+pos_bias,
+        reduction=reduction
+    )
+    if mask is not None:
+        loss = (loss * mask).sum() / (mask.sum() + eps)
+    return balance_weight * loss
+def sequence_loss(
+    flow_preds,
+    flow_gt,
+    valids,
+    vis=None,
+    gamma=0.8,
+    add_huber_loss=False,
+    loss_only_for_visible=False,
+    depth_sample=None,
+    z_unc=None,
+    mask_traj_gt=None
+):
+    """Loss function defined over sequence of flow predictions"""
+    total_flow_loss = 0.0
+    for j in range(len(flow_gt)):
+        B, S, N, D = flow_gt[j].shape
+        B, S2, N = valids[j].shape
+        assert S == S2
+        n_predictions = len(flow_preds[j])
+        flow_loss = 0.0
+        for i in range(n_predictions):
+            i_weight = gamma ** (n_predictions - i - 1)
+            flow_pred = flow_preds[j][i][:,:,:flow_gt[j].shape[2]]
+            if flow_pred.shape[-1] == 3:
+                flow_pred[...,2] = flow_pred[...,2]
+            if add_huber_loss:
+                i_loss = huber_loss(flow_pred, flow_gt[j], delta=6.0)
+            else:
+                if flow_gt[j][...,2].abs().max() != 0:
+                    track_z_loss = (flow_pred- flow_gt[j])[...,2].abs().mean()
+                    if mask_traj_gt is not None:
+                        track_z_loss = ((flow_pred- flow_gt[j])[...,2].abs() * mask_traj_gt.permute(0,2,1)).sum() / (mask_traj_gt.sum(dim=1)+1e-6)
+                else:
+                    track_z_loss = 0
+                i_loss = (flow_pred[...,:2] - flow_gt[j][...,:2]).abs() # B, S, N, 2
+            # print((flow_pred - flow_gt[j])[...,2].abs()[vis[j].bool()].mean())
+            i_loss = torch.mean(i_loss, dim=3)  # B, S, N
+            valid_ = valids[j].clone()[:,:, :flow_gt[j].shape[2]]  # Ensure valid_ has the same shape as i_loss
+            valid_ = valid_ * (flow_gt[j][...,:2].norm(dim=-1) > 0).float()
+            if loss_only_for_visible:
+                valid_ = valid_ * vis[j]
+            # print(reduce_masked_mean(i_loss, valid_).item(), track_z_loss.item()/16)
+            flow_loss += i_weight * (reduce_masked_mean(i_loss, valid_) + track_z_loss + 10*reduce_masked_mean(i_loss, valid_* vis[j]))
+            # if flow_loss > 5e2:
+            #     import pdb; pdb.set_trace()
+        flow_loss = flow_loss / n_predictions
+        total_flow_loss += flow_loss
+    return total_flow_loss / len(flow_gt)
+def sequence_loss_xyz(
+    flow_preds,
+    flow_gt,
+    valids,
+    intrs,
+    vis=None,
+    gamma=0.8,
+    add_huber_loss=False,
+    loss_only_for_visible=False,
+    mask_traj_gt=None
+):
+    """Loss function defined over sequence of flow predictions"""
+    total_flow_loss = 0.0
+    for j in range(len(flow_gt)):
+        B, S, N, D = flow_gt[j].shape
+        B, S2, N = valids[j].shape
+        assert S == S2
+        n_predictions = len(flow_preds[j])
+        flow_loss = 0.0
+        for i in range(n_predictions):
+            i_weight = gamma ** (n_predictions - i - 1)
+            flow_pred = flow_preds[j][i][:,:,:flow_gt[j].shape[2]]
+            flow_gt_ = flow_gt[j]
+            flow_gt_one = torch.cat([flow_gt_[...,:2], torch.ones_like(flow_gt_[:,:,:,:1])], dim=-1)
+            flow_gt_cam = torch.einsum('btsc,btnc->btns', torch.inverse(intrs), flow_gt_one)
+            flow_gt_cam *= flow_gt_[...,2:3].abs()
+            flow_gt_cam[...,2] *= torch.sign(flow_gt_cam[...,2])
+            if add_huber_loss:
+                i_loss = huber_loss(flow_pred, flow_gt_cam, delta=6.0)
+            else:
+                i_loss = (flow_pred- flow_gt_cam).norm(dim=-1,keepdim=True) # B, S, N, 2
+            # print((flow_pred - flow_gt[j])[...,2].abs()[vis[j].bool()].mean())
+            i_loss = torch.mean(i_loss, dim=3)  # B, S, N
+            valid_ = valids[j].clone()[:,:, :flow_gt[j].shape[2]]  # Ensure valid_ has the same shape as i_loss
+            if loss_only_for_visible:
+                valid_ = valid_ * vis[j]
+            # print(reduce_masked_mean(i_loss, valid_).item(), track_z_loss.item()/16)
+            flow_loss += i_weight * (reduce_masked_mean(i_loss, valid_)) * 1000
+            # if flow_loss > 5e2:
+            #     import pdb; pdb.set_trace()
+        flow_loss = flow_loss / n_predictions
+        total_flow_loss += flow_loss
+    return total_flow_loss / len(flow_gt)
+def huber_loss(x, y, delta=1.0):
+    """Calculate element-wise Huber loss between x and y"""
+    diff = x - y
+    abs_diff = diff.abs()
+    flag = (abs_diff <= delta).float()
+    return flag * 0.5 * diff**2 + (1 - flag) * delta * (abs_diff - 0.5 * delta)
+def sequence_BCE_loss(vis_preds, vis_gts, mask=None):
+    total_bce_loss = 0.0
+    for j in range(len(vis_preds)):
+        n_predictions = len(vis_preds[j])
+        bce_loss = 0.0
+        for i in range(n_predictions):
+            N_gt = vis_gts[j].shape[-1]
+            if mask is not None:
+                vis_loss = balanced_binary_cross_entropy(vis_preds[j][i][...,:N_gt], vis_gts[j], mask=mask[j], reduction="none")
+            else:
+                vis_loss = balanced_binary_cross_entropy(vis_preds[j][i][...,:N_gt], vis_gts[j]) + focal_loss(vis_preds[j][i][...,:N_gt], vis_gts[j])
+            # print(vis_loss, ((torch.sigmoid(vis_preds[j][i][...,:N_gt])>0.5).float() - vis_gts[j]).abs().sum())
+            bce_loss += vis_loss
+        bce_loss = bce_loss / n_predictions
+        total_bce_loss += bce_loss
+    return total_bce_loss / len(vis_preds)
+def sequence_prob_loss(
+    tracks: torch.Tensor,
+    confidence: torch.Tensor,
+    target_points: torch.Tensor,
+    visibility: torch.Tensor,
+    expected_dist_thresh: float = 12.0,
+):
+    """Loss for classifying if a point is within pixel threshold of its target."""
+    # Points with an error larger than 12 pixels are likely to be useless; marking
+    # them as occluded will actually improve Jaccard metrics and give
+    # qualitatively better results.
+    total_logprob_loss = 0.0
+    for j in range(len(tracks)):
+        n_predictions = len(tracks[j])
+        logprob_loss = 0.0
+        for i in range(n_predictions):
+            N_gt = target_points[j].shape[2]
+            err = torch.sum((tracks[j][i].detach()[:,:,:N_gt,:2] - target_points[j][...,:2]) ** 2, dim=-1)
+            valid = (err <= expected_dist_thresh**2).float()
+            logprob = balanced_binary_cross_entropy(confidence[j][i][...,:N_gt], valid, reduction="none")
+            logprob *= visibility[j]
+            logprob = torch.mean(logprob, dim=[1, 2])
+            logprob_loss += logprob
+        logprob_loss = logprob_loss / n_predictions
+        total_logprob_loss += logprob_loss
+    return total_logprob_loss / len(tracks)
+def sequence_dyn_prob_loss(
+    tracks: torch.Tensor,
+    confidence: torch.Tensor,
+    target_points: torch.Tensor,
+    visibility: torch.Tensor,
+    expected_dist_thresh: float = 6.0,
+):
+    """Loss for classifying if a point is within pixel threshold of its target."""
+    # Points with an error larger than 12 pixels are likely to be useless; marking
+    # them as occluded will actually improve Jaccard metrics and give
+    # qualitatively better results.
+    total_logprob_loss = 0.0
+    for j in range(len(tracks)):
+        n_predictions = len(tracks[j])
+        logprob_loss = 0.0
+        for i in range(n_predictions):
+            err = torch.sum((tracks[j][i].detach() - target_points[j]) ** 2, dim=-1)
+            valid = (err <= expected_dist_thresh**2).float()
+            valid = (valid.sum(dim=1) > 0).float()
+            logprob = balanced_binary_cross_entropy(confidence[j][i].mean(dim=1), valid, reduction="none")
+            # logprob *= visibility[j]
+            logprob = torch.mean(logprob, dim=[0, 1])
+            logprob_loss += logprob
+        logprob_loss = logprob_loss / n_predictions
+        total_logprob_loss += logprob_loss
+    return total_logprob_loss / len(tracks)
+def masked_mean(data: torch.Tensor, mask: torch.Tensor, dim: List[int]):
+    if mask is None:
+        return data.mean(dim=dim, keepdim=True)
+    mask = mask.float()
+    mask_sum = torch.sum(mask, dim=dim, keepdim=True)
+    mask_mean = torch.sum(data * mask, dim=dim, keepdim=True) / torch.clamp(
+        mask_sum, min=1.0
+    )
+    return mask_mean
+def masked_mean_var(data: torch.Tensor, mask: torch.Tensor, dim: List[int]):
+    if mask is None:
+        return data.mean(dim=dim, keepdim=True), data.var(dim=dim, keepdim=True)
+    mask = mask.float()
+    mask_sum = torch.sum(mask, dim=dim, keepdim=True)
+    mask_mean = torch.sum(data * mask, dim=dim, keepdim=True) / torch.clamp(
+        mask_sum, min=1.0
+    )
+    mask_var = torch.sum(
+        mask * (data - mask_mean) ** 2, dim=dim, keepdim=True
+    ) / torch.clamp(mask_sum, min=1.0)
+    return mask_mean.squeeze(dim), mask_var.squeeze(dim)
+class NeighborTransformer(nn.Module):
+    def __init__(self, dim: int, num_heads: int, head_dim: int, mlp_ratio: float):
+        super().__init__()
+        self.dim = dim
+        self.output_token_1 = nn.Parameter(torch.randn(1, dim))
+        self.output_token_2 = nn.Parameter(torch.randn(1, dim))
+        self.xblock1_2 = CrossAttnBlock(dim, context_dim=dim, num_heads=num_heads, dim_head=head_dim, mlp_ratio=mlp_ratio)
+        self.xblock2_1 = CrossAttnBlock(dim, context_dim=dim, num_heads=num_heads, dim_head=head_dim, mlp_ratio=mlp_ratio)
+        self.aggr1 = Attention(dim, context_dim=dim, num_heads=num_heads, dim_head=head_dim)
+        self.aggr2 = Attention(dim, context_dim=dim, num_heads=num_heads, dim_head=head_dim)
+    def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+        from einops import rearrange, repeat
+        import torch.utils.checkpoint as checkpoint
+        assert len (x.shape) == 3, "x should be of shape (B, N, D)"
+        assert len (y.shape) == 3, "y should be of shape (B, N, D)"
+        # not work so well ...
+        def forward_chunk(x, y):
+            new_x = self.xblock1_2(x, y)
+            new_y = self.xblock2_1(y, x)
+            out1 = self.aggr1(repeat(self.output_token_1, 'n d -> b n d', b=x.shape[0]), context=new_x)
+            out2 = self.aggr2(repeat(self.output_token_2, 'n d -> b n d', b=x.shape[0]), context=new_y)
+            return out1 + out2
+        return checkpoint.checkpoint(forward_chunk, x, y)
+class CorrPointformer(nn.Module):
+    def __init__(self, dim: int, num_heads: int, head_dim: int, mlp_ratio: float):
+        super().__init__()
+        self.dim = dim
+        self.xblock1_2 = CrossAttnBlock(dim, context_dim=dim, num_heads=num_heads, dim_head=head_dim, mlp_ratio=mlp_ratio)
+        # self.xblock2_1 = CrossAttnBlock(dim, context_dim=dim, num_heads=num_heads, dim_head=head_dim, mlp_ratio=mlp_ratio)
+        self.aggr = CrossAttnBlock(dim, context_dim=dim, num_heads=num_heads, dim_head=head_dim, mlp_ratio=mlp_ratio)
+        self.out_proj = nn.Linear(dim, 2*dim)
+    def forward(self, query: torch.Tensor, target: torch.Tensor, target_rel_pos: torch.Tensor) -> torch.Tensor:
+        from einops import rearrange, repeat
+        import torch.utils.checkpoint as checkpoint
+        def forward_chunk(query, target, target_rel_pos):
+            new_query = self.xblock1_2(query, target).mean(dim=1, keepdim=True)
+            # new_target = self.xblock2_1(target, query).mean(dim=1, keepdim=True)
+            # new_aggr = new_query + new_target
+            out = self.aggr(new_query, target+target_rel_pos)  # (potential delta xyz)  (target - center)
+            out = self.out_proj(out)
+            return out
+        return checkpoint.checkpoint(forward_chunk, query, target, target_rel_pos)