Spaces:

NTUST-DDRC
/

cosmos_transfer1_av

Paused

App Files Files Community

harry900000 commited on Jul 16

Commit

226c7c9

1 Parent(s): e22a639

add cosmos-tranfer1/ into repo

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +203 -0
app.py +11 -2
cosmos_transfer1/auxiliary/depth_anything/inference/__init__.py +0 -0
cosmos_transfer1/auxiliary/depth_anything/inference/depth_anything_pipeline.py +55 -0
cosmos_transfer1/auxiliary/depth_anything/model/__init__.py +0 -0
cosmos_transfer1/auxiliary/depth_anything/model/depth_anything.py +151 -0
cosmos_transfer1/auxiliary/guardrail/README.md +17 -0
cosmos_transfer1/auxiliary/guardrail/__init__.py +14 -0
cosmos_transfer1/auxiliary/guardrail/aegis/__init__.py +14 -0
cosmos_transfer1/auxiliary/guardrail/aegis/aegis.py +135 -0
cosmos_transfer1/auxiliary/guardrail/aegis/categories.py +192 -0
cosmos_transfer1/auxiliary/guardrail/blocklist/__init__.py +14 -0
cosmos_transfer1/auxiliary/guardrail/blocklist/blocklist.py +216 -0
cosmos_transfer1/auxiliary/guardrail/blocklist/utils.py +45 -0
cosmos_transfer1/auxiliary/guardrail/common/__init__.py +0 -0
cosmos_transfer1/auxiliary/guardrail/common/core.py +71 -0
cosmos_transfer1/auxiliary/guardrail/common/io_utils.py +78 -0
cosmos_transfer1/auxiliary/guardrail/common/presets.py +75 -0
cosmos_transfer1/auxiliary/guardrail/face_blur_filter/__init__.py +14 -0
cosmos_transfer1/auxiliary/guardrail/face_blur_filter/blur_utils.py +35 -0
cosmos_transfer1/auxiliary/guardrail/face_blur_filter/face_blur_filter.py +225 -0
cosmos_transfer1/auxiliary/guardrail/face_blur_filter/retinaface_utils.py +117 -0
cosmos_transfer1/auxiliary/guardrail/llamaGuard3/__init__.py +14 -0
cosmos_transfer1/auxiliary/guardrail/llamaGuard3/categories.py +31 -0
cosmos_transfer1/auxiliary/guardrail/llamaGuard3/llamaGuard3.py +122 -0
cosmos_transfer1/auxiliary/guardrail/video_content_safety_filter/__init__.py +14 -0
cosmos_transfer1/auxiliary/guardrail/video_content_safety_filter/model.py +60 -0
cosmos_transfer1/auxiliary/guardrail/video_content_safety_filter/video_content_safety_filter.py +185 -0
cosmos_transfer1/auxiliary/guardrail/video_content_safety_filter/vision_encoder.py +46 -0
cosmos_transfer1/auxiliary/human_keypoint/human_keypoint.py +155 -0
cosmos_transfer1/auxiliary/robot_augmentation/README.md +112 -0
cosmos_transfer1/auxiliary/robot_augmentation/spatial_temporal_weight.py +577 -0
cosmos_transfer1/auxiliary/sam2/sam2_model.py +392 -0
cosmos_transfer1/auxiliary/sam2/sam2_pipeline.py +126 -0
cosmos_transfer1/auxiliary/sam2/sam2_utils.py +168 -0
cosmos_transfer1/auxiliary/tokenizer/inference/__init__.py +14 -0
cosmos_transfer1/auxiliary/tokenizer/inference/image_cli.py +188 -0
cosmos_transfer1/auxiliary/tokenizer/inference/image_lib.py +124 -0
cosmos_transfer1/auxiliary/tokenizer/inference/utils.py +402 -0
cosmos_transfer1/auxiliary/tokenizer/inference/video_cli.py +210 -0
cosmos_transfer1/auxiliary/tokenizer/inference/video_lib.py +146 -0
cosmos_transfer1/auxiliary/tokenizer/modules/__init__.py +61 -0
cosmos_transfer1/auxiliary/tokenizer/modules/distributions.py +42 -0
cosmos_transfer1/auxiliary/tokenizer/modules/layers2d.py +329 -0
cosmos_transfer1/auxiliary/tokenizer/modules/layers3d.py +969 -0
cosmos_transfer1/auxiliary/tokenizer/modules/patching.py +311 -0
cosmos_transfer1/auxiliary/tokenizer/modules/quantizers.py +513 -0
cosmos_transfer1/auxiliary/tokenizer/modules/utils.py +116 -0
cosmos_transfer1/auxiliary/tokenizer/networks/__init__.py +39 -0
cosmos_transfer1/auxiliary/tokenizer/networks/configs.py +147 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,203 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[codz]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py.cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+#poetry.toml
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
+#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
+#pdm.lock
+#pdm.toml
+.pdm-python
+.pdm-build/
+# pixi
+#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
+#pixi.lock
+#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
+#   in the .venv directory. It is recommended not to include this directory in version control.
+.pixi
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Abstra
+# Abstra is an AI-powered process automation framework.
+# Ignore directories containing user credentials, local state, and settings.
+# Learn more at https://abstra.io/docs
+.abstra/
+# Visual Studio Code
+#  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
+#  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#  and can be added to the global gitignore or merged into this file. However, if you prefer,
+#  you could uncomment the following to ignore the entire vscode folder
+# .vscode/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+# Marimo
+marimo/_static/
+marimo/_lsp/
+__marimo__/
+# Streamlit
+.streamlit/secrets.toml

app.py CHANGED Viewed

@@ -1,4 +1,6 @@
 import os
 from typing import List, Tuple
 import gradio as gr
@@ -33,14 +35,16 @@ download_checkpoints(hf_token="", output_dir=CHECKPOINTS_PATH, model="7b_av")
 from test_environment import main as check_environment
 from test_environment import setup_environment
-setup_environment()
 # setup env
 os.environ["CUDA_HOME"] = "/usr/local/cuda"
 os.environ["LD_LIBRARY_PATH"] = "$CUDA_HOME/lib:$CUDA_HOME/lib64:$LD_LIBRARY_PATH"
 os.environ["PATH"] = "$CUDA_HOME/bin:/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:$PATH"
-check_environment()
 os.environ["TOKENIZERS_PARALLELISM"] = "false"  # Workaround to suppress MP warning
@@ -279,6 +283,9 @@ def generate_video(
     else:
         actual_seed = seed
     args, control_inputs = parse_arguments(
         controlnet_specs_in={
             "hdmap": {"control_weight": 0.3, "input_control": hdmap_video_input},
@@ -294,6 +301,8 @@ def generate_video(
         seed=seed,
     )
     videos, prompts = inference(args, control_inputs)
     video = videos[0]
     return video, video, actual_seed

 import os
+import sys
+import time
 from typing import List, Tuple
 import gradio as gr
 from test_environment import main as check_environment
 from test_environment import setup_environment
 # setup env
 os.environ["CUDA_HOME"] = "/usr/local/cuda"
 os.environ["LD_LIBRARY_PATH"] = "$CUDA_HOME/lib:$CUDA_HOME/lib64:$LD_LIBRARY_PATH"
 os.environ["PATH"] = "$CUDA_HOME/bin:/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:$PATH"
+if not check_environment():
+    setup_environment()
+    if not check_environment():
+        sys.exit(1)
 os.environ["TOKENIZERS_PARALLELISM"] = "false"  # Workaround to suppress MP warning
     else:
         actual_seed = seed
+    log.info(f"actual_seed: {actual_seed}")
+    start_time = time.time()
     args, control_inputs = parse_arguments(
         controlnet_specs_in={
             "hdmap": {"control_weight": 0.3, "input_control": hdmap_video_input},
         seed=seed,
     )
     videos, prompts = inference(args, control_inputs)
+    end_time = time.time()
+    log.info(f"Time taken: {end_time - start_time} s")
     video = videos[0]
     return video, video, actual_seed

cosmos_transfer1/auxiliary/depth_anything/inference/__init__.py ADDED Viewed

File without changes

cosmos_transfer1/auxiliary/depth_anything/inference/depth_anything_pipeline.py ADDED Viewed

	@@ -0,0 +1,55 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+from PIL import Image
+from cosmos_transfer1.auxiliary.depth_anything.model.depth_anything import DepthAnythingModel
+def parse_args():
+    parser = argparse.ArgumentParser(description="Depth Estimation using Depth Anything V2")
+    parser.add_argument("--input", type=str, required=True, help="Path to input image or video file")
+    parser.add_argument("--output", type=str, required=True, help="Path to save the output image or video")
+    parser.add_argument(
+        "--mode",
+        type=str,
+        choices=["image", "video"],
+        default="image",
+        help="Processing mode: 'image' for a single image, 'video' for a video file",
+    )
+    return parser.parse_args()
+def main():
+    args = parse_args()
+    model = DepthAnythingModel()
+    if args.mode == "image":
+        # Load the input image and predict its depth
+        image = Image.open(args.input).convert("RGB")
+        depth_image = model.predict_depth(image)
+        depth_image.save(args.output)
+        print(f"Depth image saved to {args.output}")
+    elif args.mode == "video":
+        # Process the video and save the output
+        out_path = model.predict_depth_video(args.input, args.output)
+        if out_path:
+            print(f"Depth video saved to {out_path}")
+if __name__ == "__main__":
+    main()

cosmos_transfer1/auxiliary/depth_anything/model/__init__.py ADDED Viewed

File without changes

cosmos_transfer1/auxiliary/depth_anything/model/depth_anything.py ADDED Viewed

	@@ -0,0 +1,151 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import cv2
+import imageio
+import numpy as np
+import torch
+from PIL import Image
+from transformers import AutoImageProcessor, AutoModelForDepthEstimation
+from cosmos_transfer1.checkpoints import DEPTH_ANYTHING_MODEL_CHECKPOINT
+from cosmos_transfer1.utils import log
+class DepthAnythingModel:
+    def __init__(self):
+        """
+        Initialize the Depth Anything model and its image processor.
+        """
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        # Load image processor and model with half precision
+        print(f"Loading Depth Anything model - {DEPTH_ANYTHING_MODEL_CHECKPOINT}...")
+        self.image_processor = AutoImageProcessor.from_pretrained(
+            DEPTH_ANYTHING_MODEL_CHECKPOINT,
+            torch_dtype=torch.float16,
+            trust_remote_code=True,
+        )
+        self.model = AutoModelForDepthEstimation.from_pretrained(
+            DEPTH_ANYTHING_MODEL_CHECKPOINT,
+            torch_dtype=torch.float16,
+            trust_remote_code=True,
+        ).to(self.device)
+    def predict_depth(self, image: Image.Image) -> Image.Image:
+        """
+        Process a single PIL image and return a depth map as a uint16 PIL Image.
+        """
+        # Prepare inputs for the model
+        inputs = self.image_processor(images=image, return_tensors="pt")
+        # Move all tensors to the proper device with half precision
+        inputs = {k: v.to(self.device, dtype=torch.float16) for k, v in inputs.items()}
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+            predicted_depth = outputs.predicted_depth
+        # Interpolate the predicted depth to the original image size
+        prediction = torch.nn.functional.interpolate(
+            predicted_depth.unsqueeze(1),
+            size=image.size[::-1],  # PIL image size is (width, height), interpolate expects (height, width)
+            mode="bicubic",
+            align_corners=False,
+        )
+        # Convert the output tensor to a numpy array and save as a depth image
+        output = prediction.squeeze().cpu().numpy()
+        depth_image = DepthAnythingModel.save_depth(output)
+        return depth_image
+    def __call__(self, input_video: str, output_video: str = "depth.mp4") -> str:
+        """
+        Process a video file frame-by-frame to produce a depth-estimated video.
+        The output video is saved as an MP4 file.
+        """
+        log.info(f"Processing video: {input_video} to generate depth video: {output_video}")
+        assert os.path.exists(input_video)
+        cap = cv2.VideoCapture(input_video)
+        if not cap.isOpened():
+            print("Error: Cannot open video file.")
+            return
+        # Retrieve video properties
+        frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        fps = cap.get(cv2.CAP_PROP_FPS)
+        depths = []
+        while True:
+            ret, frame = cap.read()
+            if not ret:
+                break
+            # Convert frame from BGR to RGB and then to PIL Image
+            image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+            inputs = self.image_processor(images=image, return_tensors="pt")
+            inputs = {k: v.to(self.device, dtype=torch.float16) for k, v in inputs.items()}
+            with torch.no_grad():
+                outputs = self.model(**inputs)
+                predicted_depth = outputs.predicted_depth
+            # For video processing, take the first output and interpolate to original size
+            prediction = torch.nn.functional.interpolate(
+                predicted_depth[0].unsqueeze(0).unsqueeze(0),
+                size=(frame_height, frame_width),
+                mode="bicubic",
+                align_corners=False,
+            )
+            depth = prediction.squeeze().cpu().numpy()
+            depths += [depth]
+        cap.release()
+        depths = np.stack(depths)
+        depths_normed = (depths - depths.min()) / (depths.max() - depths.min() + 1e-8) * 255.0
+        depths_normed = depths_normed.astype(np.uint8)
+        os.makedirs(os.path.dirname(output_video), exist_ok=True)
+        self.write_video(depths_normed, output_video, fps=fps)
+        return output_video
+    @staticmethod
+    def save_depth(output: np.ndarray) -> Image.Image:
+        """
+        Convert the raw depth output (float values) into a uint16 PIL Image.
+        """
+        depth_min = output.min()
+        depth_max = output.max()
+        max_val = (2**16) - 1  # Maximum value for uint16
+        if depth_max - depth_min > np.finfo("float").eps:
+            out_array = max_val * (output - depth_min) / (depth_max - depth_min)
+        else:
+            out_array = np.zeros_like(output)
+        formatted = out_array.astype("uint16")
+        depth_image = Image.fromarray(formatted, mode="I;16")
+        return depth_image
+    @staticmethod
+    def write_video(frames, output_path, fps=30):
+        with imageio.get_writer(output_path, fps=fps, macro_block_size=8) as writer:
+            for frame in frames:
+                if len(frame.shape) == 2:  # single channel
+                    frame = frame[:, :, None].repeat(3, axis=2)
+                writer.append_data(frame)

cosmos_transfer1/auxiliary/guardrail/README.md ADDED Viewed

	@@ -0,0 +1,17 @@

+# Cosmos Guardrail
+This page outlines a set of tools to ensure content safety in Cosmos. For implementation details, please consult the [Cosmos paper](https://research.nvidia.com/publication/2025-01_cosmos-world-foundation-model-platform-physical-ai).
+## Overview
+Our guardrail system consists of two stages: pre-Guard and post-Guard.
+Cosmos pre-Guard models are applied to text input, including input prompts and upsampled prompts.
+* Blocklist: a keyword list checker for detecting harmful keywords
+* Llama Guard 3: an LLM-based approach for blocking harmful prompts
+Cosmos post-Guard models are applied to video frames generated by Cosmos models.
+* Video Content Safety Filter: a classifier trained to distinguish between safe and unsafe video frames
+* Face Blur Filter: a face detection and blurring module

cosmos_transfer1/auxiliary/guardrail/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

cosmos_transfer1/auxiliary/guardrail/aegis/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

cosmos_transfer1/auxiliary/guardrail/aegis/aegis.py ADDED Viewed

	@@ -0,0 +1,135 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import torch
+from peft import PeftModel
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from cosmos_transfer1.auxiliary.guardrail.aegis.categories import UNSAFE_CATEGORIES
+from cosmos_transfer1.auxiliary.guardrail.common.core import ContentSafetyGuardrail, GuardrailRunner
+from cosmos_transfer1.checkpoints import AEGIS_MODEL_CHECKPOINT, LLAMA_GUARD_MODEL_CHECKPOINT
+from cosmos_transfer1.utils import log, misc
+SAFE = misc.Color.green("SAFE")
+UNSAFE = misc.Color.red("UNSAFE")
+class Aegis(ContentSafetyGuardrail):
+    def __init__(
+        self,
+        checkpoint_dir: str,
+        device="cuda" if torch.cuda.is_available() else "cpu",
+    ) -> None:
+        self.checkpoint_dir = checkpoint_dir
+        self.device = device
+        self.dtype = torch.bfloat16
+        base_model_id = "meta-llama/LlamaGuard-7b"
+        aegis_adapter = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
+        base_model_dir = os.path.join(self.checkpoint_dir, base_model_id)
+        aegis_adapter_dir = os.path.join(self.checkpoint_dir, aegis_adapter)
+        base_model = AutoModelForCausalLM.from_pretrained(base_model_id, cache_dir=base_model_dir)
+        self.tokenizer = AutoTokenizer.from_pretrained(base_model_id, cache_dir=base_model_dir)
+        self.model = PeftModel.from_pretrained(base_model, aegis_adapter, cache_dir=aegis_adapter_dir)
+        self.model.to(self.device, dtype=self.dtype).eval()
+    def get_moderation_prompt(self, user_prompt: str) -> str:
+        """Create the moderation prompt for the Aegis model."""
+        full_prompt = f"""[INST] Task: Check if there is unsafe content in 'User' messages in conversations according our safety policy with the below categories.
+        <BEGIN UNSAFE CONTENT CATEGORIES>
+        {"".join([f"{category['instruction']}" for category in UNSAFE_CATEGORIES])}
+        <END UNSAFE CONTENT CATEGORIES>
+        <BEGIN CONVERSATION>
+        """
+        full_prompt += f"User: {user_prompt}\n\n"
+        full_prompt += """<END CONVERSATION>
+        Provide your safety assessment for User in the above conversation:
+        - First line must read 'safe' or 'unsafe'.
+        - If unsafe, a second line must include a comma-separated list of violated categories. [/INST]"""
+        return full_prompt
+    def get_aegis_block_message(self, moderation_output: str) -> str:
+        """Extract the blocked category and reason from the Aegis model output."""
+        block_msg = "Prompt blocked by Aegis."
+        try:
+            # Extract blocked category and reason
+            if len(moderation_output.split("\n")) >= 2:
+                moderation_output_list = moderation_output.split("\n")
+                blocked_category = int(moderation_output_list[1].lstrip("O")) - 1
+                if 0 <= blocked_category < len(UNSAFE_CATEGORIES):
+                    blocked_reason = UNSAFE_CATEGORIES[blocked_category]["blocked_reason"]
+                    blocked_category_name = UNSAFE_CATEGORIES[blocked_category]["category"]
+                    block_msg = f"{blocked_category_name}: {blocked_reason}"
+        except Exception as e:
+            log.warning(f"Unable to extract blocked category and reason from Aegis output: {e}")
+        return block_msg
+    def filter_aegis_output(self, prompt: str) -> tuple[bool, str]:
+        """Filter the Aegis model output and return the safety status and message."""
+        full_prompt = self.get_moderation_prompt(prompt)
+        inputs = self.tokenizer([full_prompt], add_special_tokens=False, return_tensors="pt").to(self.device)
+        output = self.model.generate(**inputs, max_new_tokens=100, pad_token_id=self.tokenizer.eos_token_id)
+        prompt_len = inputs["input_ids"].shape[-1]
+        moderation_output = self.tokenizer.decode(output[0][prompt_len:], skip_special_tokens=True)
+        if "unsafe" in moderation_output.lower():
+            block_msg = self.get_aegis_block_message(moderation_output)
+            return False, block_msg
+        else:
+            return True, ""
+    def is_safe(self, prompt: str) -> tuple[bool, str]:
+        """Check if the input prompt is safe according to the Aegis model."""
+        try:
+            return self.filter_aegis_output(prompt)
+        except Exception as e:
+            log.error(f"Unexpected error occurred when running Aegis guardrail: {e}")
+            return True, "Unexpected error occurred when running Aegis guardrail."
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--prompt", type=str, required=True, help="Input prompt")
+    parser.add_argument(
+        "--checkpoint_dir",
+        type=str,
+        help="Path to the Aegis checkpoint folder",
+        default=DEFAULT_CHECKPOINT_DIR,
+    )
+    return parser.parse_args()
+def main(args):
+    aegis = Aegis(checkpoint_dir=args.checkpoint_dir)
+    runner = GuardrailRunner(safety_models=[aegis])
+    with misc.timer("aegis safety check"):
+        safety, message = runner.run_safety_check(args.prompt)
+    log.info(f"Input is: {'SAFE' if safety else 'UNSAFE'}")
+    log.info(f"Message: {message}") if not safety else None
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)

cosmos_transfer1/auxiliary/guardrail/aegis/categories.py ADDED Viewed

	@@ -0,0 +1,192 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+VIOLENCE = {
+    "category": "violence",
+    "blocked_reason": """This prompt is blocked due to the presence of violent content.""",
+    "instruction": """O1: Violence.
+Should not
+- Help people plan or engage in violence.
+- Encourage people to engage in violence.
+- Contain unpleasant vivid violent imagery or violent description that arouses discomfort in the reader.
+Can
+- Provide information on violence.
+- Discuss the topics of violence, and can discuss historical events involving violence.
+""",
+}
+SEXUAL = {
+    "category": "sexual",
+    "blocked_reason": """This prompt is blocked due to the presence of sexual content.""",
+    "instruction": """O2: Sexual.
+Should not
+- Engage in sexually explicit (i.e., erotic) conversations and should not encourage people to engage in sexual activities.
+- Engage or encourage sexual activity that is considered illegal.
+Can
+- Discuss topics related to sexuality and sexual education.
+- Provide general information regarding sexual activities.
+- Provide guidance on topics related to sexual health.
+""",
+}
+CRIMINAL_PLANNING = {
+    "category": "criminal_planning",
+    "blocked_reason": """This prompt is blocked due to the presence of criminal planning/confessions content.""",
+    "instruction": """O3: Criminal Planning/Confessions.
+Should not
+- Help people plan or engage in criminal activities like theft, kidnapping, or financial crimes, especially when the person indicates that they may act on the information provided to them.
+- Encourage people to engage in criminal activities or condone criminal behavior.
+- Have a confession of crime that was committed or will be committed in future.
+Can
+- Help people understand how crimes are committed, provided that the response is not so detailed that it could be used to provide a roadmap for future crimes.
+- Help people understand how systems work that are used to prevent crimes, provided those descriptions do not provide detailed information on how to defeat those systems.
+""",
+}
+WEAPONS = {
+    "category": "weapons",
+    "blocked_reason": """This prompt is blocked due to the presence of weapons content.""",
+    "instruction": """O4: Guns and Illegal Weapons.
+Should not
+- Assist in the planning or execution of firearm or illegal weapons crimes, especially when the person indicates that they may act on the information provided to them.
+- Encourage people to engage in firearm or illegal weapons crimes or condone such behavior.
+- Assist or encourage people to execute or own other illegal weapons for example biological agents, or chemical weapons, including instructions on how to create or use them.
+Can
+- Discuss firearms and the arguments for and against firearm ownership.
+""",
+}
+SUBSTANCE_ABUSE = {
+    "category": "substance_abuse",
+    "blocked_reason": """This prompt is blocked due to the presence of substance abuse content.""",
+    "instruction": """O5: Controlled/Regulated Substances.
+Should not
+- Encourage or assist people in the trafficking, consumption, or use of regulated or controlled substances, including but not limited to illegal drugs, tobacco, alcohol, and cannabis.
+- Assist or encourage people to create such substances when it is illegal to do so.
+Can
+- Discuss the use of regulated or controlled substances, and can provide general information on those substances (ex: what their chemical composition is and what their effects are).
+- Provide information on how those substances are manufactured, provided that the information is not so detailed as to provide step-by-step instructions for their creation.
+""",
+}
+SUICIDE = {
+    "category": "suicide",
+    "blocked_reason": "This prompt is blocked due to the presence of suicide and self harm content.",
+    "instruction": """O6: Suicide and Self Harm.
+Should not
+- Encourage people to harm themselves.
+- Romanticize or condone self-harm or suicide.
+- Provide information regarding the methods of suicide or self-harm.
+- Provide any information or advice, other than information regarding appropriate health resources, to people who express or imply that they wish to harm themselves.
+- Contain confessions or intent of self harm
+Can
+- Provide general information regarding self-harm and suicide (ex: statistics, the mental health challenges faced by those who take these actions).
+Should
+- Provide information regarding appropriate health resources to people who imply or express the desire to harm themselves.
+""",
+}
+CSAM = {
+    "category": "csam",
+    "blocked_reason": "This prompt contains material related to Child Sexual Abuse",
+    "instruction": """O7: Sexual (minor).
+Should not
+- Assist, engage or encourage intention to engage in sexual activity with an underage/minor.
+- Discuss or plan sexual acts, sharing of media of sexual nature, personal information or intention to meet in person for sexual activity with a minor.
+- Should not include discussion where private information is shared with underage minors for purposes of acting on them and taking actions that are sexual in nature including meeting or sharing media.
+- Should not discuss child abuse or engage, encourage, assist in sexual grooming/trafficking of children.
+Can
+- Provide information about sex education for use in schools.
+""",
+}
+HATRED = {
+    "category": "hatred",
+    "blocked_reason": "This prompt is blocked due to the presence of hate speech content.",
+    "instruction": """O8: Hate/Identity Hate.
+Should not
+- Express hateful, derogatory, or demeaning sentiments against people based on sensitive personal characteristics like their
+race, color, ethnicity, religion, national origin, sexual orientation, gender, gender identity, or disability or caste.
+- Encourage discrimination against people based on those characteristics and should not use slurs or create insults based on those characteristics.
+- Should not discuss prejudice towards a group classified as protected according to the law of Germany, UK, USA or any other country.
+- Should not contain slurs.
+- Should not contain text that is distasteful, offensive, or hateful that causes others to feel grief, anger, or fear.
+Can
+- Provide information on discrimination.
+- Discuss the topics of discrimination, and can discuss historical events involving discrimination.
+- Discuss neutral statements referring to group identity and  contextualized hate speech (that talks about hate but does not incite hate).
+""",
+}
+HARASSMENT = {
+    "category": "harassment",
+    "blocked_reason": "This prompt is blocked due to the presence of harassment content.",
+    "instruction": """O10: Harassment.
+Should not
+- Have content that may be used to torment or annoy individuals in real life, or make harassment more likely to occur.
+- Engage in harassment, misconduct and unlawful discrimination
+- Single out an individual or marginalization and/or retaliation based on protected characteristics like Race, Color, Gender, Sex,
+Sexual orientation, Gender identity, and gender expression, National origin, Ethnicity, Disability (including being regarded as disabled),
+Religion, Age (40+), Pregnancy (including pregnancy, childbirth or related medical conditions), Genetic information, Military or veteran status, Citizenship status, Political activity, or affiliation,
+- Taking or requesting statutorily protected leave Body characteristics.
+- Promise to give a benefit, or a threat to retaliate or take an adverse action based on a response to harassment and misconduct.
+- Bullying.
+- Being derogatory and humiliating toward an individual but not necessarily protected characteristics under law.
+- Making rude or insulting comments, demeaning, and objectifying terms toward an individual.
+Can
+- Discuss negative impacts of harassment and bullying based on protected characteristics like race, color, gender, sex, nationality, within a contextualized setting.
+- Provide information on protected characteristics or refers, cites material that describes what protected characteristics are.
+- Provide information or refer or cite material that informs what harassment is.
+""",
+}
+THREAT = {
+    "category": "threat",
+    "blocked_reason": "This prompt is blocked due to the presence of threat content.",
+    "instruction": """O11: Threat.
+Should not
+- Discuss a communication of intent to inflict harm or loss on another person.
+- Should not contain discussion of intention of intimidation which  is a tactic used between conflicting parties to make the other timid or psychologically insecure for coercion or control.
+- Should not contain intention of knowingly putting another person in fear of bodily injury.
+- Should not assist or plan injury, physical or mental damage, act or instance of injury, or material  or any kind of loss to a person.
+- Should not contain threats that are forbidden by law as intent to obtain a monetary advantage or to compel a person to act against their will.
+Can
+- Provide information about threat is and the meaning of threat.
+""",
+}
+PROFANITY = {
+    "category": "profanity",
+    "blocked_reason": "This prompt is blocked due to the presence of profanity content.",
+    "instruction": """O12: Profanity.
+Should not
+- Contain swear words, curse words, or other obscene or profane language
+- This includes offensive words used without any intention to act on them or without any explicit targeted recipient
+""",
+}
+UNSAFE_CATEGORIES = [
+    VIOLENCE,
+    SEXUAL,
+    CRIMINAL_PLANNING,
+    WEAPONS,
+    SUBSTANCE_ABUSE,
+    SUICIDE,
+    CSAM,
+    HATRED,
+    HARASSMENT,
+    THREAT,
+    PROFANITY,
+]

cosmos_transfer1/auxiliary/guardrail/blocklist/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

cosmos_transfer1/auxiliary/guardrail/blocklist/blocklist.py ADDED Viewed

	@@ -0,0 +1,216 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import re
+import string
+from difflib import SequenceMatcher
+import nltk
+from better_profanity import profanity
+from cosmos_transfer1.auxiliary.guardrail.blocklist.utils import read_keyword_list_from_dir, to_ascii
+from cosmos_transfer1.auxiliary.guardrail.common.core import ContentSafetyGuardrail, GuardrailRunner
+from cosmos_transfer1.utils import log, misc
+CENSOR = misc.Color.red("*")
+class Blocklist(ContentSafetyGuardrail):
+    def __init__(
+        self,
+        checkpoint_dir: str,
+        guardrail_partial_match_min_chars: int = 6,
+        guardrail_partial_match_letter_count: float = 0.4,
+    ) -> None:
+        self.checkpoint_dir = os.path.join(checkpoint_dir, "nvidia/Cosmos-Guardrail1/blocklist")
+        nltk.data.path.append(os.path.join(self.checkpoint_dir, "nltk_data"))
+        self.lemmatizer = nltk.WordNetLemmatizer()
+        self.profanity = profanity
+        self.guardrail_partial_match_min_chars = guardrail_partial_match_min_chars
+        self.guardrail_partial_match_letter_count = guardrail_partial_match_letter_count
+        # Load blocklist and whitelist keywords
+        self.blocklist_words = read_keyword_list_from_dir(os.path.join(self.checkpoint_dir, "custom"))
+        self.whitelist_words = read_keyword_list_from_dir(os.path.join(self.checkpoint_dir, "whitelist"))
+        self.exact_match_words = read_keyword_list_from_dir(os.path.join(self.checkpoint_dir, "exact_match"))
+        self.profanity.load_censor_words(custom_words=self.blocklist_words, whitelist_words=self.whitelist_words)
+        log.debug(f"Loaded {len(self.blocklist_words)} words/phrases from blocklist")
+        log.debug(f"Whitelisted {len(self.whitelist_words)} words/phrases from whitelist")
+        log.debug(f"Loaded {len(self.exact_match_words)} exact match words/phrases from blocklist")
+    def uncensor_whitelist(self, input_prompt: str, censored_prompt: str) -> str:
+        """Explicitly uncensor words that are in the whitelist."""
+        input_words = input_prompt.split()
+        censored_words = censored_prompt.split()
+        whitelist_words = set(self.whitelist_words)
+        for i, token in enumerate(input_words):
+            if token.strip(string.punctuation).lower() in whitelist_words:
+                censored_words[i] = token
+        censored_prompt = " ".join(censored_words)
+        return censored_prompt
+    def censor_prompt(self, input_prompt: str) -> tuple[bool, str]:
+        """Censor the prompt using the blocklist with better-profanity fuzzy matching.
+        Args:
+            input_prompt: input prompt to censor
+        Returns:
+            bool: True if the prompt is blocked, False otherwise
+            str: A message indicating why the prompt was blocked
+        """
+        censored_prompt = self.profanity.censor(input_prompt, censor_char=CENSOR)
+        # Uncensor whitelisted words that were censored from blocklist fuzzy matching
+        censored_prompt = self.uncensor_whitelist(input_prompt, censored_prompt)
+        if CENSOR in censored_prompt:
+            return True, f"Prompt blocked by censorship: Censored Prompt: {censored_prompt}"
+        return False, ""
+    @staticmethod
+    def check_partial_match(
+        normalized_prompt: str, normalized_word: str, guardrail_partial_match_letter_count: float
+    ) -> tuple[bool, str]:
+        """
+        Check robustly if normalized word and the matching target have a difference of up to guardrail_partial_match_letter_count characters.
+        Args:
+            normalized_prompt: a string with many words
+            normalized_word: a string with one or multiple words, its length is smaller than normalized_prompt
+            guardrail_partial_match_letter_count: maximum allowed difference in characters (float to allow partial characters)
+        Returns:
+            bool: True if a match is found, False otherwise
+            str: A message indicating why the prompt was blocked
+        """
+        prompt_words = normalized_prompt.split()
+        word_length = len(normalized_word.split())
+        max_similarity_ratio = (len(normalized_word) - float(guardrail_partial_match_letter_count)) / float(
+            len(normalized_word)
+        )
+        for i in range(len(prompt_words) - word_length + 1):
+            # Extract a substring from the prompt with the same number of words as the normalized_word
+            substring = " ".join(prompt_words[i : i + word_length])
+            similarity_ratio = SequenceMatcher(None, substring, normalized_word).ratio()
+            if similarity_ratio >= max_similarity_ratio:
+                return (
+                    True,
+                    f"Prompt blocked by partial match blocklist: Prompt: {normalized_prompt}, Partial Match Word: {normalized_word}",
+                )
+        return False, ""
+    @staticmethod
+    def check_against_whole_word_blocklist(
+        prompt: str,
+        blocklist: list[str],
+        guardrail_partial_match_min_chars: int = 6,
+        guardrail_partial_match_letter_count: float = 0.4,
+    ) -> bool:
+        """
+        Check if the prompt contains any whole words from the blocklist.
+        The match is case insensitive and robust to multiple spaces between words.
+        Args:
+            prompt: input prompt to check
+            blocklist: list of words to check against
+            guardrail_partial_match_min_chars: minimum number of characters in a word to check for partial match
+            guardrail_partial_match_letter_count: maximum allowed difference in characters for partial match
+        Returns:
+            bool: True if a match is found, False otherwise
+            str: A message indicating why the prompt was blocked
+        """
+        # Normalize spaces and convert to lowercase
+        normalized_prompt = re.sub(r"\s+", " ", prompt).strip().lower()
+        for word in blocklist:
+            # Normalize spaces and convert to lowercase for each blocklist word
+            normalized_word = re.sub(r"\s+", " ", word).strip().lower()
+            # Use word boundaries to ensure whole word match
+            if re.search(r"\b" + re.escape(normalized_word) + r"\b", normalized_prompt):
+                return True, f"Prompt blocked by exact match blocklist: Prompt: {prompt}, Exact Match Word: {word}"
+            # Check for partial match if the word is long enough
+            if len(normalized_word) >= guardrail_partial_match_min_chars:
+                match, message = Blocklist.check_partial_match(
+                    normalized_prompt, normalized_word, guardrail_partial_match_letter_count
+                )
+                if match:
+                    return True, message
+        return False, ""
+    def is_safe(self, input_prompt: str = "") -> tuple[bool, str]:
+        """Check if the input prompt is safe using the blocklist."""
+        # Check if the input is empty
+        if not input_prompt:
+            return False, "Input is empty"
+        input_prompt = to_ascii(input_prompt)
+        # Check full sentence for censored words
+        censored, message = self.censor_prompt(input_prompt)
+        if censored:
+            return False, message
+        # Check lemmatized words for censored words
+        tokens = nltk.word_tokenize(input_prompt)
+        lemmas = [self.lemmatizer.lemmatize(token) for token in tokens]
+        lemmatized_prompt = " ".join(lemmas)
+        censored, message = self.censor_prompt(lemmatized_prompt)
+        if censored:
+            return False, message
+        # Check for exact match blocklist words
+        censored, message = self.check_against_whole_word_blocklist(
+            input_prompt,
+            self.exact_match_words,
+            self.guardrail_partial_match_min_chars,
+            self.guardrail_partial_match_letter_count,
+        )
+        if censored:
+            return False, message
+        # If all these checks pass, the input is safe
+        return True, "Input is safe"
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--prompt", type=str, required=True, help="Input prompt")
+    parser.add_argument(
+        "--checkpoint_dir",
+        type=str,
+        help="Path to the Blocklist checkpoint folder",
+    )
+    return parser.parse_args()
+def main(args):
+    blocklist = Blocklist(checkpoint_dir=args.checkpoint_dir)
+    runner = GuardrailRunner(safety_models=[blocklist])
+    with misc.timer("blocklist safety check"):
+        safety, message = runner.run_safety_check(args.prompt)
+    log.info(f"Input is: {'SAFE' if safety else 'UNSAFE'}")
+    log.info(f"Message: {message}") if not safety else None
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)

cosmos_transfer1/auxiliary/guardrail/blocklist/utils.py ADDED Viewed

	@@ -0,0 +1,45 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import re
+from cosmos_transfer1.utils import log
+def read_keyword_list_from_dir(folder_path: str) -> list[str]:
+    """Read keyword list from all files in a folder."""
+    output_list = []
+    file_list = []
+    # Get list of files in the folder
+    for file in os.listdir(folder_path):
+        if os.path.isfile(os.path.join(folder_path, file)):
+            file_list.append(file)
+    # Process each file
+    for file in file_list:
+        file_path = os.path.join(folder_path, file)
+        try:
+            with open(file_path, "r") as f:
+                output_list.extend([line.strip() for line in f.readlines()])
+        except Exception as e:
+            log.error(f"Error reading file {file}: {str(e)}")
+    return output_list
+def to_ascii(prompt: str) -> str:
+    """Convert prompt to ASCII."""
+    return re.sub(r"[^\x00-\x7F]+", " ", prompt)

cosmos_transfer1/auxiliary/guardrail/common/__init__.py ADDED Viewed

File without changes

cosmos_transfer1/auxiliary/guardrail/common/core.py ADDED Viewed

	@@ -0,0 +1,71 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Tuple
+import numpy as np
+from cosmos_transfer1.utils import log
+class ContentSafetyGuardrail:
+    def is_safe(self, **kwargs) -> Tuple[bool, str]:
+        raise NotImplementedError("Child classes must implement the is_safe method")
+class PostprocessingGuardrail:
+    def postprocess(self, frames: np.ndarray) -> np.ndarray:
+        raise NotImplementedError("Child classes must implement the postprocess method")
+class GuardrailRunner:
+    def __init__(
+        self,
+        safety_models: list[ContentSafetyGuardrail] | None = None,
+        generic_block_msg: str = "",
+        generic_safe_msg: str = "",
+        postprocessors: list[PostprocessingGuardrail] | None = None,
+    ):
+        self.safety_models = safety_models
+        self.generic_block_msg = generic_block_msg
+        self.generic_safe_msg = generic_safe_msg if generic_safe_msg else "Prompt is safe"
+        self.postprocessors = postprocessors
+    def run_safety_check(self, input: Any) -> Tuple[bool, str]:
+        """Run the safety check on the input."""
+        if not self.safety_models:
+            log.warning("No safety models found, returning safe")
+            return True, self.generic_safe_msg
+        for guardrail in self.safety_models:
+            guardrail_name = str(guardrail.__class__.__name__).upper()
+            log.debug(f"Running guardrail: {guardrail_name}")
+            safe, message = guardrail.is_safe(input)
+            if not safe:
+                reasoning = self.generic_block_msg if self.generic_block_msg else f"{guardrail_name}: {message}"
+                return False, reasoning
+        return True, self.generic_safe_msg
+    def postprocess(self, frames: np.ndarray) -> np.ndarray:
+        """Run the postprocessing on the video frames."""
+        if not self.postprocessors:
+            log.warning("No postprocessors found, returning original frames")
+            return frames
+        for guardrail in self.postprocessors:
+            guardrail_name = str(guardrail.__class__.__name__).upper()
+            log.debug(f"Running guardrail: {guardrail_name}")
+            frames = guardrail.postprocess(frames)
+        return frames

cosmos_transfer1/auxiliary/guardrail/common/io_utils.py ADDED Viewed

	@@ -0,0 +1,78 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import glob
+from dataclasses import dataclass
+import imageio
+import numpy as np
+from cosmos_transfer1.utils import log
+@dataclass
+class VideoData:
+    frames: np.ndarray  # Shape: [B, H, W, C]
+    fps: int
+    duration: int  # in seconds
+def get_video_filepaths(input_dir: str) -> list[str]:
+    """Get a list of filepaths for all videos in the input directory."""
+    paths = glob.glob(f"{input_dir}/**/*.mp4", recursive=True)
+    paths += glob.glob(f"{input_dir}/**/*.avi", recursive=True)
+    paths += glob.glob(f"{input_dir}/**/*.mov", recursive=True)
+    paths = sorted(paths)
+    log.debug(f"Found {len(paths)} videos")
+    return paths
+def read_video(filepath: str) -> VideoData:
+    """Read a video file and extract its frames and metadata."""
+    try:
+        reader = imageio.get_reader(filepath, "ffmpeg")
+    except Exception as e:
+        raise ValueError(f"Failed to read video file: {filepath}") from e
+    # Extract metadata from the video file
+    try:
+        metadata = reader.get_meta_data()
+        fps = metadata.get("fps")
+        duration = metadata.get("duration")
+    except Exception as e:
+        reader.close()
+        raise ValueError(f"Failed to extract metadata from video file: {filepath}") from e
+    # Extract frames from the video file
+    try:
+        frames = np.array([frame for frame in reader])
+    except Exception as e:
+        raise ValueError(f"Failed to extract frames from video file: {filepath}") from e
+    finally:
+        reader.close()
+    return VideoData(frames=frames, fps=fps, duration=duration)
+def save_video(filepath: str, frames: np.ndarray, fps: int) -> None:
+    """Save a video file from a sequence of frames."""
+    try:
+        writer = imageio.get_writer(filepath, fps=fps, macro_block_size=1)
+        for frame in frames:
+            writer.append_data(frame)
+    except Exception as e:
+        raise ValueError(f"Failed to save video file to {filepath}") from e
+    finally:
+        writer.close()

cosmos_transfer1/auxiliary/guardrail/common/presets.py ADDED Viewed

	@@ -0,0 +1,75 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import numpy as np
+from cosmos_transfer1.auxiliary.guardrail.blocklist.blocklist import Blocklist
+from cosmos_transfer1.auxiliary.guardrail.common.core import GuardrailRunner
+from cosmos_transfer1.auxiliary.guardrail.face_blur_filter.face_blur_filter import RetinaFaceFilter
+from cosmos_transfer1.auxiliary.guardrail.llamaGuard3.llamaGuard3 import LlamaGuard3
+from cosmos_transfer1.auxiliary.guardrail.video_content_safety_filter.video_content_safety_filter import (
+    VideoContentSafetyFilter,
+)
+from cosmos_transfer1.utils import log
+def create_text_guardrail_runner(checkpoint_dir: str) -> GuardrailRunner:
+    """Create the text guardrail runner."""
+    return GuardrailRunner(safety_models=[Blocklist(checkpoint_dir), LlamaGuard3(checkpoint_dir)])
+def create_video_guardrail_runner(checkpoint_dir: str) -> GuardrailRunner:
+    """Create the video guardrail runner."""
+    return GuardrailRunner(
+        safety_models=[VideoContentSafetyFilter(checkpoint_dir)],
+        postprocessors=[RetinaFaceFilter(checkpoint_dir)],
+    )
+def run_text_guardrail(prompt: str, guardrail_runner: GuardrailRunner) -> bool:
+    """Run the text guardrail on the prompt, checking for content safety.
+    Args:
+        prompt: The text prompt.
+        guardrail_runner: The text guardrail runner.
+    Returns:
+        bool: Whether the prompt is safe.
+    """
+    is_safe, message = guardrail_runner.run_safety_check(prompt)
+    if not is_safe:
+        log.critical(f"GUARDRAIL BLOCKED: {message}")
+    return is_safe
+def run_video_guardrail(frames: np.ndarray, guardrail_runner: GuardrailRunner) -> np.ndarray | None:
+    """Run the video guardrail on the frames, checking for content safety and applying face blur.
+    Args:
+        frames: The frames of the generated video.
+        guardrail_runner: The video guardrail runner.
+    Returns:
+        The processed frames if safe, otherwise None.
+    """
+    is_safe, message = guardrail_runner.run_safety_check(frames)
+    if not is_safe:
+        log.critical(f"GUARDRAIL BLOCKED: {message}")
+        return None
+    frames = guardrail_runner.postprocess(frames)
+    return frames

cosmos_transfer1/auxiliary/guardrail/face_blur_filter/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

cosmos_transfer1/auxiliary/guardrail/face_blur_filter/blur_utils.py ADDED Viewed

	@@ -0,0 +1,35 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import cv2
+import numpy as np
+def pixelate_face(face_img: np.ndarray, blocks: int = 5) -> np.ndarray:
+    """
+    Pixelate a face region by reducing resolution and then upscaling.
+    Args:
+        face_img: Face region to pixelate
+        blocks: Number of blocks to divide the face into (in each dimension)
+    Returns:
+        Pixelated face region
+    """
+    h, w = face_img.shape[:2]
+    # Shrink the image and scale back up to create pixelation effect
+    temp = cv2.resize(face_img, (blocks, blocks), interpolation=cv2.INTER_LINEAR)
+    pixelated = cv2.resize(temp, (w, h), interpolation=cv2.INTER_NEAREST)
+    return pixelated

cosmos_transfer1/auxiliary/guardrail/face_blur_filter/face_blur_filter.py ADDED Viewed

	@@ -0,0 +1,225 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import numpy as np
+import torch
+from retinaface.data import cfg_re50
+from retinaface.layers.functions.prior_box import PriorBox
+from retinaface.models.retinaface import RetinaFace
+from torch.utils.data import DataLoader, TensorDataset
+from tqdm import tqdm
+from cosmos_transfer1.auxiliary.guardrail.common.core import GuardrailRunner, PostprocessingGuardrail
+from cosmos_transfer1.auxiliary.guardrail.common.io_utils import get_video_filepaths, read_video, save_video
+from cosmos_transfer1.auxiliary.guardrail.face_blur_filter.blur_utils import pixelate_face
+from cosmos_transfer1.auxiliary.guardrail.face_blur_filter.retinaface_utils import (
+    decode_batch,
+    filter_detected_boxes,
+    load_model,
+)
+from cosmos_transfer1.utils import log, misc
+# RetinaFace model constants from https://github.com/biubug6/Pytorch_Retinaface/blob/master/detect.py
+TOP_K = 5_000
+KEEP_TOP_K = 750
+NMS_THRESHOLD = 0.4
+class RetinaFaceFilter(PostprocessingGuardrail):
+    def __init__(
+        self,
+        checkpoint_dir: str,
+        batch_size: int = 1,
+        confidence_threshold: float = 0.7,
+        device="cuda" if torch.cuda.is_available() else "cpu",
+    ) -> None:
+        """
+        Initialize the RetinaFace model for face detection and blurring.
+        Args:
+            checkpoint: Path to the RetinaFace checkpoint file
+            batch_size: Batch size for RetinaFace inference and processing
+            confidence_threshold: Minimum confidence score to consider a face detection
+        """
+        self.checkpoint = f"{checkpoint_dir}/nvidia/Cosmos-Guardrail1/face_blur_filter/Resnet50_Final.pth"
+        self.cfg = cfg_re50
+        self.batch_size = batch_size
+        self.confidence_threshold = confidence_threshold
+        self.device = device
+        self.dtype = torch.float32
+        # Disable loading ResNet pretrained weights
+        self.cfg["pretrain"] = False
+        self.net = RetinaFace(cfg=self.cfg, phase="test")
+        cpu = self.device == "cpu"
+        # Load from RetinaFace pretrained checkpoint
+        self.net = load_model(self.net, self.checkpoint, cpu)
+        self.net.to(self.device, dtype=self.dtype).eval()
+    def preprocess_frames(self, frames: np.ndarray) -> torch.Tensor:
+        """Preprocess a sequence of frames for face detection.
+        Args:
+            frames: Input frames
+        Returns:
+            Preprocessed frames tensor
+        """
+        with torch.no_grad():
+            frames_tensor = torch.from_numpy(frames).to(self.device, dtype=self.dtype)  # Shape: [T, H, W, C]
+            frames_tensor = frames_tensor.permute(0, 3, 1, 2)  # Shape: [T, C, H, W]
+            frames_tensor = frames_tensor[:, [2, 1, 0], :, :]  # RGB to BGR to match RetinaFace model input
+            means = torch.tensor([104.0, 117.0, 123.0], device=self.device, dtype=self.dtype).view(1, 3, 1, 1)
+            frames_tensor = frames_tensor - means  # Subtract mean BGR values for each channel
+            return frames_tensor
+    def blur_detected_faces(
+        self,
+        frames: np.ndarray,
+        batch_loc: torch.Tensor,
+        batch_conf: torch.Tensor,
+        prior_data: torch.Tensor,
+        scale: torch.Tensor,
+        min_size: tuple[int] = (20, 20),
+    ) -> list[np.ndarray]:
+        """Blur detected faces in a batch of frames using RetinaFace predictions.
+        Args:
+            frames: Input frames
+            batch_loc: Batched location predictions
+            batch_conf: Batched confidence scores
+            prior_data: Prior boxes for the video
+            scale: Scale factor for resizing detections
+            min_size: Minimum size of a detected face region in pixels
+        Returns:
+            Processed frames with pixelated faces
+        """
+        with torch.no_grad():
+            batch_boxes = decode_batch(batch_loc, prior_data, self.cfg["variance"])
+            batch_boxes = batch_boxes * scale
+        blurred_frames = []
+        for i, boxes in enumerate(batch_boxes):
+            boxes = boxes.detach().cpu().numpy()
+            scores = batch_conf[i, :, 1].detach().cpu().numpy()
+            filtered_boxes = filter_detected_boxes(
+                boxes,
+                scores,
+                confidence_threshold=self.confidence_threshold,
+                nms_threshold=NMS_THRESHOLD,
+                top_k=TOP_K,
+                keep_top_k=KEEP_TOP_K,
+            )
+            frame = frames[i]
+            for box in filtered_boxes:
+                x1, y1, x2, y2 = map(int, box)
+                # Ignore bounding boxes smaller than the minimum size
+                if x2 - x1 < min_size[0] or y2 - y1 < min_size[1]:
+                    continue
+                max_h, max_w = frame.shape[:2]
+                face_roi = frame[max(y1, 0) : min(y2, max_h), max(x1, 0) : min(x2, max_w)]
+                blurred_face = pixelate_face(face_roi)
+                frame[max(y1, 0) : min(y2, max_h), max(x1, 0) : min(x2, max_w)] = blurred_face
+            blurred_frames.append(frame)
+        return blurred_frames
+    def postprocess(self, frames: np.ndarray) -> np.ndarray:
+        """Blur faces in a sequence of frames.
+        Args:
+            frames: Input frames
+        Returns:
+            Processed frames with pixelated faces
+        """
+        # Create dataset and dataloader
+        frames_tensor = self.preprocess_frames(frames)
+        dataset = TensorDataset(frames_tensor)
+        dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=False)
+        processed_frames, processed_batches = [], []
+        prior_data, scale = None, None
+        for i, batch in enumerate(dataloader):
+            batch = batch[0]
+            h, w = batch.shape[-2:]  # Batch shape: [C, H, W]
+            with torch.no_grad():
+                # Generate priors for the video
+                if prior_data is None:
+                    priorbox = PriorBox(self.cfg, image_size=(h, w))
+                    priors = priorbox.forward()
+                    priors = priors.to(self.device, dtype=self.dtype)
+                    prior_data = priors.data
+                # Get scale for resizing detections
+                if scale is None:
+                    scale = torch.Tensor([w, h, w, h])
+                    scale = scale.to(self.device, dtype=self.dtype)
+                batch_loc, batch_conf, _ = self.net(batch)
+            # Blur detected faces in each batch of frames
+            start_idx = i * self.batch_size
+            end_idx = min(start_idx + self.batch_size, len(frames))
+            processed_batches.append(
+                self.blur_detected_faces(frames[start_idx:end_idx], batch_loc, batch_conf, prior_data, scale)
+            )
+        processed_frames = [frame for batch in processed_batches for frame in batch]
+        return np.array(processed_frames)
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input_dir", type=str, required=True, help="Path containing input videos")
+    parser.add_argument("--output_dir", type=str, required=True, help="Path for saving processed videos")
+    parser.add_argument(
+        "--checkpoint",
+        type=str,
+        help="Path to the RetinaFace checkpoint file",
+    )
+    return parser.parse_args()
+def main(args):
+    filepaths = get_video_filepaths(args.input_dir)
+    if not filepaths:
+        log.error(f"No video files found in directory: {args.input_dir}")
+        return
+    face_blur = RetinaFaceFilter(checkpoint=args.checkpoint)
+    postprocessing_runner = GuardrailRunner(postprocessors=[face_blur])
+    os.makedirs(args.output_dir, exist_ok=True)
+    for filepath in tqdm(filepaths):
+        video_data = read_video(filepath)
+        with misc.timer("face blur filter"):
+            frames = postprocessing_runner.postprocess(video_data.frames)
+        output_path = os.path.join(args.output_dir, os.path.basename(filepath))
+        save_video(output_path, frames, video_data.fps)
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)

cosmos_transfer1/auxiliary/guardrail/face_blur_filter/retinaface_utils.py ADDED Viewed

	@@ -0,0 +1,117 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import torch
+from retinaface.utils.nms.py_cpu_nms import py_cpu_nms
+from cosmos_transfer1.utils import log
+# Adapted from https://github.com/biubug6/Pytorch_Retinaface/blob/master/detect.py
+def filter_detected_boxes(boxes, scores, confidence_threshold, nms_threshold, top_k, keep_top_k):
+    """Filter boxes based on confidence score and remove overlapping boxes using NMS."""
+    # Keep detections with confidence above threshold
+    inds = np.where(scores > confidence_threshold)[0]
+    boxes = boxes[inds]
+    scores = scores[inds]
+    # Sort by confidence and keep top K detections
+    order = scores.argsort()[::-1][:top_k]
+    boxes = boxes[order]
+    scores = scores[order]
+    # Run non-maximum-suppression (NMS) to remove overlapping boxes
+    dets = np.hstack((boxes, scores[:, np.newaxis])).astype(np.float32, copy=False)
+    keep = py_cpu_nms(dets, nms_threshold)
+    dets = dets[keep, :]
+    dets = dets[:keep_top_k, :]
+    boxes = dets[:, :-1]
+    return boxes
+# Adapted from https://github.com/biubug6/Pytorch_Retinaface/blob/master/utils/box_utils.py to handle batched inputs
+def decode_batch(loc, priors, variances):
+    """Decode batched locations from predictions using priors and variances.
+    Args:
+        loc (tensor): Batched location predictions for loc layers.
+            Shape: [batch_size, num_priors, 4]
+        priors (tensor): Prior boxes in center-offset form.
+            Shape: [num_priors, 4]
+        variances: (list[float]): Variances of prior boxes.
+    Return:
+        Decoded batched bounding box predictions
+            Shape: [batch_size, num_priors, 4]
+    """
+    batch_size = loc.size(0)
+    priors = priors.unsqueeze(0).expand(batch_size, -1, -1)
+    boxes = torch.cat(
+        (
+            priors[:, :, :2] + loc[:, :, :2] * variances[0] * priors[:, :, 2:],
+            priors[:, :, 2:] * torch.exp(loc[:, :, 2:] * variances[1]),
+        ),
+        dim=2,
+    )
+    boxes[:, :, :2] -= boxes[:, :, 2:] / 2
+    boxes[:, :, 2:] += boxes[:, :, :2]
+    return boxes
+# Adapted from https://github.com/biubug6/Pytorch_Retinaface/blob/master/detect.py
+def _check_keys(model, pretrained_state_dict):
+    ckpt_keys = set(pretrained_state_dict.keys())
+    model_keys = set(model.state_dict().keys())
+    used_pretrained_keys = model_keys & ckpt_keys
+    unused_pretrained_keys = ckpt_keys - model_keys
+    missing_keys = model_keys - ckpt_keys
+    log.debug("Missing keys:{}".format(len(missing_keys)))
+    log.debug("Unused checkpoint keys:{}".format(len(unused_pretrained_keys)))
+    log.debug("Used keys:{}".format(len(used_pretrained_keys)))
+    assert len(used_pretrained_keys) > 0, "load NONE from pretrained checkpoint"
+    return True
+# Adapted from https://github.com/biubug6/Pytorch_Retinaface/blob/master/detect.py
+def _remove_prefix(state_dict, prefix):
+    """Old version of the model is stored with all names of parameters sharing common prefix 'module.'"""
+    log.debug("Removing prefix '{}'".format(prefix))
+    def f(x):
+        return x.split(prefix, 1)[-1] if x.startswith(prefix) else x
+    return {f(key): value for key, value in state_dict.items()}
+# Adapted from https://github.com/biubug6/Pytorch_Retinaface/blob/master/detect.py
+def load_model(model, pretrained_path, load_to_cpu):
+    log.debug("Loading pretrained model from {}".format(pretrained_path))
+    if load_to_cpu:
+        pretrained_dict = torch.load(pretrained_path, map_location=lambda storage, loc: storage, weights_only=True)
+    else:
+        device = torch.cuda.current_device()
+        pretrained_dict = torch.load(
+            pretrained_path, map_location=lambda storage, loc: storage.cuda(device), weights_only=True
+        )
+    if "state_dict" in pretrained_dict.keys():
+        pretrained_dict = _remove_prefix(pretrained_dict["state_dict"], "module.")
+    else:
+        pretrained_dict = _remove_prefix(pretrained_dict, "module.")
+    _check_keys(model, pretrained_dict)
+    model.load_state_dict(pretrained_dict, strict=False)
+    return model

cosmos_transfer1/auxiliary/guardrail/llamaGuard3/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

cosmos_transfer1/auxiliary/guardrail/llamaGuard3/categories.py ADDED Viewed

	@@ -0,0 +1,31 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+UNSAFE_CATEGORIES = {
+    "S1": "Violent Crimes.",
+    "S2": "Non-Violent Crimes.",
+    "S3": "Sex Crimes.",
+    "S4": "Child Exploitation.",
+    "S5": "Defamation.",
+    "S6": "Specialized Advice.",
+    "S7": "Privacy.",
+    "S8": "Intellectual Property.",
+    "S9": "Indiscriminate Weapons.",
+    "S10": "Hate.",
+    "S11": "Self-Harm.",
+    "S12": "Sexual Content.",
+    "S13": "Elections.",
+    "s14": "Code Interpreter Abuse.",
+}

cosmos_transfer1/auxiliary/guardrail/llamaGuard3/llamaGuard3.py ADDED Viewed

	@@ -0,0 +1,122 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from cosmos_transfer1.auxiliary.guardrail.common.core import ContentSafetyGuardrail, GuardrailRunner
+from cosmos_transfer1.auxiliary.guardrail.llamaGuard3.categories import UNSAFE_CATEGORIES
+from cosmos_transfer1.utils import log, misc
+SAFE = misc.Color.green("SAFE")
+UNSAFE = misc.Color.red("UNSAFE")
+class LlamaGuard3(ContentSafetyGuardrail):
+    def __init__(
+        self,
+        checkpoint_dir: str,
+        device="cuda" if torch.cuda.is_available() else "cpu",
+    ) -> None:
+        self.checkpoint_dir = checkpoint_dir
+        self.device = device
+        self.dtype = torch.bfloat16
+        model_id = "meta-llama/Llama-Guard-3-8B"
+        model_dir = os.path.join(self.checkpoint_dir, model_id)
+        self.model = AutoModelForCausalLM.from_pretrained(model_dir)
+        self.tokenizer = AutoTokenizer.from_pretrained(model_dir)
+        self.model.to(self.device, dtype=self.dtype).eval()
+    def get_llamaGuard3_block_message(self, moderation_output: str) -> str:
+        """Extract the blocked category from the Llama Guard 3 model output."""
+        block_msg = "Prompt blocked by Llama Guard 3."
+        try:
+            lines = moderation_output.splitlines()
+            categories_detected = []
+            for line in lines[1:]:
+                line_stripped = line.split("<|eot_id|>")[0].strip()
+                for catagory in line_stripped.split(","):
+                    catagory = catagory.strip()
+                    if catagory not in UNSAFE_CATEGORIES:
+                        log.warning(f"Unrecognized category from moderation output: {catagory}")
+                    else:
+                        categories_detected.append(catagory)
+            if len(categories_detected) > 0:
+                blocked_catagories = ", ".join([UNSAFE_CATEGORIES[catagory][:-1] for catagory in categories_detected])
+                block_msg = f"{block_msg} Violations: {blocked_catagories}."
+        except Exception as e:
+            log.warning(f"Unable to extract blocked category from Llama Guard 3 output: {e}")
+        return block_msg
+    def filter_llamaGuard3_output(self, prompt: str) -> tuple[bool, str]:
+        """Filter the Llama Guard 3 model output and return the safety status and message."""
+        conversation = [{"role": "user", "content": prompt}]
+        input_ids = self.tokenizer.apply_chat_template(
+            conversation, categories=UNSAFE_CATEGORIES, return_tensors="pt"
+        ).to("cuda")
+        prompt_len = input_ids.shape[1]
+        output = self.model.generate(
+            input_ids=input_ids,
+            max_new_tokens=100,
+            return_dict_in_generate=True,
+            pad_token_id=0,
+        )
+        generated_tokens = output.sequences[:, prompt_len:]
+        moderation_output = self.tokenizer.decode(generated_tokens[0], skip_special_tokens=False).strip()
+        if "unsafe" in moderation_output.lower():
+            block_msg = self.get_llamaGuard3_block_message(moderation_output)
+            return False, block_msg
+        else:
+            return True, ""
+    def is_safe(self, prompt: str) -> tuple[bool, str]:
+        """Check if the input prompt is safe according to the Llama Guard 3 model."""
+        try:
+            return self.filter_llamaGuard3_output(prompt)
+        except Exception as e:
+            log.error(f"Unexpected error occurred when running Llama Guard 3 guardrail: {e}")
+            return True, "Unexpected error occurred when running Llama Guard 3 guardrail."
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--prompt", type=str, required=True, help="Input prompt")
+    parser.add_argument(
+        "--checkpoint_dir",
+        type=str,
+        help="Path to the Llama Guard 3 checkpoint folder",
+    )
+    return parser.parse_args()
+def main(args):
+    llamaGuard3 = LlamaGuard3(checkpoint_dir=args.checkpoint_dir)
+    runner = GuardrailRunner(safety_models=[llamaGuard3])
+    with misc.timer("Llama Guard 3 safety check"):
+        safety, message = runner.run_safety_check(args.prompt)
+    log.info(f"Input is: {'SAFE' if safety else 'UNSAFE'}")
+    log.info(f"Message: {message}") if not safety else None
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)

cosmos_transfer1/auxiliary/guardrail/video_content_safety_filter/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

cosmos_transfer1/auxiliary/guardrail/video_content_safety_filter/model.py ADDED Viewed

	@@ -0,0 +1,60 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import attrs
+import torch
+import torch.nn as nn
+from cosmos_transfer1.utils.ddp_config import make_freezable
+@make_freezable
+@attrs.define(slots=False)
+class ModelConfig:
+    input_size: int = 1152
+    num_classes: int = 7
+class SafetyClassifier(nn.Module):
+    def __init__(self, input_size: int = 1024, num_classes: int = 2):
+        super().__init__()
+        self.input_size = input_size
+        self.num_classes = num_classes
+        self.layers = nn.Sequential(
+            nn.Linear(self.input_size, 512),
+            nn.BatchNorm1d(512),
+            nn.ReLU(),
+            nn.Linear(512, 256),
+            nn.BatchNorm1d(256),
+            nn.ReLU(),
+            nn.Linear(256, self.num_classes),
+            # Note: No activation function here; CrossEntropyLoss expects raw logits
+        )
+    def forward(self, x):
+        return self.layers(x)
+class VideoSafetyModel(nn.Module):
+    def __init__(self, config: ModelConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.num_classes = config.num_classes
+        self.network = SafetyClassifier(input_size=config.input_size, num_classes=self.num_classes)
+    @torch.inference_mode()
+    def forward(self, data_batch: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
+        logits = self.network(data_batch["data"].cuda())
+        return {"logits": logits}

cosmos_transfer1/auxiliary/guardrail/video_content_safety_filter/video_content_safety_filter.py ADDED Viewed

	@@ -0,0 +1,185 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import json
+import os
+from typing import Iterable, Tuple, Union
+import torch
+from PIL import Image
+from cosmos_transfer1.auxiliary.guardrail.common.core import ContentSafetyGuardrail, GuardrailRunner
+from cosmos_transfer1.auxiliary.guardrail.common.io_utils import get_video_filepaths, read_video
+from cosmos_transfer1.auxiliary.guardrail.video_content_safety_filter.model import ModelConfig, VideoSafetyModel
+from cosmos_transfer1.auxiliary.guardrail.video_content_safety_filter.vision_encoder import SigLIPEncoder
+from cosmos_transfer1.utils import log, misc
+# Define the class index to class name mapping for multi-class classification
+CLASS_IDX_TO_NAME = {
+    0: "Safe",
+    1: "Sexual_Content",
+    3: "Drugs",
+    4: "Child_Abuse",
+    5: "Hate_and_Harassment",
+    6: "Self-Harm",
+}
+class VideoContentSafetyFilter(ContentSafetyGuardrail):
+    def __init__(
+        self,
+        checkpoint_dir: str,
+        device="cuda" if torch.cuda.is_available() else "cpu",
+    ) -> None:
+        self.checkpoint_dir = os.path.join(checkpoint_dir, "nvidia/Cosmos-Guardrail1/video_content_safety_filter")
+        self.device = device
+        self.dtype = torch.float32
+        # Initialize the SigLIP encoder
+        self.encoder = SigLIPEncoder(checkpoint_dir=self.checkpoint_dir, device=device, dtype=self.dtype)
+        # Use ModelConfig directly for inference configuration
+        model_config = ModelConfig(input_size=1152, num_classes=7)
+        # Load the multi-class classifier
+        self.model = VideoSafetyModel(model_config)
+        safety_filter_local_path = os.path.join(self.checkpoint_dir, "safety_filter.pt")
+        checkpoint = torch.load(safety_filter_local_path, map_location=torch.device("cpu"), weights_only=True)
+        self.model.load_state_dict(checkpoint["model"])
+        self.model.to(self.device, dtype=self.dtype).eval()
+    @torch.inference_mode()
+    def __infer(self, pil_image: Image.Image) -> int:
+        """Infer the class of the image."""
+        image_embs = self.encoder.encode_image(pil_image)
+        logits = self.model.network(image_embs)
+        probabilities = torch.nn.functional.softmax(logits, dim=-1)
+        predicted_class = torch.argmax(probabilities, dim=-1).item()
+        return predicted_class
+    def is_safe_file(self, filepath: str) -> bool:
+        """Check if the video file is safe."""
+        video_data = read_video(filepath)
+        # Sample frames at 2 FPS
+        sample_rate = 2  # frames per second
+        frame_interval = int(video_data.fps / sample_rate)
+        frame_numbers = list(range(0, int(video_data.fps * video_data.duration), frame_interval))
+        is_safe = True
+        frame_scores = []
+        for frame_number in frame_numbers:
+            try:
+                frame = video_data.frames[frame_number]
+                pil_image = Image.fromarray(frame)
+                predicted_class = self.__infer(pil_image)
+                class_name = CLASS_IDX_TO_NAME.get(predicted_class, "Safe")
+                frame_scores.append({"frame_number": frame_number, "class": class_name})
+                # If any frame is not "Safe", mark the video as unsafe
+                if class_name != "Safe":
+                    is_safe = False
+                    break
+            except Exception as e:
+                log.warning(f"Warning: Failed to run safety classifier on frame_number {frame_number}. Exception: {e}")
+                continue
+        # Prepare data for JSON
+        video_data = {
+            "filepath": filepath,
+            "is_safe": is_safe,
+            "video_length": video_data.duration,
+            "fps": video_data.fps,
+            "frame_scores": frame_scores,
+        }
+        log.info(f"Video {filepath} is {'SAFE' if is_safe else 'UNSAFE'}.")
+        log.debug(f"Video data: {json.dumps(video_data, indent=4)}")
+        return is_safe
+    def is_safe_frames(self, frames: Iterable) -> bool:
+        """Check if the generated video frames are safe."""
+        frame_scores = []
+        total_frames = 0
+        safe_frames = 0
+        for frame_number, frame in enumerate(frames):
+            try:
+                total_frames += 1
+                pil_image = Image.fromarray(frame)
+                predicted_class = self.__infer(pil_image)
+                class_name = CLASS_IDX_TO_NAME.get(predicted_class, "Safe")
+                frame_scores.append({"frame_number": frame_number, "class": class_name})
+                if class_name == "Safe":
+                    safe_frames += 1
+            except Exception as e:
+                log.warning(f"Warning: Failed to run safety classifier on frame_number {frame_number}. Exception: {e}")
+                continue
+        # Decide if the video is safe based on the ratio of safe frames
+        is_safe = False
+        if total_frames > 0:
+            is_safe = (safe_frames / total_frames) >= 0.95
+        video_data = {
+            "is_safe": is_safe,
+            "frame_scores": frame_scores,
+        }
+        log.debug(f"Frames data: {json.dumps(video_data, indent=4)}")
+        return is_safe
+    def is_safe(self, input: Union[str, Iterable]) -> Tuple[bool, str]:
+        if isinstance(input, str):
+            is_safe = self.is_safe_file(input)
+            return is_safe, "safe video detected" if is_safe else "unsafe video detected"
+        else:
+            is_safe = self.is_safe_frames(input)
+            return is_safe, "safe frames detected" if is_safe else "unsafe frames detected"
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input_dir", type=str, required=True, help="Path containing input videos")
+    parser.add_argument(
+        "--checkpoint_dir",
+        type=str,
+        help="Path to the Video Content Safety Filter checkpoint folder",
+    )
+    return parser.parse_args()
+def main(args):
+    filepaths = get_video_filepaths(args.input_dir)
+    if not filepaths:
+        log.error(f"No video files found in directory: {args.input_dir}")
+        return
+    video_filter = VideoContentSafetyFilter(checkpoint_dir=args.checkpoint_dir)
+    runner = GuardrailRunner(safety_models=[video_filter], generic_safe_msg="Video is safe")
+    for filepath in filepaths:
+        with misc.timer("video content safety filter"):
+            _ = runner.run_safety_check(filepath)
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)

cosmos_transfer1/auxiliary/guardrail/video_content_safety_filter/vision_encoder.py ADDED Viewed

	@@ -0,0 +1,46 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import torch
+from PIL import Image
+from transformers import SiglipModel, SiglipProcessor
+class SigLIPEncoder(torch.nn.Module):
+    def __init__(
+        self,
+        checkpoint_dir: str,
+        model_name: str = "google/siglip-so400m-patch14-384",
+        device="cuda" if torch.cuda.is_available() else "cpu",
+        dtype=torch.float32,
+    ) -> None:
+        super().__init__()
+        self.checkpoint_dir = checkpoint_dir
+        self.device = device
+        self.dtype = dtype
+        self.model = SiglipModel.from_pretrained(model_name, cache_dir=self.checkpoint_dir)
+        self.processor = SiglipProcessor.from_pretrained(model_name, cache_dir=self.checkpoint_dir)
+        self.model.to(self.device, dtype=self.dtype).eval()
+    @torch.inference_mode()
+    def encode_image(self, input_img: Image.Image) -> torch.Tensor:
+        """Encode an image into a feature vector."""
+        with torch.no_grad():
+            inputs = self.processor(images=input_img, return_tensors="pt").to(self.device, dtype=self.dtype)
+            image_features = self.model.get_image_features(**inputs)
+            image_features /= image_features.norm(dim=-1, keepdim=True)
+        return image_features

cosmos_transfer1/auxiliary/human_keypoint/human_keypoint.py ADDED Viewed

	@@ -0,0 +1,155 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import cv2
+import numpy as np
+from rtmlib import Wholebody
+from cosmos_transfer1.diffusion.datasets.augmentors.human_keypoint_utils import (
+    coco_wholebody_133_skeleton,
+    openpose134_skeleton,
+)
+from cosmos_transfer1.utils import log
+class HumanKeypointModel:
+    def __init__(self, to_openpose=True, conf_thres=0.6):
+        self.model = Wholebody(
+            to_openpose=to_openpose,
+            mode="performance",
+            backend="onnxruntime",
+            device="cuda",
+        )
+        self.to_openpose = to_openpose
+        self.conf_thres = conf_thres
+    def __call__(self, input_video: str, output_video: str = "keypoint.mp4") -> str:
+        """
+        Generate the human body keypoint plot for the keypointControlNet video2world model.
+        Input: mp4 video
+        Output: mp4 keypoint video, of the same spatial and temporal dimensions as the input video.
+        """
+        log.info(f"Processing video: {input_video} to generate keypoint video: {output_video}")
+        assert os.path.exists(input_video)
+        cap = cv2.VideoCapture(input_video)
+        fps = int(cap.get(cv2.CAP_PROP_FPS))
+        frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        frame_size = (frame_width, frame_height)
+        # vid writer
+        fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+        skeleton_writer = cv2.VideoWriter(output_video, fourcc, fps, frame_size)
+        log.info(f"frame width: {frame_width}, frame height: {frame_height}, fps: {fps}")
+        log.info("start pose estimation for frames..")
+        # Process each frame
+        while cap.isOpened():
+            ret, frame = cap.read()
+            if not ret:
+                break
+            # Create a black background frame
+            black_frame = np.zeros_like(frame)
+            # Run pose estimation
+            keypoints, scores = self.model(frame)
+            if keypoints is not None and len(keypoints) > 0:
+                skeleton_frame = self.plot_person_kpts(
+                    black_frame,
+                    keypoints,
+                    scores,
+                    kpt_thr=self.conf_thres,
+                    openpose_format=True,
+                    line_width=4,
+                )  # (h, w, 3)
+            else:
+                skeleton_frame = black_frame
+            skeleton_writer.write(skeleton_frame[:, :, ::-1])
+        cap.release()
+        skeleton_writer.release()
+    def draw_skeleton(
+        self,
+        img: np.ndarray,
+        keypoints: np.ndarray,
+        scores: np.ndarray,
+        kpt_thr: float = 0.6,
+        openpose_format: bool = True,
+        radius: int = 2,
+        line_width: int = 4,
+    ):
+        skeleton_topology = openpose134_skeleton if openpose_format else coco_wholebody_133_skeleton
+        assert len(keypoints.shape) == 2
+        keypoint_info, skeleton_info = (
+            skeleton_topology["keypoint_info"],
+            skeleton_topology["skeleton_info"],
+        )
+        vis_kpt = [s >= kpt_thr for s in scores]
+        link_dict = {}
+        for i, kpt_info in keypoint_info.items():
+            kpt_color = tuple(kpt_info["color"])
+            link_dict[kpt_info["name"]] = kpt_info["id"]
+            kpt = keypoints[i]
+            if vis_kpt[i]:
+                img = cv2.circle(img, (int(kpt[0]), int(kpt[1])), int(radius), kpt_color, -1)
+        for i, ske_info in skeleton_info.items():
+            link = ske_info["link"]
+            pt0, pt1 = link_dict[link[0]], link_dict[link[1]]
+            if vis_kpt[pt0] and vis_kpt[pt1]:
+                link_color = ske_info["color"]
+                kpt0 = keypoints[pt0]
+                kpt1 = keypoints[pt1]
+                img = cv2.line(
+                    img, (int(kpt0[0]), int(kpt0[1])), (int(kpt1[0]), int(kpt1[1])), link_color, thickness=line_width
+                )
+        return img
+    def plot_person_kpts(
+        self,
+        pose_vis_img: np.ndarray,
+        keypoints: np.ndarray,
+        scores: np.ndarray,
+        kpt_thr: float = 0.6,
+        openpose_format: bool = True,
+        line_width: int = 4,
+    ) -> np.ndarray:
+        """
+        plot a single person
+        in-place update the pose image
+        """
+        for kpts, ss in zip(keypoints, scores):
+            try:
+                pose_vis_img = self.draw_skeleton(
+                    pose_vis_img, kpts, ss, kpt_thr=kpt_thr, openpose_format=openpose_format, line_width=line_width
+                )
+            except ValueError as e:
+                log.error(f"Error in draw_skeleton func, {e}")
+        return pose_vis_img

cosmos_transfer1/auxiliary/robot_augmentation/README.md ADDED Viewed

	@@ -0,0 +1,112 @@

+# Robot Data Augmentation with Cosmos-Transfer1
+This pipeline provides a two-step process to augment robotic videos using **Cosmos-Transfer1-7B**. It leverages **spatial-temporal control** to modify backgrounds while preserving the shape and/or appearance of the robot foreground.
+## Overview of Settings
+We propose two augmentation settings:
+### Setting 1 (fg_vis_edge_bg_seg): Preserve Shape and Appearance of the Robot (foreground)
+- **Foreground Controls**: `Edge`, `Vis`
+- **Background Controls**: `Segmentation`
+- **Weights**:
+  - `w_edge(FG) = 1`
+  - `w_vis(FG) = 1`
+  - `w_seg(BG) = 1`
+  - All other weights = 0
+### Setting 2 (fg_edge_bg_seg): Preserve Only Shape of the Robot (foreground)
+- **Foreground Controls**: `Edge`
+- **Background Controls**: `Segmentation`
+- **Weights**:
+  - `w_edge(FG) = 1`
+  - `w_seg(BG) = 1`
+  - All other weights = 0
+## Step-by-Step Instructions
+### Step 1: Generate Spatial-Temporal Weights
+This script extracts foreground (robot) and background information from semantic segmentation data. It processes per-frame segmentation masks and color-to-class mappings to generate spatial-temporal weight matrices for each control modality based on the selected setting.
+#### Input Requirements:
+- A `segmentation` folder containing per-frame segmentation masks in PNG format
+- A `segmentation_label` folder containing color-to-class mapping JSON files for each frame, for example:
+  ```json
+  {
+      "(29, 0, 0, 255)": {
+          "class": "gripper0_right_r_palm_vis"
+      },
+      "(31, 0, 0, 255)": {
+          "class": "gripper0_right_R_thumb_proximal_base_link_vis"
+      },
+      "(33, 0, 0, 255)": {
+          "class": "gripper0_right_R_thumb_proximal_link_vis"
+      }
+  }
+  ```
+- An input video file
+Here is an example input format:
+[Example input directory](https://github.com/google-deepmind/cosmos/tree/main/assets/robot_augmentation_example/example1)
+#### Usage
+```bash
+PYTHONPATH=$(pwd) python cosmos_transfer1/auxiliary/robot_augmentation/spatial_temporal_weight.py \
+    --setting setting1 \
+    --robot-keywords world_robot gripper robot \
+    --input-dir assets/robot_augmentation_example \
+    --output-dir outputs/robot_augmentation_example
+```
+#### Parameters:
+* `--setting`: Weight setting to use (choices: 'setting1', 'setting2', default: 'setting1')
+  * setting1: Emphasizes robot in visual and edge features (vis: 1.0 foreground, edge: 1.0 foreground, seg: 1.0 background)
+  * setting2: Emphasizes robot only in edge features (edge: 1.0 foreground, seg: 1.0 background)
+* `--input-dir`: Input directory containing example folders
+  * Default: 'assets/robot_augmentation_example'
+* `--output-dir`: Output directory for weight matrices
+  * Default: 'outputs/robot_augmentation_example'
+* `--robot-keywords`: Keywords used to identify robot classes
+  * Default: ["world_robot", "gripper", "robot"]
+  * Any semantic class containing these keywords will be treated as robot foreground
+### Step 2: Run Cosmos-Transfer1 Inference
+Use the generated spatial-temporal weight matrices to perform video augmentation with the proper controls.
+```bash
+export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:=0}"
+export CHECKPOINT_DIR="${CHECKPOINT_DIR:=./checkpoints}"
+export NUM_GPU="${NUM_GPU:=1}"
+PYTHONPATH=$(pwd) torchrun --nproc_per_node=$NUM_GPU --nnodes=1 --node_rank=0 \
+cosmos_transfer1/diffusion/inference/transfer.py \
+    --checkpoint_dir $CHECKPOINT_DIR \
+    --video_save_folder outputs/robot_example_spatial_temporal_setting1 \
+    --controlnet_specs assets/robot_augmentation_example/example1/inference_cosmos_transfer1_robot_spatiotemporal_weights.json \
+    --offload_text_encoder_model \
+    --offload_guardrail_models \
+    --num_gpus $NUM_GPU
+```
+- Augmented videos are saved in `outputs/robot_example_spatial_temporal_setting1/`
+## Input Outputs Example
+Input video:
+<video src="https://github.com/user-attachments/assets/9c2df99d-7d0c-4dcf-af87-4ec9f65328ed">
+  Your browser does not support the video tag.
+</video>
+You can run multiple times with different prompts (e.g., `assets/robot_augmentation_example/example1/example1_prompts.json`), and you can get different augmentation results:
+<video src="https://github.com/user-attachments/assets/6dee15f5-9d8b-469a-a92a-3419cb466d44">
+  Your browser does not support the video tag.
+</video>

cosmos_transfer1/auxiliary/robot_augmentation/spatial_temporal_weight.py ADDED Viewed

	@@ -0,0 +1,577 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This script processes segmentation results for each video frame saved as JSON files and generates a spatial-temporal weight matrix saved as a .pt file.
+# The input JSON files contain segmentation information for each frame, and the output .pt file represents the spatial-temporal weight matrix for the video.
+import argparse
+import glob
+import json
+import logging
+import os
+import re
+from collections import defaultdict
+import cv2
+import numpy as np
+import torch
+from tqdm import tqdm
+# Configure logging
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+# Class to manage different weight settings
+class WeightSettings:
+    """Class to manage different weight settings for the features"""
+    @staticmethod
+    def get_settings(setting_name):
+        """Get weight settings by name
+        Args:
+            setting_name (str): Name of the setting
+        Returns:
+            dict: Dictionary with weights for each feature
+        """
+        settings = {
+            # Default setting: Emphasize robot in all features
+            "fg_vis_edge_bg_seg": {
+                "depth": {"foreground": 0.0, "background": 0.0},
+                "vis": {"foreground": 1.0, "background": 0.0},
+                "edge": {"foreground": 1.0, "background": 0.0},
+                "seg": {"foreground": 0.0, "background": 1.0},
+            },
+            "fg_edge_bg_seg": {
+                "depth": {"foreground": 0.0, "background": 0.0},
+                "vis": {"foreground": 0.0, "background": 0.0},
+                "edge": {"foreground": 1.0, "background": 0.0},
+                "seg": {"foreground": 0.0, "background": 1.0},
+            },
+        }
+        if setting_name not in settings:
+            logger.warning(f"Setting '{setting_name}' not found. Using default.")
+            return settings["fg_vis_edge_bg_seg"]
+        return settings[setting_name]
+    @staticmethod
+    def list_settings():
+        """List all available settings
+        Returns:
+            list: List of setting names
+        """
+        return ["fg_vis_edge_bg_seg", "fg_edge_bg_seg"]
+def get_video_info(video_path):
+    """Get video dimensions and frame count"""
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        raise ValueError(f"Could not open video file: {video_path}")
+    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    cap.release()
+    return width, height, frame_count, fps
+def parse_color_key(color_key):
+    """Parse a color key string into an RGB tuple
+    Args:
+        color_key (str): Color key string in the format "(r,g,b,a)" or similar
+    Returns:
+        tuple: RGB tuple (r, g, b)
+    """
+    # Extract numbers using regex to handle different formats
+    numbers = re.findall(r"\d+", color_key)
+    if len(numbers) >= 3:
+        r, g, b = map(int, numbers[:3])
+        return (r, g, b)
+    else:
+        raise ValueError(f"Invalid color key format: {color_key}")
+def save_visualization(mask, frame_num, feature_name, viz_dir):
+    """Save a visualization of the binary mask
+    Args:
+        mask (numpy.ndarray): The mask (values 0 or 255)
+        frame_num (int): The frame number
+        feature_name (str): The name of the feature (depth, vis, edge, seg)
+        viz_dir (str): Directory to save visualizations
+    """
+    # Simply save the binary mask directly
+    output_path = os.path.join(viz_dir, f"{feature_name}_frame_{frame_num:06d}.png")
+    cv2.imwrite(output_path, mask)
+    logger.info(f"Saved binary visualization to {output_path}")
+def process_segmentation_files(
+    segmentation_dir,
+    output_dir,
+    viz_dir,
+    video_path=None,
+    weights_dict=None,
+    setting_name="fg_vis_edge_bg_seg",
+    robot_keywords=None,
+):
+    """Process all segmentation JSON files and create weight matrices
+    Args:
+        segmentation_dir (str): Directory containing segmentation JSON files
+        output_dir (str): Directory to save weight matrices
+        viz_dir (str): Directory to save visualizations
+        video_path (str, optional): Path to the video file. Defaults to None.
+        weights_dict (dict, optional): Dictionary with weights for each feature.
+            Format: {
+                'depth': {'foreground': float, 'background': float},
+                'vis': {'foreground': float, 'background': float},
+                'edge': {'foreground': float, 'background': float},
+                'seg': {'foreground': float, 'background': float}
+            }
+            Values should be in range 0-1. Defaults to None.
+        setting_name (str, optional): Weight setting name. Defaults to 'fg_vis_edge_bg_seg (setting1)'.
+        robot_keywords (list, optional): List of keywords to identify robot classes. Defaults to ["robot"].
+    """
+    # Set default robot keywords if not provided
+    if robot_keywords is None:
+        robot_keywords = ["robot"]
+    # Get all JSON files
+    json_files = sorted(glob.glob(os.path.join(segmentation_dir, "*.json")))
+    logger.info(f"Found {len(json_files)} JSON files")
+    if len(json_files) == 0:
+        raise ValueError(f"No JSON files found in {segmentation_dir}")
+    # For example directories, check for PNG files
+    png_dir = os.path.join(os.path.dirname(segmentation_dir), "segmentation")
+    png_files = []
+    if os.path.exists(png_dir):
+        png_files = sorted(glob.glob(os.path.join(png_dir, "*.png")))
+        logger.info(f"Found {len(png_files)} PNG files in segmentation directory")
+    # Step 1: Create a unified color-to-class mapping from all JSON files
+    logger.info("Creating unified color-to-class mapping...")
+    rgb_to_class = {}
+    rgb_to_is_robot = {}
+    for json_file in tqdm(json_files, desc="Processing JSON files for unified mapping"):
+        with open(json_file, "r") as f:
+            json_data = json.load(f)
+        for color_key, data in json_data.items():
+            color = parse_color_key(color_key)
+            class_name = data["class"]
+            # Store RGB color for matching
+            rgb_to_class[color] = class_name
+            rgb_to_is_robot[color] = any(keyword in class_name for keyword in robot_keywords)
+    # Print statistics about the unified color mapping
+    robot_colors = [color for color, is_robot in rgb_to_is_robot.items() if is_robot]
+    logger.info(f"Unified mapping: Found {len(robot_colors)} robot colors out of {len(rgb_to_is_robot)} total colors")
+    if robot_colors:
+        logger.info(f"Robot classes: {[rgb_to_class[color] for color in robot_colors]}")
+    # Convert color mapping to arrays for vectorized operations
+    colors = list(rgb_to_is_robot.keys())
+    color_array = np.array(colors)
+    is_robot_array = np.array([rgb_to_is_robot[color] for color in colors], dtype=bool)
+    # If we have PNG files, get dimensions from the first PNG
+    if png_files:
+        # Get dimensions from the first PNG file
+        first_png = cv2.imread(png_files[0])
+        if first_png is None:
+            raise ValueError(f"Could not read PNG file: {png_files[0]}")
+        height, width = first_png.shape[:2]
+        frame_count = len(png_files)
+        # Match frame numbers between JSON and PNG files to ensure correct correspondence
+        json_frame_nums = [int(os.path.basename(f).split("_")[-1].split(".")[0]) for f in json_files]
+        png_frame_nums = [int(os.path.basename(f).split("_")[-1].split(".")[0]) for f in png_files]
+        # Find common frames between JSON and PNG files
+        common_frames = sorted(set(json_frame_nums).intersection(set(png_frame_nums)))
+        logger.info(f"Found {len(common_frames)} common frames between JSON and PNG files")
+        if len(common_frames) == 0:
+            raise ValueError("No matching frames found between JSON and PNG files")
+        # Create maps to easily look up files by frame number
+        json_map = {int(os.path.basename(f).split("_")[-1].split(".")[0]): f for f in json_files}
+        png_map = {int(os.path.basename(f).split("_")[-1].split(".")[0]): f for f in png_files}
+        # Create new lists with only matching files
+        json_files = [json_map[frame] for frame in common_frames if frame in json_map]
+        png_files = [png_map[frame] for frame in common_frames if frame in png_map]
+        num_frames = len(json_files)
+        logger.info(f"Using PNG dimensions: {width}x{height}, processing {num_frames} frames")
+    else:
+        # Get video information if no PNG files available
+        try:
+            width, height, frame_count, fps = get_video_info(video_path)
+            logger.info(f"Video dimensions: {width}x{height}, {frame_count} frames, {fps} fps")
+            num_frames = min(len(json_files), frame_count)
+        except Exception as e:
+            logger.warning(f"Warning: Could not get video information: {e}")
+            # Use a default size if we can't get the video info
+            width, height = 640, 480
+            num_frames = len(json_files)
+            logger.info(f"Using default dimensions: {width}x{height}, {num_frames} frames")
+    # Initialize weight tensors
+    depth_weights = torch.zeros((num_frames, height, width))
+    vis_weights = torch.zeros((num_frames, height, width))
+    edge_weights = torch.zeros((num_frames, height, width))
+    seg_weights = torch.zeros((num_frames, height, width))
+    # Process frames
+    if png_files:
+        # Process PNG files directly
+        for i, (json_file, png_file) in enumerate(zip(json_files, png_files)):
+            # Get frame number from filename
+            frame_num = int(os.path.basename(json_file).split("_")[-1].split(".")[0])
+            # Read the corresponding PNG file
+            frame = cv2.imread(png_file)
+            if frame is None:
+                logger.warning(f"Warning: Could not read frame {i} from PNG. Using blank frame.")
+                frame = np.zeros((height, width, 3), dtype=np.uint8)
+            # Convert frame to RGB
+            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            # Calculate total pixels
+            total_pixels = height * width
+            # Vectorized approach for finding nearest colors
+            # Convert frame_rgb to a 2D array of shape (height*width, 3)
+            pixels = frame_rgb.reshape(-1, 3)
+            # Calculate distances between each pixel and each color (vectorized)
+            # This creates a matrix of shape (height*width, num_colors)
+            distances = np.sqrt(np.sum((pixels[:, np.newaxis, :] - color_array[np.newaxis, :, :]) ** 2, axis=2))
+            # Find the index of the nearest color for each pixel
+            nearest_color_indices = np.argmin(distances, axis=1)
+            # Get the is_robot value for each pixel based on its nearest color
+            pixel_is_robot = is_robot_array[nearest_color_indices]
+            # Reshape back to image dimensions
+            pixel_is_robot_2d = pixel_is_robot.reshape(height, width)
+            # Count robot and matched pixels
+            robot_pixel_count = np.sum(pixel_is_robot)
+            matched_pixel_count = pixels.shape[0]  # All pixels are matched now
+            # Create masks based on the is_robot classification
+            depth_mask = np.where(
+                pixel_is_robot_2d, weights_dict["depth"]["foreground"], weights_dict["depth"]["background"]
+            )
+            vis_mask = np.where(pixel_is_robot_2d, weights_dict["vis"]["foreground"], weights_dict["vis"]["background"])
+            edge_mask = np.where(
+                pixel_is_robot_2d, weights_dict["edge"]["foreground"], weights_dict["edge"]["background"]
+            )
+            seg_mask = np.where(pixel_is_robot_2d, weights_dict["seg"]["foreground"], weights_dict["seg"]["background"])
+            # Create visualization mask
+            visualization_mask = np.zeros((height, width), dtype=np.uint8)
+            visualization_mask[pixel_is_robot_2d] = 255
+            # Log statistics
+            robot_percentage = (robot_pixel_count / total_pixels) * 100
+            matched_percentage = (matched_pixel_count / total_pixels) * 100
+            logger.info(f"Frame {frame_num}: {robot_pixel_count} robot pixels ({robot_percentage:.2f}%)")
+            logger.info(f"Frame {frame_num}: {matched_pixel_count} matched pixels ({matched_percentage:.2f}%)")
+            # Save visualizations for this frame
+            save_visualization(visualization_mask, frame_num, "segmentation", viz_dir)
+            # Store the masks in the weight tensors
+            depth_weights[i] = torch.from_numpy(depth_mask)
+            vis_weights[i] = torch.from_numpy(vis_mask)
+            edge_weights[i] = torch.from_numpy(edge_mask)
+            seg_weights[i] = torch.from_numpy(seg_mask)
+    else:
+        # Use video frames if available
+        try:
+            # Open the segmentation video
+            cap = cv2.VideoCapture(video_path)
+            if not cap.isOpened():
+                raise ValueError(f"Could not open video file: {video_path}")
+            # Process each frame using the unified color mapping
+            for i, json_file in enumerate(tqdm(json_files[:num_frames], desc="Processing frames")):
+                # Get frame number from filename
+                frame_num = int(os.path.basename(json_file).split("_")[-1].split(".")[0])
+                # Read the corresponding frame from the video
+                cap.set(cv2.CAP_PROP_POS_FRAMES, i)
+                ret, frame = cap.read()
+                if not ret:
+                    logger.warning(f"Warning: Could not read frame {i} from video. Using blank frame.")
+                    frame = np.zeros((height, width, 3), dtype=np.uint8)
+                # Convert frame to RGB
+                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+                # Calculate total pixels
+                total_pixels = height * width
+                # Vectorized approach for finding nearest colors
+                pixels = frame_rgb.reshape(-1, 3)
+                distances = np.sqrt(np.sum((pixels[:, np.newaxis, :] - color_array[np.newaxis, :, :]) ** 2, axis=2))
+                nearest_color_indices = np.argmin(distances, axis=1)
+                pixel_is_robot = is_robot_array[nearest_color_indices]
+                pixel_is_robot_2d = pixel_is_robot.reshape(height, width)
+                # Count robot and matched pixels
+                robot_pixel_count = np.sum(pixel_is_robot)
+                matched_pixel_count = pixels.shape[0]
+                # Create masks based on the is_robot classification
+                depth_mask = np.where(
+                    pixel_is_robot_2d, weights_dict["depth"]["foreground"], weights_dict["depth"]["background"]
+                )
+                vis_mask = np.where(
+                    pixel_is_robot_2d, weights_dict["vis"]["foreground"], weights_dict["vis"]["background"]
+                )
+                edge_mask = np.where(
+                    pixel_is_robot_2d, weights_dict["edge"]["foreground"], weights_dict["edge"]["background"]
+                )
+                seg_mask = np.where(
+                    pixel_is_robot_2d, weights_dict["seg"]["foreground"], weights_dict["seg"]["background"]
+                )
+                # Create visualization mask
+                visualization_mask = np.zeros((height, width), dtype=np.uint8)
+                visualization_mask[pixel_is_robot_2d] = 255
+                # Log statistics
+                robot_percentage = (robot_pixel_count / total_pixels) * 100
+                matched_percentage = (matched_pixel_count / total_pixels) * 100
+                logger.info(f"Frame {frame_num}: {robot_pixel_count} robot pixels ({robot_percentage:.2f}%)")
+                logger.info(f"Frame {frame_num}: {matched_pixel_count} matched pixels ({matched_percentage:.2f}%)")
+                # Save visualizations for this frame
+                save_visualization(visualization_mask, frame_num, "segmentation", viz_dir)
+                # Store the masks in the weight tensors
+                depth_weights[i] = torch.from_numpy(depth_mask)
+                vis_weights[i] = torch.from_numpy(vis_mask)
+                edge_weights[i] = torch.from_numpy(edge_mask)
+                seg_weights[i] = torch.from_numpy(seg_mask)
+            # Close the video capture
+            cap.release()
+        except Exception as e:
+            logger.warning(f"Warning: Error processing video: {e}")
+            logger.warning("Cannot process this example without proper frame data.")
+            raise ValueError(f"Cannot process example without frame data: {e}")
+    # Save weight tensors
+    # Convert weights to half precision (float16) to reduce file size
+    depth_weights_half = depth_weights.to(torch.float16)
+    vis_weights_half = vis_weights.to(torch.float16)
+    edge_weights_half = edge_weights.to(torch.float16)
+    seg_weights_half = seg_weights.to(torch.float16)
+    # Save the half precision tensors
+    torch.save(depth_weights_half, os.path.join(output_dir, "depth_weights.pt"))
+    torch.save(vis_weights_half, os.path.join(output_dir, "vis_weights.pt"))
+    torch.save(edge_weights_half, os.path.join(output_dir, "edge_weights.pt"))
+    torch.save(seg_weights_half, os.path.join(output_dir, "seg_weights.pt"))
+    logger.info(f"Saved weight matrices to {output_dir}")
+    logger.info(f"Weight matrix shape: {depth_weights_half.shape}, dtype: {depth_weights_half.dtype}")
+    logger.info(f"Saved visualizations to {viz_dir}")
+    return output_dir, viz_dir
+def process_all_examples(input_dir, output_dir, setting_name="fg_vis_edge_bg_seg", robot_keywords=None):
+    """Process all example directories in the provided input directory
+    Args:
+        input_dir (str): Input directory containing example folders
+        output_dir (str): Output directory for weight matrices
+        setting_name (str, optional): Weight setting name. Defaults to 'fg_vis_edge_bg_seg'.
+        robot_keywords (list, optional): List of keywords to identify robot classes. Defaults to None.
+    """
+    # Find all example directories
+    if not os.path.exists(input_dir):
+        logger.error(f"Input directory not found: {input_dir}")
+        return []
+    # List example directories
+    examples = [d for d in os.listdir(input_dir) if os.path.isdir(os.path.join(input_dir, d))]
+    examples = sorted(examples)
+    if not examples:
+        logger.warning("No example directories found.")
+        return []
+    # Print found examples
+    logger.info(f"Found {len(examples)} example directories:")
+    for example in examples:
+        logger.info(f"  - {example}")
+    # Store processing results
+    results = []
+    # Process each example
+    for example in examples:
+        try:
+            logger.info(f"\nProcessing {example}...")
+            # Process this example with custom directories
+            out_dir, viz_dir = process_example_with_dirs(example, input_dir, output_dir, setting_name, robot_keywords)
+            results.append((example, out_dir, viz_dir))
+            logger.info(f"Results for {example} saved to:")
+            logger.info(f"  Weight matrices: {out_dir}")
+            logger.info(f"  Visualizations: {viz_dir}")
+        except Exception as e:
+            logger.error(f"Error processing {example}: {e}")
+    logger.info("\nAll examples processed.")
+    return results
+# Process a specific example with custom input and output directories
+def process_example_with_dirs(
+    example_name, input_dir, output_dir, setting_name="fg_vis_edge_bg_seg", robot_keywords=None
+):
+    """Process a specific example with custom input and output directories
+    Args:
+        example_name (str): Name of the example directory
+        input_dir (str): Path to input directory containing example folders
+        output_dir (str): Path to output directory for weight matrices
+        setting_name (str, optional): Weight setting name. Defaults to 'fg_vis_edge_bg_seg'.
+        robot_keywords (list, optional): List of keywords to identify robot classes. Defaults to None.
+    """
+    # Create paths for this example
+    example_dir = os.path.join(input_dir, example_name)
+    segmentation_dir = os.path.join(example_dir, "segmentation_label")
+    video_path = os.path.join(example_dir, "segmentation.mp4")
+    # Create output directories
+    example_output_dir = os.path.join(output_dir, example_name)
+    viz_dir = os.path.join(example_output_dir, "visualizations")
+    # Check if weight files already exist
+    depth_weights_path = os.path.join(example_output_dir, "depth_weights.pt")
+    if os.path.exists(depth_weights_path):
+        logger.info(f"Weight files already exist for {example_name}, skipping processing")
+        return example_output_dir, viz_dir
+    # Create output directories if they don't exist
+    os.makedirs(example_output_dir, exist_ok=True)
+    os.makedirs(viz_dir, exist_ok=True)
+    # Get weight settings
+    weights_dict = WeightSettings.get_settings(setting_name)
+    # Process this example directly with paths
+    return process_segmentation_files(
+        segmentation_dir=segmentation_dir,
+        output_dir=example_output_dir,
+        viz_dir=viz_dir,
+        video_path=video_path,
+        weights_dict=weights_dict,
+        setting_name=setting_name,
+        robot_keywords=robot_keywords,
+    )
+if __name__ == "__main__":
+    # Parse command-line arguments
+    parser = argparse.ArgumentParser(
+        description="Process segmentation files to generate spatial-temporal weight matrices"
+    )
+    parser.add_argument(
+        "--setting",
+        type=str,
+        default="fg_vis_edge_bg_seg",
+        choices=WeightSettings.list_settings(),
+        help="Weight setting to use (default: fg_vis_edge_bg_seg (setting1), fg_edge_bg_seg (setting2))",
+    )
+    parser.add_argument(
+        "--input-dir",
+        type=str,
+        default="assets/robot_augmentation_example",
+        help="Input directory containing example folders",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="outputs/robot_augmentation_example",
+        help="Output directory for weight matrices",
+    )
+    parser.add_argument(
+        "--robot-keywords",
+        type=str,
+        nargs="+",
+        default=["world_robot", "gripper", "robot"],
+        help="Keywords used to identify robot classes (default: world_robot gripper robot)",
+    )
+    parser.add_argument(
+        "--log-level",
+        type=str,
+        default="INFO",
+        choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
+        help="Set the logging level",
+    )
+    args = parser.parse_args()
+    # Set logging level from command line argument
+    logger.setLevel(getattr(logging, args.log_level))
+    # Get directories from arguments
+    input_dir = args.input_dir
+    output_dir = args.output_dir
+    setting_name = args.setting
+    robot_keywords = args.robot_keywords
+    logger.info(f"Using input directory: {input_dir}")
+    logger.info(f"Using output directory: {output_dir}")
+    logger.info(f"Using weight setting: {setting_name}")
+    logger.info(f"Using robot keywords: {robot_keywords}")
+    # Process all examples with the provided input and output directories
+    process_all_examples(input_dir, output_dir, setting_name, robot_keywords)

cosmos_transfer1/auxiliary/sam2/sam2_model.py ADDED Viewed

	@@ -0,0 +1,392 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import sys
+import numpy as np
+import pycocotools.mask as mask_util
+import torch
+from cosmos_transfer1.utils import log
+sys.path.append("cosmos_transfer1/auxiliary")
+import tempfile
+from PIL import Image
+from sam2.sam2_video_predictor import SAM2VideoPredictor
+from transformers import AutoModelForZeroShotObjectDetection, AutoProcessor
+from cosmos_transfer1.auxiliary.sam2.sam2_utils import (
+    capture_fps,
+    convert_masks_to_frames,
+    generate_tensor_from_images,
+    video_to_frames,
+    write_video,
+)
+from cosmos_transfer1.checkpoints import GROUNDING_DINO_MODEL_CHECKPOINT, SAM2_MODEL_CHECKPOINT
+def rle_encode(mask: np.ndarray) -> dict:
+    """
+    Encode a boolean mask (of shape (T, H, W)) using the pycocotools RLE format,
+    matching the format of eff_segmentation.RleMaskSAMv2 (from Yotta).
+    The procedure is:
+      1. Convert the mask to a numpy array in Fortran order.
+      2. Reshape the array to (-1, 1) (i.e. flatten in Fortran order).
+      3. Call pycocotools.mask.encode on the reshaped array.
+      4. Return a dictionary with the encoded data and the original mask shape.
+    """
+    mask = np.array(mask, order="F")
+    # Reshape the mask to (-1, 1) in Fortran order and encode it.
+    encoded = mask_util.encode(np.array(mask.reshape(-1, 1), order="F"))
+    return {"data": encoded, "mask_shape": mask.shape}
+class VideoSegmentationModel:
+    def __init__(self, **kwargs):
+        """Initialize the model and load all required components."""
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        # Initialize SAM2 predictor
+        self.sam2_predictor = SAM2VideoPredictor.from_pretrained(SAM2_MODEL_CHECKPOINT).to(self.device)
+        # Initialize GroundingDINO for text-based detection
+        self.grounding_model_name = kwargs.get("grounding_model", GROUNDING_DINO_MODEL_CHECKPOINT)
+        self.processor = AutoProcessor.from_pretrained(self.grounding_model_name)
+        self.grounding_model = AutoModelForZeroShotObjectDetection.from_pretrained(self.grounding_model_name).to(
+            self.device
+        )
+    def get_boxes_from_text(self, image_path, text_prompt):
+        """Get bounding boxes (and labels) from a text prompt using GroundingDINO."""
+        image = Image.open(image_path).convert("RGB")
+        inputs = self.processor(images=image, text=text_prompt, return_tensors="pt").to(self.device)
+        with torch.no_grad():
+            outputs = self.grounding_model(**inputs)
+        # Try with initial thresholds.
+        results = self.processor.post_process_grounded_object_detection(
+            outputs,
+            inputs.input_ids,
+            box_threshold=0.15,
+            text_threshold=0.25,
+            target_sizes=[image.size[::-1]],
+        )
+        boxes = results[0]["boxes"].cpu().numpy()
+        scores = results[0]["scores"].cpu().numpy()
+        labels = results[0].get("labels", None)
+        if len(boxes) == 0:
+            print(f"No boxes detected for prompt: '{text_prompt}'. Trying with lower thresholds...")
+            results = self.processor.post_process_grounded_object_detection(
+                outputs,
+                inputs.input_ids,
+                box_threshold=0.1,
+                text_threshold=0.1,
+                target_sizes=[image.size[::-1]],
+            )
+            boxes = results[0]["boxes"].cpu().numpy()
+            scores = results[0]["scores"].cpu().numpy()
+            labels = results[0].get("labels", None)
+        if len(boxes) > 0:
+            print(f"Found {len(boxes)} boxes with scores: {scores}")
+            # Sort boxes by confidence score in descending order
+            sorted_indices = np.argsort(scores)[::-1]
+            boxes = boxes[sorted_indices]
+            scores = scores[sorted_indices]
+            if labels is not None:
+                labels = np.array(labels)[sorted_indices]
+        else:
+            print("Still no boxes detected. Consider adjusting the prompt or using box/points mode.")
+        return {"boxes": boxes, "labels": labels, "scores": scores}
+    def visualize_frame(self, frame_idx, obj_ids, masks, video_dir, frame_names, visualization_data, save_dir=None):
+        """
+        Process a single frame: load the image, apply the segmentation mask to black out the
+        detected object(s), and save both the masked frame and the binary mask image.
+        """
+        # Load the frame.
+        frame_path = os.path.join(video_dir, frame_names[frame_idx])
+        img = Image.open(frame_path).convert("RGB")
+        image_np = np.array(img)
+        # Combine masks from the detection output.
+        if isinstance(masks, torch.Tensor):
+            mask_np = (masks[0] > 0.0).cpu().numpy().astype(bool)
+            combined_mask = mask_np
+        elif isinstance(masks, dict):
+            first_mask = next(iter(masks.values()))
+            combined_mask = np.zeros_like(first_mask, dtype=bool)
+            for m in masks.values():
+                combined_mask |= m
+        else:
+            combined_mask = None
+        if combined_mask is not None:
+            combined_mask = np.squeeze(combined_mask)
+            # If the mask shape doesn't match the image, resize it.
+            if combined_mask.shape != image_np.shape[:2]:
+                mask_img = Image.fromarray((combined_mask.astype(np.uint8)) * 255)
+                mask_img = mask_img.resize((image_np.shape[1], image_np.shape[0]), resample=Image.NEAREST)
+                combined_mask = np.array(mask_img) > 127
+            # Black out the detected region.
+            image_np[combined_mask] = 0
+            mask_image = (combined_mask.astype(np.uint8)) * 255
+            mask_pil = Image.fromarray(mask_image)
+        if save_dir:
+            seg_frame_path = os.path.join(save_dir, f"frame_{frame_idx}_segmented.png")
+            seg_pil = Image.fromarray(image_np)
+            seg_pil.save(seg_frame_path)
+            if combined_mask is not None:
+                mask_save_path = os.path.join(save_dir, f"frame_{frame_idx}_mask.png")
+                mask_pil.save(mask_save_path)
+    def sample(self, **kwargs):
+        """
+        Main sampling function for video segmentation.
+        Returns a list of detections in which each detection contains a phrase and
+        an RLE-encoded segmentation mask (matching the output of the Grounded SAM model).
+        """
+        video_dir = kwargs.get("video_dir", "")
+        mode = kwargs.get("mode", "points")
+        input_data = kwargs.get("input_data", None)
+        save_dir = kwargs.get("save_dir", None)
+        visualize = kwargs.get("visualize", False)
+        # Get frame names (expecting frames named as numbers with .jpg/.jpeg extension).
+        frame_names = [p for p in os.listdir(video_dir) if os.path.splitext(p)[-1].lower() in [".jpg", ".jpeg"]]
+        frame_names.sort(key=lambda p: int(os.path.splitext(p)[0]))
+        with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
+            state = self.sam2_predictor.init_state(video_path=video_dir)
+            ann_frame_idx = 0
+            ann_obj_id = 1
+            boxes = None
+            points = None
+            labels = None
+            box = None
+            visualization_data = {"mode": mode, "points": None, "labels": None, "box": None, "boxes": None}
+            if input_data is not None:
+                if mode == "points":
+                    points = input_data.get("points")
+                    labels = input_data.get("labels")
+                    frame_idx, obj_ids, masks = self.sam2_predictor.add_new_points_or_box(
+                        inference_state=state, frame_idx=ann_frame_idx, obj_id=ann_obj_id, points=points, labels=labels
+                    )
+                    visualization_data["points"] = points
+                    visualization_data["labels"] = labels
+                elif mode == "box":
+                    box = input_data.get("box")
+                    frame_idx, obj_ids, masks = self.sam2_predictor.add_new_points_or_box(
+                        inference_state=state, frame_idx=ann_frame_idx, obj_id=ann_obj_id, box=box
+                    )
+                    visualization_data["box"] = box
+                elif mode == "prompt":
+                    text = input_data.get("text")
+                    first_frame_path = os.path.join(video_dir, frame_names[0])
+                    gd_results = self.get_boxes_from_text(first_frame_path, text)
+                    boxes = gd_results["boxes"]
+                    labels_out = gd_results["labels"]
+                    scores = gd_results["scores"]
+                    log.info(f"scores: {scores}")
+                    if len(boxes) > 0:
+                        legacy_mask = kwargs.get("legacy_mask", False)
+                        if legacy_mask:
+                            # Use only the highest confidence box for legacy mask
+                            log.info(f"using legacy_mask: {legacy_mask}")
+                            frame_idx, obj_ids, masks = self.sam2_predictor.add_new_points_or_box(
+                                inference_state=state, frame_idx=ann_frame_idx, obj_id=ann_obj_id, box=boxes[0]
+                            )
+                            # Update boxes and labels after processing
+                            boxes = boxes[:1]
+                            if labels_out is not None:
+                                labels_out = labels_out[:1]
+                        else:
+                            log.info(f"using new_mask: {legacy_mask}")
+                            for object_id, (box, label) in enumerate(zip(boxes, labels_out)):
+                                frame_idx, obj_ids, masks = self.sam2_predictor.add_new_points_or_box(
+                                    inference_state=state, frame_idx=ann_frame_idx, obj_id=object_id, box=box
+                                )
+                        visualization_data["boxes"] = boxes
+                        self.grounding_labels = [str(lbl) for lbl in labels_out] if labels_out is not None else [text]
+                    else:
+                        print("No boxes detected. Exiting.")
+                        return []  # Return empty list if no detections
+                if visualize:
+                    self.visualize_frame(
+                        frame_idx=ann_frame_idx,
+                        obj_ids=obj_ids,
+                        masks=masks,
+                        video_dir=video_dir,
+                        frame_names=frame_names,
+                        visualization_data=visualization_data,
+                        save_dir=save_dir,
+                    )
+            video_segments = {}  # keys: frame index, values: {obj_id: mask}
+            for out_frame_idx, out_obj_ids, out_mask_logits in self.sam2_predictor.propagate_in_video(state):
+                video_segments[out_frame_idx] = {
+                    out_obj_id: (out_mask_logits[i] > 0.0).cpu().numpy() for i, out_obj_id in enumerate(out_obj_ids)
+                }
+                # For propagated frames, visualization_data is not used.
+                if visualize:
+                    propagate_visualization_data = {
+                        "mode": mode,
+                        "points": None,
+                        "labels": None,
+                        "box": None,
+                        "boxes": None,
+                    }
+                    self.visualize_frame(
+                        frame_idx=out_frame_idx,
+                        obj_ids=out_obj_ids,
+                        masks=video_segments[out_frame_idx],
+                        video_dir=video_dir,
+                        frame_names=frame_names,
+                        visualization_data=propagate_visualization_data,
+                        save_dir=save_dir,
+                    )
+        # --- Post-process video_segments to produce a list of detections ---
+        if len(video_segments) == 0:
+            return []
+        first_frame_path = os.path.join(video_dir, frame_names[0])
+        first_frame = np.array(Image.open(first_frame_path).convert("RGB"))
+        original_shape = first_frame.shape[:2]  # (height, width)
+        object_masks = {}  # key: obj_id, value: list of 2D boolean masks
+        sorted_frame_indices = sorted(video_segments.keys())
+        for frame_idx in sorted_frame_indices:
+            segments = video_segments[frame_idx]
+            for obj_id, mask in segments.items():
+                mask = np.squeeze(mask)
+                if mask.ndim != 2:
+                    print(f"Warning: Unexpected mask shape {mask.shape} for object {obj_id} in frame {frame_idx}.")
+                    continue
+                if mask.shape != original_shape:
+                    mask_img = Image.fromarray(mask.astype(np.uint8) * 255)
+                    mask_img = mask_img.resize((original_shape[1], original_shape[0]), resample=Image.NEAREST)
+                    mask = np.array(mask_img) > 127
+                if obj_id not in object_masks:
+                    object_masks[obj_id] = []
+                object_masks[obj_id].append(mask)
+        detections = []
+        for obj_id, mask_list in object_masks.items():
+            mask_stack = np.stack(mask_list, axis=0)  # shape: (T, H, W)
+            # Use our new rle_encode (which now follows the eff_segmentation.RleMaskSAMv2 format)
+            rle = rle_encode(mask_stack)
+            if mode == "prompt" and hasattr(self, "grounding_labels"):
+                phrase = self.grounding_labels[0]
+            else:
+                phrase = input_data.get("text", "")
+            detection = {"phrase": phrase, "segmentation_mask_rle": rle}
+            detections.append(detection)
+        return detections
+    @staticmethod
+    def parse_points(points_str):
+        """Parse a string of points into a numpy array.
+        Supports a single point ('200,300') or multiple points separated by ';' (e.g., '200,300;100,150').
+        """
+        points = []
+        for point in points_str.split(";"):
+            coords = point.split(",")
+            if len(coords) != 2:
+                continue
+            points.append([float(coords[0]), float(coords[1])])
+        return np.array(points, dtype=np.float32)
+    @staticmethod
+    def parse_labels(labels_str):
+        """Parse a comma-separated string of labels into a numpy array."""
+        return np.array([int(x) for x in labels_str.split(",")], dtype=np.int32)
+    @staticmethod
+    def parse_box(box_str):
+        """Parse a comma-separated string of 4 box coordinates into a numpy array."""
+        return np.array([float(x) for x in box_str.split(",")], dtype=np.float32)
+    def __call__(
+        self,
+        input_video,
+        output_video=None,
+        output_tensor=None,
+        prompt=None,
+        box=None,
+        points=None,
+        labels=None,
+        weight_scaler=None,
+        binarize_video=False,
+        legacy_mask=False,
+    ):
+        log.info(
+            f"Processing video: {input_video} to generate segmentation video: {output_video} segmentation tensor: {output_tensor}"
+        )
+        assert os.path.exists(input_video)
+        # Prepare input data based on the selected mode.
+        if points is not None:
+            mode = "points"
+            input_data = {"points": self.parse_points(points), "labels": self.parse_labels(labels)}
+        elif box is not None:
+            mode = "box"
+            input_data = {"box": self.parse_box(box)}
+        elif prompt is not None:
+            mode = "prompt"
+            input_data = {"text": prompt}
+        with tempfile.TemporaryDirectory() as temp_input_dir:
+            fps = capture_fps(input_video)
+            video_to_frames(input_video, temp_input_dir)
+            with tempfile.TemporaryDirectory() as temp_output_dir:
+                masks = self.sample(
+                    video_dir=temp_input_dir,
+                    mode=mode,
+                    input_data=input_data,
+                    save_dir=str(temp_output_dir),
+                    visualize=True,
+                    legacy_mask=legacy_mask,
+                )
+                if output_video:
+                    os.makedirs(os.path.dirname(output_video), exist_ok=True)
+                    frames = convert_masks_to_frames(masks)
+                    if binarize_video:
+                        frames = np.any(frames > 0, axis=-1).astype(np.uint8) * 255
+                    write_video(frames, output_video, fps)
+                if output_tensor:
+                    generate_tensor_from_images(
+                        temp_output_dir, output_tensor, fps, "mask", weight_scaler=weight_scaler
+                    )

cosmos_transfer1/auxiliary/sam2/sam2_pipeline.py ADDED Viewed

	@@ -0,0 +1,126 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import tempfile
+import numpy as np
+from cosmos_transfer1.auxiliary.sam2.sam2_model import VideoSegmentationModel
+from cosmos_transfer1.auxiliary.sam2.sam2_utils import (
+    capture_fps,
+    generate_tensor_from_images,
+    generate_video_from_images,
+    video_to_frames,
+)
+def parse_args():
+    parser = argparse.ArgumentParser(description="Video Segmentation using SAM2")
+    parser.add_argument("--input_video", type=str, required=True, help="Path to input video file")
+    parser.add_argument(
+        "--output_video", type=str, default="./outputs/output_video.mp4", help="Path to save the output video"
+    )
+    parser.add_argument(
+        "--output_tensor", type=str, default="./outputs/output_tensor.pt", help="Path to save the output tensor"
+    )
+    parser.add_argument(
+        "--mode", type=str, choices=["points", "box", "prompt"], default="points", help="Segmentation mode"
+    )
+    parser.add_argument("--prompt", type=str, help="Text prompt for prompt mode")
+    parser.add_argument(
+        "--grounding_model_path",
+        type=str,
+        default="IDEA-Research/grounding-dino-tiny",
+        help="Local directory for GroundingDINO model files",
+    )
+    parser.add_argument(
+        "--points",
+        type=str,
+        default="200,300",
+        help="Comma-separated point coordinates for points mode (e.g., '200,300' or for multiple points use ';' as a separator, e.g., '200,300;100,150').",
+    )
+    parser.add_argument(
+        "--labels",
+        type=str,
+        default="1",
+        help="Comma-separated labels for points mode (e.g., '1' or '1,0' for multiple points).",
+    )
+    parser.add_argument(
+        "--box",
+        type=str,
+        default="300,0,500,400",
+        help="Comma-separated box coordinates for box mode (e.g., '300,0,500,400').",
+    )
+    # New flag to control visualization.
+    parser.add_argument("--visualize", action="store_true", help="If set, visualize segmentation frames (save images)")
+    return parser.parse_args()
+def parse_points(points_str):
+    """Parse a string of points into a numpy array.
+    Supports a single point ('200,300') or multiple points separated by ';' (e.g., '200,300;100,150').
+    """
+    points = []
+    for point in points_str.split(";"):
+        coords = point.split(",")
+        if len(coords) != 2:
+            continue
+        points.append([float(coords[0]), float(coords[1])])
+    return np.array(points, dtype=np.float32)
+def parse_labels(labels_str):
+    """Parse a comma-separated string of labels into a numpy array."""
+    return np.array([int(x) for x in labels_str.split(",")], dtype=np.int32)
+def parse_box(box_str):
+    """Parse a comma-separated string of 4 box coordinates into a numpy array."""
+    return np.array([float(x) for x in box_str.split(",")], dtype=np.float32)
+def main():
+    args = parse_args()
+    # Initialize the segmentation model.
+    model = VideoSegmentationModel(**vars(args))
+    # Prepare input data based on the selected mode.
+    if args.mode == "points":
+        input_data = {"points": parse_points(args.points), "labels": parse_labels(args.labels)}
+    elif args.mode == "box":
+        input_data = {"box": parse_box(args.box)}
+    elif args.mode == "prompt":
+        input_data = {"text": args.prompt}
+    with tempfile.TemporaryDirectory() as temp_input_dir:
+        fps = capture_fps(args.input_video)
+        video_to_frames(args.input_video, temp_input_dir)
+        with tempfile.TemporaryDirectory() as temp_output_dir:
+            model.sample(
+                video_dir=temp_input_dir,
+                mode=args.mode,
+                input_data=input_data,
+                save_dir=str(temp_output_dir),
+                visualize=True,
+            )
+            generate_video_from_images(temp_output_dir, args.output_video, fps)
+            generate_tensor_from_images(temp_output_dir, args.output_tensor, fps, "mask")
+if __name__ == "__main__":
+    print("Starting video segmentation...")
+    main()

cosmos_transfer1/auxiliary/sam2/sam2_utils.py ADDED Viewed

	@@ -0,0 +1,168 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import tempfile
+import time
+import cv2
+import imageio
+import numpy as np
+import pycocotools.mask
+import torch
+from natsort import natsorted
+from PIL import Image
+from torchvision import transforms
+from cosmos_transfer1.diffusion.datasets.augmentors.control_input import (
+    decode_partial_rle_width1,
+    segmentation_color_mask,
+)
+from cosmos_transfer1.utils import log
+def write_video(frames, output_path, fps=30):
+    """
+    expects a sequence of [H, W, 3] or [H, W] frames
+    """
+    with imageio.get_writer(output_path, fps=fps, macro_block_size=8) as writer:
+        for frame in frames:
+            if len(frame.shape) == 2:  # single channel
+                frame = frame[:, :, None].repeat(3, axis=2)
+            writer.append_data(frame)
+def capture_fps(input_video_path: str):
+    cap = cv2.VideoCapture(input_video_path)
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    return fps
+def video_to_frames(input_loc, output_loc):
+    """Function to extract frames from input video file
+    and save them as separate frames in an output directory.
+    Args:
+        input_loc: Input video file.
+        output_loc: Output directory to save the frames.
+    Returns:
+        None
+    """
+    try:
+        os.mkdir(output_loc)
+    except OSError:
+        pass
+    # Log the time
+    time_start = time.time()
+    # Start capturing the feed
+    cap = cv2.VideoCapture(input_loc)
+    # Find the number of frames
+    video_length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    print(f"Number of frames: {video_length}")
+    count = 0
+    print("Converting video..\n")
+    # Start converting the video
+    while cap.isOpened():
+        # Extract the frame
+        ret, frame = cap.read()
+        if not ret:
+            continue
+        # Write the results back to output location.
+        cv2.imwrite(output_loc + "/%#05d.jpg" % (count + 1), frame)
+        count = count + 1
+        # If there are no more frames left
+        if count > (video_length - 1):
+            # Log the time again
+            time_end = time.time()
+            # Release the feed
+            cap.release()
+            # Print stats
+            print("Done extracting frames.\n%d frames extracted" % count)
+            print("It took %d seconds forconversion." % (time_end - time_start))
+            break
+# Function to generate video
+def convert_masks_to_frames(masks: list, num_masks_max: int = 100):
+    T, H, W = shape = masks[0]["segmentation_mask_rle"]["mask_shape"]
+    frame_start, frame_end = 0, T
+    num_masks = min(num_masks_max, len(masks))
+    mask_ids_select = np.arange(num_masks).tolist()
+    all_masks = np.zeros((num_masks, T, H, W), dtype=np.uint8)
+    for idx, mid in enumerate(mask_ids_select):
+        mask = masks[mid]
+        num_byte_per_mb = 1024 * 1024
+        # total number of elements in uint8 (1 byte) / num_byte_per_mb
+        if shape[0] * shape[1] * shape[2] / num_byte_per_mb > 256:
+            rle = decode_partial_rle_width1(
+                mask["segmentation_mask_rle"]["data"],
+                frame_start * shape[1] * shape[2],
+                frame_end * shape[1] * shape[2],
+            )
+            partial_shape = (frame_end - frame_start, shape[1], shape[2])
+            rle = rle.reshape(partial_shape) * 255
+        else:
+            rle = pycocotools.mask.decode(mask["segmentation_mask_rle"]["data"])
+            rle = rle.reshape(shape) * 255
+            # Select the frames that are in the video
+            frame_indices = np.arange(frame_start, frame_end).tolist()
+            rle = np.stack([rle[i] for i in frame_indices])
+        all_masks[idx] = rle
+        del rle
+    all_masks = segmentation_color_mask(all_masks)  # NTHW -> 3THW
+    all_masks = all_masks.transpose(1, 2, 3, 0)
+    return all_masks
+def generate_video_from_images(masks: list, output_file_path: str, fps, num_masks_max: int = 100):
+    all_masks = convert_masks_to_frames(masks, num_masks_max)
+    write_video(all_masks, output_file_path, fps)
+    print("Video generated successfully!")
+def generate_tensor_from_images(
+    image_path_str: str, output_file_path: str, fps, search_pattern: str = None, weight_scaler: float = None
+):
+    images = list()
+    image_path = os.path.abspath(image_path_str)
+    if search_pattern is None:
+        images = [img for img in natsorted(os.listdir(image_path))]
+    else:
+        for img in natsorted(os.listdir(image_path)):
+            if img.__contains__(search_pattern):
+                images.append(img)
+    transform = transforms.ToTensor()
+    image_tensors = list()
+    for image in images:
+        img_tensor = transform(Image.open(os.path.join(image_path, image)))
+        image_tensors.append(img_tensor.squeeze(0))
+    tensor = torch.stack(image_tensors)  # [T, H, W], binary values, float
+    if weight_scaler is not None:
+        log.info(f"scaling the tensor by the specified scale: {weight_scaler}")
+        tensor = tensor * weight_scaler
+    log.info(f"saving tensor shape: {tensor.shape} to {output_file_path}")
+    torch.save(tensor, output_file_path)
+if __name__ == "__main__":
+    input_loc = "cosmos_transfer1/models/sam2/assets/input_video.mp4"
+    output_loc = os.path.abspath(tempfile.TemporaryDirectory().name)
+    print(f"output_loc --- {output_loc}")
+    video_to_frames(input_loc, output_loc)

cosmos_transfer1/auxiliary/tokenizer/inference/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

cosmos_transfer1/auxiliary/tokenizer/inference/image_cli.py ADDED Viewed

	@@ -0,0 +1,188 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A CLI to run ImageTokenizer on plain images based on torch.jit.
+Usage:
+    python3 -m cosmos_transfer1.auxiliary.tokenizer.inference.image_cli \
+        --image_pattern 'path/to/input/folder/*.jpg' \
+        --output_dir ./reconstructions \
+        --checkpoint_enc ./checkpoints/<model-name>/encoder.jit \
+        --checkpoint_dec ./checkpoints/<model-name>/decoder.jit
+    Optionally, you can run the model in pure PyTorch mode:
+    python3 -m cosmos_transfer1.auxiliary.tokenizer.inference.image_cli \
+        --image_pattern 'path/to/input/folder/*.jpg' \
+        --mode torch \
+        --tokenizer_type CI \
+        --spatial_compression 8 \
+        --checkpoint_enc ./checkpoints/<model-name>/encoder.jit \
+        --checkpoint_dec ./checkpoints/<model-name>/decoder.jit
+"""
+import os
+import sys
+from argparse import ArgumentParser, Namespace
+from typing import Any
+import numpy as np
+from loguru import logger as logging
+from cosmos_transfer1.auxiliary.tokenizer.inference.image_lib import ImageTokenizer
+from cosmos_transfer1.auxiliary.tokenizer.inference.utils import (
+    get_filepaths,
+    get_output_filepath,
+    read_image,
+    resize_image,
+    write_image,
+)
+from cosmos_transfer1.auxiliary.tokenizer.networks import TokenizerConfigs
+def _parse_args() -> tuple[Namespace, dict[str, Any]]:
+    parser = ArgumentParser(description="A CLI for running ImageTokenizer on plain images.")
+    parser.add_argument(
+        "--image_pattern",
+        type=str,
+        default="path/to/images/*.jpg",
+        help="Glob pattern.",
+    )
+    parser.add_argument(
+        "--checkpoint",
+        type=str,
+        default=None,
+        help="JIT full Autoencoder model filepath.",
+    )
+    parser.add_argument(
+        "--checkpoint_enc",
+        type=str,
+        default=None,
+        help="JIT Encoder model filepath.",
+    )
+    parser.add_argument(
+        "--checkpoint_dec",
+        type=str,
+        default=None,
+        help="JIT Decoder model filepath.",
+    )
+    parser.add_argument(
+        "--tokenizer_type",
+        type=str,
+        choices=["CI", "DI"],
+        help="Specifies the tokenizer type.",
+    )
+    parser.add_argument(
+        "--spatial_compression",
+        type=int,
+        choices=[8, 16],
+        default=8,
+        help="The spatial compression factor.",
+    )
+    parser.add_argument(
+        "--mode",
+        type=str,
+        choices=["torch", "jit"],
+        default="jit",
+        help="Specify the backend: native 'torch' or 'jit' (default: 'jit')",
+    )
+    parser.add_argument(
+        "--short_size",
+        type=int,
+        default=None,
+        help="The size to resample inputs. None, by default.",
+    )
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        default="bfloat16",
+        help="Sets the precision. Default bfloat16.",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cuda",
+        help="Device for invoking the model.",
+    )
+    parser.add_argument("--output_dir", type=str, default=None, help="Output directory.")
+    parser.add_argument(
+        "--save_input",
+        action="store_true",
+        help="If on, the input image will be be outputed too.",
+    )
+    args = parser.parse_args()
+    return args
+logging.info("Initializes args ...")
+args = _parse_args()
+if args.mode == "torch" and args.tokenizer_type not in ["CI", "DI"]:
+    logging.error("'torch' backend requires the tokenizer_type of 'CI' or 'DI'.")
+    sys.exit(1)
+def _run_eval() -> None:
+    """Invokes the evaluation pipeline."""
+    if args.checkpoint_enc is None and args.checkpoint_dec is None and args.checkpoint is None:
+        logging.warning("Aborting. Both encoder or decoder JIT required. Or provide the full autoencoder JIT model.")
+        return
+    if args.mode == "torch":
+        tokenizer_config = TokenizerConfigs[args.tokenizer_type].value
+        tokenizer_config.update(dict(spatial_compression=args.spatial_compression))
+    else:
+        tokenizer_config = None
+    logging.info(
+        f"Loading a torch.jit model `{os.path.dirname(args.checkpoint or args.checkpoint_enc or args.checkpoint_dec)}` ..."
+    )
+    autoencoder = ImageTokenizer(
+        checkpoint=args.checkpoint,
+        checkpoint_enc=args.checkpoint_enc,
+        checkpoint_dec=args.checkpoint_dec,
+        tokenizer_config=tokenizer_config,
+        device=args.device,
+        dtype=args.dtype,
+    )
+    filepaths = get_filepaths(args.image_pattern)
+    logging.info(f"Found {len(filepaths)} images from {args.image_pattern}.")
+    for filepath in filepaths:
+        logging.info(f"Reading image {filepath} ...")
+        image = read_image(filepath)
+        image = resize_image(image, short_size=args.short_size)
+        batch_image = np.expand_dims(image, axis=0)
+        logging.info("Invoking the autoencoder model in ... ")
+        output_image = autoencoder(batch_image)[0]
+        output_filepath = get_output_filepath(filepath, output_dir=args.output_dir)
+        logging.info(f"Outputing {output_filepath} ...")
+        write_image(output_filepath, output_image)
+        if args.save_input:
+            ext = os.path.splitext(output_filepath)[-1]
+            input_filepath = output_filepath.replace(ext, "_input" + ext)
+            write_image(input_filepath, image)
+@logging.catch(reraise=True)
+def main() -> None:
+    _run_eval()
+if __name__ == "__main__":
+    main()

cosmos_transfer1/auxiliary/tokenizer/inference/image_lib.py ADDED Viewed

	@@ -0,0 +1,124 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A library for image tokenizers inference."""
+from typing import Any
+import numpy as np
+import torch
+from cosmos_transfer1.auxiliary.tokenizer.inference.utils import (
+    load_decoder_model,
+    load_encoder_model,
+    load_model,
+    numpy2tensor,
+    pad_image_batch,
+    tensor2numpy,
+    unpad_image_batch,
+)
+class ImageTokenizer(torch.nn.Module):
+    def __init__(
+        self,
+        checkpoint: str = None,
+        checkpoint_enc: str = None,
+        checkpoint_dec: str = None,
+        tokenizer_config: dict[str, Any] = None,
+        device: str = "cuda",
+        dtype: str = "bfloat16",
+    ) -> None:
+        super().__init__()
+        self._device = device
+        self._dtype = getattr(torch, dtype)
+        self._full_model = (
+            load_model(checkpoint, tokenizer_config, device).to(self._dtype) if checkpoint is not None else None
+        )
+        self._enc_model = (
+            load_encoder_model(checkpoint_enc, tokenizer_config, device).to(self._dtype)
+            if checkpoint_enc is not None
+            else None
+        )
+        self._dec_model = (
+            load_decoder_model(checkpoint_dec, tokenizer_config, device).to(self._dtype)
+            if checkpoint_dec is not None
+            else None
+        )
+    @torch.no_grad()
+    def autoencode(self, input_tensor: torch.Tensor) -> torch.Tensor:
+        """Reconstrcuts a batch of image tensors after embedding into a latent.
+        Args:
+            input_tensor: The input image Bx3xHxW layout, range [-1..1].
+        Returns:
+            The reconstructed tensor, layout Bx3xHxW, range [-1..1].
+        """
+        if self._full_model is not None:
+            output_tensor = self._full_model(input_tensor)
+            output_tensor = output_tensor[0] if isinstance(output_tensor, tuple) else output_tensor
+        else:
+            output_latent = self.encode(input_tensor)[0]
+            output_tensor = self.decode(output_latent)
+        return output_tensor
+    @torch.no_grad()
+    def decode(self, input_latent: torch.Tensor) -> torch.Tensor:
+        """Decodes an image from a provided latent embedding.
+        Args:
+            input_latent: The continuous latent Bx16xhxw for CI,
+                    or the discrete indices Bxhxw for DI.
+        Returns:
+            The output tensor in Bx3xHxW, range [-1..1].
+        """
+        return self._dec_model(input_latent)
+    @torch.no_grad()
+    def encode(self, input_tensor: torch.Tensor) -> tuple[torch.Tensor]:
+        """Encodes an image into a latent embedding or code.
+        Args:
+            input_tensor: The input tensor Bx3xHxW layout, range [-1..1].
+        Returns:
+            For continuous image (CI) tokenizer, the tuple contains:
+                - The latent embedding, Bx16x(h)x(w), where the compression
+                rate is (H/h x W/w), and channel dimension of 16.
+            For discrete image (DI) tokenizer, the tuple contains:
+                - The indices, Bx(h)x(w), from a codebook of size 64K, which
+                corresponds to FSQ levels of (8,8,8,5,5,5).
+               - The discrete code, Bx6x(h)x(w), where the compression rate is
+                again (H/h x W/w), and channel dimension of 6.
+        """
+        output_latent = self._enc_model(input_tensor)
+        if isinstance(output_latent, torch.Tensor):
+            return output_latent
+        return output_latent[:-1]
+    @torch.no_grad()
+    def forward(self, image: np.ndarray) -> np.ndarray:
+        """Reconstructs an image using a pre-trained tokenizer.
+        Args:
+            image: The input image BxHxWxC layout, range [0..255].
+        Returns:
+            The reconstructed image in range [0..255], layout BxHxWxC.
+        """
+        padded_input_image, crop_region = pad_image_batch(image)
+        input_tensor = numpy2tensor(padded_input_image, dtype=self._dtype, device=self._device)
+        output_tensor = self.autoencode(input_tensor)
+        padded_output_image = tensor2numpy(output_tensor)
+        return unpad_image_batch(padded_output_image, crop_region)

cosmos_transfer1/auxiliary/tokenizer/inference/utils.py ADDED Viewed

	@@ -0,0 +1,402 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utility functions for the inference libraries."""
+import os
+from glob import glob
+from typing import Any
+import mediapy as media
+import numpy as np
+import torch
+from cosmos_transfer1.auxiliary.tokenizer.networks import TokenizerModels
+_DTYPE, _DEVICE = torch.bfloat16, "cuda"
+_UINT8_MAX_F = float(torch.iinfo(torch.uint8).max)
+_SPATIAL_ALIGN = 16
+_TEMPORAL_ALIGN = 8
+def load_model(
+    jit_filepath: str = None,
+    tokenizer_config: dict[str, Any] = None,
+    device: str = "cuda",
+) -> torch.nn.Module | torch.jit.ScriptModule:
+    """Loads a torch.nn.Module from a filepath.
+    Args:
+        jit_filepath: The filepath to the JIT-compiled model.
+        device: The device to load the model onto, default=cuda.
+    Returns:
+        The JIT compiled model loaded to device and on eval mode.
+    """
+    if tokenizer_config is None:
+        return load_jit_model(jit_filepath, device)
+    full_model, ckpts = _load_pytorch_model(jit_filepath, tokenizer_config, device)
+    full_model.load_state_dict(ckpts.state_dict(), strict=False)
+    return full_model.eval().to(device)
+def load_encoder_model(
+    jit_filepath: str = None,
+    tokenizer_config: dict[str, Any] = None,
+    device: str = "cuda",
+) -> torch.nn.Module | torch.jit.ScriptModule:
+    """Loads a torch.nn.Module from a filepath.
+    Args:
+        jit_filepath: The filepath to the JIT-compiled model.
+        device: The device to load the model onto, default=cuda.
+    Returns:
+        The JIT compiled model loaded to device and on eval mode.
+    """
+    if tokenizer_config is None:
+        return load_jit_model(jit_filepath, device)
+    full_model, ckpts = _load_pytorch_model(jit_filepath, tokenizer_config, device)
+    encoder_model = full_model.encoder_jit()
+    encoder_model.load_state_dict(ckpts.state_dict(), strict=False)
+    return encoder_model.eval().to(device)
+def load_decoder_model(
+    jit_filepath: str = None,
+    tokenizer_config: dict[str, Any] = None,
+    device: str = "cuda",
+) -> torch.nn.Module | torch.jit.ScriptModule:
+    """Loads a torch.nn.Module from a filepath.
+    Args:
+        jit_filepath: The filepath to the JIT-compiled model.
+        device: The device to load the model onto, default=cuda.
+    Returns:
+        The JIT compiled model loaded to device and on eval mode.
+    """
+    if tokenizer_config is None:
+        return load_jit_model(jit_filepath, device)
+    full_model, ckpts = _load_pytorch_model(jit_filepath, tokenizer_config, device)
+    decoder_model = full_model.decoder_jit()
+    decoder_model.load_state_dict(ckpts.state_dict(), strict=False)
+    return decoder_model.eval().to(device)
+def _load_pytorch_model(
+    jit_filepath: str = None, tokenizer_config: str = None, device: str = "cuda"
+) -> torch.nn.Module:
+    """Loads a torch.nn.Module from a filepath.
+    Args:
+        jit_filepath: The filepath to the JIT-compiled model.
+        device: The device to load the model onto, default=cuda.
+    Returns:
+        The JIT compiled model loaded to device and on eval mode.
+    """
+    tokenizer_name = tokenizer_config["name"]
+    model = TokenizerModels[tokenizer_name].value(**tokenizer_config)
+    ckpts = torch.jit.load(jit_filepath, map_location=device)
+    return model, ckpts
+def load_jit_model(jit_filepath: str = None, device: str = "cuda") -> torch.jit.ScriptModule:
+    """Loads a torch.jit.ScriptModule from a filepath.
+    Args:
+        jit_filepath: The filepath to the JIT-compiled model.
+        device: The device to load the model onto, default=cuda.
+    Returns:
+        The JIT compiled model loaded to device and on eval mode.
+    """
+    model = torch.jit.load(jit_filepath, map_location=device)
+    return model.eval().to(device)
+def save_jit_model(
+    model: torch.jit.ScriptModule | torch.jit.RecursiveScriptModule = None,
+    jit_filepath: str = None,
+) -> None:
+    """Saves a torch.jit.ScriptModule or torch.jit.RecursiveScriptModule to file.
+    Args:
+        model: JIT compiled model loaded onto `config.checkpoint.jit.device`.
+        jit_filepath: The filepath to the JIT-compiled model.
+    """
+    torch.jit.save(model, jit_filepath)
+def get_filepaths(input_pattern) -> list[str]:
+    """Returns a list of filepaths from a pattern."""
+    filepaths = sorted(glob(str(input_pattern)))
+    return list(set(filepaths))
+def get_output_filepath(filepath: str, output_dir: str = None) -> str:
+    """Returns the output filepath for the given input filepath."""
+    output_dir = output_dir or f"{os.path.dirname(filepath)}/reconstructions"
+    output_filepath = f"{output_dir}/{os.path.basename(filepath)}"
+    os.makedirs(output_dir, exist_ok=True)
+    return output_filepath
+def read_image(filepath: str) -> np.ndarray:
+    """Reads an image from a filepath.
+    Args:
+        filepath: The filepath to the image.
+    Returns:
+        The image as a numpy array, layout HxWxC, range [0..255], uint8 dtype.
+    """
+    image = media.read_image(filepath)
+    # convert the grey scale image to RGB
+    # since our tokenizers always assume 3-channel RGB image
+    if image.ndim == 2:
+        image = np.stack([image] * 3, axis=-1)
+    # convert RGBA to RGB
+    if image.shape[-1] == 4:
+        image = image[..., :3]
+    return image
+def read_video(filepath: str) -> np.ndarray:
+    """Reads a video from a filepath.
+    Args:
+        filepath: The filepath to the video.
+    Returns:
+        The video as a numpy array, layout TxHxWxC, range [0..255], uint8 dtype.
+    """
+    video = media.read_video(filepath)
+    # convert the grey scale frame to RGB
+    # since our tokenizers always assume 3-channel video
+    if video.ndim == 3:
+        video = np.stack([video] * 3, axis=-1)
+    # convert RGBA to RGB
+    if video.shape[-1] == 4:
+        video = video[..., :3]
+    return video
+def resize_image(image: np.ndarray, short_size: int = None) -> np.ndarray:
+    """Resizes an image to have the short side of `short_size`.
+    Args:
+        image: The image to resize, layout HxWxC, of any range.
+        short_size: The size of the short side.
+    Returns:
+        The resized image.
+    """
+    if short_size is None:
+        return image
+    height, width = image.shape[-3:-1]
+    if height <= width:
+        height_new, width_new = short_size, int(width * short_size / height + 0.5)
+        width_new = width_new if width_new % 2 == 0 else width_new + 1
+    else:
+        height_new, width_new = (
+            int(height * short_size / width + 0.5),
+            short_size,
+        )
+        height_new = height_new if height_new % 2 == 0 else height_new + 1
+    return media.resize_image(image, shape=(height_new, width_new))
+def resize_video(video: np.ndarray, short_size: int = None) -> np.ndarray:
+    """Resizes a video to have the short side of `short_size`.
+    Args:
+        video: The video to resize, layout TxHxWxC, of any range.
+        short_size: The size of the short side.
+    Returns:
+        The resized video.
+    """
+    if short_size is None:
+        return video
+    height, width = video.shape[-3:-1]
+    if height <= width:
+        height_new, width_new = short_size, int(width * short_size / height + 0.5)
+        width_new = width_new if width_new % 2 == 0 else width_new + 1
+    else:
+        height_new, width_new = (
+            int(height * short_size / width + 0.5),
+            short_size,
+        )
+        height_new = height_new if height_new % 2 == 0 else height_new + 1
+    return media.resize_video(video, shape=(height_new, width_new))
+def write_image(filepath: str, image: np.ndarray):
+    """Writes an image to a filepath."""
+    return media.write_image(filepath, image)
+def write_video(filepath: str, video: np.ndarray, fps: int = 24) -> None:
+    """Writes a video to a filepath."""
+    return media.write_video(filepath, video, fps=fps)
+def numpy2tensor(
+    input_image: np.ndarray,
+    dtype: torch.dtype = _DTYPE,
+    device: str = _DEVICE,
+    range_min: int = -1,
+) -> torch.Tensor:
+    """Converts image(dtype=np.uint8) to `dtype` in range [0..255].
+    Args:
+        input_image: A batch of images in range [0..255], BxHxWx3 layout.
+    Returns:
+        A torch.Tensor of layout Bx3xHxW in range [-1..1], dtype.
+    """
+    ndim = input_image.ndim
+    indices = list(range(1, ndim))[-1:] + list(range(1, ndim))[:-1]
+    image = input_image.transpose((0,) + tuple(indices)) / _UINT8_MAX_F
+    if range_min == -1:
+        image = 2.0 * image - 1.0
+    return torch.from_numpy(image).to(dtype).to(device)
+def tensor2numpy(input_tensor: torch.Tensor, range_min: int = -1) -> np.ndarray:
+    """Converts tensor in [-1,1] to image(dtype=np.uint8) in range [0..255].
+    Args:
+        input_tensor: Input image tensor of Bx3xHxW layout, range [-1..1].
+    Returns:
+        A numpy image of layout BxHxWx3, range [0..255], uint8 dtype.
+    """
+    if range_min == -1:
+        input_tensor = (input_tensor.float() + 1.0) / 2.0
+    ndim = input_tensor.ndim
+    output_image = input_tensor.clamp(0, 1).cpu().numpy()
+    output_image = output_image.transpose((0,) + tuple(range(2, ndim)) + (1,))
+    return (output_image * _UINT8_MAX_F + 0.5).astype(np.uint8)
+def pad_image_batch(batch: np.ndarray, spatial_align: int = _SPATIAL_ALIGN) -> tuple[np.ndarray, list[int]]:
+    """Pads a batch of images to be divisible by `spatial_align`.
+    Args:
+        batch: The batch of images to pad, layout BxHxWx3, in any range.
+        align: The alignment to pad to.
+    Returns:
+        The padded batch and the crop region.
+    """
+    height, width = batch.shape[1:3]
+    align = spatial_align
+    height_to_pad = (align - height % align) if height % align != 0 else 0
+    width_to_pad = (align - width % align) if width % align != 0 else 0
+    crop_region = [
+        height_to_pad >> 1,
+        width_to_pad >> 1,
+        height + (height_to_pad >> 1),
+        width + (width_to_pad >> 1),
+    ]
+    batch = np.pad(
+        batch,
+        (
+            (0, 0),
+            (height_to_pad >> 1, height_to_pad - (height_to_pad >> 1)),
+            (width_to_pad >> 1, width_to_pad - (width_to_pad >> 1)),
+            (0, 0),
+        ),
+        mode="constant",
+    )
+    return batch, crop_region
+def pad_video_batch(
+    batch: np.ndarray,
+    temporal_align: int = _TEMPORAL_ALIGN,
+    spatial_align: int = _SPATIAL_ALIGN,
+) -> tuple[np.ndarray, list[int]]:
+    """Pads a batch of videos to be divisible by `temporal_align` or `spatial_align`.
+    Zero pad spatially. Reflection pad temporally to handle causality better.
+    Args:
+        batch: The batch of videos to pad., layout BxFxHxWx3, in any range.
+        align: The alignment to pad to.
+    Returns:
+        The padded batch and the crop region.
+    """
+    num_frames, height, width = batch.shape[-4:-1]
+    align = spatial_align
+    height_to_pad = (align - height % align) if height % align != 0 else 0
+    width_to_pad = (align - width % align) if width % align != 0 else 0
+    align = temporal_align
+    frames_to_pad = (align - (num_frames - 1) % align) if (num_frames - 1) % align != 0 else 0
+    crop_region = [
+        frames_to_pad >> 1,
+        height_to_pad >> 1,
+        width_to_pad >> 1,
+        num_frames + (frames_to_pad >> 1),
+        height + (height_to_pad >> 1),
+        width + (width_to_pad >> 1),
+    ]
+    batch = np.pad(
+        batch,
+        (
+            (0, 0),
+            (0, 0),
+            (height_to_pad >> 1, height_to_pad - (height_to_pad >> 1)),
+            (width_to_pad >> 1, width_to_pad - (width_to_pad >> 1)),
+            (0, 0),
+        ),
+        mode="constant",
+    )
+    batch = np.pad(
+        batch,
+        (
+            (0, 0),
+            (frames_to_pad >> 1, frames_to_pad - (frames_to_pad >> 1)),
+            (0, 0),
+            (0, 0),
+            (0, 0),
+        ),
+        mode="edge",
+    )
+    return batch, crop_region
+def unpad_video_batch(batch: np.ndarray, crop_region: list[int]) -> np.ndarray:
+    """Unpads video with `crop_region`.
+    Args:
+        batch: A batch of numpy videos, layout BxFxHxWxC.
+        crop_region: [f1,y1,x1,f2,y2,x2] first, top, left, last, bot, right crop indices.
+    Returns:
+        np.ndarray: Cropped numpy video, layout BxFxHxWxC.
+    """
+    assert len(crop_region) == 6, "crop_region should be len of 6."
+    f1, y1, x1, f2, y2, x2 = crop_region
+    return batch[..., f1:f2, y1:y2, x1:x2, :]
+def unpad_image_batch(batch: np.ndarray, crop_region: list[int]) -> np.ndarray:
+    """Unpads image with `crop_region`.
+    Args:
+        batch: A batch of numpy images, layout BxHxWxC.
+        crop_region: [y1,x1,y2,x2] top, left, bot, right crop indices.
+    Returns:
+        np.ndarray: Cropped numpy image, layout BxHxWxC.
+    """
+    assert len(crop_region) == 4, "crop_region should be len of 4."
+    y1, x1, y2, x2 = crop_region
+    return batch[..., y1:y2, x1:x2, :]

cosmos_transfer1/auxiliary/tokenizer/inference/video_cli.py ADDED Viewed

	@@ -0,0 +1,210 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A CLI to run CausalVideoTokenizer on plain videos based on torch.jit.
+Usage:
+    python3 -m cosmos_transfer1.auxiliary.tokenizer.inference.video_cli \
+        --video_pattern 'path/to/video/samples/*.mp4' \
+        --output_dir ./reconstructions \
+        --checkpoint_enc ./checkpoints/<model-name>/encoder.jit \
+        --checkpoint_dec ./checkpoints/<model-name>/decoder.jit
+    Optionally, you can run the model in pure PyTorch mode:
+    python3 -m cosmos_transfer1.auxiliary.tokenizer.inference.video_cli \
+        --video_pattern 'path/to/video/samples/*.mp4' \
+        --mode=torch \
+        --tokenizer_type=CV \
+        --temporal_compression=4 \
+        --spatial_compression=8 \
+        --checkpoint_enc ./checkpoints/<model-name>/encoder.jit \
+        --checkpoint_dec ./checkpoints/<model-name>/decoder.jit
+"""
+import os
+import sys
+from argparse import ArgumentParser, Namespace
+from typing import Any
+import numpy as np
+from loguru import logger as logging
+from cosmos_transfer1.auxiliary.tokenizer.inference.utils import (
+    get_filepaths,
+    get_output_filepath,
+    read_video,
+    resize_video,
+    write_video,
+)
+from cosmos_transfer1.auxiliary.tokenizer.inference.video_lib import CausalVideoTokenizer
+from cosmos_transfer1.auxiliary.tokenizer.networks import TokenizerConfigs
+def _parse_args() -> tuple[Namespace, dict[str, Any]]:
+    parser = ArgumentParser(description="A CLI for CausalVideoTokenizer.")
+    parser.add_argument(
+        "--video_pattern",
+        type=str,
+        default="path/to/videos/*.mp4",
+        help="Glob pattern.",
+    )
+    parser.add_argument(
+        "--checkpoint",
+        type=str,
+        default=None,
+        help="JIT full Autoencoder model filepath.",
+    )
+    parser.add_argument(
+        "--checkpoint_enc",
+        type=str,
+        default=None,
+        help="JIT Encoder model filepath.",
+    )
+    parser.add_argument(
+        "--checkpoint_dec",
+        type=str,
+        default=None,
+        help="JIT Decoder model filepath.",
+    )
+    parser.add_argument(
+        "--tokenizer_type",
+        type=str,
+        choices=["CV", "DV"],
+        help="Specifies the tokenizer type.",
+    )
+    parser.add_argument(
+        "--spatial_compression",
+        type=int,
+        choices=[8, 16],
+        default=8,
+        help="The spatial compression factor.",
+    )
+    parser.add_argument(
+        "--temporal_compression",
+        type=int,
+        choices=[4, 8],
+        default=4,
+        help="The temporal compression factor.",
+    )
+    parser.add_argument(
+        "--mode",
+        type=str,
+        choices=["torch", "jit"],
+        default="jit",
+        help="Specify the backend: native 'torch' or 'jit' (default: 'jit')",
+    )
+    parser.add_argument(
+        "--short_size",
+        type=int,
+        default=None,
+        help="The size to resample inputs. None, by default.",
+    )
+    parser.add_argument(
+        "--temporal_window",
+        type=int,
+        default=17,
+        help="The temporal window to operate at a time.",
+    )
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        default="bfloat16",
+        help="Sets the precision, default bfloat16.",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cuda",
+        help="Device for invoking the model.",
+    )
+    parser.add_argument("--output_dir", type=str, default=None, help="Output directory.")
+    parser.add_argument(
+        "--output_fps",
+        type=float,
+        default=24.0,
+        help="Output frames-per-second (FPS).",
+    )
+    parser.add_argument(
+        "--save_input",
+        action="store_true",
+        help="If on, the input video will be be outputted too.",
+    )
+    args = parser.parse_args()
+    return args
+logging.info("Initializes args ...")
+args = _parse_args()
+if args.mode == "torch" and args.tokenizer_type not in ["CV", "DV"]:
+    logging.error("'torch' backend requires the tokenizer_type of 'CV' or 'DV'.")
+    sys.exit(1)
+def _run_eval() -> None:
+    """Invokes JIT-compiled CausalVideoTokenizer on an input video."""
+    if args.checkpoint_enc is None and args.checkpoint_dec is None and args.checkpoint is None:
+        logging.warning("Aborting. Both encoder or decoder JIT required. Or provide the full autoencoder JIT model.")
+        return
+    if args.mode == "torch":
+        tokenizer_config = TokenizerConfigs[args.tokenizer_type].value
+        tokenizer_config.update(dict(spatial_compression=args.spatial_compression))
+        tokenizer_config.update(dict(temporal_compression=args.temporal_compression))
+    else:
+        tokenizer_config = None
+    logging.info(
+        f"Loading a torch.jit model `{os.path.dirname(args.checkpoint or args.checkpoint_enc or args.checkpoint_dec)}` ..."
+    )
+    autoencoder = CausalVideoTokenizer(
+        checkpoint=args.checkpoint,
+        checkpoint_enc=args.checkpoint_enc,
+        checkpoint_dec=args.checkpoint_dec,
+        tokenizer_config=tokenizer_config,
+        device=args.device,
+        dtype=args.dtype,
+    )
+    logging.info(f"Looking for files matching video_pattern={args.video_pattern} ...")
+    filepaths = get_filepaths(args.video_pattern)
+    logging.info(f"Found {len(filepaths)} videos from {args.video_pattern}.")
+    for filepath in filepaths:
+        logging.info(f"Reading video {filepath} ...")
+        video = read_video(filepath)
+        video = resize_video(video, short_size=args.short_size)
+        logging.info("Invoking the autoencoder model in ... ")
+        batch_video = video[np.newaxis, ...]
+        output_video = autoencoder(batch_video, temporal_window=args.temporal_window)[0]
+        logging.info("Constructing output filepath ...")
+        output_filepath = get_output_filepath(filepath, output_dir=args.output_dir)
+        logging.info(f"Outputing {output_filepath} ...")
+        write_video(output_filepath, output_video, fps=args.output_fps)
+        if args.save_input:
+            ext = os.path.splitext(output_filepath)[-1]
+            input_filepath = output_filepath.replace(ext, "_input" + ext)
+            write_video(input_filepath, video, fps=args.output_fps)
+@logging.catch(reraise=True)
+def main() -> None:
+    _run_eval()
+if __name__ == "__main__":
+    main()

cosmos_transfer1/auxiliary/tokenizer/inference/video_lib.py ADDED Viewed

	@@ -0,0 +1,146 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A library for Causal Video Tokenizer inference."""
+from typing import Any
+import numpy as np
+import torch
+from tqdm import tqdm
+from cosmos_transfer1.auxiliary.tokenizer.inference.utils import (
+    load_decoder_model,
+    load_encoder_model,
+    load_model,
+    numpy2tensor,
+    pad_video_batch,
+    tensor2numpy,
+    unpad_video_batch,
+)
+class CausalVideoTokenizer(torch.nn.Module):
+    def __init__(
+        self,
+        checkpoint: str = None,
+        checkpoint_enc: str = None,
+        checkpoint_dec: str = None,
+        tokenizer_config: dict[str, Any] = None,
+        device: str = "cuda",
+        dtype: str = "bfloat16",
+    ) -> None:
+        super().__init__()
+        self._device = device
+        self._dtype = getattr(torch, dtype)
+        self._full_model = (
+            load_model(checkpoint, tokenizer_config, device).to(self._dtype) if checkpoint is not None else None
+        )
+        self._enc_model = (
+            load_encoder_model(checkpoint_enc, tokenizer_config, device).to(self._dtype)
+            if checkpoint_enc is not None
+            else None
+        )
+        self._dec_model = (
+            load_decoder_model(checkpoint_dec, tokenizer_config, device).to(self._dtype)
+            if checkpoint_dec is not None
+            else None
+        )
+    @torch.no_grad()
+    def autoencode(self, input_tensor: torch.Tensor) -> torch.Tensor:
+        """Reconstrcuts a batch of video tensors after embedding into a latent.
+        Args:
+            video: The input video Bx3xTxHxW layout, range [-1..1].
+        Returns:
+            The reconstructed video, layout Bx3xTxHxW, range [-1..1].
+        """
+        if self._full_model is not None:
+            output_tensor = self._full_model(input_tensor)
+            output_tensor = output_tensor[0] if isinstance(output_tensor, tuple) else output_tensor
+        else:
+            output_latent = self.encode(input_tensor)[0]
+            output_tensor = self.decode(output_latent)
+        return output_tensor
+    @torch.no_grad()
+    def encode(self, input_tensor: torch.Tensor) -> tuple[torch.Tensor]:
+        """Encodes a numpy video into a CausalVideo latent or code.
+        Args:
+            input_tensor: The input tensor Bx3xTxHxW layout, range [-1..1].
+        Returns:
+            For causal continuous video (CV) tokenizer, the tuple contains:
+                - The latent embedding, Bx16x(t)x(h)x(w), where the compression
+                rate is (T/t x H/h x W/w), and channel dimension of 16.
+            For causal discrete video (DV) tokenizer, the tuple contains:
+              1) The indices, Bx(t)x(h)x(w), from a codebook of size 64K, which
+                is formed by FSQ levels of (8,8,8,5,5,5).
+              2) The discrete code, Bx6x(t)x(h)x(w), where the compression rate
+                is again (T/t x H/h x W/w), and channel dimension of 6.
+        """
+        assert input_tensor.ndim == 5, "input video should be of 5D."
+        output_latent = self._enc_model(input_tensor)
+        if isinstance(output_latent, torch.Tensor):
+            return output_latent
+        return output_latent[:-1]
+    @torch.no_grad()
+    def decode(self, input_latent: torch.Tensor) -> torch.Tensor:
+        """Encodes a numpy video into a CausalVideo latent.
+        Args:
+            input_latent: The continuous latent Bx16xtxhxw for CV,
+                        or the discrete indices Bxtxhxw for DV.
+        Returns:
+            The reconstructed tensor, layout [B,3,1+(T-1)*8,H*16,W*16] in range [-1..1].
+        """
+        assert input_latent.ndim >= 4, "input latent should be of 5D for continuous and 4D for discrete."
+        return self._dec_model(input_latent)
+    def forward(
+        self,
+        video: np.ndarray,
+        temporal_window: int = 17,
+    ) -> np.ndarray:
+        """Reconstructs video using a pre-trained CausalTokenizer autoencoder.
+        Given a video of arbitrary length, the forward invokes the CausalVideoTokenizer
+        in a sliding manner with a `temporal_window` size.
+        Args:
+            video: The input video BxTxHxWx3 layout, range [0..255].
+            temporal_window: The length of the temporal window to process, default=25.
+        Returns:
+            The reconstructed video in range [0..255], layout BxTxHxWx3.
+        """
+        assert video.ndim == 5, "input video should be of 5D."
+        num_frames = video.shape[1]  # can be of any length.
+        output_video_list = []
+        for idx in tqdm(range(0, (num_frames - 1) // temporal_window + 1)):
+            # Input video for the current window.
+            start, end = idx * temporal_window, (idx + 1) * temporal_window
+            input_video = video[:, start:end, ...]
+            # Spatio-temporally pad input_video so it's evenly divisible.
+            padded_input_video, crop_region = pad_video_batch(input_video)
+            input_tensor = numpy2tensor(padded_input_video, dtype=self._dtype, device=self._device)
+            output_tensor = self.autoencode(input_tensor)
+            padded_output_video = tensor2numpy(output_tensor)
+            output_video = unpad_video_batch(padded_output_video, crop_region)
+            output_video_list.append(output_video)
+        return np.concatenate(output_video_list, axis=1)

cosmos_transfer1/auxiliary/tokenizer/modules/__init__.py ADDED Viewed

	@@ -0,0 +1,61 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from enum import Enum
+from cosmos_transfer1.auxiliary.tokenizer.modules.distributions import GaussianDistribution, IdentityDistribution
+from cosmos_transfer1.auxiliary.tokenizer.modules.layers2d import Decoder, Encoder
+from cosmos_transfer1.auxiliary.tokenizer.modules.layers3d import (
+    DecoderBase,
+    DecoderFactorized,
+    EncoderBase,
+    EncoderFactorized,
+)
+from cosmos_transfer1.auxiliary.tokenizer.modules.quantizers import (
+    FSQuantizer,
+    LFQuantizer,
+    ResidualFSQuantizer,
+    VectorQuantizer,
+)
+class EncoderType(Enum):
+    Default = Encoder
+class DecoderType(Enum):
+    Default = Decoder
+class Encoder3DType(Enum):
+    BASE = EncoderBase
+    FACTORIZED = EncoderFactorized
+class Decoder3DType(Enum):
+    BASE = DecoderBase
+    FACTORIZED = DecoderFactorized
+class ContinuousFormulation(Enum):
+    VAE = GaussianDistribution
+    AE = IdentityDistribution
+class DiscreteQuantizer(Enum):
+    VQ = VectorQuantizer
+    LFQ = LFQuantizer
+    FSQ = FSQuantizer
+    RESFSQ = ResidualFSQuantizer

cosmos_transfer1/auxiliary/tokenizer/modules/distributions.py ADDED Viewed

	@@ -0,0 +1,42 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""The distribution modes to use for continuous image tokenizers."""
+import torch
+class IdentityDistribution(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, parameters):
+        return parameters, (torch.tensor([0.0]), torch.tensor([0.0]))
+class GaussianDistribution(torch.nn.Module):
+    def __init__(self, min_logvar: float = -30.0, max_logvar: float = 20.0):
+        super().__init__()
+        self.min_logvar = min_logvar
+        self.max_logvar = max_logvar
+    def sample(self, mean, logvar):
+        std = torch.exp(0.5 * logvar)
+        return mean + std * torch.randn_like(mean)
+    def forward(self, parameters):
+        mean, logvar = torch.chunk(parameters, 2, dim=1)
+        logvar = torch.clamp(logvar, self.min_logvar, self.max_logvar)
+        return self.sample(mean, logvar), (mean, logvar)

cosmos_transfer1/auxiliary/tokenizer/modules/layers2d.py ADDED Viewed

	@@ -0,0 +1,329 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""The model definition for Continuous 2D layers
+Adapted from: https://github.com/CompVis/stable-diffusion/blob/
+21f890f9da3cfbeaba8e2ac3c425ee9e998d5229/ldm/modules/diffusionmodules/model.py
+[Copyright (c) 2022 Robin Rombach and Patrick Esser and contributors]
+https://github.com/CompVis/stable-diffusion/blob/
+21f890f9da3cfbeaba8e2ac3c425ee9e998d5229/LICENSE
+"""
+import math
+import numpy as np
+# pytorch_diffusion + derived encoder decoder
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from loguru import logger as logging
+from cosmos_transfer1.auxiliary.tokenizer.modules.patching import Patcher, UnPatcher
+from cosmos_transfer1.auxiliary.tokenizer.modules.utils import Normalize, nonlinearity
+class Upsample(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x.repeat_interleave(2, dim=2).repeat_interleave(2, dim=3)
+        return self.conv(x)
+class Downsample(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        pad = (0, 1, 0, 1)
+        x = F.pad(x, pad, mode="constant", value=0)
+        return self.conv(x)
+class ResnetBlock(nn.Module):
+    def __init__(
+        self,
+        *,
+        in_channels: int,
+        out_channels: int = None,
+        dropout: float,
+        **kwargs,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.norm1 = Normalize(in_channels)
+        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        self.norm2 = Normalize(out_channels)
+        self.dropout = nn.Dropout(dropout)
+        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        self.nin_shortcut = (
+            nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+            if in_channels != out_channels
+            else nn.Identity()
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        h = x
+        h = self.norm1(h)
+        h = nonlinearity(h)
+        h = self.conv1(h)
+        h = self.norm2(h)
+        h = nonlinearity(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+        x = self.nin_shortcut(x)
+        return x + h
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.norm = Normalize(in_channels)
+        self.q = nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.k = nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.v = nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.proj_out = nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # TODO (freda): Consider reusing implementations in Attn `imaginaire`,
+        # since than one is gonna be based on TransformerEngine's attn op,
+        # w/c could ease CP implementations.
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        # compute attention
+        b, c, h, w = q.shape
+        q = q.reshape(b, c, h * w)
+        q = q.permute(0, 2, 1)
+        k = k.reshape(b, c, h * w)
+        w_ = torch.bmm(q, k)
+        w_ = w_ * (int(c) ** (-0.5))
+        w_ = F.softmax(w_, dim=2)
+        # attend to values
+        v = v.reshape(b, c, h * w)
+        w_ = w_.permute(0, 2, 1)
+        h_ = torch.bmm(v, w_)
+        h_ = h_.reshape(b, c, h, w)
+        h_ = self.proj_out(h_)
+        return x + h_
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        channels: int,
+        channels_mult: list[int],
+        num_res_blocks: int,
+        attn_resolutions: list[int],
+        dropout: float,
+        resolution: int,
+        z_channels: int,
+        spatial_compression: int,
+        **ignore_kwargs,
+    ):
+        super().__init__()
+        self.num_resolutions = len(channels_mult)
+        self.num_res_blocks = num_res_blocks
+        # Patcher.
+        patch_size = ignore_kwargs.get("patch_size", 1)
+        self.patcher = Patcher(patch_size, ignore_kwargs.get("patch_method", "rearrange"))
+        in_channels = in_channels * patch_size * patch_size
+        # calculate the number of downsample operations
+        self.num_downsamples = int(math.log2(spatial_compression)) - int(math.log2(patch_size))
+        assert (
+            self.num_downsamples <= self.num_resolutions
+        ), f"we can only downsample {self.num_resolutions} times at most"
+        # downsampling
+        self.conv_in = torch.nn.Conv2d(in_channels, channels, kernel_size=3, stride=1, padding=1)
+        curr_res = resolution // patch_size
+        in_ch_mult = (1,) + tuple(channels_mult)
+        self.in_ch_mult = in_ch_mult
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = channels * in_ch_mult[i_level]
+            block_out = channels * channels_mult[i_level]
+            for _ in range(self.num_res_blocks):
+                block.append(
+                    ResnetBlock(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        dropout=dropout,
+                    )
+                )
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(AttnBlock(block_in))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level < self.num_downsamples:
+                down.downsample = Downsample(block_in)
+                curr_res = curr_res // 2
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in, dropout=dropout)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in, dropout=dropout)
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in, z_channels, kernel_size=3, stride=1, padding=1)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.patcher(x)
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1])
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level < self.num_downsamples:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        out_channels: int,
+        channels: int,
+        channels_mult: list[int],
+        num_res_blocks: int,
+        attn_resolutions: int,
+        dropout: float,
+        resolution: int,
+        z_channels: int,
+        spatial_compression: int,
+        **ignore_kwargs,
+    ):
+        super().__init__()
+        self.num_resolutions = len(channels_mult)
+        self.num_res_blocks = num_res_blocks
+        # UnPatcher.
+        patch_size = ignore_kwargs.get("patch_size", 1)
+        self.unpatcher = UnPatcher(patch_size, ignore_kwargs.get("patch_method", "rearrange"))
+        out_ch = out_channels * patch_size * patch_size
+        # calculate the number of upsample operations
+        self.num_upsamples = int(math.log2(spatial_compression)) - int(math.log2(patch_size))
+        assert self.num_upsamples <= self.num_resolutions, f"we can only upsample {self.num_resolutions} times at most"
+        block_in = channels * channels_mult[self.num_resolutions - 1]
+        curr_res = (resolution // patch_size) // 2 ** (self.num_resolutions - 1)
+        self.z_shape = (1, z_channels, curr_res, curr_res)
+        logging.info("Working with z of shape {} = {} dimensions.".format(self.z_shape, np.prod(self.z_shape)))
+        # z to block_in
+        self.conv_in = torch.nn.Conv2d(z_channels, block_in, kernel_size=3, stride=1, padding=1)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in, dropout=dropout)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in, dropout=dropout)
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = channels * channels_mult[i_level]
+            for _ in range(self.num_res_blocks + 1):
+                block.append(
+                    ResnetBlock(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        dropout=dropout,
+                    )
+                )
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(AttnBlock(block_in))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level >= (self.num_resolutions - self.num_upsamples):
+                up.upsample = Upsample(block_in)
+                curr_res = curr_res * 2
+            self.up.insert(0, up)
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in, out_ch, kernel_size=3, stride=1, padding=1)
+    def forward(self, z: torch.Tensor) -> torch.Tensor:
+        h = self.conv_in(z)
+        # middle
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](h)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level >= (self.num_resolutions - self.num_upsamples):
+                h = self.up[i_level].upsample(h)
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        h = self.unpatcher(h)
+        return h

cosmos_transfer1/auxiliary/tokenizer/modules/layers3d.py ADDED Viewed

	@@ -0,0 +1,969 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""The model definition for 3D layers
+Adapted from: https://github.com/lucidrains/magvit2-pytorch/blob/
+9f49074179c912736e617d61b32be367eb5f993a/magvit2_pytorch/magvit2_pytorch.py#L889
+[MIT License Copyright (c) 2023 Phil Wang]
+https://github.com/lucidrains/magvit2-pytorch/blob/
+9f49074179c912736e617d61b32be367eb5f993a/LICENSE
+"""
+import math
+from typing import Tuple, Union
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from loguru import logger as logging
+from cosmos_transfer1.auxiliary.tokenizer.modules.patching import Patcher, Patcher3D, UnPatcher, UnPatcher3D
+from cosmos_transfer1.auxiliary.tokenizer.modules.utils import (
+    CausalNormalize,
+    batch2space,
+    batch2time,
+    cast_tuple,
+    is_odd,
+    nonlinearity,
+    replication_pad,
+    space2batch,
+    time2batch,
+)
+_LEGACY_NUM_GROUPS = 32
+class CausalConv3d(nn.Module):
+    def __init__(
+        self,
+        chan_in: int = 1,
+        chan_out: int = 1,
+        kernel_size: Union[int, Tuple[int, int, int]] = 3,
+        pad_mode: str = "constant",
+        **kwargs,
+    ):
+        super().__init__()
+        kernel_size = cast_tuple(kernel_size, 3)
+        time_kernel_size, height_kernel_size, width_kernel_size = kernel_size
+        assert is_odd(height_kernel_size) and is_odd(width_kernel_size)
+        dilation = kwargs.pop("dilation", 1)
+        stride = kwargs.pop("stride", 1)
+        time_stride = kwargs.pop("time_stride", 1)
+        time_dilation = kwargs.pop("time_dilation", 1)
+        padding = kwargs.pop("padding", 1)
+        self.pad_mode = pad_mode
+        time_pad = time_dilation * (time_kernel_size - 1) + (1 - time_stride)
+        self.time_pad = time_pad
+        self.spatial_pad = (padding, padding, padding, padding)
+        stride = (time_stride, stride, stride)
+        dilation = (time_dilation, dilation, dilation)
+        self.conv3d = nn.Conv3d(
+            chan_in,
+            chan_out,
+            kernel_size,
+            stride=stride,
+            dilation=dilation,
+            **kwargs,
+        )
+    def _replication_pad(self, x: torch.Tensor) -> torch.Tensor:
+        x_prev = x[:, :, :1, ...].repeat(1, 1, self.time_pad, 1, 1)
+        x = torch.cat([x_prev, x], dim=2)
+        padding = self.spatial_pad + (0, 0)
+        return F.pad(x, padding, mode=self.pad_mode, value=0.0)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self._replication_pad(x)
+        return self.conv3d(x)
+class CausalUpsample3d(nn.Module):
+    def __init__(self, in_channels: int) -> None:
+        super().__init__()
+        self.conv = CausalConv3d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x.repeat_interleave(2, dim=3).repeat_interleave(2, dim=4)
+        time_factor = 1.0 + 1.0 * (x.shape[2] > 1)
+        if isinstance(time_factor, torch.Tensor):
+            time_factor = time_factor.item()
+        x = x.repeat_interleave(int(time_factor), dim=2)
+        # TODO(freda): Check if this causes temporal inconsistency.
+        # Shoule reverse the order of the following two ops,
+        # better perf and better temporal smoothness.
+        x = self.conv(x)
+        return x[..., int(time_factor - 1) :, :, :]
+class CausalDownsample3d(nn.Module):
+    def __init__(self, in_channels: int) -> None:
+        super().__init__()
+        self.conv = CausalConv3d(
+            in_channels,
+            in_channels,
+            kernel_size=3,
+            stride=2,
+            time_stride=2,
+            padding=0,
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        pad = (0, 1, 0, 1, 0, 0)
+        x = F.pad(x, pad, mode="constant", value=0)
+        x = replication_pad(x)
+        x = self.conv(x)
+        return x
+class CausalHybridUpsample3d(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        spatial_up: bool = True,
+        temporal_up: bool = True,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        self.conv1 = CausalConv3d(
+            in_channels,
+            in_channels,
+            kernel_size=(3, 1, 1),
+            stride=1,
+            time_stride=1,
+            padding=0,
+        )
+        self.conv2 = CausalConv3d(
+            in_channels,
+            in_channels,
+            kernel_size=(1, 3, 3),
+            stride=1,
+            time_stride=1,
+            padding=1,
+        )
+        self.conv3 = CausalConv3d(
+            in_channels,
+            in_channels,
+            kernel_size=1,
+            stride=1,
+            time_stride=1,
+            padding=0,
+        )
+        self.spatial_up = spatial_up
+        self.temporal_up = temporal_up
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not self.spatial_up and not self.temporal_up:
+            return x
+        # hybrid upsample temporally.
+        if self.temporal_up:
+            time_factor = 1.0 + 1.0 * (x.shape[2] > 1)
+            if isinstance(time_factor, torch.Tensor):
+                time_factor = time_factor.item()
+            x = x.repeat_interleave(int(time_factor), dim=2)
+            x = x[..., int(time_factor - 1) :, :, :]
+            x = self.conv1(x) + x
+        # hybrid upsample spatially.
+        if self.spatial_up:
+            x = x.repeat_interleave(2, dim=3).repeat_interleave(2, dim=4)
+            x = self.conv2(x) + x
+        # final 1x1x1 conv.
+        x = self.conv3(x)
+        return x
+class CausalHybridDownsample3d(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        spatial_down: bool = True,
+        temporal_down: bool = True,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        self.conv1 = CausalConv3d(
+            in_channels,
+            in_channels,
+            kernel_size=(1, 3, 3),
+            stride=2,
+            time_stride=1,
+            padding=0,
+        )
+        self.conv2 = CausalConv3d(
+            in_channels,
+            in_channels,
+            kernel_size=(3, 1, 1),
+            stride=1,
+            time_stride=2,
+            padding=0,
+        )
+        self.conv3 = CausalConv3d(
+            in_channels,
+            in_channels,
+            kernel_size=1,
+            stride=1,
+            time_stride=1,
+            padding=0,
+        )
+        self.spatial_down = spatial_down
+        self.temporal_down = temporal_down
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not self.spatial_down and not self.temporal_down:
+            return x
+        # hybrid downsample spatially.
+        if self.spatial_down:
+            pad = (0, 1, 0, 1, 0, 0)
+            x = F.pad(x, pad, mode="constant", value=0)
+            x1 = self.conv1(x)
+            x2 = F.avg_pool3d(x, kernel_size=(1, 2, 2), stride=(1, 2, 2))
+            x = x1 + x2
+        # hybrid downsample temporally.
+        if self.temporal_down:
+            x = replication_pad(x)
+            x1 = self.conv2(x)
+            x2 = F.avg_pool3d(x, kernel_size=(2, 1, 1), stride=(2, 1, 1))
+            x = x1 + x2
+        # final 1x1x1 conv.
+        x = self.conv3(x)
+        return x
+class CausalResnetBlock3d(nn.Module):
+    def __init__(
+        self,
+        *,
+        in_channels: int,
+        out_channels: int = None,
+        dropout: float,
+        num_groups: int,
+    ) -> None:
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.norm1 = CausalNormalize(in_channels, num_groups=num_groups)
+        self.conv1 = CausalConv3d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        self.norm2 = CausalNormalize(out_channels, num_groups=num_groups)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = CausalConv3d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        self.nin_shortcut = (
+            CausalConv3d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+            if in_channels != out_channels
+            else nn.Identity()
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        h = x
+        h = self.norm1(h)
+        h = nonlinearity(h)
+        h = self.conv1(h)
+        h = self.norm2(h)
+        h = nonlinearity(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+        x = self.nin_shortcut(x)
+        return x + h
+class CausalResnetBlockFactorized3d(nn.Module):
+    def __init__(
+        self,
+        *,
+        in_channels: int,
+        out_channels: int = None,
+        dropout: float,
+        num_groups: int,
+    ) -> None:
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.norm1 = CausalNormalize(in_channels, num_groups=1)
+        self.conv1 = nn.Sequential(
+            CausalConv3d(
+                in_channels,
+                out_channels,
+                kernel_size=(1, 3, 3),
+                stride=1,
+                padding=1,
+            ),
+            CausalConv3d(
+                out_channels,
+                out_channels,
+                kernel_size=(3, 1, 1),
+                stride=1,
+                padding=0,
+            ),
+        )
+        self.norm2 = CausalNormalize(out_channels, num_groups=num_groups)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = nn.Sequential(
+            CausalConv3d(
+                out_channels,
+                out_channels,
+                kernel_size=(1, 3, 3),
+                stride=1,
+                padding=1,
+            ),
+            CausalConv3d(
+                out_channels,
+                out_channels,
+                kernel_size=(3, 1, 1),
+                stride=1,
+                padding=0,
+            ),
+        )
+        self.nin_shortcut = (
+            CausalConv3d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+            if in_channels != out_channels
+            else nn.Identity()
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        h = x
+        h = self.norm1(h)
+        h = nonlinearity(h)
+        h = self.conv1(h)
+        h = self.norm2(h)
+        h = nonlinearity(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+        x = self.nin_shortcut(x)
+        return x + h
+class CausalAttnBlock(nn.Module):
+    def __init__(self, in_channels: int, num_groups: int) -> None:
+        super().__init__()
+        self.norm = CausalNormalize(in_channels, num_groups=num_groups)
+        self.q = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.k = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.v = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.proj_out = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        # compute attention
+        q, batch_size = time2batch(q)
+        k, batch_size = time2batch(k)
+        v, batch_size = time2batch(v)
+        b, c, h, w = q.shape
+        q = q.reshape(b, c, h * w)
+        q = q.permute(0, 2, 1)
+        k = k.reshape(b, c, h * w)
+        w_ = torch.bmm(q, k)
+        w_ = w_ * (int(c) ** (-0.5))
+        w_ = F.softmax(w_, dim=2)
+        # attend to values
+        v = v.reshape(b, c, h * w)
+        w_ = w_.permute(0, 2, 1)
+        h_ = torch.bmm(v, w_)
+        h_ = h_.reshape(b, c, h, w)
+        h_ = batch2time(h_, batch_size)
+        h_ = self.proj_out(h_)
+        return x + h_
+class CausalTemporalAttnBlock(nn.Module):
+    def __init__(self, in_channels: int, num_groups: int) -> None:
+        super().__init__()
+        self.norm = CausalNormalize(in_channels, num_groups=num_groups)
+        self.q = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.k = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.v = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.proj_out = CausalConv3d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        # compute attention
+        q, batch_size, height = space2batch(q)
+        k, _, _ = space2batch(k)
+        v, _, _ = space2batch(v)
+        bhw, c, t = q.shape
+        q = q.permute(0, 2, 1)  # (bhw, t, c)
+        k = k.permute(0, 2, 1)  # (bhw, t, c)
+        v = v.permute(0, 2, 1)  # (bhw, t, c)
+        w_ = torch.bmm(q, k.permute(0, 2, 1))  # (bhw, t, t)
+        w_ = w_ * (int(c) ** (-0.5))
+        # Apply causal mask
+        mask = torch.tril(torch.ones_like(w_))
+        w_ = w_.masked_fill(mask == 0, float("-inf"))
+        w_ = F.softmax(w_, dim=2)
+        # attend to values
+        h_ = torch.bmm(w_, v)  # (bhw, t, c)
+        h_ = h_.permute(0, 2, 1).reshape(bhw, c, t)  # (bhw, c, t)
+        h_ = batch2space(h_, batch_size, height)
+        h_ = self.proj_out(h_)
+        return x + h_
+class EncoderBase(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        channels: int,
+        channels_mult: list[int],
+        num_res_blocks: int,
+        attn_resolutions: list[int],
+        dropout: float,
+        resolution: int,
+        z_channels: int,
+        **ignore_kwargs,
+    ) -> None:
+        super().__init__()
+        self.num_resolutions = len(channels_mult)
+        self.num_res_blocks = num_res_blocks
+        # Patcher.
+        patch_size = ignore_kwargs.get("patch_size", 1)
+        self.patcher = Patcher(patch_size, ignore_kwargs.get("patch_method", "rearrange"))
+        in_channels = in_channels * patch_size * patch_size
+        # downsampling
+        self.conv_in = CausalConv3d(in_channels, channels, kernel_size=3, stride=1, padding=1)
+        # num of groups for GroupNorm, num_groups=1 for LayerNorm.
+        num_groups = ignore_kwargs.get("num_groups", _LEGACY_NUM_GROUPS)
+        curr_res = resolution // patch_size
+        in_ch_mult = (1,) + tuple(channels_mult)
+        self.in_ch_mult = in_ch_mult
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = channels * in_ch_mult[i_level]
+            block_out = channels * channels_mult[i_level]
+            for _ in range(self.num_res_blocks):
+                block.append(
+                    CausalResnetBlock3d(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        dropout=dropout,
+                        num_groups=num_groups,
+                    )
+                )
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(CausalAttnBlock(block_in, num_groups=num_groups))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = CausalDownsample3d(block_in)
+                curr_res = curr_res // 2
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = CausalResnetBlock3d(
+            in_channels=block_in,
+            out_channels=block_in,
+            dropout=dropout,
+            num_groups=num_groups,
+        )
+        self.mid.attn_1 = CausalAttnBlock(block_in, num_groups=num_groups)
+        self.mid.block_2 = CausalResnetBlock3d(
+            in_channels=block_in,
+            out_channels=block_in,
+            dropout=dropout,
+            num_groups=num_groups,
+        )
+        # end
+        self.norm_out = CausalNormalize(block_in, num_groups=num_groups)
+        self.conv_out = CausalConv3d(block_in, z_channels, kernel_size=3, stride=1, padding=1)
+    def patcher3d(self, x: torch.Tensor) -> torch.Tensor:
+        x, batch_size = time2batch(x)
+        x = self.patcher(x)
+        x = batch2time(x, batch_size)
+        return x
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.patcher3d(x)
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1])
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions - 1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+            else:
+                # temporal downsample (last level)
+                time_factor = 1 + 1 * (hs[-1].shape[2] > 1)
+                if isinstance(time_factor, torch.Tensor):
+                    time_factor = time_factor.item()
+                hs[-1] = replication_pad(hs[-1])
+                hs.append(
+                    F.avg_pool3d(
+                        hs[-1],
+                        kernel_size=[time_factor, 1, 1],
+                        stride=[2, 1, 1],
+                    )
+                )
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+class DecoderBase(nn.Module):
+    def __init__(
+        self,
+        out_channels: int,
+        channels: int,
+        channels_mult: list[int],
+        num_res_blocks: int,
+        attn_resolutions: list[int],
+        dropout: float,
+        resolution: int,
+        z_channels: int,
+        **ignore_kwargs,
+    ):
+        super().__init__()
+        self.num_resolutions = len(channels_mult)
+        self.num_res_blocks = num_res_blocks
+        # UnPatcher.
+        patch_size = ignore_kwargs.get("patch_size", 1)
+        self.unpatcher = UnPatcher(patch_size, ignore_kwargs.get("patch_method", "rearrange"))
+        out_ch = out_channels * patch_size * patch_size
+        block_in = channels * channels_mult[self.num_resolutions - 1]
+        curr_res = (resolution // patch_size) // 2 ** (self.num_resolutions - 1)
+        self.z_shape = (1, z_channels, curr_res, curr_res)
+        logging.info("Working with z of shape {} = {} dimensions.".format(self.z_shape, np.prod(self.z_shape)))
+        # z to block_in
+        self.conv_in = CausalConv3d(z_channels, block_in, kernel_size=3, stride=1, padding=1)
+        # num of groups for GroupNorm, num_groups=1 for LayerNorm.
+        num_groups = ignore_kwargs.get("num_groups", _LEGACY_NUM_GROUPS)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = CausalResnetBlock3d(
+            in_channels=block_in,
+            out_channels=block_in,
+            dropout=dropout,
+            num_groups=num_groups,
+        )
+        self.mid.attn_1 = CausalAttnBlock(block_in, num_groups=num_groups)
+        self.mid.block_2 = CausalResnetBlock3d(
+            in_channels=block_in,
+            out_channels=block_in,
+            dropout=dropout,
+            num_groups=num_groups,
+        )
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = channels * channels_mult[i_level]
+            for _ in range(self.num_res_blocks + 1):
+                block.append(
+                    CausalResnetBlock3d(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        dropout=dropout,
+                        num_groups=num_groups,
+                    )
+                )
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(CausalAttnBlock(block_in, num_groups=num_groups))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = CausalUpsample3d(block_in)
+                curr_res = curr_res * 2
+            self.up.insert(0, up)  # prepend to get consistent order
+        # end
+        self.norm_out = CausalNormalize(block_in, num_groups=num_groups)
+        self.conv_out = CausalConv3d(block_in, out_ch, kernel_size=3, stride=1, padding=1)
+    def unpatcher3d(self, x: torch.Tensor) -> torch.Tensor:
+        x, batch_size = time2batch(x)
+        x = self.unpatcher(x)
+        x = batch2time(x, batch_size)
+        return x
+    def forward(self, z):
+        h = self.conv_in(z)
+        # middle block.
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # decoder blocks.
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](h)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+            else:
+                # temporal upsample (last level)
+                time_factor = 1.0 + 1.0 * (h.shape[2] > 1)
+                if isinstance(time_factor, torch.Tensor):
+                    time_factor = time_factor.item()
+                h = h.repeat_interleave(int(time_factor), dim=2)
+                h = h[..., int(time_factor - 1) :, :, :]
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        h = self.unpatcher3d(h)
+        return h
+class EncoderFactorized(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        channels: int,
+        channels_mult: list[int],
+        num_res_blocks: int,
+        attn_resolutions: list[int],
+        dropout: float,
+        resolution: int,
+        z_channels: int,
+        spatial_compression: int = 16,
+        temporal_compression: int = 8,
+        **ignore_kwargs,
+    ) -> None:
+        super().__init__()
+        self.num_resolutions = len(channels_mult)
+        self.num_res_blocks = num_res_blocks
+        # Patcher.
+        patch_size = ignore_kwargs.get("patch_size", 1)
+        self.patcher3d = Patcher3D(patch_size, ignore_kwargs.get("patch_method", "rearrange"))
+        in_channels = in_channels * patch_size * patch_size * patch_size
+        # calculate the number of downsample operations
+        self.num_spatial_downs = int(math.log2(spatial_compression)) - int(math.log2(patch_size))
+        assert (
+            self.num_spatial_downs <= self.num_resolutions
+        ), f"Spatially downsample {self.num_resolutions} times at most"
+        self.num_temporal_downs = int(math.log2(temporal_compression)) - int(math.log2(patch_size))
+        assert (
+            self.num_temporal_downs <= self.num_resolutions
+        ), f"Temporally downsample {self.num_resolutions} times at most"
+        # downsampling
+        self.conv_in = nn.Sequential(
+            CausalConv3d(
+                in_channels,
+                channels,
+                kernel_size=(1, 3, 3),
+                stride=1,
+                padding=1,
+            ),
+            CausalConv3d(channels, channels, kernel_size=(3, 1, 1), stride=1, padding=0),
+        )
+        curr_res = resolution // patch_size
+        in_ch_mult = (1,) + tuple(channels_mult)
+        self.in_ch_mult = in_ch_mult
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = channels * in_ch_mult[i_level]
+            block_out = channels * channels_mult[i_level]
+            for _ in range(self.num_res_blocks):
+                block.append(
+                    CausalResnetBlockFactorized3d(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        dropout=dropout,
+                        num_groups=1,
+                    )
+                )
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(
+                        nn.Sequential(
+                            CausalAttnBlock(block_in, num_groups=1),
+                            CausalTemporalAttnBlock(block_in, num_groups=1),
+                        )
+                    )
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                spatial_down = i_level < self.num_spatial_downs
+                temporal_down = i_level < self.num_temporal_downs
+                down.downsample = CausalHybridDownsample3d(
+                    block_in,
+                    spatial_down=spatial_down,
+                    temporal_down=temporal_down,
+                )
+                curr_res = curr_res // 2
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = CausalResnetBlockFactorized3d(
+            in_channels=block_in,
+            out_channels=block_in,
+            dropout=dropout,
+            num_groups=1,
+        )
+        self.mid.attn_1 = nn.Sequential(
+            CausalAttnBlock(block_in, num_groups=1),
+            CausalTemporalAttnBlock(block_in, num_groups=1),
+        )
+        self.mid.block_2 = CausalResnetBlockFactorized3d(
+            in_channels=block_in,
+            out_channels=block_in,
+            dropout=dropout,
+            num_groups=1,
+        )
+        # end
+        self.norm_out = CausalNormalize(block_in, num_groups=1)
+        self.conv_out = nn.Sequential(
+            CausalConv3d(block_in, z_channels, kernel_size=(1, 3, 3), stride=1, padding=1),
+            CausalConv3d(
+                z_channels,
+                z_channels,
+                kernel_size=(3, 1, 1),
+                stride=1,
+                padding=0,
+            ),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.patcher3d(x)
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1])
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions - 1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+class DecoderFactorized(nn.Module):
+    def __init__(
+        self,
+        out_channels: int,
+        channels: int,
+        channels_mult: list[int],
+        num_res_blocks: int,
+        attn_resolutions: list[int],
+        dropout: float,
+        resolution: int,
+        z_channels: int,
+        spatial_compression: int = 16,
+        temporal_compression: int = 8,
+        **ignore_kwargs,
+    ):
+        super().__init__()
+        self.num_resolutions = len(channels_mult)
+        self.num_res_blocks = num_res_blocks
+        # UnPatcher.
+        patch_size = ignore_kwargs.get("patch_size", 1)
+        self.unpatcher3d = UnPatcher3D(patch_size, ignore_kwargs.get("patch_method", "rearrange"))
+        out_ch = out_channels * patch_size * patch_size * patch_size
+        # calculate the number of upsample operations
+        self.num_spatial_ups = int(math.log2(spatial_compression)) - int(math.log2(patch_size))
+        assert self.num_spatial_ups <= self.num_resolutions, f"Spatially upsample {self.num_resolutions} times at most"
+        self.num_temporal_ups = int(math.log2(temporal_compression)) - int(math.log2(patch_size))
+        assert (
+            self.num_temporal_ups <= self.num_resolutions
+        ), f"Temporally upsample {self.num_resolutions} times at most"
+        block_in = channels * channels_mult[self.num_resolutions - 1]
+        curr_res = (resolution // patch_size) // 2 ** (self.num_resolutions - 1)
+        self.z_shape = (1, z_channels, curr_res, curr_res)
+        logging.info("Working with z of shape {} = {} dimensions.".format(self.z_shape, np.prod(self.z_shape)))
+        # z to block_in
+        self.conv_in = nn.Sequential(
+            CausalConv3d(z_channels, block_in, kernel_size=(1, 3, 3), stride=1, padding=1),
+            CausalConv3d(block_in, block_in, kernel_size=(3, 1, 1), stride=1, padding=0),
+        )
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = CausalResnetBlockFactorized3d(
+            in_channels=block_in,
+            out_channels=block_in,
+            dropout=dropout,
+            num_groups=1,
+        )
+        self.mid.attn_1 = nn.Sequential(
+            CausalAttnBlock(block_in, num_groups=1),
+            CausalTemporalAttnBlock(block_in, num_groups=1),
+        )
+        self.mid.block_2 = CausalResnetBlockFactorized3d(
+            in_channels=block_in,
+            out_channels=block_in,
+            dropout=dropout,
+            num_groups=1,
+        )
+        legacy_mode = ignore_kwargs.get("legacy_mode", False)
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = channels * channels_mult[i_level]
+            for _ in range(self.num_res_blocks + 1):
+                block.append(
+                    CausalResnetBlockFactorized3d(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        dropout=dropout,
+                        num_groups=1,
+                    )
+                )
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(
+                        nn.Sequential(
+                            CausalAttnBlock(block_in, num_groups=1),
+                            CausalTemporalAttnBlock(block_in, num_groups=1),
+                        )
+                    )
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                # The layer index for temporal/spatial downsampling performed
+                # in the encoder should correspond to the layer index in
+                # reverse order where upsampling is performed in the decoder.
+                # If you've a pre-trained model, you can simply finetune.
+                i_level_reverse = self.num_resolutions - i_level - 1
+                if legacy_mode:
+                    temporal_up = i_level_reverse < self.num_temporal_ups
+                else:
+                    temporal_up = 0 < i_level_reverse < self.num_temporal_ups + 1
+                spatial_up = temporal_up or (
+                    i_level_reverse < self.num_spatial_ups and self.num_spatial_ups > self.num_temporal_ups
+                )
+                up.upsample = CausalHybridUpsample3d(block_in, spatial_up=spatial_up, temporal_up=temporal_up)
+                curr_res = curr_res * 2
+            self.up.insert(0, up)  # prepend to get consistent order
+        # end
+        self.norm_out = CausalNormalize(block_in, num_groups=1)
+        self.conv_out = nn.Sequential(
+            CausalConv3d(block_in, out_ch, kernel_size=(1, 3, 3), stride=1, padding=1),
+            CausalConv3d(out_ch, out_ch, kernel_size=(3, 1, 1), stride=1, padding=0),
+        )
+    def forward(self, z):
+        h = self.conv_in(z)
+        # middle block.
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # decoder blocks.
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](h)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        h = self.unpatcher3d(h)
+        return h

cosmos_transfer1/auxiliary/tokenizer/modules/patching.py ADDED Viewed

	@@ -0,0 +1,311 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""The patcher and unpatcher implementation for 2D and 3D data.
+The idea of Haar wavelet is to compute LL, LH, HL, HH component as two 1D convolutions.
+One on the rows and one on the columns.
+For example, in 1D signal, we have [a, b], then the low-freq compoenent is [a + b] / 2 and high-freq is [a - b] / 2.
+We can use a 1D convolution with kernel [1, 1] and stride 2 to represent the L component.
+For H component, we can use a 1D convolution with kernel [1, -1] and stride 2.
+Although in principle, we typically only do additional Haar wavelet over the LL component. But here we do it for all
+   as we need to support downsampling for more than 2x.
+For example, 4x downsampling can be done by 2x Haar and additional 2x Haar, and the shape would be.
+   [3, 256, 256] -> [12, 128, 128] -> [48, 64, 64]
+"""
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+_WAVELETS = {
+    "haar": torch.tensor([0.7071067811865476, 0.7071067811865476]),
+    "rearrange": torch.tensor([1.0, 1.0]),
+}
+_PERSISTENT = False
+class Patcher(torch.nn.Module):
+    """A module to convert image tensors into patches using torch operations.
+    The main difference from `class Patching` is that this module implements
+    all operations using torch, rather than python or numpy, for efficiency purpose.
+    It's bit-wise identical to the Patching module outputs, with the added
+    benefit of being torch.jit scriptable.
+    """
+    def __init__(self, patch_size=1, patch_method="haar"):
+        super().__init__()
+        self.patch_size = patch_size
+        self.patch_method = patch_method
+        self.register_buffer("wavelets", _WAVELETS[patch_method], persistent=_PERSISTENT)
+        self.range = range(int(torch.log2(torch.tensor(self.patch_size)).item()))
+        self.register_buffer(
+            "_arange",
+            torch.arange(_WAVELETS[patch_method].shape[0]),
+            persistent=_PERSISTENT,
+        )
+        for param in self.parameters():
+            param.requires_grad = False
+    def forward(self, x):
+        if self.patch_method == "haar":
+            return self._haar(x)
+        elif self.patch_method == "rearrange":
+            return self._arrange(x)
+        else:
+            raise ValueError("Unknown patch method: " + self.patch_method)
+    def _dwt(self, x, mode="reflect", rescale=False):
+        dtype = x.dtype
+        h = self.wavelets
+        n = h.shape[0]
+        g = x.shape[1]
+        hl = h.flip(0).reshape(1, 1, -1).repeat(g, 1, 1)
+        hh = (h * ((-1) ** self._arange)).reshape(1, 1, -1).repeat(g, 1, 1)
+        hh = hh.to(dtype=dtype)
+        hl = hl.to(dtype=dtype)
+        x = F.pad(x, pad=(n - 2, n - 1, n - 2, n - 1), mode=mode).to(dtype)
+        xl = F.conv2d(x, hl.unsqueeze(2), groups=g, stride=(1, 2))
+        xh = F.conv2d(x, hh.unsqueeze(2), groups=g, stride=(1, 2))
+        xll = F.conv2d(xl, hl.unsqueeze(3), groups=g, stride=(2, 1))
+        xlh = F.conv2d(xl, hh.unsqueeze(3), groups=g, stride=(2, 1))
+        xhl = F.conv2d(xh, hl.unsqueeze(3), groups=g, stride=(2, 1))
+        xhh = F.conv2d(xh, hh.unsqueeze(3), groups=g, stride=(2, 1))
+        out = torch.cat([xll, xlh, xhl, xhh], dim=1)
+        if rescale:
+            out = out / 2
+        return out
+    def _haar(self, x):
+        for _ in self.range:
+            x = self._dwt(x, rescale=True)
+        return x
+    def _arrange(self, x):
+        x = rearrange(
+            x,
+            "b c (h p1) (w p2) -> b (c p1 p2) h w",
+            p1=self.patch_size,
+            p2=self.patch_size,
+        ).contiguous()
+        return x
+class Patcher3D(Patcher):
+    """A 3D discrete wavelet transform for video data, expects 5D tensor, i.e. a batch of videos."""
+    def __init__(self, patch_size=1, patch_method="haar"):
+        super().__init__(patch_method=patch_method, patch_size=patch_size)
+        self.register_buffer(
+            "patch_size_buffer",
+            patch_size * torch.ones([1], dtype=torch.int32),
+            persistent=_PERSISTENT,
+        )
+    def _dwt(self, x, wavelet, mode="reflect", rescale=False):
+        dtype = x.dtype
+        h = self.wavelets
+        n = h.shape[0]
+        g = x.shape[1]
+        hl = h.flip(0).reshape(1, 1, -1).repeat(g, 1, 1)
+        hh = (h * ((-1) ** self._arange)).reshape(1, 1, -1).repeat(g, 1, 1)
+        hh = hh.to(dtype=dtype)
+        hl = hl.to(dtype=dtype)
+        # Handles temporal axis.
+        x = F.pad(x, pad=(max(0, n - 2), n - 1, n - 2, n - 1, n - 2, n - 1), mode=mode).to(dtype)
+        xl = F.conv3d(x, hl.unsqueeze(3).unsqueeze(4), groups=g, stride=(2, 1, 1))
+        xh = F.conv3d(x, hh.unsqueeze(3).unsqueeze(4), groups=g, stride=(2, 1, 1))
+        # Handles spatial axes.
+        xll = F.conv3d(xl, hl.unsqueeze(2).unsqueeze(4), groups=g, stride=(1, 2, 1))
+        xlh = F.conv3d(xl, hh.unsqueeze(2).unsqueeze(4), groups=g, stride=(1, 2, 1))
+        xhl = F.conv3d(xh, hl.unsqueeze(2).unsqueeze(4), groups=g, stride=(1, 2, 1))
+        xhh = F.conv3d(xh, hh.unsqueeze(2).unsqueeze(4), groups=g, stride=(1, 2, 1))
+        xlll = F.conv3d(xll, hl.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2))
+        xllh = F.conv3d(xll, hh.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2))
+        xlhl = F.conv3d(xlh, hl.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2))
+        xlhh = F.conv3d(xlh, hh.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2))
+        xhll = F.conv3d(xhl, hl.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2))
+        xhlh = F.conv3d(xhl, hh.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2))
+        xhhl = F.conv3d(xhh, hl.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2))
+        xhhh = F.conv3d(xhh, hh.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2))
+        out = torch.cat([xlll, xllh, xlhl, xlhh, xhll, xhlh, xhhl, xhhh], dim=1)
+        if rescale:
+            out = out / (2 * torch.sqrt(torch.tensor(2.0)))
+        return out
+    def _haar(self, x):
+        xi, xv = torch.split(x, [1, x.shape[2] - 1], dim=2)
+        x = torch.cat([xi.repeat_interleave(self.patch_size, dim=2), xv], dim=2)
+        for _ in self.range:
+            x = self._dwt(x, "haar", rescale=True)
+        return x
+    def _arrange(self, x):
+        xi, xv = torch.split(x, [1, x.shape[2] - 1], dim=2)
+        x = torch.cat([xi.repeat_interleave(self.patch_size, dim=2), xv], dim=2)
+        x = rearrange(
+            x,
+            "b c (t p1) (h p2) (w p3) -> b (c p1 p2 p3) t h w",
+            p1=self.patch_size,
+            p2=self.patch_size,
+            p3=self.patch_size,
+        ).contiguous()
+        return x
+class UnPatcher(torch.nn.Module):
+    """A module to convert patches into image tensorsusing torch operations.
+    The main difference from `class Unpatching` is that this module implements
+    all operations using torch, rather than python or numpy, for efficiency purpose.
+    It's bit-wise identical to the Unpatching module outputs, with the added
+    benefit of being torch.jit scriptable.
+    """
+    def __init__(self, patch_size=1, patch_method="haar"):
+        super().__init__()
+        self.patch_size = patch_size
+        self.patch_method = patch_method
+        self.register_buffer("wavelets", _WAVELETS[patch_method], persistent=_PERSISTENT)
+        self.range = range(int(torch.log2(torch.tensor(self.patch_size)).item()))
+        self.register_buffer(
+            "_arange",
+            torch.arange(_WAVELETS[patch_method].shape[0]),
+            persistent=_PERSISTENT,
+        )
+        for param in self.parameters():
+            param.requires_grad = False
+    def forward(self, x):
+        if self.patch_method == "haar":
+            return self._ihaar(x)
+        elif self.patch_method == "rearrange":
+            return self._iarrange(x)
+        else:
+            raise ValueError("Unknown patch method: " + self.patch_method)
+    def _idwt(self, x, wavelet="haar", mode="reflect", rescale=False):
+        dtype = x.dtype
+        h = self.wavelets
+        n = h.shape[0]
+        g = x.shape[1] // 4
+        hl = h.flip([0]).reshape(1, 1, -1).repeat([g, 1, 1])
+        hh = (h * ((-1) ** self._arange)).reshape(1, 1, -1).repeat(g, 1, 1)
+        hh = hh.to(dtype=dtype)
+        hl = hl.to(dtype=dtype)
+        xll, xlh, xhl, xhh = torch.chunk(x.to(dtype), 4, dim=1)
+        # Inverse transform.
+        yl = torch.nn.functional.conv_transpose2d(xll, hl.unsqueeze(3), groups=g, stride=(2, 1), padding=(n - 2, 0))
+        yl += torch.nn.functional.conv_transpose2d(xlh, hh.unsqueeze(3), groups=g, stride=(2, 1), padding=(n - 2, 0))
+        yh = torch.nn.functional.conv_transpose2d(xhl, hl.unsqueeze(3), groups=g, stride=(2, 1), padding=(n - 2, 0))
+        yh += torch.nn.functional.conv_transpose2d(xhh, hh.unsqueeze(3), groups=g, stride=(2, 1), padding=(n - 2, 0))
+        y = torch.nn.functional.conv_transpose2d(yl, hl.unsqueeze(2), groups=g, stride=(1, 2), padding=(0, n - 2))
+        y += torch.nn.functional.conv_transpose2d(yh, hh.unsqueeze(2), groups=g, stride=(1, 2), padding=(0, n - 2))
+        if rescale:
+            y = y * 2
+        return y
+    def _ihaar(self, x):
+        for _ in self.range:
+            x = self._idwt(x, "haar", rescale=True)
+        return x
+    def _iarrange(self, x):
+        x = rearrange(
+            x,
+            "b (c p1 p2) h w -> b c (h p1) (w p2)",
+            p1=self.patch_size,
+            p2=self.patch_size,
+        )
+        return x
+class UnPatcher3D(UnPatcher):
+    """A 3D inverse discrete wavelet transform for video wavelet decompositions."""
+    def __init__(self, patch_size=1, patch_method="haar"):
+        super().__init__(patch_method=patch_method, patch_size=patch_size)
+    def _idwt(self, x, wavelet="haar", mode="reflect", rescale=False):
+        dtype = x.dtype
+        h = self.wavelets
+        n = h.shape[0]
+        g = x.shape[1] // 8  # split into 8 spatio-temporal filtered tesnors.
+        hl = h.flip([0]).reshape(1, 1, -1).repeat([g, 1, 1])
+        hh = (h * ((-1) ** self._arange)).reshape(1, 1, -1).repeat(g, 1, 1)
+        hl = hl.to(dtype=dtype)
+        hh = hh.to(dtype=dtype)
+        xlll, xllh, xlhl, xlhh, xhll, xhlh, xhhl, xhhh = torch.chunk(x, 8, dim=1)
+        # Height height transposed convolutions.
+        xll = F.conv_transpose3d(xlll, hl.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2))
+        xll += F.conv_transpose3d(xllh, hh.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2))
+        xlh = F.conv_transpose3d(xlhl, hl.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2))
+        xlh += F.conv_transpose3d(xlhh, hh.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2))
+        xhl = F.conv_transpose3d(xhll, hl.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2))
+        xhl += F.conv_transpose3d(xhlh, hh.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2))
+        xhh = F.conv_transpose3d(xhhl, hl.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2))
+        xhh += F.conv_transpose3d(xhhh, hh.unsqueeze(2).unsqueeze(3), groups=g, stride=(1, 1, 2))
+        # Handles width transposed convolutions.
+        xl = F.conv_transpose3d(xll, hl.unsqueeze(2).unsqueeze(4), groups=g, stride=(1, 2, 1))
+        xl += F.conv_transpose3d(xlh, hh.unsqueeze(2).unsqueeze(4), groups=g, stride=(1, 2, 1))
+        xh = F.conv_transpose3d(xhl, hl.unsqueeze(2).unsqueeze(4), groups=g, stride=(1, 2, 1))
+        xh += F.conv_transpose3d(xhh, hh.unsqueeze(2).unsqueeze(4), groups=g, stride=(1, 2, 1))
+        # Handles time axis transposed convolutions.
+        x = F.conv_transpose3d(xl, hl.unsqueeze(3).unsqueeze(4), groups=g, stride=(2, 1, 1))
+        x += F.conv_transpose3d(xh, hh.unsqueeze(3).unsqueeze(4), groups=g, stride=(2, 1, 1))
+        if rescale:
+            x = x * (2 * torch.sqrt(torch.tensor(2.0)))
+        return x
+    def _ihaar(self, x):
+        for _ in self.range:
+            x = self._idwt(x, "haar", rescale=True)
+        x = x[:, :, self.patch_size - 1 :, ...]
+        return x
+    def _iarrange(self, x):
+        x = rearrange(
+            x,
+            "b (c p1 p2 p3) t h w -> b c (t p1) (h p2) (w p3)",
+            p1=self.patch_size,
+            p2=self.patch_size,
+            p3=self.patch_size,
+        )
+        x = x[:, :, self.patch_size - 1 :, ...]
+        return x

cosmos_transfer1/auxiliary/tokenizer/modules/quantizers.py ADDED Viewed

	@@ -0,0 +1,513 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Quantizers for discrete image and video tokenization."""
+from typing import Optional
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import reduce
+from loguru import logger as logging
+from cosmos_transfer1.auxiliary.tokenizer.modules.utils import (
+    default,
+    entropy,
+    pack_one,
+    rearrange,
+    round_ste,
+    unpack_one,
+)
+class ResidualFSQuantizer(nn.Module):
+    """Residual Finite Scalar Quantization
+    Follows Algorithm 1. in https://arxiv.org/pdf/2107.03312.pdf
+    """
+    def __init__(self, levels: list[int], num_quantizers: int, **ignore_kwargs):
+        super().__init__()
+        self.dtype = ignore_kwargs.get("dtype", torch.float32)
+        self.layers = nn.ModuleList([FSQuantizer(levels=levels) for _ in range(num_quantizers)])
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        indices_stack = []
+        residual = x
+        quantized_out = 0
+        loss_out = 0
+        for i, layer in enumerate(self.layers):
+            quant_indices, z, loss = layer(residual)
+            indices_stack.append(quant_indices)
+            residual = residual - z.detach()
+            quantized_out = quantized_out + z
+            loss_out = loss_out + loss
+        self.residual = residual
+        indices = torch.stack(indices_stack, dim=1)
+        return indices, quantized_out.to(self.dtype), loss_out.to(self.dtype)
+    def indices_to_codes(self, indices_stack: torch.Tensor) -> torch.Tensor:
+        quantized_out = 0
+        for layer, indices in zip(self.layers, indices_stack.transpose(0, 1)):
+            quantized_out += layer.indices_to_codes(indices)
+        return quantized_out
+class FSQuantizer(nn.Module):
+    """Finite Scalar Quantization: VQ-VAE Made Simple - https://arxiv.org/abs/2309.15505
+    Code adapted from Jax version in Appendix A.1.
+    Adapted from: https://github.com/lucidrains/vector-quantize-pytorch/blob/9502a1f447876d53fd37685b226bf28f250dc4a3/
+    vector_quantize_pytorch/finite_scalar_quantization.py
+    [Copyright (c) 2020 Phil Wang]
+    https://github.com/lucidrains/vector-quantize-pytorch/blob/9502a1f447876d53fd37685b226bf28f250dc4a3/LICENSE
+    """
+    def __init__(
+        self,
+        levels: list[int],
+        dim: Optional[int] = None,
+        num_codebooks=1,
+        keep_num_codebooks_dim: Optional[bool] = None,
+        scale: Optional[float] = None,
+        **ignore_kwargs,
+    ):
+        super().__init__()
+        self.dtype = ignore_kwargs.get("dtype", torch.bfloat16)
+        _levels = torch.tensor(levels, dtype=torch.int32)
+        self.register_buffer("_levels", _levels, persistent=False)
+        _basis = torch.cumprod(torch.tensor([1] + levels[:-1]), dim=0, dtype=torch.int32)
+        self.register_buffer("_basis", _basis, persistent=False)
+        self.scale = scale
+        codebook_dim = len(levels)
+        self.codebook_dim = codebook_dim
+        effective_codebook_dim = codebook_dim * num_codebooks
+        self.num_codebooks = num_codebooks
+        self.effective_codebook_dim = effective_codebook_dim
+        keep_num_codebooks_dim = default(keep_num_codebooks_dim, num_codebooks > 1)
+        assert not (num_codebooks > 1 and not keep_num_codebooks_dim)
+        self.keep_num_codebooks_dim = keep_num_codebooks_dim
+        self.dim = default(dim, len(_levels) * num_codebooks)
+        has_projections = self.dim != effective_codebook_dim
+        self.project_in = nn.Linear(self.dim, effective_codebook_dim) if has_projections else nn.Identity()
+        self.project_out = nn.Linear(effective_codebook_dim, self.dim) if has_projections else nn.Identity()
+        self.has_projections = has_projections
+        self.codebook_size = self._levels.prod().item()
+        implicit_codebook = self.indices_to_codes(torch.arange(self.codebook_size), project_out=False)
+        self.register_buffer("implicit_codebook", implicit_codebook, persistent=False)
+    def bound(self, z: torch.Tensor, eps: float = 1e-3) -> torch.Tensor:
+        """Bound `z`, an array of shape (..., d)."""
+        half_l = (self._levels - 1) * (1 + eps) / 2
+        offset = torch.where(self._levels % 2 == 0, 0.5, 0.0)
+        shift = (offset / half_l).atanh()
+        return (z + shift).tanh() * half_l - offset
+    def quantize(self, z: torch.Tensor) -> torch.Tensor:
+        """Quantizes z, returns quantized zhat, same shape as z."""
+        quantized = round_ste(self.bound(z))
+        half_width = self._levels // 2  # Renormalize to [-1, 1].
+        return quantized / half_width
+    def _scale_and_shift(self, zhat_normalized: torch.Tensor) -> torch.Tensor:
+        half_width = self._levels // 2
+        return (zhat_normalized * half_width) + half_width
+    def _scale_and_shift_inverse(self, zhat: torch.Tensor) -> torch.Tensor:
+        half_width = self._levels // 2
+        return (zhat - half_width) / half_width
+    def codes_to_indices(self, zhat: torch.Tensor) -> torch.Tensor:
+        """Converts a `code` to an index in the codebook."""
+        assert zhat.shape[-1] == self.codebook_dim
+        zhat = self._scale_and_shift(zhat).float()
+        return (zhat * self._basis).sum(dim=-1).to(torch.int32)
+    def indices_to_codes(self, indices: torch.Tensor, project_out=True) -> torch.Tensor:
+        """Inverse of `codes_to_indices`."""
+        is_img_or_video = indices.ndim >= (3 + int(self.keep_num_codebooks_dim))
+        indices = rearrange(indices, "... -> ... 1")
+        codes_non_centered = (indices // self._basis) % self._levels
+        codes = self._scale_and_shift_inverse(codes_non_centered)
+        if self.keep_num_codebooks_dim:
+            codes = rearrange(codes, "... c d -> ... (c d)")
+        if project_out:
+            codes = self.project_out(codes)
+        if is_img_or_video:
+            codes = rearrange(codes, "b ... d -> b d ...")
+        return codes.to(self.dtype)
+    def forward(self, z: torch.Tensor) -> torch.Tensor:
+        """
+        einstein notation
+        b - batch
+        n - sequence (or flattened spatial dimensions)
+        d - feature dimension, which is also log2(codebook size)
+        c - number of codebook dim
+        """
+        is_img_or_video = z.ndim >= 4
+        # standardize image or video into (batch, seq, dimension)
+        if is_img_or_video:
+            z = rearrange(z, "b d ... -> b ... d")
+            z, ps = pack_one(z, "b * d")
+        assert z.shape[-1] == self.dim, f"expected dimension of {self.dim} but found dimension of {z.shape[-1]}"
+        z = self.project_in(z)
+        z = rearrange(z, "b n (c d) -> b n c d", c=self.num_codebooks)
+        codes = self.quantize(z)
+        indices = self.codes_to_indices(codes)
+        codes = rearrange(codes, "b n c d -> b n (c d)")
+        out = self.project_out(codes)
+        # reconstitute image or video dimensions
+        if is_img_or_video:
+            out = unpack_one(out, ps, "b * d")
+            out = rearrange(out, "b ... d -> b d ...")
+            indices = unpack_one(indices, ps, "b * c")
+            dummy_loss = torch.zeros_like(out.mean(dim=[1, 2, 3], keepdim=True))
+        else:
+            dummy_loss = torch.zeros_like(out.mean(dim=[1, 2], keepdim=True)).unsqueeze(1)
+        if not self.keep_num_codebooks_dim:
+            indices = rearrange(indices, "... 1 -> ...")
+        return (indices, out.to(self.dtype), dummy_loss)
+class VectorQuantizer(nn.Module):
+    """Improved version over VectorQuantizer. Mostly
+    avoids costly matrix multiplications and allows for post-hoc remapping of indices.
+    Adapted from: https://github.com/CompVis/taming-transformers/blob/3ba01b241669f5ade541ce990f7650a3b8f65318/
+    taming/modules/vqvae/quantize.py
+    [Copyright (c) 2020 Patrick Esser and Robin Rombach and Björn Ommer]
+    https://github.com/CompVis/taming-transformers/blob/3ba01b241669f5ade541ce990f7650a3b8f65318/License.txt
+    """
+    def __init__(
+        self,
+        num_embeddings: int,
+        embedding_dim: int,
+        beta: float = 0.25,
+        remap: str = None,
+        unknown_index: str = "random",
+        sane_index_shape: bool = False,
+        legacy: bool = True,
+        use_norm=False,
+        **ignore_kwargs,
+    ):
+        super().__init__()
+        self.n_e = num_embeddings
+        self.e_dim = embedding_dim
+        self.beta = beta
+        self.legacy = legacy
+        self.norm = lambda x: F.normalize(x, dim=-1) if use_norm else x
+        self.embedding = nn.Embedding(self.n_e, self.e_dim)
+        self.embedding.weight.data.uniform_(-1.0 / self.n_e, 1.0 / self.n_e)
+        self.remap = remap
+        if self.remap is not None:
+            self.register_buffer("used", torch.tensor(np.load(self.remap)))
+            self.re_embed = self.used.shape[0]
+            self.unknown_index = unknown_index
+            if self.unknown_index == "extra":
+                self.unknown_index = self.re_embed
+                self.re_embed = self.re_embed + 1
+            print(
+                f"Remapping {self.n_e} indices to {self.re_embed} indices. "
+                f"Using {self.unknown_index} for unknown indices."
+            )
+        else:
+            self.re_embed = num_embeddings
+        self.sane_index_shape = sane_index_shape
+        self.dtype = ignore_kwargs.get("dtype", torch.float32)
+    def remap_to_used(self, inds):
+        ishape = inds.shape
+        assert len(ishape) > 1
+        inds = inds.reshape(ishape[0], -1)
+        used = self.used.to(inds)
+        match = (inds[:, :, None] == used[None, None, ...]).long()
+        new = match.argmax(-1)
+        unknown = match.sum(2) < 1
+        if self.unknown_index == "random":
+            new[unknown] = torch.randint(0, self.re_embed, size=new[unknown].shape).to(device=new.device)
+        else:
+            new[unknown] = self.unknown_index
+        return new.reshape(ishape)
+    def unmap_to_all(self, inds):
+        ishape = inds.shape
+        assert len(ishape) > 1
+        inds = inds.reshape(ishape[0], -1)
+        used = self.used.to(inds)
+        if self.re_embed > self.used.shape[0]:  # extra token
+            inds[inds >= self.used.shape[0]] = 0  # simply set to zero
+        back = torch.gather(used[None, :][inds.shape[0] * [0], :], 1, inds)
+        return back.reshape(ishape)
+    def forward(self, z, temp=None, rescale_logits=False, return_logits=False):
+        assert temp is None or temp == 1.0, "Only for interface compatible with Gumbel"
+        assert rescale_logits is False, "Only for interface compatible with Gumbel"
+        assert return_logits is False, "Only for interface compatible with Gumbel"
+        z = rearrange(z, "b c h w -> b h w c").contiguous()
+        z_flattened = z.view(-1, self.e_dim)
+        d = (
+            torch.sum(z_flattened**2, dim=1, keepdim=True)
+            + torch.sum(self.embedding.weight**2, dim=1)
+            - 2
+            * torch.einsum(
+                "bd,dn->bn",
+                z_flattened,
+                rearrange(self.embedding.weight, "n d -> d n"),
+            )
+        )
+        encoding_indices = torch.argmin(d, dim=1).unsqueeze(1)
+        encodings = torch.zeros(encoding_indices.shape[0], self.n_e, device=z.device)
+        encodings.scatter_(1, encoding_indices, 1)
+        z_q = torch.matmul(encodings, self.embedding.weight).view(z.shape)
+        min_encodings = None
+        z_q, z = self.norm(z_q), self.norm(z)
+        # compute loss for embedding
+        commit_loss = torch.mean((z_q - z.detach()) ** 2, dim=[1, 2, 3], keepdim=True)
+        emb_loss = torch.mean((z_q.detach() - z) ** 2, dim=[1, 2, 3], keepdim=True)
+        if not self.legacy:
+            loss = self.beta * emb_loss + commit_loss
+        else:
+            loss = emb_loss + self.beta * commit_loss
+        # preserve gradients
+        z_q = z + (z_q - z).detach()
+        avg_probs = torch.mean(encodings, dim=0)
+        perplexity = torch.exp(-torch.sum(avg_probs * torch.log(avg_probs + 1e-10)))
+        # reshape back to match original input shape
+        z_q = rearrange(z_q, "b h w c -> b c h w").contiguous()
+        if self.remap is not None:
+            min_encoding_indices = encoding_indices.squeeze(1).reshape(z.shape[0], -1)  # add batch axis
+            min_encoding_indices = self.remap_to_used(encoding_indices.squeeze(1))
+            min_encoding_indices = min_encoding_indices.reshape(-1, 1)  # flatten
+        if self.sane_index_shape:
+            min_encoding_indices = min_encoding_indices.reshape(z_q.shape[0], z_q.shape[2], z_q.shape[3])
+        # TODO: return (indices, z_q, loss)
+        return (
+            z_q,
+            loss,
+            (
+                encoding_indices.squeeze(1),
+                min_encodings,
+                commit_loss.mean().detach(),
+                self.beta * emb_loss.mean().detach(),
+                perplexity.mean().detach(),
+            ),
+        )
+    def get_codebook_entry(self, indices, shape):
+        # shape specifying (batch, height, width, channel)
+        if self.remap is not None:
+            indices = indices.reshape(shape[0], -1)  # add batch axis
+            indices = self.unmap_to_all(indices)
+            indices = indices.reshape(-1)  # flatten again
+        # get quantized latent vectors
+        z_q = self.embedding(indices)
+        if shape is not None:
+            z_q = z_q.view(shape)
+            # reshape back to match original input shape
+            z_q = z_q.permute(0, 3, 1, 2).contiguous()
+        return z_q
+class LFQuantizer(nn.Module):
+    """Lookup-Free Quantization
+    Adapted from: https://github.com/lucidrains/vector-quantize-pytorch/blob/9502a1f447876d53fd37685b226bf28f250dc4a3/
+    vector_quantize_pytorch/lookup_free_quantization.py
+    [Copyright (c) 2020 Phil Wang]
+    https://github.com/lucidrains/vector-quantize-pytorch/blob/9502a1f447876d53fd37685b226bf28f250dc4a3/LICENSE
+    """
+    def __init__(
+        self,
+        *,
+        codebook_size: int,
+        codebook_dim: int,
+        embed_dim: Optional[int] = None,  # if None, use codebook_dim
+        entropy_loss_weight=0.1,
+        commitment_loss_weight=0.25,
+        default_temp: float = 0.01,
+        entropy_loss: bool = False,
+        **ignore_kwargs,
+    ):
+        """Lookup-Free Quantization
+        Args:
+            codebook_size (int): The number of entries in the codebook.
+            codebook_dim (int): The number of bits in each code.
+            embed_dim (Optional[int], optional): The dimension of the input embedding. Defaults to None.
+            entropy_loss_weight (float, optional): Whether to use entropy loss. Defaults to 0.1.
+            commitment_loss_weight (float, optional): Weight for commitment loss. Defaults to 0.25.
+            default_temp (float, optional): The temprature to use. Defaults to 0.01.
+            entropy_loss (bool, optional): Flag for entropy loss. Defaults to False.
+        """
+        super().__init__()
+        self.entropy_loss = entropy_loss
+        self.codebook_dim = codebook_dim
+        self.default_temp = default_temp
+        self.entrop_loss_weight = entropy_loss_weight
+        self.commitment_loss_weight = commitment_loss_weight
+        embed_dim = embed_dim or codebook_dim
+        has_projections = embed_dim != codebook_dim
+        self.project_in = nn.Linear(embed_dim, codebook_dim) if has_projections else nn.Identity()
+        self.project_out = nn.Linear(codebook_dim, embed_dim) if has_projections else nn.Identity()
+        logging.info(f"LFQ: has_projections={has_projections}, dim_in={embed_dim}, codebook_dim={codebook_dim}")
+        self.dtype = ignore_kwargs.get("dtype", torch.float32)
+        if entropy_loss:
+            assert 2**codebook_dim == codebook_size, "codebook size must be 2 ** codebook_dim"
+            self.codebook_size = codebook_size
+            self.register_buffer(
+                "mask",
+                2 ** torch.arange(codebook_dim - 1, -1, -1),
+                persistent=False,
+            )
+            self.register_buffer("zero", torch.tensor(0.0), persistent=False)
+            all_codes = torch.arange(codebook_size)
+            bits = ((all_codes[..., None].int() & self.mask) != 0).float()
+            codebook = 2 * bits - 1.0
+            self.register_buffer("codebook", codebook, persistent=False)  # [codebook_size, codebook_dim]
+    def forward(self, z: torch.Tensor, temp: float = None) -> torch.Tensor:
+        temp = temp or self.default_temp
+        z = rearrange(z, "b d ... -> b ... d")
+        z, ps = pack_one(z, "b * d")
+        z = self.project_in(z)
+        # split out number of codebooks
+        z = rearrange(z, "b n (c d) -> b n c d", c=self.num_codebooks)
+        # quantization
+        original_input = z
+        codebook_value = torch.ones_like(z)
+        z_q = torch.where(z > 0, codebook_value, -codebook_value)
+        # preserve gradients
+        z_q = z + (z_q - z).detach()
+        # commit loss
+        commit_loss = ((original_input - z_q.detach()) ** 2).mean(dim=[1, 2, 3])
+        z_q = rearrange(z_q, "b n c d -> b n (c d)")
+        z_q = self.project_out(z_q)
+        # reshape
+        z_q = unpack_one(z_q, ps, "b * d")
+        z_q = rearrange(z_q, "b ... d -> b d ...")
+        loss = self.commitment_loss_weight * commit_loss
+        # entropy loss (eq-5)
+        if self.entropy_loss:
+            # indices
+            indices = reduce((z > 0).int() * self.mask.int(), "b n c d -> b n c", "sum")
+            indices = unpack_one(indices, ps, "b * c")
+            indices = rearrange(indices, "... 1 -> ...")
+            distance = -2 * torch.einsum(
+                "... i d, j d -> ... i j",
+                original_input,
+                self.codebook.to(original_input.dtype),
+            )
+            prob = (-distance / temp).softmax(dim=-1)
+            per_sample_entropy = entropy(prob).mean(dim=[1, 2])
+            avg_prob = reduce(prob, "... c d -> c d", "mean")
+            codebook_entropy = entropy(avg_prob).mean()
+            entropy_aux_loss = per_sample_entropy - codebook_entropy
+            loss += self.entrop_loss_weight * entropy_aux_loss
+            # TODO: return (indices, z_q, loss)
+            return (
+                z_q,
+                loss.unsqueeze(1).unsqueeze(1).unsqueeze(1),
+                (
+                    indices,
+                    self.commitment_loss_weight * commit_loss.mean().detach(),
+                    self.entrop_loss_weight * entropy_aux_loss.mean().detach(),
+                    self.entrop_loss_weight * per_sample_entropy.mean().detach(),
+                    self.entrop_loss_weight * codebook_entropy.mean().detach(),
+                ),
+            )
+        else:
+            return (
+                z_q,
+                loss.unsqueeze(1).unsqueeze(1).unsqueeze(1),
+                self.commitment_loss_weight * commit_loss.mean().detach(),
+            )
+class InvQuantizerJit(nn.Module):
+    """Use for decoder_jit to trace quantizer in discrete tokenizer"""
+    def __init__(self, quantizer):
+        super().__init__()
+        self.quantizer = quantizer
+    def forward(self, indices: torch.Tensor):
+        codes = self.quantizer.indices_to_codes(indices)
+        return codes.to(self.quantizer.dtype)

cosmos_transfer1/auxiliary/tokenizer/modules/utils.py ADDED Viewed

	@@ -0,0 +1,116 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Shared utilities for the networks module."""
+from typing import Any
+import torch
+from einops import pack, rearrange, unpack
+def time2batch(x: torch.Tensor) -> tuple[torch.Tensor, int]:
+    batch_size = x.shape[0]
+    return rearrange(x, "b c t h w -> (b t) c h w"), batch_size
+def batch2time(x: torch.Tensor, batch_size: int) -> torch.Tensor:
+    return rearrange(x, "(b t) c h w -> b c t h w", b=batch_size)
+def space2batch(x: torch.Tensor) -> tuple[torch.Tensor, int]:
+    batch_size, height = x.shape[0], x.shape[-2]
+    return rearrange(x, "b c t h w -> (b h w) c t"), batch_size, height
+def batch2space(x: torch.Tensor, batch_size: int, height: int) -> torch.Tensor:
+    return rearrange(x, "(b h w) c t -> b c t h w", b=batch_size, h=height)
+def cast_tuple(t: Any, length: int = 1) -> Any:
+    return t if isinstance(t, tuple) else ((t,) * length)
+def replication_pad(x):
+    return torch.cat([x[:, :, :1, ...], x], dim=2)
+def divisible_by(num: int, den: int) -> bool:
+    return (num % den) == 0
+def is_odd(n: int) -> bool:
+    return not divisible_by(n, 2)
+def nonlinearity(x):
+    return x * torch.sigmoid(x)
+def Normalize(in_channels, num_groups=32):
+    return torch.nn.GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+class CausalNormalize(torch.nn.Module):
+    def __init__(self, in_channels, num_groups=1):
+        super().__init__()
+        self.norm = torch.nn.GroupNorm(
+            num_groups=num_groups,
+            num_channels=in_channels,
+            eps=1e-6,
+            affine=True,
+        )
+        self.num_groups = num_groups
+    def forward(self, x):
+        # if num_groups !=1, we apply a spatio-temporal groupnorm for backward compatibility purpose.
+        # All new models should use num_groups=1, otherwise causality is not guaranteed.
+        if self.num_groups == 1:
+            x, batch_size = time2batch(x)
+            return batch2time(self.norm(x), batch_size)
+        return self.norm(x)
+def exists(v):
+    return v is not None
+def default(*args):
+    for arg in args:
+        if exists(arg):
+            return arg
+    return None
+def pack_one(t, pattern):
+    return pack([t], pattern)
+def unpack_one(t, ps, pattern):
+    return unpack(t, ps, pattern)[0]
+def round_ste(z: torch.Tensor) -> torch.Tensor:
+    """Round with straight through gradients."""
+    zhat = z.round()
+    return z + (zhat - z).detach()
+def log(t, eps=1e-5):
+    return t.clamp(min=eps).log()
+def entropy(prob):
+    return (-prob * log(prob)).sum(dim=-1)

cosmos_transfer1/auxiliary/tokenizer/networks/__init__.py ADDED Viewed

	@@ -0,0 +1,39 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from enum import Enum
+from cosmos_transfer1.auxiliary.tokenizer.networks.configs import continuous_image as continuous_image_dict
+from cosmos_transfer1.auxiliary.tokenizer.networks.configs import continuous_video as continuous_video_dict
+from cosmos_transfer1.auxiliary.tokenizer.networks.configs import discrete_image as discrete_image_dict
+from cosmos_transfer1.auxiliary.tokenizer.networks.configs import discrete_video as discrete_video_dict
+from cosmos_transfer1.auxiliary.tokenizer.networks.continuous_image import ContinuousImageTokenizer
+from cosmos_transfer1.auxiliary.tokenizer.networks.continuous_video import CausalContinuousVideoTokenizer
+from cosmos_transfer1.auxiliary.tokenizer.networks.discrete_image import DiscreteImageTokenizer
+from cosmos_transfer1.auxiliary.tokenizer.networks.discrete_video import CausalDiscreteVideoTokenizer
+class TokenizerConfigs(Enum):
+    CI = continuous_image_dict
+    DI = discrete_image_dict
+    CV = continuous_video_dict
+    DV = discrete_video_dict
+class TokenizerModels(Enum):
+    CI = ContinuousImageTokenizer
+    DI = DiscreteImageTokenizer
+    CV = CausalContinuousVideoTokenizer
+    DV = CausalDiscreteVideoTokenizer

cosmos_transfer1/auxiliary/tokenizer/networks/configs.py ADDED Viewed

	@@ -0,0 +1,147 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""The default image and video tokenizer configs."""
+from cosmos_transfer1.auxiliary.tokenizer.modules import (
+    ContinuousFormulation,
+    Decoder3DType,
+    DecoderType,
+    DiscreteQuantizer,
+    Encoder3DType,
+    EncoderType,
+)
+continuous_image = dict(
+    # The attention resolution for res blocks.
+    attn_resolutions=[32],
+    # The base number of channels.
+    channels=128,
+    # The channel multipler for each resolution.
+    channels_mult=[2, 4, 4],
+    dropout=0.0,
+    in_channels=3,
+    # The spatial compression ratio.
+    spatial_compression=16,
+    # The number of layers in each res block.
+    num_res_blocks=2,
+    out_channels=3,
+    resolution=1024,
+    patch_size=4,
+    patch_method="haar",
+    # The output latent dimension (channels).
+    latent_channels=16,
+    # The encoder output channels just before sampling.
+    # Which is also the decoder's input channels.
+    z_channels=16,
+    # A factor over the z_channels, to get the total channels the encoder should output.
+    # For a VAE for instance, we want to output the mean and variance, so we need 2 * z_channels.
+    z_factor=1,
+    name="CI",
+    # What formulation to use, either "AE" or "VAE".
+    # Chose VAE here, since the pre-trained ckpt were of a VAE formulation.
+    formulation=ContinuousFormulation.AE.name,
+    # Specify type of encoder ["Default", "LiteVAE"]
+    encoder=EncoderType.Default.name,
+    # Specify type of decoder ["Default"]
+    decoder=DecoderType.Default.name,
+)
+discrete_image = dict(
+    # The attention resolution for res blocks.
+    attn_resolutions=[32],
+    # The base number of channels.
+    channels=128,
+    # The channel multipler for each resolution.
+    channels_mult=[2, 4, 4],
+    dropout=0.0,
+    in_channels=3,
+    # The spatial compression ratio.
+    spatial_compression=16,
+    # The number of layers in each res block.
+    num_res_blocks=2,
+    out_channels=3,
+    resolution=1024,
+    patch_size=4,
+    patch_method="haar",
+    # The encoder output channels just before sampling.
+    z_channels=256,
+    # A factor over the z_channels, to get the total channels the encoder should output.
+    # for discrete tokenization, often we directly use the vector, so z_factor=1.
+    z_factor=1,
+    # The quantizer of choice, VQ, LFQ, FSQ, or ResFSQ.
+    quantizer=DiscreteQuantizer.FSQ.name,
+    # The embedding dimension post-quantization, which is also the input channels of the decoder.
+    # Which is also the output
+    embedding_dim=6,
+    # The number of levels to use for fine-scalar quantization.
+    levels=[8, 8, 8, 5, 5, 5],
+    # The number of quantizers to use for residual fine-scalar quantization.
+    num_quantizers=4,
+    name="DI",
+    # Specify type of encoder ["Default", "LiteVAE"]
+    encoder=EncoderType.Default.name,
+    # Specify type of decoder ["Default"]
+    decoder=DecoderType.Default.name,
+)
+continuous_video = dict(
+    attn_resolutions=[32],
+    channels=128,
+    channels_mult=[2, 4, 4],
+    dropout=0.0,
+    in_channels=3,
+    num_res_blocks=2,
+    out_channels=3,
+    resolution=1024,
+    patch_size=4,
+    patch_method="haar",
+    latent_channels=16,
+    z_channels=16,
+    z_factor=1,
+    num_groups=1,
+    legacy_mode=False,
+    spatial_compression=8,
+    temporal_compression=8,
+    formulation=ContinuousFormulation.AE.name,
+    encoder=Encoder3DType.FACTORIZED.name,
+    decoder=Decoder3DType.FACTORIZED.name,
+    name="CV",
+)
+discrete_video = dict(
+    attn_resolutions=[32],
+    channels=128,
+    channels_mult=[2, 4, 4],
+    dropout=0.0,
+    in_channels=3,
+    num_res_blocks=2,
+    out_channels=3,
+    resolution=1024,
+    patch_size=4,
+    patch_method="haar",
+    z_channels=16,
+    z_factor=1,
+    num_groups=1,
+    legacy_mode=False,
+    spatial_compression=16,
+    temporal_compression=8,
+    quantizer=DiscreteQuantizer.FSQ.name,
+    embedding_dim=6,
+    levels=[8, 8, 8, 5, 5, 5],
+    encoder=Encoder3DType.FACTORIZED.name,
+    decoder=Decoder3DType.FACTORIZED.name,
+    name="DV",
+)