harry900000 commited on
Commit
ee8cb8c
·
1 Parent(s): e875314

pip install cosmos-transfer1

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. app.py +4 -20
  2. cosmos-transfer1/.flake8 +0 -10
  3. cosmos-transfer1/.gitignore +0 -242
  4. cosmos-transfer1/.pre-commit-config.yaml +0 -55
  5. cosmos-transfer1/ATTRIBUTIONS.md +0 -1661
  6. cosmos-transfer1/CONTRIBUTING.md +0 -51
  7. cosmos-transfer1/Dockerfile +0 -47
  8. cosmos-transfer1/INSTALL.md +0 -88
  9. cosmos-transfer1/LICENSE +0 -201
  10. cosmos-transfer1/README.md +0 -102
  11. cosmos-transfer1/checkpoints/README.md +0 -3
  12. cosmos-transfer1/cosmos-transfer1.yaml +0 -30
  13. cosmos-transfer1/cosmos_transfer1/auxiliary/depth_anything/inference/__init__.py +0 -0
  14. cosmos-transfer1/cosmos_transfer1/auxiliary/depth_anything/inference/depth_anything_pipeline.py +0 -55
  15. cosmos-transfer1/cosmos_transfer1/auxiliary/depth_anything/model/__init__.py +0 -0
  16. cosmos-transfer1/cosmos_transfer1/auxiliary/depth_anything/model/depth_anything.py +0 -151
  17. cosmos-transfer1/cosmos_transfer1/auxiliary/guardrail/README.md +0 -17
  18. cosmos-transfer1/cosmos_transfer1/auxiliary/guardrail/__init__.py +0 -14
  19. cosmos-transfer1/cosmos_transfer1/auxiliary/guardrail/aegis/__init__.py +0 -14
  20. cosmos-transfer1/cosmos_transfer1/auxiliary/guardrail/aegis/aegis.py +0 -135
  21. cosmos-transfer1/cosmos_transfer1/auxiliary/guardrail/aegis/categories.py +0 -192
  22. cosmos-transfer1/cosmos_transfer1/auxiliary/guardrail/blocklist/__init__.py +0 -14
  23. cosmos-transfer1/cosmos_transfer1/auxiliary/guardrail/blocklist/blocklist.py +0 -216
  24. cosmos-transfer1/cosmos_transfer1/auxiliary/guardrail/blocklist/utils.py +0 -45
  25. cosmos-transfer1/cosmos_transfer1/auxiliary/guardrail/common/__init__.py +0 -0
  26. cosmos-transfer1/cosmos_transfer1/auxiliary/guardrail/common/core.py +0 -71
  27. cosmos-transfer1/cosmos_transfer1/auxiliary/guardrail/common/io_utils.py +0 -78
  28. cosmos-transfer1/cosmos_transfer1/auxiliary/guardrail/common/presets.py +0 -75
  29. cosmos-transfer1/cosmos_transfer1/auxiliary/guardrail/face_blur_filter/__init__.py +0 -14
  30. cosmos-transfer1/cosmos_transfer1/auxiliary/guardrail/face_blur_filter/blur_utils.py +0 -35
  31. cosmos-transfer1/cosmos_transfer1/auxiliary/guardrail/face_blur_filter/face_blur_filter.py +0 -225
  32. cosmos-transfer1/cosmos_transfer1/auxiliary/guardrail/face_blur_filter/retinaface_utils.py +0 -117
  33. cosmos-transfer1/cosmos_transfer1/auxiliary/guardrail/llamaGuard3/__init__.py +0 -14
  34. cosmos-transfer1/cosmos_transfer1/auxiliary/guardrail/llamaGuard3/categories.py +0 -31
  35. cosmos-transfer1/cosmos_transfer1/auxiliary/guardrail/llamaGuard3/llamaGuard3.py +0 -122
  36. cosmos-transfer1/cosmos_transfer1/auxiliary/guardrail/video_content_safety_filter/__init__.py +0 -14
  37. cosmos-transfer1/cosmos_transfer1/auxiliary/guardrail/video_content_safety_filter/model.py +0 -60
  38. cosmos-transfer1/cosmos_transfer1/auxiliary/guardrail/video_content_safety_filter/video_content_safety_filter.py +0 -185
  39. cosmos-transfer1/cosmos_transfer1/auxiliary/guardrail/video_content_safety_filter/vision_encoder.py +0 -46
  40. cosmos-transfer1/cosmos_transfer1/auxiliary/human_keypoint/human_keypoint.py +0 -155
  41. cosmos-transfer1/cosmos_transfer1/auxiliary/robot_augmentation/README.md +0 -112
  42. cosmos-transfer1/cosmos_transfer1/auxiliary/robot_augmentation/spatial_temporal_weight.py +0 -577
  43. cosmos-transfer1/cosmos_transfer1/auxiliary/sam2/sam2_model.py +0 -392
  44. cosmos-transfer1/cosmos_transfer1/auxiliary/sam2/sam2_pipeline.py +0 -126
  45. cosmos-transfer1/cosmos_transfer1/auxiliary/sam2/sam2_utils.py +0 -168
  46. cosmos-transfer1/cosmos_transfer1/auxiliary/tokenizer/inference/__init__.py +0 -14
  47. cosmos-transfer1/cosmos_transfer1/auxiliary/tokenizer/inference/image_cli.py +0 -188
  48. cosmos-transfer1/cosmos_transfer1/auxiliary/tokenizer/inference/image_lib.py +0 -124
  49. cosmos-transfer1/cosmos_transfer1/auxiliary/tokenizer/inference/utils.py +0 -402
  50. cosmos-transfer1/cosmos_transfer1/auxiliary/tokenizer/inference/video_cli.py +0 -210
app.py CHANGED
@@ -1,11 +1,8 @@
1
  import os
2
- import sys
3
  from typing import List, Tuple
4
 
5
  PWD = os.path.dirname(__file__)
6
 
7
- sys.path.append(os.path.join(PWD, "cosmos-transfer1"))
8
-
9
  import subprocess
10
 
11
  subprocess.run("pip install flash-attn --no-build-isolation", env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"}, shell=True)
@@ -26,23 +23,10 @@ except Exception as e:
26
  print(f"Authentication failed: {e}")
27
 
28
  # download checkpoints
29
- subprocess.run(
30
- [
31
- "python",
32
- os.path.join(PWD, "cosmos-transfer1", "scripts", "download_checkpoints.py"),
33
- "--output_dir",
34
- os.path.join(PWD, "cosmos-transfer1", "checkpoints"),
35
- "--model",
36
- "7b_av",
37
- ],
38
- shell=True,
39
- )
40
- # subprocess.run(
41
- # f"python cosmos-transfer1/scripts/download_checkpoints.py \
42
- # --hf_token {hf_token} \
43
- # --output_dir cosmos-transfer1/checkpoints/ \
44
- # --model 7b_av"
45
- # )
46
 
47
  os.environ["TOKENIZERS_PARALLELISM"] = "false" # Workaround to suppress MP warning
48
 
 
1
  import os
 
2
  from typing import List, Tuple
3
 
4
  PWD = os.path.dirname(__file__)
5
 
 
 
6
  import subprocess
7
 
8
  subprocess.run("pip install flash-attn --no-build-isolation", env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"}, shell=True)
 
23
  print(f"Authentication failed: {e}")
24
 
25
  # download checkpoints
26
+ from download_checkpoints import main as download_checkpoints
27
+
28
+ os.makedirs("./checkpoints", exist_ok=True)
29
+ download_checkpoints(hf_token="", output_dir="./checkpoints", model="7b_av")
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
  os.environ["TOKENIZERS_PARALLELISM"] = "false" # Workaround to suppress MP warning
32
 
cosmos-transfer1/.flake8 DELETED
@@ -1,10 +0,0 @@
1
- [flake8]
2
- enable-extensions = G
3
- select = B,C,E,F,G,P,SIM1,T4,W,B9
4
- max-line-length = 120
5
- # C408 ignored because we like the dict keyword argument syntax
6
- # E501 is not flexible enough, we're using B950 instead
7
- ignore =
8
- E203,E305,E402,E501,E721,E741,F405,F821,F841,F999,W503,W504,C408,E302,W291,E303,E226,E265
9
- exclude =
10
- third_party
 
 
 
 
 
 
 
 
 
 
 
cosmos-transfer1/.gitignore DELETED
@@ -1,242 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- # Misc
17
- outputs/
18
- checkpoints/*
19
- !checkpoints/README.md
20
-
21
- # Data types
22
- *.jit
23
- *.pt
24
- *.hdr
25
- *.webp
26
- *.pgm
27
- *.tiff
28
- *.tif
29
- *.tar
30
- *.tar.gz
31
- *.gz
32
- *.pkl
33
- *.pt
34
- *.bin
35
-
36
- # Other uncheckable file types
37
- *.zip
38
- *.exe
39
- *.dll
40
- *.swp
41
- *.vscode
42
- *.DS_Store
43
- *.pyc
44
- *Thumbs.db
45
- *.patch
46
-
47
- # Credential information that should never be checked in
48
- credentials
49
- *.secret
50
-
51
- # ------------------------ BELOW IS AUTO-GENERATED FOR PYTHON REPOS ------------------------
52
-
53
- # Byte-compiled / optimized / DLL files
54
- **/__pycache__/
55
- *.py[cod]
56
- *$py.class
57
-
58
- # C extensions
59
- *.so
60
-
61
- # Distribution / packaging
62
- .Python
63
- build/
64
- develop-eggs/
65
- dist/
66
- downloads/
67
- eggs/
68
- .eggs/
69
- lib/
70
- lib64/
71
- parts/
72
- results/
73
- sdist/
74
- var/
75
- wheels/
76
- share/python-wheels/
77
- *.egg-info/
78
- .installed.config
79
- *.egg
80
- MANIFEST
81
-
82
- # PyInstaller
83
- # Usually these files are written by a python script from a template
84
- # before PyInstaller builds the exe, so as to inject date/other infos into it.
85
- *.manifest
86
- *.spec
87
-
88
- # Installer logs
89
- pip-log.txt
90
- pip-delete-this-directory.txt
91
-
92
- # Unit test / coverage reports
93
- htmlcov/
94
- .tox/
95
- .nox/
96
- .coverage
97
- .coverage.*
98
- .cache
99
- nosetests.xml
100
- coverage.xml
101
- *.cover
102
- *.py,cover
103
- .hypothesis/
104
- .pytest_cache/
105
- cover/
106
-
107
- # Translations
108
- *.mo
109
- *.pot
110
-
111
- # Django stuff:
112
- *.log
113
- local_settings.py
114
- db.sqlite3
115
- db.sqlite3-journal
116
-
117
- # Flask stuff:
118
- instance/
119
- .webassets-cache
120
-
121
- # Scrapy stuff:
122
- .scrapy
123
-
124
- # Sphinx documentation
125
- docs/_build/
126
-
127
- # PyBuilder
128
- .pybuilder/
129
- target/
130
-
131
- # Third party
132
- # Jupyter Notebook
133
- .ipynb_checkpoints
134
-
135
- # IPython
136
- profile_default/
137
- ipython_config.py
138
-
139
- # pyenv
140
- # For a library or package, you might want to ignore these files since the code is
141
- # intended to run in multiple environments; otherwise, check them in:
142
- # .python-version
143
-
144
- # pipenv
145
- # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
146
- # However, in case of collaboration, if having platform-specific dependencies or dependencies
147
- # having no cross-platform support, pipenv may install dependencies that don't work, or not
148
- # install all needed dependencies.
149
- #Pipfile.lock
150
-
151
- # poetry
152
- # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
153
- # This is especially recommended for binary packages to ensure reproducibility, and is more
154
- # commonly ignored for libraries.
155
- # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
156
- #poetry.lock
157
-
158
- # pdm
159
- # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
160
- #pdm.lock
161
- # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
162
- # in version control.
163
- # https://pdm.fming.dev/#use-with-ide
164
- .pdm.toml
165
-
166
- # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
167
- __pypackages__/
168
-
169
- # Celery stuff
170
- celerybeat-schedule
171
- celerybeat.pid
172
-
173
- # SageMath parsed files
174
- *.sage.py
175
-
176
- # Environments
177
- .env
178
- .venv
179
- env/
180
- venv/
181
- ENV/
182
- env.bak/
183
- venv.bak/
184
-
185
- # Spyder project settings
186
- .spyderproject
187
- .spyproject
188
-
189
- # Rope project settings
190
- .ropeproject
191
-
192
- # mkdocs documentation
193
- /site
194
-
195
- # mypy
196
- .mypy_cache/
197
- .dmypy.json
198
- dmypy.json
199
-
200
- # Pyre type checker
201
- .pyre/
202
-
203
- # pytype static type analyzer
204
- .pytype/
205
-
206
- # Cython debug symbols
207
- cython_debug/
208
-
209
- # ruff
210
- .ruff_cache
211
-
212
- # PyCharm
213
- # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
214
- # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
215
- # and can be added to the global gitignore or merged into this file. For a more nuclear
216
- # option (not recommended) you can uncomment the following to ignore the entire idea folder.
217
- #.idea/
218
- CLIP
219
- .devcontainer/devcontainer.json
220
-
221
- # Coverage
222
- .coverage
223
- coverage.xml
224
-
225
- # JUnit Reports
226
- report.xml
227
-
228
- # CI-CD
229
- temp/
230
- envs.txt
231
- manifest.json
232
-
233
-
234
- # locks and t5 temp files
235
- *.locks*
236
- *.no_exist*
237
- *models--t5*
238
-
239
- # OneLogger
240
- wandb/
241
- onelogger.err
242
- onelogger.log
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cosmos-transfer1/.pre-commit-config.yaml DELETED
@@ -1,55 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- default_language_version:
17
- python: python3.10
18
- repos:
19
- - repo: https://github.com/pycqa/flake8
20
- rev: 6.0.0
21
- hooks:
22
- - id: flake8
23
- args:
24
- - --max-line-length=120
25
- - --ignore=E501,F401,E203,E402,E265,E741,F841,F821,F811,W503,E231,E225,E702
26
- exclude: ^dist/|^third_party/
27
-
28
- - repo: https://github.com/psf/black
29
- rev: 23.12.1
30
- hooks:
31
- - id: black
32
- args: [--line-length=120]
33
- exclude: ^dist/|^third_party/
34
-
35
- - repo: https://github.com/timothycrosley/isort
36
- rev: 5.12.0
37
- hooks:
38
- - id: isort
39
- args: [--line-length=120]
40
-
41
- - repo: https://github.com/MarcoGorelli/absolufy-imports
42
- rev: v0.3.1
43
- hooks:
44
- - id: absolufy-imports
45
-
46
- - repo: https://github.com/pre-commit/pre-commit-hooks
47
- rev: v4.0.1
48
- hooks:
49
- - id: trailing-whitespace
50
- exclude: ^tests/.*/fixtures/.*
51
- args: [--markdown-linebreak-ext=md]
52
- - id: end-of-file-fixer
53
- exclude: ^tests/.*/fixtures/.*
54
- - id: check-added-large-files
55
- args: ['--maxkb=2000']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cosmos-transfer1/ATTRIBUTIONS.md DELETED
@@ -1,1661 +0,0 @@
1
- # Open Source License Attribution
2
-
3
- Cosmos uses Open Source components. You can find the details of these open-source projects along with license information below, sorted alphabetically.
4
- We are grateful to the developers for their contributions to open source and acknowledge these below.
5
-
6
- ## Better-Profanity - [MIT License](https://github.com/snguyenthanh/better_profanity/blob/master/LICENSE)
7
-
8
- ```
9
-
10
- Copyright (c) 2018 The Python Packaging Authority
11
-
12
- Permission is hereby granted, free of charge, to any person obtaining a copy
13
- of this software and associated documentation files (the "Software"), to deal
14
- in the Software without restriction, including without limitation the rights
15
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
16
- copies of the Software, and to permit persons to whom the Software is
17
- furnished to do so, subject to the following conditions:
18
-
19
- The above copyright notice and this permission notice shall be included in all
20
- copies or substantial portions of the Software.
21
-
22
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
27
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28
- SOFTWARE.
29
-
30
- ```
31
-
32
- ## FFmpeg - [FFMPEG License](https://github.com/FFmpeg/FFmpeg/blob/master/LICENSE.md)
33
-
34
- ```
35
- # License
36
-
37
- Most files in FFmpeg are under the GNU Lesser General Public License version 2.1
38
- or later (LGPL v2.1+). Read the file `COPYING.LGPLv2.1` for details. Some other
39
- files have MIT/X11/BSD-style licenses. In combination the LGPL v2.1+ applies to
40
- FFmpeg.
41
-
42
- Some optional parts of FFmpeg are licensed under the GNU General Public License
43
- version 2 or later (GPL v2+). See the file `COPYING.GPLv2` for details. None of
44
- these parts are used by default, you have to explicitly pass `--enable-gpl` to
45
- configure to activate them. In this case, FFmpeg's license changes to GPL v2+.
46
-
47
- Specifically, the GPL parts of FFmpeg are:
48
-
49
- - libpostproc
50
- - optional x86 optimization in the files
51
- - `libavcodec/x86/flac_dsp_gpl.asm`
52
- - `libavcodec/x86/idct_mmx.c`
53
- - `libavfilter/x86/vf_removegrain.asm`
54
- - the following building and testing tools
55
- - `compat/solaris/make_sunver.pl`
56
- - `doc/t2h.pm`
57
- - `doc/texi2pod.pl`
58
- - `libswresample/tests/swresample.c`
59
- - `tests/checkasm/*`
60
- - `tests/tiny_ssim.c`
61
- - the following filters in libavfilter:
62
- - `signature_lookup.c`
63
- - `vf_blackframe.c`
64
- - `vf_boxblur.c`
65
- - `vf_colormatrix.c`
66
- - `vf_cover_rect.c`
67
- - `vf_cropdetect.c`
68
- - `vf_delogo.c`
69
- - `vf_eq.c`
70
- - `vf_find_rect.c`
71
- - `vf_fspp.c`
72
- - `vf_histeq.c`
73
- - `vf_hqdn3d.c`
74
- - `vf_kerndeint.c`
75
- - `vf_lensfun.c` (GPL version 3 or later)
76
- - `vf_mcdeint.c`
77
- - `vf_mpdecimate.c`
78
- - `vf_nnedi.c`
79
- - `vf_owdenoise.c`
80
- - `vf_perspective.c`
81
- - `vf_phase.c`
82
- - `vf_pp.c`
83
- - `vf_pp7.c`
84
- - `vf_pullup.c`
85
- - `vf_repeatfields.c`
86
- - `vf_sab.c`
87
- - `vf_signature.c`
88
- - `vf_smartblur.c`
89
- - `vf_spp.c`
90
- - `vf_stereo3d.c`
91
- - `vf_super2xsai.c`
92
- - `vf_tinterlace.c`
93
- - `vf_uspp.c`
94
- - `vf_vaguedenoiser.c`
95
- - `vsrc_mptestsrc.c`
96
-
97
- Should you, for whatever reason, prefer to use version 3 of the (L)GPL, then
98
- the configure parameter `--enable-version3` will activate this licensing option
99
- for you. Read the file `COPYING.LGPLv3` or, if you have enabled GPL parts,
100
- `COPYING.GPLv3` to learn the exact legal terms that apply in this case.
101
-
102
- There are a handful of files under other licensing terms, namely:
103
-
104
- * The files `libavcodec/jfdctfst.c`, `libavcodec/jfdctint_template.c` and
105
- `libavcodec/jrevdct.c` are taken from libjpeg, see the top of the files for
106
- licensing details. Specifically note that you must credit the IJG in the
107
- documentation accompanying your program if you only distribute executables.
108
- You must also indicate any changes including additions and deletions to
109
- those three files in the documentation.
110
- * `tests/reference.pnm` is under the expat license.
111
-
112
-
113
- ## External libraries
114
-
115
- FFmpeg can be combined with a number of external libraries, which sometimes
116
- affect the licensing of binaries resulting from the combination.
117
-
118
- ### Compatible libraries
119
-
120
- The following libraries are under GPL version 2:
121
- - avisynth
122
- - frei0r
123
- - libcdio
124
- - libdavs2
125
- - librubberband
126
- - libvidstab
127
- - libx264
128
- - libx265
129
- - libxavs
130
- - libxavs2
131
- - libxvid
132
-
133
- When combining them with FFmpeg, FFmpeg needs to be licensed as GPL as well by
134
- passing `--enable-gpl` to configure.
135
-
136
- The following libraries are under LGPL version 3:
137
- - gmp
138
- - libaribb24
139
- - liblensfun
140
-
141
- When combining them with FFmpeg, use the configure option `--enable-version3` to
142
- upgrade FFmpeg to the LGPL v3.
143
-
144
- The VMAF, mbedTLS, RK MPI, OpenCORE and VisualOn libraries are under the Apache License
145
- 2.0. That license is incompatible with the LGPL v2.1 and the GPL v2, but not with
146
- version 3 of those licenses. So to combine these libraries with FFmpeg, the
147
- license version needs to be upgraded by passing `--enable-version3` to configure.
148
-
149
- The smbclient library is under the GPL v3, to combine it with FFmpeg,
150
- the options `--enable-gpl` and `--enable-version3` have to be passed to
151
- configure to upgrade FFmpeg to the GPL v3.
152
-
153
- ### Incompatible libraries
154
-
155
- There are certain libraries you can combine with FFmpeg whose licenses are not
156
- compatible with the GPL and/or the LGPL. If you wish to enable these
157
- libraries, even in circumstances that their license may be incompatible, pass
158
- `--enable-nonfree` to configure. This will cause the resulting binary to be
159
- unredistributable.
160
-
161
- The Fraunhofer FDK AAC and OpenSSL libraries are under licenses which are
162
- incompatible with the GPLv2 and v3. To the best of our knowledge, they are
163
- compatible with the LGPL.
164
-
165
- ```
166
-
167
- ## Hydra-core [MIT License](https://github.com/facebookresearch/hydra/blob/main/LICENSE)
168
-
169
- ```
170
-
171
- MIT License
172
-
173
- Copyright (c) Facebook, Inc. and its affiliates.
174
-
175
- Permission is hereby granted, free of charge, to any person obtaining a copy
176
- of this software and associated documentation files (the "Software"), to deal
177
- in the Software without restriction, including without limitation the rights
178
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
179
- copies of the Software, and to permit persons to whom the Software is
180
- furnished to do so, subject to the following conditions:
181
-
182
- The above copyright notice and this permission notice shall be included in all
183
- copies or substantial portions of the Software.
184
-
185
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
186
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
187
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
188
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
189
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
190
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
191
- SOFTWARE.
192
-
193
- ```
194
-
195
- ## ImageIo - [BSD 2-Clause "Simplified" License](https://github.com/imageio/imageio/blob/master/LICENSE)
196
-
197
- ```
198
-
199
- Copyright (c) 2014-2022, imageio developers
200
- All rights reserved.
201
-
202
- Redistribution and use in source and binary forms, with or without
203
- modification, are permitted provided that the following conditions are met:
204
-
205
- * Redistributions of source code must retain the above copyright notice, this
206
- list of conditions and the following disclaimer.
207
-
208
- * Redistributions in binary form must reproduce the above copyright notice,
209
- this list of conditions and the following disclaimer in the documentation
210
- and/or other materials provided with the distribution.
211
-
212
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
213
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
214
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
215
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
216
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
217
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
218
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
219
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
220
- OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
221
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
222
-
223
- ```
224
-
225
- ## Iopath - [MIT License](https://github.com/facebookresearch/iopath/blob/main/LICENSE)
226
-
227
- ```
228
- MIT License
229
-
230
- Copyright (c) Facebook, Inc. and its affiliates.
231
-
232
- Permission is hereby granted, free of charge, to any person obtaining a copy
233
- of this software and associated documentation files (the "Software"), to deal
234
- in the Software without restriction, including without limitation the rights
235
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
236
- copies of the Software, and to permit persons to whom the Software is
237
- furnished to do so, subject to the following conditions:
238
-
239
- The above copyright notice and this permission notice shall be included in all
240
- copies or substantial portions of the Software.
241
-
242
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
243
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
244
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
245
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
246
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
247
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
248
- SOFTWARE.
249
-
250
- ```
251
-
252
- ## Llama-Guard-3-8B [META LLAMA 3 COMMUNITY LICENSE](https://github.com/meta-llama/llama3/blob/main/LICENSE)
253
-
254
- ```
255
-
256
- META LLAMA 3 COMMUNITY LICENSE AGREEMENT
257
-
258
- Meta Llama 3 Version Release Date: April 18, 2024
259
-
260
- “Agreement” means the terms and conditions for use, reproduction, distribution, and
261
- modification of the Llama Materials set forth herein.
262
-
263
- “Documentation” means the specifications, manuals, and documentation accompanying Meta
264
- Llama 3 distributed by Meta at https://llama.meta.com/get-started/.
265
-
266
- “Licensee” or “you” means you, or your employer or any other person or entity (if you are
267
- entering into this Agreement on such person or entity’s behalf), of the age required under
268
- applicable laws, rules, or regulations to provide legal consent and that has legal authority
269
- to bind your employer or such other person or entity if you are entering into this Agreement
270
- on their behalf.
271
-
272
- “Meta Llama 3” means the foundational large language models and software and algorithms,
273
- including machine-learning model code, trained model weights, inference-enabling code,
274
- training-enabling code, fine-tuning-enabling code, and other elements of the foregoing
275
- distributed by Meta at https://llama.meta.com/llama-downloads.
276
-
277
- “Llama Materials” means, collectively, Meta’s proprietary Meta Llama 3 and Documentation
278
- (and any portion thereof) made available under this Agreement.
279
-
280
- “Meta” or “we” means Meta Platforms Ireland Limited (if you are located in or, if you are
281
- an entity, your principal place of business is in the EEA or Switzerland) and Meta Platforms,
282
- Inc. (if you are located outside of the EEA or Switzerland).
283
-
284
- By clicking “I Accept” below or by using or distributing any portion or element of the Llama
285
- Materials, you agree to be bound by this Agreement.
286
-
287
- 1. License Rights and Redistribution.
288
-
289
- a. Grant of Rights. You are granted a non-exclusive, worldwide, non-transferable and
290
- royalty-free limited license under Meta’s intellectual property or other rights owned by
291
- Meta embodied in the Llama Materials to use, reproduce, distribute, copy, create derivative
292
- works of, and make modifications to the Llama Materials.
293
-
294
- b. Redistribution and Use.
295
- i. If you distribute or make available the Llama Materials (or any derivative works
296
- thereof), or a product or service that uses any of them, including another AI model, you
297
- shall (A) provide a copy of this Agreement with any such Llama Materials; and (B)
298
- prominently display “Built with Meta Llama 3” on a related website, user interface,
299
- blogpost, about page, or product documentation. If you use the Llama Materials to create,
300
- train, fine tune, or otherwise improve an AI model, which is distributed or made available,
301
- you shall also include “Llama 3” at the beginning of any such AI model name.
302
-
303
- ii. If you receive Llama Materials, or any derivative works thereof, from a Licensee as
304
- part of an integrated end user product, then Section 2 of this Agreement will not apply
305
- to you.
306
-
307
- iii. You must retain in all copies of the Llama Materials that you distribute the
308
- following attribution notice within a “Notice” text file distributed as a part of such
309
- copies: “Meta Llama 3 is licensed under the Meta Llama 3 Community License, Copyright ©
310
- Meta Platforms, Inc. All Rights Reserved.”
311
-
312
- iv. Your use of the Llama Materials must comply with applicable laws and regulations
313
- (including trade compliance laws and regulations) and adhere to the Acceptable Use Policy
314
- for the Llama Materials (available at https://llama.meta.com/llama3/use-policy), which
315
- is hereby incorporated by reference into this Agreement.
316
-
317
- v. You will not use the Llama Materials or any output or results of the Llama Materials
318
- to improve any other large language model (excluding Meta Llama 3 or derivative works
319
- thereof).
320
-
321
- 2. Additional Commercial Terms.
322
-
323
- If, on the Meta Llama 3 version release date, the monthly active users of the products or
324
- services made available by or for Licensee, or Licensee’s affiliates, is greater than 700
325
- million monthly active users in the preceding calendar month, you must request a license
326
- from Meta, which Meta may grant to you in its sole discretion, and you are not authorized
327
- to exercise any of the rights under this Agreement unless or until Meta otherwise expressly
328
- grants you such rights.
329
-
330
- 3. Disclaimer of Warranty.
331
-
332
- UNLESS REQUIRED BY APPLICABLE LAW, THE LLAMA MATERIALS AND ANY OUTPUT AND RESULTS THEREFROM
333
- ARE PROVIDED ON AN “AS IS” BASIS, WITHOUT WARRANTIES OF ANY KIND, AND META DISCLAIMS ALL
334
- WARRANTIES OF ANY KIND, BOTH EXPRESS AND IMPLIED, INCLUDING, WITHOUT LIMITATION, ANY
335
- WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
336
- YOU ARE SOLELY RESPONSIBLE FOR DETERMINING THE APPROPRIATENESS OF USING OR REDISTRIBUTING
337
- THE LLAMA MATERIALS AND ASSUME ANY RISKS ASSOCIATED WITH YOUR USE OF THE LLAMA MATERIALS
338
- AND ANY OUTPUT AND RESULTS.
339
-
340
- 4. Limitation of Liability.
341
-
342
- IN NO EVENT WILL META OR ITS AFFILIATES BE LIABLE UNDER ANY THEORY OF LIABILITY, WHETHER IN
343
- CONTRACT, TORT, NEGLIGENCE, PRODUCTS LIABILITY, OR OTHERWISE, ARISING OUT OF THIS AGREEMENT,
344
- FOR ANY LOST PROFITS OR ANY INDIRECT, SPECIAL, CONSEQUENTIAL, INCIDENTAL, EXEMPLARY OR
345
- PUNITIVE DAMAGES, EVEN IF META OR ITS AFFILIATES HAVE BEEN ADVISED OF THE POSSIBILITY OF ANY
346
- OF THE FOREGOING.
347
-
348
- 5. Intellectual Property.
349
-
350
- a. No trademark licenses are granted under this Agreement, and in connection with the Llama
351
- Materials, neither Meta nor Licensee may use any name or mark owned by or associated with
352
- the other or any of its affiliates, except as required for reasonable and customary use in
353
- describing and redistributing the Llama Materials or as set forth in this Section 5(a).
354
- Meta hereby grants you a license to use “Llama 3” (the “Mark”) solely as required to comply
355
- with the last sentence of Section 1.b.i. You will comply with Meta’s brand guidelines
356
- (currently accessible at https://about.meta.com/brand/resources/meta/company-brand/).
357
- All goodwill arising out of your use of the Mark will inure to the benefit of Meta.
358
-
359
- b. Subject to Meta’s ownership of Llama Materials and derivatives made by or for Meta, with
360
- respect to any derivative works and modifications of the Llama Materials that are made by
361
- you, as between you and Meta, you are and will be the owner of such derivative works and
362
- modifications.
363
-
364
- c. If you institute litigation or other proceedings against Meta or any entity (including a
365
- cross-claim or counterclaim in a lawsuit) alleging that the Llama Materials or Meta Llama 3
366
- outputs or results, or any portion of any of the foregoing, constitutes infringement of
367
- intellectual property or other rights owned or licensable by you, then any licenses granted
368
- to you under this Agreement shall terminate as of the date such litigation or claim is filed
369
- or instituted. You will indemnify and hold harmless Meta from and against any claim by any
370
- third party arising out of or related to your use or distribution of the Llama Materials.
371
-
372
- 6. Term and Termination.
373
-
374
- The term of this Agreement will commence upon your acceptance of this Agreement or access
375
- to the Llama Materials and will continue in full force and effect until terminated in
376
- accordance with the terms and conditions herein. Meta may terminate this Agreement if you
377
- are in breach of any term or condition of this Agreement. Upon termination of this Agreement,
378
- you shall delete and cease use of the Llama Materials. Sections 3, 4, and 7 shall survive
379
- the termination of this Agreement.
380
-
381
- 7. Governing Law and Jurisdiction.
382
-
383
- This Agreement will be governed and construed under the laws of the State of California
384
- without regard to choice of law principles, and the UN Convention on Contracts for the
385
- International Sale of Goods does not apply to this Agreement. The courts of California
386
- shall have exclusive jurisdiction of any dispute arising out of this Agreement.
387
-
388
- META LLAMA 3 ACCEPTABLE USE POLICY
389
-
390
- Meta is committed to promoting safe and fair use of its tools and features, including Meta
391
- Llama 3. If you access or use Meta Llama 3, you agree to this Acceptable Use Policy
392
- (“Policy”). The most recent copy of this policy can be found at
393
- https://llama.meta.com/llama3/use-policy.
394
-
395
- Prohibited Uses
396
-
397
- We want everyone to use Meta Llama 3 safely and responsibly. You agree you will not use, or
398
- allow others to use, Meta Llama 3 to:
399
-
400
- 1. Violate the law or others’ rights, including to:
401
-
402
- a. Engage in, promote, generate, contribute to, encourage, plan, incite, or further illegal
403
- or unlawful activity or content, such as:
404
-
405
- i. Violence or terrorism
406
- ii. Exploitation or harm to children, including the solicitation, creation, acquisition,
407
- or dissemination of child exploitative content or failure to report Child Sexual Abuse
408
- Material
409
- iii. Human trafficking, exploitation, and sexual violence
410
- iv. The illegal distribution of information or materials to minors, including obscene
411
- materials, or failure to employ legally required age-gating in connection with such
412
- information or materials
413
- v. Sexual solicitation
414
- vi. Any other criminal activity
415
-
416
- b. Engage in, promote, incite, or facilitate the harassment, abuse, threatening, or
417
- bullying of individuals or groups of individuals
418
-
419
- c. Engage in, promote, incite, or facilitate discrimination or other unlawful or harmful
420
- conduct in the provision of employment, employment benefits, credit, housing, other economic
421
- benefits, or other essential goods and services
422
-
423
- d. Engage in the unauthorized or unlicensed practice of any profession including, but not
424
- limited to, financial, legal, medical/health, or related professional practices
425
-
426
- e. Collect, process, disclose, generate, or infer health, demographic, or other sensitive
427
- personal or private information about individuals without rights and consents required by
428
- applicable laws
429
-
430
- f. Engage in or facilitate any action or generate any content that infringes, misappropriates,
431
- or otherwise violates any third-party rights, including the outputs or results of any
432
- products or services using the Llama Materials
433
-
434
- g. Create, generate, or facilitate the creation of malicious code, malware, computer viruses
435
- or do anything else that could disable, overburden, interfere with or impair the proper
436
- working, integrity, operation, or appearance of a website or computer system
437
-
438
- 2. Engage in, promote, incite, facilitate, or assist in the planning or development of
439
- activities that present a risk of death or bodily harm to individuals, including use of Meta
440
- Llama 3 related to the following:
441
-
442
- a. Military, warfare, nuclear industries or applications, espionage, use for materials or
443
- activities that are subject to the International Traffic Arms Regulations (ITAR) maintained
444
- by the United States Department of State
445
- b. Guns and illegal weapons (including weapon development)
446
- c. Illegal drugs and regulated/controlled substances
447
- d. Operation of critical infrastructure, transportation technologies, or heavy machinery
448
- e. Self-harm or harm to others, including suicide, cutting, and eating disorders
449
- f. Any content intended to incite or promote violence, abuse, or any infliction of bodily
450
- harm to an individual
451
-
452
- 3. Intentionally deceive or mislead others, including use of Meta Llama 3 related to the
453
- following:
454
-
455
- a. Generating, promoting, or furthering fraud or the creation or promotion of disinformation
456
- b. Generating, promoting, or furthering defamatory content, including the creation of
457
- defamatory statements, images, or other content
458
- c. Generating, promoting, or further distributing spam
459
- d. Impersonating another individual without consent, authorization, or legal right
460
- e. Representing that the use of Meta Llama 3 or outputs are human-generated
461
- f. Generating or facilitating false online engagement, including fake reviews and other
462
- means of fake online engagement
463
- g. Fail to appropriately disclose to end users any known dangers of your AI system
464
-
465
- Please report any violation of this Policy, software “bug,” or other problems that could
466
- lead to a violation of this Policy through one of the following means:
467
-
468
- * Reporting issues with the model: https://github.com/meta-llama/llama3
469
- * Reporting risky content generated by the model: developers.facebook.com/llama_output_feedback
470
- * Reporting bugs and security concerns: facebook.com/whitehat/info
471
- * Reporting violations of the Acceptable Use Policy or unlicensed uses of Meta Llama 3:
472
473
-
474
- ```
475
-
476
- ## Loguru - [MIT License](https://github.com/Delgan/loguru/blob/master/LICENSE)
477
-
478
- ```
479
-
480
- MIT License
481
-
482
- Copyright (c) 2017
483
-
484
- Permission is hereby granted, free of charge, to any person obtaining a copy
485
- of this software and associated documentation files (the "Software"), to deal
486
- in the Software without restriction, including without limitation the rights
487
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
488
- copies of the Software, and to permit persons to whom the Software is
489
- furnished to do so, subject to the following conditions:
490
-
491
- The above copyright notice and this permission notice shall be included in all
492
- copies or substantial portions of the Software.
493
-
494
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
495
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
496
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
497
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
498
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
499
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
500
- SOFTWARE.
501
-
502
- ```
503
-
504
- ## Mediapy - [Apache License 2.0](https://github.com/google/mediapy/blob/main/LICENSE)
505
-
506
- ```
507
-
508
- Apache License
509
- Version 2.0, January 2004
510
- http://www.apache.org/licenses/
511
-
512
- TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
513
-
514
- 1. Definitions.
515
-
516
- "License" shall mean the terms and conditions for use, reproduction,
517
- and distribution as defined by Sections 1 through 9 of this document.
518
-
519
- "Licensor" shall mean the copyright owner or entity authorized by
520
- the copyright owner that is granting the License.
521
-
522
- "Legal Entity" shall mean the union of the acting entity and all
523
- other entities that control, are controlled by, or are under common
524
- control with that entity. For the purposes of this definition,
525
- "control" means (i) the power, direct or indirect, to cause the
526
- direction or management of such entity, whether by contract or
527
- otherwise, or (ii) ownership of fifty percent (50%) or more of the
528
- outstanding shares, or (iii) beneficial ownership of such entity.
529
-
530
- "You" (or "Your") shall mean an individual or Legal Entity
531
- exercising permissions granted by this License.
532
-
533
- "Source" form shall mean the preferred form for making modifications,
534
- including but not limited to software source code, documentation
535
- source, and configuration files.
536
-
537
- "Object" form shall mean any form resulting from mechanical
538
- transformation or translation of a Source form, including but
539
- not limited to compiled object code, generated documentation,
540
- and conversions to other media types.
541
-
542
- "Work" shall mean the work of authorship, whether in Source or
543
- Object form, made available under the License, as indicated by a
544
- copyright notice that is included in or attached to the work
545
- (an example is provided in the Appendix below).
546
-
547
- "Derivative Works" shall mean any work, whether in Source or Object
548
- form, that is based on (or derived from) the Work and for which the
549
- editorial revisions, annotations, elaborations, or other modifications
550
- represent, as a whole, an original work of authorship. For the purposes
551
- of this License, Derivative Works shall not include works that remain
552
- separable from, or merely link (or bind by name) to the interfaces of,
553
- the Work and Derivative Works thereof.
554
-
555
- "Contribution" shall mean any work of authorship, including
556
- the original version of the Work and any modifications or additions
557
- to that Work or Derivative Works thereof, that is intentionally
558
- submitted to Licensor for inclusion in the Work by the copyright owner
559
- or by an individual or Legal Entity authorized to submit on behalf of
560
- the copyright owner. For the purposes of this definition, "submitted"
561
- means any form of electronic, verbal, or written communication sent
562
- to the Licensor or its representatives, including but not limited to
563
- communication on electronic mailing lists, source code control systems,
564
- and issue tracking systems that are managed by, or on behalf of, the
565
- Licensor for the purpose of discussing and improving the Work, but
566
- excluding communication that is conspicuously marked or otherwise
567
- designated in writing by the copyright owner as "Not a Contribution."
568
-
569
- "Contributor" shall mean Licensor and any individual or Legal Entity
570
- on behalf of whom a Contribution has been received by Licensor and
571
- subsequently incorporated within the Work.
572
-
573
- 2. Grant of Copyright License. Subject to the terms and conditions of
574
- this License, each Contributor hereby grants to You a perpetual,
575
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
576
- copyright license to reproduce, prepare Derivative Works of,
577
- publicly display, publicly perform, sublicense, and distribute the
578
- Work and such Derivative Works in Source or Object form.
579
-
580
- 3. Grant of Patent License. Subject to the terms and conditions of
581
- this License, each Contributor hereby grants to You a perpetual,
582
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
583
- (except as stated in this section) patent license to make, have made,
584
- use, offer to sell, sell, import, and otherwise transfer the Work,
585
- where such license applies only to those patent claims licensable
586
- by such Contributor that are necessarily infringed by their
587
- Contribution(s) alone or by combination of their Contribution(s)
588
- with the Work to which such Contribution(s) was submitted. If You
589
- institute patent litigation against any entity (including a
590
- cross-claim or counterclaim in a lawsuit) alleging that the Work
591
- or a Contribution incorporated within the Work constitutes direct
592
- or contributory patent infringement, then any patent licenses
593
- granted to You under this License for that Work shall terminate
594
- as of the date such litigation is filed.
595
-
596
- 4. Redistribution. You may reproduce and distribute copies of the
597
- Work or Derivative Works thereof in any medium, with or without
598
- modifications, and in Source or Object form, provided that You
599
- meet the following conditions:
600
-
601
- (a) You must give any other recipients of the Work or
602
- Derivative Works a copy of this License; and
603
-
604
- (b) You must cause any modified files to carry prominent notices
605
- stating that You changed the files; and
606
-
607
- (c) You must retain, in the Source form of any Derivative Works
608
- that You distribute, all copyright, patent, trademark, and
609
- attribution notices from the Source form of the Work,
610
- excluding those notices that do not pertain to any part of
611
- the Derivative Works; and
612
-
613
- (d) If the Work includes a "NOTICE" text file as part of its
614
- distribution, then any Derivative Works that You distribute must
615
- include a readable copy of the attribution notices contained
616
- within such NOTICE file, excluding those notices that do not
617
- pertain to any part of the Derivative Works, in at least one
618
- of the following places: within a NOTICE text file distributed
619
- as part of the Derivative Works; within the Source form or
620
- documentation, if provided along with the Derivative Works; or,
621
- within a display generated by the Derivative Works, if and
622
- wherever such third-party notices normally appear. The contents
623
- of the NOTICE file are for informational purposes only and
624
- do not modify the License. You may add Your own attribution
625
- notices within Derivative Works that You distribute, alongside
626
- or as an addendum to the NOTICE text from the Work, provided
627
- that such additional attribution notices cannot be construed
628
- as modifying the License.
629
-
630
- You may add Your own copyright statement to Your modifications and
631
- may provide additional or different license terms and conditions
632
- for use, reproduction, or distribution of Your modifications, or
633
- for any such Derivative Works as a whole, provided Your use,
634
- reproduction, and distribution of the Work otherwise complies with
635
- the conditions stated in this License.
636
-
637
- 5. Submission of Contributions. Unless You explicitly state otherwise,
638
- any Contribution intentionally submitted for inclusion in the Work
639
- by You to the Licensor shall be under the terms and conditions of
640
- this License, without any additional terms or conditions.
641
- Notwithstanding the above, nothing herein shall supersede or modify
642
- the terms of any separate license agreement you may have executed
643
- with Licensor regarding such Contributions.
644
-
645
- 6. Trademarks. This License does not grant permission to use the trade
646
- names, trademarks, service marks, or product names of the Licensor,
647
- except as required for reasonable and customary use in describing the
648
- origin of the Work and reproducing the content of the NOTICE file.
649
-
650
- 7. Disclaimer of Warranty. Unless required by applicable law or
651
- agreed to in writing, Licensor provides the Work (and each
652
- Contributor provides its Contributions) on an "AS IS" BASIS,
653
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
654
- implied, including, without limitation, any warranties or conditions
655
- of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
656
- PARTICULAR PURPOSE. You are solely responsible for determining the
657
- appropriateness of using or redistributing the Work and assume any
658
- risks associated with Your exercise of permissions under this License.
659
-
660
- 8. Limitation of Liability. In no event and under no legal theory,
661
- whether in tort (including negligence), contract, or otherwise,
662
- unless required by applicable law (such as deliberate and grossly
663
- negligent acts) or agreed to in writing, shall any Contributor be
664
- liable to You for damages, including any direct, indirect, special,
665
- incidental, or consequential damages of any character arising as a
666
- result of this License or out of the use or inability to use the
667
- Work (including but not limited to damages for loss of goodwill,
668
- work stoppage, computer failure or malfunction, or any and all
669
- other commercial damages or losses), even if such Contributor
670
- has been advised of the possibility of such damages.
671
-
672
- 9. Accepting Warranty or Additional Liability. While redistributing
673
- the Work or Derivative Works thereof, You may choose to offer,
674
- and charge a fee for, acceptance of support, warranty, indemnity,
675
- or other liability obligations and/or rights consistent with this
676
- License. However, in accepting such obligations, You may act only
677
- on Your own behalf and on Your sole responsibility, not on behalf
678
- of any other Contributor, and only if You agree to indemnify,
679
- defend, and hold each Contributor harmless for any liability
680
- incurred by, or claims asserted against, such Contributor by reason
681
- of your accepting any such warranty or additional liability.
682
-
683
- END OF TERMS AND CONDITIONS
684
-
685
- APPENDIX: How to apply the Apache License to your work.
686
-
687
- To apply the Apache License to your work, attach the following
688
- boilerplate notice, with the fields enclosed by brackets "[]"
689
- replaced with your own identifying information. (Don't include
690
- the brackets!) The text should be enclosed in the appropriate
691
- comment syntax for the file format. We also recommend that a
692
- file or class name and description of purpose be included on the
693
- same "printed page" as the copyright notice for easier
694
- identification within third-party archives.
695
-
696
- Copyright [yyyy] [name of copyright owner]
697
-
698
- Licensed under the Apache License, Version 2.0 (the "License");
699
- you may not use this file except in compliance with the License.
700
- You may obtain a copy of the License at
701
-
702
- http://www.apache.org/licenses/LICENSE-2.0
703
-
704
- Unless required by applicable law or agreed to in writing, software
705
- distributed under the License is distributed on an "AS IS" BASIS,
706
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
707
- See the License for the specific language governing permissions and
708
- limitations under the License.
709
-
710
- ```
711
-
712
- ## Nltk - [Apache License 2.0](https://github.com/nltk/nltk/blob/develop/LICENSE.txt)
713
-
714
- ```
715
-
716
- Apache License
717
- Version 2.0, January 2004
718
- http://www.apache.org/licenses/
719
-
720
- TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
721
-
722
- 1. Definitions.
723
-
724
- "License" shall mean the terms and conditions for use, reproduction,
725
- and distribution as defined by Sections 1 through 9 of this document.
726
-
727
- "Licensor" shall mean the copyright owner or entity authorized by
728
- the copyright owner that is granting the License.
729
-
730
- "Legal Entity" shall mean the union of the acting entity and all
731
- other entities that control, are controlled by, or are under common
732
- control with that entity. For the purposes of this definition,
733
- "control" means (i) the power, direct or indirect, to cause the
734
- direction or management of such entity, whether by contract or
735
- otherwise, or (ii) ownership of fifty percent (50%) or more of the
736
- outstanding shares, or (iii) beneficial ownership of such entity.
737
-
738
- "You" (or "Your") shall mean an individual or Legal Entity
739
- exercising permissions granted by this License.
740
-
741
- "Source" form shall mean the preferred form for making modifications,
742
- including but not limited to software source code, documentation
743
- source, and configuration files.
744
-
745
- "Object" form shall mean any form resulting from mechanical
746
- transformation or translation of a Source form, including but
747
- not limited to compiled object code, generated documentation,
748
- and conversions to other media types.
749
-
750
- "Work" shall mean the work of authorship, whether in Source or
751
- Object form, made available under the License, as indicated by a
752
- copyright notice that is included in or attached to the work
753
- (an example is provided in the Appendix below).
754
-
755
- "Derivative Works" shall mean any work, whether in Source or Object
756
- form, that is based on (or derived from) the Work and for which the
757
- editorial revisions, annotations, elaborations, or other modifications
758
- represent, as a whole, an original work of authorship. For the purposes
759
- of this License, Derivative Works shall not include works that remain
760
- separable from, or merely link (or bind by name) to the interfaces of,
761
- the Work and Derivative Works thereof.
762
-
763
- "Contribution" shall mean any work of authorship, including
764
- the original version of the Work and any modifications or additions
765
- to that Work or Derivative Works thereof, that is intentionally
766
- submitted to Licensor for inclusion in the Work by the copyright owner
767
- or by an individual or Legal Entity authorized to submit on behalf of
768
- the copyright owner. For the purposes of this definition, "submitted"
769
- means any form of electronic, verbal, or written communication sent
770
- to the Licensor or its representatives, including but not limited to
771
- communication on electronic mailing lists, source code control systems,
772
- and issue tracking systems that are managed by, or on behalf of, the
773
- Licensor for the purpose of discussing and improving the Work, but
774
- excluding communication that is conspicuously marked or otherwise
775
- designated in writing by the copyright owner as "Not a Contribution."
776
-
777
- "Contributor" shall mean Licensor and any individual or Legal Entity
778
- on behalf of whom a Contribution has been received by Licensor and
779
- subsequently incorporated within the Work.
780
-
781
- 2. Grant of Copyright License. Subject to the terms and conditions of
782
- this License, each Contributor hereby grants to You a perpetual,
783
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
784
- copyright license to reproduce, prepare Derivative Works of,
785
- publicly display, publicly perform, sublicense, and distribute the
786
- Work and such Derivative Works in Source or Object form.
787
-
788
- 3. Grant of Patent License. Subject to the terms and conditions of
789
- this License, each Contributor hereby grants to You a perpetual,
790
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
791
- (except as stated in this section) patent license to make, have made,
792
- use, offer to sell, sell, import, and otherwise transfer the Work,
793
- where such license applies only to those patent claims licensable
794
- by such Contributor that are necessarily infringed by their
795
- Contribution(s) alone or by combination of their Contribution(s)
796
- with the Work to which such Contribution(s) was submitted. If You
797
- institute patent litigation against any entity (including a
798
- cross-claim or counterclaim in a lawsuit) alleging that the Work
799
- or a Contribution incorporated within the Work constitutes direct
800
- or contributory patent infringement, then any patent licenses
801
- granted to You under this License for that Work shall terminate
802
- as of the date such litigation is filed.
803
-
804
- 4. Redistribution. You may reproduce and distribute copies of the
805
- Work or Derivative Works thereof in any medium, with or without
806
- modifications, and in Source or Object form, provided that You
807
- meet the following conditions:
808
-
809
- (a) You must give any other recipients of the Work or
810
- Derivative Works a copy of this License; and
811
-
812
- (b) You must cause any modified files to carry prominent notices
813
- stating that You changed the files; and
814
-
815
- (c) You must retain, in the Source form of any Derivative Works
816
- that You distribute, all copyright, patent, trademark, and
817
- attribution notices from the Source form of the Work,
818
- excluding those notices that do not pertain to any part of
819
- the Derivative Works; and
820
-
821
- (d) If the Work includes a "NOTICE" text file as part of its
822
- distribution, then any Derivative Works that You distribute must
823
- include a readable copy of the attribution notices contained
824
- within such NOTICE file, excluding those notices that do not
825
- pertain to any part of the Derivative Works, in at least one
826
- of the following places: within a NOTICE text file distributed
827
- as part of the Derivative Works; within the Source form or
828
- documentation, if provided along with the Derivative Works; or,
829
- within a display generated by the Derivative Works, if and
830
- wherever such third-party notices normally appear. The contents
831
- of the NOTICE file are for informational purposes only and
832
- do not modify the License. You may add Your own attribution
833
- notices within Derivative Works that You distribute, alongside
834
- or as an addendum to the NOTICE text from the Work, provided
835
- that such additional attribution notices cannot be construed
836
- as modifying the License.
837
-
838
- You may add Your own copyright statement to Your modifications and
839
- may provide additional or different license terms and conditions
840
- for use, reproduction, or distribution of Your modifications, or
841
- for any such Derivative Works as a whole, provided Your use,
842
- reproduction, and distribution of the Work otherwise complies with
843
- the conditions stated in this License.
844
-
845
- 5. Submission of Contributions. Unless You explicitly state otherwise,
846
- any Contribution intentionally submitted for inclusion in the Work
847
- by You to the Licensor shall be under the terms and conditions of
848
- this License, without any additional terms or conditions.
849
- Notwithstanding the above, nothing herein shall supersede or modify
850
- the terms of any separate license agreement you may have executed
851
- with Licensor regarding such Contributions.
852
-
853
- 6. Trademarks. This License does not grant permission to use the trade
854
- names, trademarks, service marks, or product names of the Licensor,
855
- except as required for reasonable and customary use in describing the
856
- origin of the Work and reproducing the content of the NOTICE file.
857
-
858
- 7. Disclaimer of Warranty. Unless required by applicable law or
859
- agreed to in writing, Licensor provides the Work (and each
860
- Contributor provides its Contributions) on an "AS IS" BASIS,
861
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
862
- implied, including, without limitation, any warranties or conditions
863
- of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
864
- PARTICULAR PURPOSE. You are solely responsible for determining the
865
- appropriateness of using or redistributing the Work and assume any
866
- risks associated with Your exercise of permissions under this License.
867
-
868
- 8. Limitation of Liability. In no event and under no legal theory,
869
- whether in tort (including negligence), contract, or otherwise,
870
- unless required by applicable law (such as deliberate and grossly
871
- negligent acts) or agreed to in writing, shall any Contributor be
872
- liable to You for damages, including any direct, indirect, special,
873
- incidental, or consequential damages of any character arising as a
874
- result of this License or out of the use or inability to use the
875
- Work (including but not limited to damages for loss of goodwill,
876
- work stoppage, computer failure or malfunction, or any and all
877
- other commercial damages or losses), even if such Contributor
878
- has been advised of the possibility of such damages.
879
-
880
- 9. Accepting Warranty or Additional Liability. While redistributing
881
- the Work or Derivative Works thereof, You may choose to offer,
882
- and charge a fee for, acceptance of support, warranty, indemnity,
883
- or other liability obligations and/or rights consistent with this
884
- License. However, in accepting such obligations, You may act only
885
- on Your own behalf and on Your sole responsibility, not on behalf
886
- of any other Contributor, and only if You agree to indemnify,
887
- defend, and hold each Contributor harmless for any liability
888
- incurred by, or claims asserted against, such Contributor by reason
889
- of your accepting any such warranty or additional liability.
890
-
891
- END OF TERMS AND CONDITIONS
892
-
893
- APPENDIX: How to apply the Apache License to your work.
894
-
895
- To apply the Apache License to your work, attach the following
896
- boilerplate notice, with the fields enclosed by brackets "[]"
897
- replaced with your own identifying information. (Don't include
898
- the brackets!) The text should be enclosed in the appropriate
899
- comment syntax for the file format. We also recommend that a
900
- file or class name and description of purpose be included on the
901
- same "printed page" as the copyright notice for easier
902
- identification within third-party archives.
903
-
904
- Copyright [yyyy] [name of copyright owner]
905
-
906
- Licensed under the Apache License, Version 2.0 (the "License");
907
- you may not use this file except in compliance with the License.
908
- You may obtain a copy of the License at
909
-
910
- http://www.apache.org/licenses/LICENSE-2.0
911
-
912
- Unless required by applicable law or agreed to in writing, software
913
- distributed under the License is distributed on an "AS IS" BASIS,
914
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
915
- See the License for the specific language governing permissions and
916
- limitations under the License.
917
-
918
- ```
919
-
920
- ## PEFT - [Apache License 2.0](https://github.com/huggingface/peft/blob/main/LICENSE)
921
-
922
- ```
923
-
924
- Apache License
925
- Version 2.0, January 2004
926
- http://www.apache.org/licenses/
927
-
928
- TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
929
-
930
- 1. Definitions.
931
-
932
- "License" shall mean the terms and conditions for use, reproduction,
933
- and distribution as defined by Sections 1 through 9 of this document.
934
-
935
- "Licensor" shall mean the copyright owner or entity authorized by
936
- the copyright owner that is granting the License.
937
-
938
- "Legal Entity" shall mean the union of the acting entity and all
939
- other entities that control, are controlled by, or are under common
940
- control with that entity. For the purposes of this definition,
941
- "control" means (i) the power, direct or indirect, to cause the
942
- direction or management of such entity, whether by contract or
943
- otherwise, or (ii) ownership of fifty percent (50%) or more of the
944
- outstanding shares, or (iii) beneficial ownership of such entity.
945
-
946
- "You" (or "Your") shall mean an individual or Legal Entity
947
- exercising permissions granted by this License.
948
-
949
- "Source" form shall mean the preferred form for making modifications,
950
- including but not limited to software source code, documentation
951
- source, and configuration files.
952
-
953
- "Object" form shall mean any form resulting from mechanical
954
- transformation or translation of a Source form, including but
955
- not limited to compiled object code, generated documentation,
956
- and conversions to other media types.
957
-
958
- "Work" shall mean the work of authorship, whether in Source or
959
- Object form, made available under the License, as indicated by a
960
- copyright notice that is included in or attached to the work
961
- (an example is provided in the Appendix below).
962
-
963
- "Derivative Works" shall mean any work, whether in Source or Object
964
- form, that is based on (or derived from) the Work and for which the
965
- editorial revisions, annotations, elaborations, or other modifications
966
- represent, as a whole, an original work of authorship. For the purposes
967
- of this License, Derivative Works shall not include works that remain
968
- separable from, or merely link (or bind by name) to the interfaces of,
969
- the Work and Derivative Works thereof.
970
-
971
- "Contribution" shall mean any work of authorship, including
972
- the original version of the Work and any modifications or additions
973
- to that Work or Derivative Works thereof, that is intentionally
974
- submitted to Licensor for inclusion in the Work by the copyright owner
975
- or by an individual or Legal Entity authorized to submit on behalf of
976
- the copyright owner. For the purposes of this definition, "submitted"
977
- means any form of electronic, verbal, or written communication sent
978
- to the Licensor or its representatives, including but not limited to
979
- communication on electronic mailing lists, source code control systems,
980
- and issue tracking systems that are managed by, or on behalf of, the
981
- Licensor for the purpose of discussing and improving the Work, but
982
- excluding communication that is conspicuously marked or otherwise
983
- designated in writing by the copyright owner as "Not a Contribution."
984
-
985
- "Contributor" shall mean Licensor and any individual or Legal Entity
986
- on behalf of whom a Contribution has been received by Licensor and
987
- subsequently incorporated within the Work.
988
-
989
- 2. Grant of Copyright License. Subject to the terms and conditions of
990
- this License, each Contributor hereby grants to You a perpetual,
991
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
992
- copyright license to reproduce, prepare Derivative Works of,
993
- publicly display, publicly perform, sublicense, and distribute the
994
- Work and such Derivative Works in Source or Object form.
995
-
996
- 3. Grant of Patent License. Subject to the terms and conditions of
997
- this License, each Contributor hereby grants to You a perpetual,
998
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
999
- (except as stated in this section) patent license to make, have made,
1000
- use, offer to sell, sell, import, and otherwise transfer the Work,
1001
- where such license applies only to those patent claims licensable
1002
- by such Contributor that are necessarily infringed by their
1003
- Contribution(s) alone or by combination of their Contribution(s)
1004
- with the Work to which such Contribution(s) was submitted. If You
1005
- institute patent litigation against any entity (including a
1006
- cross-claim or counterclaim in a lawsuit) alleging that the Work
1007
- or a Contribution incorporated within the Work constitutes direct
1008
- or contributory patent infringement, then any patent licenses
1009
- granted to You under this License for that Work shall terminate
1010
- as of the date such litigation is filed.
1011
-
1012
- 4. Redistribution. You may reproduce and distribute copies of the
1013
- Work or Derivative Works thereof in any medium, with or without
1014
- modifications, and in Source or Object form, provided that You
1015
- meet the following conditions:
1016
-
1017
- (a) You must give any other recipients of the Work or
1018
- Derivative Works a copy of this License; and
1019
-
1020
- (b) You must cause any modified files to carry prominent notices
1021
- stating that You changed the files; and
1022
-
1023
- (c) You must retain, in the Source form of any Derivative Works
1024
- that You distribute, all copyright, patent, trademark, and
1025
- attribution notices from the Source form of the Work,
1026
- excluding those notices that do not pertain to any part of
1027
- the Derivative Works; and
1028
-
1029
- (d) If the Work includes a "NOTICE" text file as part of its
1030
- distribution, then any Derivative Works that You distribute must
1031
- include a readable copy of the attribution notices contained
1032
- within such NOTICE file, excluding those notices that do not
1033
- pertain to any part of the Derivative Works, in at least one
1034
- of the following places: within a NOTICE text file distributed
1035
- as part of the Derivative Works; within the Source form or
1036
- documentation, if provided along with the Derivative Works; or,
1037
- within a display generated by the Derivative Works, if and
1038
- wherever such third-party notices normally appear. The contents
1039
- of the NOTICE file are for informational purposes only and
1040
- do not modify the License. You may add Your own attribution
1041
- notices within Derivative Works that You distribute, alongside
1042
- or as an addendum to the NOTICE text from the Work, provided
1043
- that such additional attribution notices cannot be construed
1044
- as modifying the License.
1045
-
1046
- You may add Your own copyright statement to Your modifications and
1047
- may provide additional or different license terms and conditions
1048
- for use, reproduction, or distribution of Your modifications, or
1049
- for any such Derivative Works as a whole, provided Your use,
1050
- reproduction, and distribution of the Work otherwise complies with
1051
- the conditions stated in this License.
1052
-
1053
- 5. Submission of Contributions. Unless You explicitly state otherwise,
1054
- any Contribution intentionally submitted for inclusion in the Work
1055
- by You to the Licensor shall be under the terms and conditions of
1056
- this License, without any additional terms or conditions.
1057
- Notwithstanding the above, nothing herein shall supersede or modify
1058
- the terms of any separate license agreement you may have executed
1059
- with Licensor regarding such Contributions.
1060
-
1061
- 6. Trademarks. This License does not grant permission to use the trade
1062
- names, trademarks, service marks, or product names of the Licensor,
1063
- except as required for reasonable and customary use in describing the
1064
- origin of the Work and reproducing the content of the NOTICE file.
1065
-
1066
- 7. Disclaimer of Warranty. Unless required by applicable law or
1067
- agreed to in writing, Licensor provides the Work (and each
1068
- Contributor provides its Contributions) on an "AS IS" BASIS,
1069
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
1070
- implied, including, without limitation, any warranties or conditions
1071
- of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
1072
- PARTICULAR PURPOSE. You are solely responsible for determining the
1073
- appropriateness of using or redistributing the Work and assume any
1074
- risks associated with Your exercise of permissions under this License.
1075
-
1076
- 8. Limitation of Liability. In no event and under no legal theory,
1077
- whether in tort (including negligence), contract, or otherwise,
1078
- unless required by applicable law (such as deliberate and grossly
1079
- negligent acts) or agreed to in writing, shall any Contributor be
1080
- liable to You for damages, including any direct, indirect, special,
1081
- incidental, or consequential damages of any character arising as a
1082
- result of this License or out of the use or inability to use the
1083
- Work (including but not limited to damages for loss of goodwill,
1084
- work stoppage, computer failure or malfunction, or any and all
1085
- other commercial damages or losses), even if such Contributor
1086
- has been advised of the possibility of such damages.
1087
-
1088
- 9. Accepting Warranty or Additional Liability. While redistributing
1089
- the Work or Derivative Works thereof, You may choose to offer,
1090
- and charge a fee for, acceptance of support, warranty, indemnity,
1091
- or other liability obligations and/or rights consistent with this
1092
- License. However, in accepting such obligations, You may act only
1093
- on Your own behalf and on Your sole responsibility, not on behalf
1094
- of any other Contributor, and only if You agree to indemnify,
1095
- defend, and hold each Contributor harmless for any liability
1096
- incurred by, or claims asserted against, such Contributor by reason
1097
- of your accepting any such warranty or additional liability.
1098
-
1099
- END OF TERMS AND CONDITIONS
1100
-
1101
- APPENDIX: How to apply the Apache License to your work.
1102
-
1103
- To apply the Apache License to your work, attach the following
1104
- boilerplate notice, with the fields enclosed by brackets "[]"
1105
- replaced with your own identifying information. (Don't include
1106
- the brackets!) The text should be enclosed in the appropriate
1107
- comment syntax for the file format. We also recommend that a
1108
- file or class name and description of purpose be included on the
1109
- same "printed page" as the copyright notice for easier
1110
- identification within third-party archives.
1111
-
1112
- Copyright [yyyy] [name of copyright owner]
1113
-
1114
- Licensed under the Apache License, Version 2.0 (the "License");
1115
- you may not use this file except in compliance with the License.
1116
- You may obtain a copy of the License at
1117
-
1118
- http://www.apache.org/licenses/LICENSE-2.0
1119
-
1120
- Unless required by applicable law or agreed to in writing, software
1121
- distributed under the License is distributed on an "AS IS" BASIS,
1122
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1123
- See the License for the specific language governing permissions and
1124
- limitations under the License.
1125
-
1126
- ```
1127
-
1128
- ## Pillow - [MIT License](https://github.com/python-pillow/Pillow/blob/main/LICENSE)
1129
-
1130
- ```
1131
-
1132
- The Python Imaging Library (PIL) is
1133
-
1134
- Copyright © 1997-2011 by Secret Labs AB
1135
- Copyright © 1995-2011 by Fredrik Lundh and contributors
1136
-
1137
- Pillow is the friendly PIL fork. It is
1138
-
1139
- Copyright © 2010 by Jeffrey A. Clark and contributors
1140
-
1141
- Like PIL, Pillow is licensed under the open source MIT-CMU License:
1142
-
1143
- By obtaining, using, and/or copying this software and/or its associated
1144
- documentation, you agree that you have read, understood, and will comply
1145
- with the following terms and conditions:
1146
-
1147
- Permission to use, copy, modify and distribute this software and its
1148
- documentation for any purpose and without fee is hereby granted,
1149
- provided that the above copyright notice appears in all copies, and that
1150
- both that copyright notice and this permission notice appear in supporting
1151
- documentation, and that the name of Secret Labs AB or the author not be
1152
- used in advertising or publicity pertaining to distribution of the software
1153
- without specific, written prior permission.
1154
-
1155
- SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS
1156
- SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
1157
- IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR ANY SPECIAL,
1158
- INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
1159
- LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
1160
- OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
1161
- PERFORMANCE OF THIS SOFTWARE.
1162
-
1163
- ```
1164
-
1165
- ## PyAV - [BSD 3-Clause "New" or "Revised" License](https://github.com/PyAV-Org/PyAV/blob/main/LICENSE.txt)
1166
-
1167
- ```
1168
-
1169
- Copyright retained by original committers. All rights reserved.
1170
-
1171
- Redistribution and use in source and binary forms, with or without
1172
- modification, are permitted provided that the following conditions are met:
1173
- * Redistributions of source code must retain the above copyright
1174
- notice, this list of conditions and the following disclaimer.
1175
- * Redistributions in binary form must reproduce the above copyright
1176
- notice, this list of conditions and the following disclaimer in the
1177
- documentation and/or other materials provided with the distribution.
1178
- * Neither the name of the project nor the names of its contributors may be
1179
- used to endorse or promote products derived from this software without
1180
- specific prior written permission.
1181
-
1182
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
1183
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1184
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
1185
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY DIRECT,
1186
- INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
1187
- BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
1188
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
1189
- OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
1190
- NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
1191
- EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
1192
-
1193
- ```
1194
-
1195
- ## Pytorch_Retinaface - [MIT License](https://github.com/biubug6/Pytorch_Retinaface/blob/master/LICENSE.MIT)
1196
-
1197
- ```
1198
- MIT License
1199
-
1200
- Copyright (c) 2019
1201
-
1202
- Permission is hereby granted, free of charge, to any person obtaining a copy
1203
- of this software and associated documentation files (the "Software"), to deal
1204
- in the Software without restriction, including without limitation the rights
1205
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
1206
- copies of the Software, and to permit persons to whom the Software is
1207
- furnished to do so, subject to the following conditions:
1208
-
1209
- The above copyright notice and this permission notice shall be included in all
1210
- copies or substantial portions of the Software.
1211
-
1212
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1213
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1214
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
1215
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1216
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
1217
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
1218
- SOFTWARE.
1219
- ```
1220
-
1221
- ## Sentencepiece - [Apache License 2.0](https://github.com/google/sentencepiece/blob/master/LICENSE)
1222
-
1223
- ```
1224
-
1225
- Apache License
1226
- Version 2.0, January 2004
1227
- http://www.apache.org/licenses/
1228
-
1229
- TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1230
-
1231
- 1. Definitions.
1232
-
1233
- "License" shall mean the terms and conditions for use, reproduction,
1234
- and distribution as defined by Sections 1 through 9 of this document.
1235
-
1236
- "Licensor" shall mean the copyright owner or entity authorized by
1237
- the copyright owner that is granting the License.
1238
-
1239
- "Legal Entity" shall mean the union of the acting entity and all
1240
- other entities that control, are controlled by, or are under common
1241
- control with that entity. For the purposes of this definition,
1242
- "control" means (i) the power, direct or indirect, to cause the
1243
- direction or management of such entity, whether by contract or
1244
- otherwise, or (ii) ownership of fifty percent (50%) or more of the
1245
- outstanding shares, or (iii) beneficial ownership of such entity.
1246
-
1247
- "You" (or "Your") shall mean an individual or Legal Entity
1248
- exercising permissions granted by this License.
1249
-
1250
- "Source" form shall mean the preferred form for making modifications,
1251
- including but not limited to software source code, documentation
1252
- source, and configuration files.
1253
-
1254
- "Object" form shall mean any form resulting from mechanical
1255
- transformation or translation of a Source form, including but
1256
- not limited to compiled object code, generated documentation,
1257
- and conversions to other media types.
1258
-
1259
- "Work" shall mean the work of authorship, whether in Source or
1260
- Object form, made available under the License, as indicated by a
1261
- copyright notice that is included in or attached to the work
1262
- (an example is provided in the Appendix below).
1263
-
1264
- "Derivative Works" shall mean any work, whether in Source or Object
1265
- form, that is based on (or derived from) the Work and for which the
1266
- editorial revisions, annotations, elaborations, or other modifications
1267
- represent, as a whole, an original work of authorship. For the purposes
1268
- of this License, Derivative Works shall not include works that remain
1269
- separable from, or merely link (or bind by name) to the interfaces of,
1270
- the Work and Derivative Works thereof.
1271
-
1272
- "Contribution" shall mean any work of authorship, including
1273
- the original version of the Work and any modifications or additions
1274
- to that Work or Derivative Works thereof, that is intentionally
1275
- submitted to Licensor for inclusion in the Work by the copyright owner
1276
- or by an individual or Legal Entity authorized to submit on behalf of
1277
- the copyright owner. For the purposes of this definition, "submitted"
1278
- means any form of electronic, verbal, or written communication sent
1279
- to the Licensor or its representatives, including but not limited to
1280
- communication on electronic mailing lists, source code control systems,
1281
- and issue tracking systems that are managed by, or on behalf of, the
1282
- Licensor for the purpose of discussing and improving the Work, but
1283
- excluding communication that is conspicuously marked or otherwise
1284
- designated in writing by the copyright owner as "Not a Contribution."
1285
-
1286
- "Contributor" shall mean Licensor and any individual or Legal Entity
1287
- on behalf of whom a Contribution has been received by Licensor and
1288
- subsequently incorporated within the Work.
1289
-
1290
- 2. Grant of Copyright License. Subject to the terms and conditions of
1291
- this License, each Contributor hereby grants to You a perpetual,
1292
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
1293
- copyright license to reproduce, prepare Derivative Works of,
1294
- publicly display, publicly perform, sublicense, and distribute the
1295
- Work and such Derivative Works in Source or Object form.
1296
-
1297
- 3. Grant of Patent License. Subject to the terms and conditions of
1298
- this License, each Contributor hereby grants to You a perpetual,
1299
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
1300
- (except as stated in this section) patent license to make, have made,
1301
- use, offer to sell, sell, import, and otherwise transfer the Work,
1302
- where such license applies only to those patent claims licensable
1303
- by such Contributor that are necessarily infringed by their
1304
- Contribution(s) alone or by combination of their Contribution(s)
1305
- with the Work to which such Contribution(s) was submitted. If You
1306
- institute patent litigation against any entity (including a
1307
- cross-claim or counterclaim in a lawsuit) alleging that the Work
1308
- or a Contribution incorporated within the Work constitutes direct
1309
- or contributory patent infringement, then any patent licenses
1310
- granted to You under this License for that Work shall terminate
1311
- as of the date such litigation is filed.
1312
-
1313
- 4. Redistribution. You may reproduce and distribute copies of the
1314
- Work or Derivative Works thereof in any medium, with or without
1315
- modifications, and in Source or Object form, provided that You
1316
- meet the following conditions:
1317
-
1318
- (a) You must give any other recipients of the Work or
1319
- Derivative Works a copy of this License; and
1320
-
1321
- (b) You must cause any modified files to carry prominent notices
1322
- stating that You changed the files; and
1323
-
1324
- (c) You must retain, in the Source form of any Derivative Works
1325
- that You distribute, all copyright, patent, trademark, and
1326
- attribution notices from the Source form of the Work,
1327
- excluding those notices that do not pertain to any part of
1328
- the Derivative Works; and
1329
-
1330
- (d) If the Work includes a "NOTICE" text file as part of its
1331
- distribution, then any Derivative Works that You distribute must
1332
- include a readable copy of the attribution notices contained
1333
- within such NOTICE file, excluding those notices that do not
1334
- pertain to any part of the Derivative Works, in at least one
1335
- of the following places: within a NOTICE text file distributed
1336
- as part of the Derivative Works; within the Source form or
1337
- documentation, if provided along with the Derivative Works; or,
1338
- within a display generated by the Derivative Works, if and
1339
- wherever such third-party notices normally appear. The contents
1340
- of the NOTICE file are for informational purposes only and
1341
- do not modify the License. You may add Your own attribution
1342
- notices within Derivative Works that You distribute, alongside
1343
- or as an addendum to the NOTICE text from the Work, provided
1344
- that such additional attribution notices cannot be construed
1345
- as modifying the License.
1346
-
1347
- You may add Your own copyright statement to Your modifications and
1348
- may provide additional or different license terms and conditions
1349
- for use, reproduction, or distribution of Your modifications, or
1350
- for any such Derivative Works as a whole, provided Your use,
1351
- reproduction, and distribution of the Work otherwise complies with
1352
- the conditions stated in this License.
1353
-
1354
- 5. Submission of Contributions. Unless You explicitly state otherwise,
1355
- any Contribution intentionally submitted for inclusion in the Work
1356
- by You to the Licensor shall be under the terms and conditions of
1357
- this License, without any additional terms or conditions.
1358
- Notwithstanding the above, nothing herein shall supersede or modify
1359
- the terms of any separate license agreement you may have executed
1360
- with Licensor regarding such Contributions.
1361
-
1362
- 6. Trademarks. This License does not grant permission to use the trade
1363
- names, trademarks, service marks, or product names of the Licensor,
1364
- except as required for reasonable and customary use in describing the
1365
- origin of the Work and reproducing the content of the NOTICE file.
1366
-
1367
- 7. Disclaimer of Warranty. Unless required by applicable law or
1368
- agreed to in writing, Licensor provides the Work (and each
1369
- Contributor provides its Contributions) on an "AS IS" BASIS,
1370
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
1371
- implied, including, without limitation, any warranties or conditions
1372
- of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
1373
- PARTICULAR PURPOSE. You are solely responsible for determining the
1374
- appropriateness of using or redistributing the Work and assume any
1375
- risks associated with Your exercise of permissions under this License.
1376
-
1377
- 8. Limitation of Liability. In no event and under no legal theory,
1378
- whether in tort (including negligence), contract, or otherwise,
1379
- unless required by applicable law (such as deliberate and grossly
1380
- negligent acts) or agreed to in writing, shall any Contributor be
1381
- liable to You for damages, including any direct, indirect, special,
1382
- incidental, or consequential damages of any character arising as a
1383
- result of this License or out of the use or inability to use the
1384
- Work (including but not limited to damages for loss of goodwill,
1385
- work stoppage, computer failure or malfunction, or any and all
1386
- other commercial damages or losses), even if such Contributor
1387
- has been advised of the possibility of such damages.
1388
-
1389
- 9. Accepting Warranty or Additional Liability. While redistributing
1390
- the Work or Derivative Works thereof, You may choose to offer,
1391
- and charge a fee for, acceptance of support, warranty, indemnity,
1392
- or other liability obligations and/or rights consistent with this
1393
- License. However, in accepting such obligations, You may act only
1394
- on Your own behalf and on Your sole responsibility, not on behalf
1395
- of any other Contributor, and only if You agree to indemnify,
1396
- defend, and hold each Contributor harmless for any liability
1397
- incurred by, or claims asserted against, such Contributor by reason
1398
- of your accepting any such warranty or additional liability.
1399
-
1400
- END OF TERMS AND CONDITIONS
1401
-
1402
- APPENDIX: How to apply the Apache License to your work.
1403
-
1404
- To apply the Apache License to your work, attach the following
1405
- boilerplate notice, with the fields enclosed by brackets "[]"
1406
- replaced with your own identifying information. (Don't include
1407
- the brackets!) The text should be enclosed in the appropriate
1408
- comment syntax for the file format. We also recommend that a
1409
- file or class name and description of purpose be included on the
1410
- same "printed page" as the copyright notice for easier
1411
- identification within third-party archives.
1412
-
1413
- Copyright [yyyy] [name of copyright owner]
1414
-
1415
- Licensed under the Apache License, Version 2.0 (the "License");
1416
- you may not use this file except in compliance with the License.
1417
- You may obtain a copy of the License at
1418
-
1419
- http://www.apache.org/licenses/LICENSE-2.0
1420
-
1421
- Unless required by applicable law or agreed to in writing, software
1422
- distributed under the License is distributed on an "AS IS" BASIS,
1423
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1424
- See the License for the specific language governing permissions and
1425
- limitations under the License.
1426
-
1427
- ```
1428
-
1429
- ## Termcolor - [MIT License](https://github.com/termcolor/termcolor/blob/main/COPYING.txt)
1430
-
1431
- ```
1432
- Copyright (c) 2008-2011 Volvox Development Team
1433
-
1434
- Permission is hereby granted, free of charge, to any person obtaining a copy
1435
- of this software and associated documentation files (the "Software"), to deal
1436
- in the Software without restriction, including without limitation the rights
1437
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
1438
- copies of the Software, and to permit persons to whom the Software is
1439
- furnished to do so, subject to the following conditions:
1440
-
1441
- The above copyright notice and this permission notice shall be included in
1442
- all copies or substantial portions of the Software.
1443
-
1444
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1445
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1446
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
1447
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1448
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
1449
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
1450
- THE SOFTWARE.
1451
- ```
1452
-
1453
- ## Transformers [Apache License 2.0](https://github.com/huggingface/transformers/blob/main/LICENSE)
1454
-
1455
- ```
1456
-
1457
- Copyright 2018- The Hugging Face team. All rights reserved.
1458
-
1459
- Apache License
1460
- Version 2.0, January 2004
1461
- http://www.apache.org/licenses/
1462
-
1463
- TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1464
-
1465
- 1. Definitions.
1466
-
1467
- "License" shall mean the terms and conditions for use, reproduction,
1468
- and distribution as defined by Sections 1 through 9 of this document.
1469
-
1470
- "Licensor" shall mean the copyright owner or entity authorized by
1471
- the copyright owner that is granting the License.
1472
-
1473
- "Legal Entity" shall mean the union of the acting entity and all
1474
- other entities that control, are controlled by, or are under common
1475
- control with that entity. For the purposes of this definition,
1476
- "control" means (i) the power, direct or indirect, to cause the
1477
- direction or management of such entity, whether by contract or
1478
- otherwise, or (ii) ownership of fifty percent (50%) or more of the
1479
- outstanding shares, or (iii) beneficial ownership of such entity.
1480
-
1481
- "You" (or "Your") shall mean an individual or Legal Entity
1482
- exercising permissions granted by this License.
1483
-
1484
- "Source" form shall mean the preferred form for making modifications,
1485
- including but not limited to software source code, documentation
1486
- source, and configuration files.
1487
-
1488
- "Object" form shall mean any form resulting from mechanical
1489
- transformation or translation of a Source form, including but
1490
- not limited to compiled object code, generated documentation,
1491
- and conversions to other media types.
1492
-
1493
- "Work" shall mean the work of authorship, whether in Source or
1494
- Object form, made available under the License, as indicated by a
1495
- copyright notice that is included in or attached to the work
1496
- (an example is provided in the Appendix below).
1497
-
1498
- "Derivative Works" shall mean any work, whether in Source or Object
1499
- form, that is based on (or derived from) the Work and for which the
1500
- editorial revisions, annotations, elaborations, or other modifications
1501
- represent, as a whole, an original work of authorship. For the purposes
1502
- of this License, Derivative Works shall not include works that remain
1503
- separable from, or merely link (or bind by name) to the interfaces of,
1504
- the Work and Derivative Works thereof.
1505
-
1506
- "Contribution" shall mean any work of authorship, including
1507
- the original version of the Work and any modifications or additions
1508
- to that Work or Derivative Works thereof, that is intentionally
1509
- submitted to Licensor for inclusion in the Work by the copyright owner
1510
- or by an individual or Legal Entity authorized to submit on behalf of
1511
- the copyright owner. For the purposes of this definition, "submitted"
1512
- means any form of electronic, verbal, or written communication sent
1513
- to the Licensor or its representatives, including but not limited to
1514
- communication on electronic mailing lists, source code control systems,
1515
- and issue tracking systems that are managed by, or on behalf of, the
1516
- Licensor for the purpose of discussing and improving the Work, but
1517
- excluding communication that is conspicuously marked or otherwise
1518
- designated in writing by the copyright owner as "Not a Contribution."
1519
-
1520
- "Contributor" shall mean Licensor and any individual or Legal Entity
1521
- on behalf of whom a Contribution has been received by Licensor and
1522
- subsequently incorporated within the Work.
1523
-
1524
- 2. Grant of Copyright License. Subject to the terms and conditions of
1525
- this License, each Contributor hereby grants to You a perpetual,
1526
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
1527
- copyright license to reproduce, prepare Derivative Works of,
1528
- publicly display, publicly perform, sublicense, and distribute the
1529
- Work and such Derivative Works in Source or Object form.
1530
-
1531
- 3. Grant of Patent License. Subject to the terms and conditions of
1532
- this License, each Contributor hereby grants to You a perpetual,
1533
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
1534
- (except as stated in this section) patent license to make, have made,
1535
- use, offer to sell, sell, import, and otherwise transfer the Work,
1536
- where such license applies only to those patent claims licensable
1537
- by such Contributor that are necessarily infringed by their
1538
- Contribution(s) alone or by combination of their Contribution(s)
1539
- with the Work to which such Contribution(s) was submitted. If You
1540
- institute patent litigation against any entity (including a
1541
- cross-claim or counterclaim in a lawsuit) alleging that the Work
1542
- or a Contribution incorporated within the Work constitutes direct
1543
- or contributory patent infringement, then any patent licenses
1544
- granted to You under this License for that Work shall terminate
1545
- as of the date such litigation is filed.
1546
-
1547
- 4. Redistribution. You may reproduce and distribute copies of the
1548
- Work or Derivative Works thereof in any medium, with or without
1549
- modifications, and in Source or Object form, provided that You
1550
- meet the following conditions:
1551
-
1552
- (a) You must give any other recipients of the Work or
1553
- Derivative Works a copy of this License; and
1554
-
1555
- (b) You must cause any modified files to carry prominent notices
1556
- stating that You changed the files; and
1557
-
1558
- (c) You must retain, in the Source form of any Derivative Works
1559
- that You distribute, all copyright, patent, trademark, and
1560
- attribution notices from the Source form of the Work,
1561
- excluding those notices that do not pertain to any part of
1562
- the Derivative Works; and
1563
-
1564
- (d) If the Work includes a "NOTICE" text file as part of its
1565
- distribution, then any Derivative Works that You distribute must
1566
- include a readable copy of the attribution notices contained
1567
- within such NOTICE file, excluding those notices that do not
1568
- pertain to any part of the Derivative Works, in at least one
1569
- of the following places: within a NOTICE text file distributed
1570
- as part of the Derivative Works; within the Source form or
1571
- documentation, if provided along with the Derivative Works; or,
1572
- within a display generated by the Derivative Works, if and
1573
- wherever such third-party notices normally appear. The contents
1574
- of the NOTICE file are for informational purposes only and
1575
- do not modify the License. You may add Your own attribution
1576
- notices within Derivative Works that You distribute, alongside
1577
- or as an addendum to the NOTICE text from the Work, provided
1578
- that such additional attribution notices cannot be construed
1579
- as modifying the License.
1580
-
1581
- You may add Your own copyright statement to Your modifications and
1582
- may provide additional or different license terms and conditions
1583
- for use, reproduction, or distribution of Your modifications, or
1584
- for any such Derivative Works as a whole, provided Your use,
1585
- reproduction, and distribution of the Work otherwise complies with
1586
- the conditions stated in this License.
1587
-
1588
- 5. Submission of Contributions. Unless You explicitly state otherwise,
1589
- any Contribution intentionally submitted for inclusion in the Work
1590
- by You to the Licensor shall be under the terms and conditions of
1591
- this License, without any additional terms or conditions.
1592
- Notwithstanding the above, nothing herein shall supersede or modify
1593
- the terms of any separate license agreement you may have executed
1594
- with Licensor regarding such Contributions.
1595
-
1596
- 6. Trademarks. This License does not grant permission to use the trade
1597
- names, trademarks, service marks, or product names of the Licensor,
1598
- except as required for reasonable and customary use in describing the
1599
- origin of the Work and reproducing the content of the NOTICE file.
1600
-
1601
- 7. Disclaimer of Warranty. Unless required by applicable law or
1602
- agreed to in writing, Licensor provides the Work (and each
1603
- Contributor provides its Contributions) on an "AS IS" BASIS,
1604
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
1605
- implied, including, without limitation, any warranties or conditions
1606
- of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
1607
- PARTICULAR PURPOSE. You are solely responsible for determining the
1608
- appropriateness of using or redistributing the Work and assume any
1609
- risks associated with Your exercise of permissions under this License.
1610
-
1611
- 8. Limitation of Liability. In no event and under no legal theory,
1612
- whether in tort (including negligence), contract, or otherwise,
1613
- unless required by applicable law (such as deliberate and grossly
1614
- negligent acts) or agreed to in writing, shall any Contributor be
1615
- liable to You for damages, including any direct, indirect, special,
1616
- incidental, or consequential damages of any character arising as a
1617
- result of this License or out of the use or inability to use the
1618
- Work (including but not limited to damages for loss of goodwill,
1619
- work stoppage, computer failure or malfunction, or any and all
1620
- other commercial damages or losses), even if such Contributor
1621
- has been advised of the possibility of such damages.
1622
-
1623
- 9. Accepting Warranty or Additional Liability. While redistributing
1624
- the Work or Derivative Works thereof, You may choose to offer,
1625
- and charge a fee for, acceptance of support, warranty, indemnity,
1626
- or other liability obligations and/or rights consistent with this
1627
- License. However, in accepting such obligations, You may act only
1628
- on Your own behalf and on Your sole responsibility, not on behalf
1629
- of any other Contributor, and only if You agree to indemnify,
1630
- defend, and hold each Contributor harmless for any liability
1631
- incurred by, or claims asserted against, such Contributor by reason
1632
- of your accepting any such warranty or additional liability.
1633
-
1634
- END OF TERMS AND CONDITIONS
1635
-
1636
- APPENDIX: How to apply the Apache License to your work.
1637
-
1638
- To apply the Apache License to your work, attach the following
1639
- boilerplate notice, with the fields enclosed by brackets "[]"
1640
- replaced with your own identifying information. (Don't include
1641
- the brackets!) The text should be enclosed in the appropriate
1642
- comment syntax for the file format. We also recommend that a
1643
- file or class name and description of purpose be included on the
1644
- same "printed page" as the copyright notice for easier
1645
- identification within third-party archives.
1646
-
1647
- Copyright [yyyy] [name of copyright owner]
1648
-
1649
- Licensed under the Apache License, Version 2.0 (the "License");
1650
- you may not use this file except in compliance with the License.
1651
- You may obtain a copy of the License at
1652
-
1653
- http://www.apache.org/licenses/LICENSE-2.0
1654
-
1655
- Unless required by applicable law or agreed to in writing, software
1656
- distributed under the License is distributed on an "AS IS" BASIS,
1657
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1658
- See the License for the specific language governing permissions and
1659
- limitations under the License.
1660
-
1661
- ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cosmos-transfer1/CONTRIBUTING.md DELETED
@@ -1,51 +0,0 @@
1
- # How to Contribute
2
-
3
- We'd love to receive your patches and contributions. Please keep your PRs as draft until such time that you would like us to review them.
4
-
5
- ## Code Reviews
6
-
7
- All submissions, including submissions by project members, require review. We use GitHub pull requests for this purpose. Consult
8
- [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more information on using pull requests.
9
-
10
- ## Signing Your Work
11
-
12
- * We require that all contributors "sign-off" on their commits. This certifies that the contribution is your original work, or you have rights to submit it under the same license, or a compatible license.
13
-
14
- * Any contribution which contains commits that are not Signed-Off will not be accepted.
15
-
16
- * To sign off on a commit you simply use the `--signoff` (or `-s`) option when committing your changes:
17
- ```bash
18
- $ git commit -s -m "Add cool feature."
19
- ```
20
- This will append the following to your commit message:
21
- ```
22
- Signed-off-by: Your Name <[email protected]>
23
- ```
24
-
25
- * Full text of the DCO:
26
-
27
- ```
28
- Developer Certificate of Origin
29
- Version 1.1
30
-
31
- Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
32
- 1 Letterman Drive
33
- Suite D4700
34
- San Francisco, CA, 94129
35
-
36
- Everyone is permitted to copy and distribute verbatim copies of this license document, but changing it is not allowed.
37
- ```
38
-
39
- ```
40
- Developer's Certificate of Origin 1.1
41
-
42
- By making a contribution to this project, I certify that:
43
-
44
- (a) The contribution was created in whole or in part by me and I have the right to submit it under the open source license indicated in the file; or
45
-
46
- (b) The contribution is based upon previous work that, to the best of my knowledge, is covered under an appropriate open source license and I have the right under that license to submit that work with modifications, whether created in whole or in part by me, under the same open source license (unless I am permitted to submit under a different license), as indicated in the file; or
47
-
48
- (c) The contribution was provided directly to me by some other person who certified (a), (b) or (c) and I have not modified it.
49
-
50
- (d) I understand and agree that this project and the contribution are public and that a record of the contribution (including all personal information I submit with it, including my sign-off) is maintained indefinitely and may be redistributed consistent with this project or the open source license(s) involved.
51
- ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cosmos-transfer1/Dockerfile DELETED
@@ -1,47 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- # Use NVIDIA PyTorch container as base image
17
- FROM nvcr.io/nvidia/tritonserver:25.04-vllm-python-py3
18
-
19
- # Install basic tools
20
- RUN apt-get update && apt-get install -y git tree ffmpeg wget
21
- RUN rm /bin/sh && ln -s /bin/bash /bin/sh && ln -s /lib64/libcuda.so.1 /lib64/libcuda.so
22
- RUN apt-get install -y libglib2.0-0
23
-
24
- # Copy the cosmos-transfer1.yaml and requirements.txt files to the container
25
- COPY ./cosmos-transfer1.yaml /cosmos-transfer1.yaml
26
- COPY ./requirements_docker.txt /requirements.txt
27
-
28
- RUN ls -l /usr/lib/python3/dist-packages/blinker-1.7.0.dist-info && rm -rf /usr/lib/python3/dist-packages/blinker-1.7.0.dist-info
29
- RUN echo "Installing dependencies. This will take a while..." && \
30
- pip install --no-cache-dir -r /requirements.txt && \
31
- pip install -v --upgrade --no-build-isolation --no-dependencies sam2==1.1.0 && \
32
- pip install transformer-engine[pytorch] && \
33
- pip install decord==0.6.0 && \
34
- git clone https://github.com/NVIDIA/apex && \
35
- pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./apex && \
36
- rm -rf apex && \
37
- pip install -v decord==0.6.0 && \
38
- echo "Environment setup complete"
39
-
40
- # Create Python symlink
41
- RUN ln -s /usr/bin/python3.12 /usr/bin/python
42
- RUN apt-get install -y libmagic1
43
-
44
- RUN mkdir -p /workspace
45
- WORKDIR /workspace
46
-
47
- CMD ["/bin/bash"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cosmos-transfer1/INSTALL.md DELETED
@@ -1,88 +0,0 @@
1
- ## Environment setup
2
-
3
- Clone the `cosmos-transfer1` source code
4
- ```bash
5
- git clone [email protected]:nvidia-cosmos/cosmos-transfer1.git
6
- cd cosmos-transfer1
7
- git submodule update --init --recursive
8
- ```
9
-
10
- Cosmos runs only on Linux systems. We have tested the installation with Ubuntu 24.04, 22.04, and 20.04.
11
- Cosmos requires the Python version to be `3.12.x`.
12
-
13
- ### Inference using conda
14
-
15
- Please also make sure you have `conda` installed ([instructions](https://docs.conda.io/projects/conda/en/latest/user-guide/install/index.html)).
16
-
17
- The below commands create the `cosmos-transfer1` conda environment and install the dependencies for inference:
18
- ```bash
19
- # Create the cosmos-transfer1 conda environment.
20
- conda env create --file cosmos-transfer1.yaml
21
- # Activate the cosmos-transfer1 conda environment.
22
- conda activate cosmos-transfer1
23
- # Install the dependencies.
24
- pip install -r requirements.txt
25
- # Install vllm
26
- pip install https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.5%2Bcu128torch2.7-cp38-abi3-linux_x86_64.whl
27
- export VLLM_ATTENTION_BACKEND=FLASHINFER
28
- pip install vllm==0.9.0
29
- # Install decord
30
- pip install decord==0.6.0
31
- # Patch Transformer engine linking issues in conda environments.
32
- ln -sf $CONDA_PREFIX/lib/python3.12/site-packages/nvidia/*/include/* $CONDA_PREFIX/include/
33
- ln -sf $CONDA_PREFIX/lib/python3.12/site-packages/nvidia/*/include/* $CONDA_PREFIX/include/python3.12
34
- # Install Transformer engine.
35
- pip install transformer-engine[pytorch]
36
- ```
37
-
38
- To test the environment setup for inference run
39
- ```bash
40
- PYTHONPATH=$(pwd) python scripts/test_environment.py
41
- ```
42
-
43
- ### Inference using docker
44
-
45
- If you prefer to use a containerized environment, you can build and run this repo's dockerfile to get an environment with all the packages pre-installed. This environment does not use conda. So, there is no need to specify `CUDA_HOME=$CONDA_PREFIX` when invoking this repo's scripts.
46
-
47
- This requires docker to be already present on your system with the [Nvidia Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html) installed.
48
-
49
- ```bash
50
- docker build -f Dockerfile . -t nvcr.io/$USER/cosmos-transfer1:latest
51
- ```
52
-
53
- Note: In case you encounter permission issues while mounting local files inside the docker, you can share the folders from your current directory to all users (including docker) using this helpful alias
54
- ```
55
- alias share='sudo chown -R ${USER}:users $PWD && sudo chmod g+w $PWD'
56
- ```
57
- before running the docker.
58
-
59
- ### Training
60
-
61
- The below commands creates the `cosmos-transfer` conda environment and installs the dependencies for training. This is the same as required for inference.
62
- ```bash
63
- # Create the cosmos-transfer1 conda environment.
64
- conda env create --file cosmos-transfer1.yaml
65
- # Activate the cosmos-transfer1 conda environment.
66
- conda activate cosmos-transfer1
67
- # Install the dependencies.
68
- pip install -r requirements.txt
69
- # Install vllm
70
- pip install https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.5%2Bcu128torch2.7-cp38-abi3-linux_x86_64.whl
71
- export VLLM_ATTENTION_BACKEND=FLASHINFER
72
- pip install vllm==0.9.0
73
- # Install decord
74
- pip install decord==0.6.0
75
- # Patch Transformer engine linking issues in conda environments.
76
- ln -sf $CONDA_PREFIX/lib/python3.12/site-packages/nvidia/*/include/* $CONDA_PREFIX/include/
77
- ln -sf $CONDA_PREFIX/lib/python3.12/site-packages/nvidia/*/include/* $CONDA_PREFIX/include/python3.12
78
- # Install Transformer engine.
79
- pip install transformer-engine[pytorch]
80
- # Install Apex for full training with bfloat16.
81
- git clone https://github.com/NVIDIA/apex
82
- pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./apex
83
- ```
84
-
85
- You can test the environment setup for post-training with
86
- ```bash
87
- PYTHONPATH=$(pwd) python scripts/test_environment.py --training
88
- ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cosmos-transfer1/LICENSE DELETED
@@ -1,201 +0,0 @@
1
- Apache License
2
- Version 2.0, January 2004
3
- http://www.apache.org/licenses/
4
-
5
- TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
-
7
- 1. Definitions.
8
-
9
- "License" shall mean the terms and conditions for use, reproduction,
10
- and distribution as defined by Sections 1 through 9 of this document.
11
-
12
- "Licensor" shall mean the copyright owner or entity authorized by
13
- the copyright owner that is granting the License.
14
-
15
- "Legal Entity" shall mean the union of the acting entity and all
16
- other entities that control, are controlled by, or are under common
17
- control with that entity. For the purposes of this definition,
18
- "control" means (i) the power, direct or indirect, to cause the
19
- direction or management of such entity, whether by contract or
20
- otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
- outstanding shares, or (iii) beneficial ownership of such entity.
22
-
23
- "You" (or "Your") shall mean an individual or Legal Entity
24
- exercising permissions granted by this License.
25
-
26
- "Source" form shall mean the preferred form for making modifications,
27
- including but not limited to software source code, documentation
28
- source, and configuration files.
29
-
30
- "Object" form shall mean any form resulting from mechanical
31
- transformation or translation of a Source form, including but
32
- not limited to compiled object code, generated documentation,
33
- and conversions to other media types.
34
-
35
- "Work" shall mean the work of authorship, whether in Source or
36
- Object form, made available under the License, as indicated by a
37
- copyright notice that is included in or attached to the work
38
- (an example is provided in the Appendix below).
39
-
40
- "Derivative Works" shall mean any work, whether in Source or Object
41
- form, that is based on (or derived from) the Work and for which the
42
- editorial revisions, annotations, elaborations, or other modifications
43
- represent, as a whole, an original work of authorship. For the purposes
44
- of this License, Derivative Works shall not include works that remain
45
- separable from, or merely link (or bind by name) to the interfaces of,
46
- the Work and Derivative Works thereof.
47
-
48
- "Contribution" shall mean any work of authorship, including
49
- the original version of the Work and any modifications or additions
50
- to that Work or Derivative Works thereof, that is intentionally
51
- submitted to Licensor for inclusion in the Work by the copyright owner
52
- or by an individual or Legal Entity authorized to submit on behalf of
53
- the copyright owner. For the purposes of this definition, "submitted"
54
- means any form of electronic, verbal, or written communication sent
55
- to the Licensor or its representatives, including but not limited to
56
- communication on electronic mailing lists, source code control systems,
57
- and issue tracking systems that are managed by, or on behalf of, the
58
- Licensor for the purpose of discussing and improving the Work, but
59
- excluding communication that is conspicuously marked or otherwise
60
- designated in writing by the copyright owner as "Not a Contribution."
61
-
62
- "Contributor" shall mean Licensor and any individual or Legal Entity
63
- on behalf of whom a Contribution has been received by Licensor and
64
- subsequently incorporated within the Work.
65
-
66
- 2. Grant of Copyright License. Subject to the terms and conditions of
67
- this License, each Contributor hereby grants to You a perpetual,
68
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
- copyright license to reproduce, prepare Derivative Works of,
70
- publicly display, publicly perform, sublicense, and distribute the
71
- Work and such Derivative Works in Source or Object form.
72
-
73
- 3. Grant of Patent License. Subject to the terms and conditions of
74
- this License, each Contributor hereby grants to You a perpetual,
75
- worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
- (except as stated in this section) patent license to make, have made,
77
- use, offer to sell, sell, import, and otherwise transfer the Work,
78
- where such license applies only to those patent claims licensable
79
- by such Contributor that are necessarily infringed by their
80
- Contribution(s) alone or by combination of their Contribution(s)
81
- with the Work to which such Contribution(s) was submitted. If You
82
- institute patent litigation against any entity (including a
83
- cross-claim or counterclaim in a lawsuit) alleging that the Work
84
- or a Contribution incorporated within the Work constitutes direct
85
- or contributory patent infringement, then any patent licenses
86
- granted to You under this License for that Work shall terminate
87
- as of the date such litigation is filed.
88
-
89
- 4. Redistribution. You may reproduce and distribute copies of the
90
- Work or Derivative Works thereof in any medium, with or without
91
- modifications, and in Source or Object form, provided that You
92
- meet the following conditions:
93
-
94
- (a) You must give any other recipients of the Work or
95
- Derivative Works a copy of this License; and
96
-
97
- (b) You must cause any modified files to carry prominent notices
98
- stating that You changed the files; and
99
-
100
- (c) You must retain, in the Source form of any Derivative Works
101
- that You distribute, all copyright, patent, trademark, and
102
- attribution notices from the Source form of the Work,
103
- excluding those notices that do not pertain to any part of
104
- the Derivative Works; and
105
-
106
- (d) If the Work includes a "NOTICE" text file as part of its
107
- distribution, then any Derivative Works that You distribute must
108
- include a readable copy of the attribution notices contained
109
- within such NOTICE file, excluding those notices that do not
110
- pertain to any part of the Derivative Works, in at least one
111
- of the following places: within a NOTICE text file distributed
112
- as part of the Derivative Works; within the Source form or
113
- documentation, if provided along with the Derivative Works; or,
114
- within a display generated by the Derivative Works, if and
115
- wherever such third-party notices normally appear. The contents
116
- of the NOTICE file are for informational purposes only and
117
- do not modify the License. You may add Your own attribution
118
- notices within Derivative Works that You distribute, alongside
119
- or as an addendum to the NOTICE text from the Work, provided
120
- that such additional attribution notices cannot be construed
121
- as modifying the License.
122
-
123
- You may add Your own copyright statement to Your modifications and
124
- may provide additional or different license terms and conditions
125
- for use, reproduction, or distribution of Your modifications, or
126
- for any such Derivative Works as a whole, provided Your use,
127
- reproduction, and distribution of the Work otherwise complies with
128
- the conditions stated in this License.
129
-
130
- 5. Submission of Contributions. Unless You explicitly state otherwise,
131
- any Contribution intentionally submitted for inclusion in the Work
132
- by You to the Licensor shall be under the terms and conditions of
133
- this License, without any additional terms or conditions.
134
- Notwithstanding the above, nothing herein shall supersede or modify
135
- the terms of any separate license agreement you may have executed
136
- with Licensor regarding such Contributions.
137
-
138
- 6. Trademarks. This License does not grant permission to use the trade
139
- names, trademarks, service marks, or product names of the Licensor,
140
- except as required for reasonable and customary use in describing the
141
- origin of the Work and reproducing the content of the NOTICE file.
142
-
143
- 7. Disclaimer of Warranty. Unless required by applicable law or
144
- agreed to in writing, Licensor provides the Work (and each
145
- Contributor provides its Contributions) on an "AS IS" BASIS,
146
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
- implied, including, without limitation, any warranties or conditions
148
- of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
- PARTICULAR PURPOSE. You are solely responsible for determining the
150
- appropriateness of using or redistributing the Work and assume any
151
- risks associated with Your exercise of permissions under this License.
152
-
153
- 8. Limitation of Liability. In no event and under no legal theory,
154
- whether in tort (including negligence), contract, or otherwise,
155
- unless required by applicable law (such as deliberate and grossly
156
- negligent acts) or agreed to in writing, shall any Contributor be
157
- liable to You for damages, including any direct, indirect, special,
158
- incidental, or consequential damages of any character arising as a
159
- result of this License or out of the use or inability to use the
160
- Work (including but not limited to damages for loss of goodwill,
161
- work stoppage, computer failure or malfunction, or any and all
162
- other commercial damages or losses), even if such Contributor
163
- has been advised of the possibility of such damages.
164
-
165
- 9. Accepting Warranty or Additional Liability. While redistributing
166
- the Work or Derivative Works thereof, You may choose to offer,
167
- and charge a fee for, acceptance of support, warranty, indemnity,
168
- or other liability obligations and/or rights consistent with this
169
- License. However, in accepting such obligations, You may act only
170
- on Your own behalf and on Your sole responsibility, not on behalf
171
- of any other Contributor, and only if You agree to indemnify,
172
- defend, and hold each Contributor harmless for any liability
173
- incurred by, or claims asserted against, such Contributor by reason
174
- of your accepting any such warranty or additional liability.
175
-
176
- END OF TERMS AND CONDITIONS
177
-
178
- APPENDIX: How to apply the Apache License to your work.
179
-
180
- To apply the Apache License to your work, attach the following
181
- boilerplate notice, with the fields enclosed by brackets "[]"
182
- replaced with your own identifying information. (Don't include
183
- the brackets!) The text should be enclosed in the appropriate
184
- comment syntax for the file format. We also recommend that a
185
- file or class name and description of purpose be included on the
186
- same "printed page" as the copyright notice for easier
187
- identification within third-party archives.
188
-
189
- Copyright [yyyy] [name of copyright owner]
190
-
191
- Licensed under the Apache License, Version 2.0 (the "License");
192
- you may not use this file except in compliance with the License.
193
- You may obtain a copy of the License at
194
-
195
- http://www.apache.org/licenses/LICENSE-2.0
196
-
197
- Unless required by applicable law or agreed to in writing, software
198
- distributed under the License is distributed on an "AS IS" BASIS,
199
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
- See the License for the specific language governing permissions and
201
- limitations under the License.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cosmos-transfer1/README.md DELETED
@@ -1,102 +0,0 @@
1
- <p align="center">
2
- <img src="assets/nvidia-cosmos-header.png" alt="NVIDIA Cosmos Header">
3
- </p>
4
-
5
- ### [Product Website](https://www.nvidia.com/en-us/ai/cosmos/) | [Hugging Face](https://huggingface.co/collections/nvidia/cosmos-transfer1-67c9d328196453be6e568d3e) | [Paper](https://arxiv.org/abs/2503.14492) | [Paper Website](https://research.nvidia.com/labs/dir/cosmos-transfer1/)
6
-
7
- Cosmos-Transfer1 is a key branch of Cosmos World Foundation Models (WFMs) specialized for multimodal controllable conditional world generation or world2world transfer. The three main branches of Cosmos WFMs are [cosmos-predict](https://github.com/nvidia-cosmos/cosmos-predict1), [cosmos-transfer](https://github.com/nvidia-cosmos/cosmos-transfer1), and [cosmos-reason](https://github.com/nvidia-cosmos/cosmos-reason1). We visualize the architecture of Cosmos-Transfer1 in the following figure.
8
-
9
- <p align="center">
10
- <img src="assets/transfer1_diagram.png" alt="Cosmos-Transfer1 Architecture Diagram">
11
- </p>
12
-
13
-
14
- Cosmos-Transfer1 includes the following:
15
- - **ControlNet-based single modality conditional world generation** where a user can generate visual simulation based on one of the following modalities: segmentation video, depth video, edge video, blur video, LiDAR video, or HDMap video. Cosmos-Transfer1 generates a video based on the signal modality conditional input, a user text prompt, and, optionally, an input RGB video frame prompt (which could be from the last video generation result when operating in the autoregressive setting). We will use Cosmos-Transfer1-7B [Modality] to refer to the model operating in this setting. For example, Cosmos-Transfer1-7B [Depth] refers to a depth ControlNet model.
16
- - **MultiControlNet-based multimodal conditional world generation** where a user can generate visual simulation based on any combination of segmentation video, depth video, edge video, and blur video (LiDAR video and HDMap in the AV sample) with a spatiotemporal control map to control the stregnth of each modality across space and time. Cosmos-Transfer1 generates a video based on the multimodal conditional inputs, a user text prompt, and, optionally, an input RGB video frame prompt (This could be from the last video generation result when operating in the autoregressive setting.). This is the preferred mode of Cosmos-Transfer. We will refer it as Cosmos-Transfer1-7B.
17
- - **4KUpscaler** for upscaling a 720p-resolution video to a 4K-resolution video.
18
- - **Post-training scripts** for helping Physical AI builders post-train pre-trained Cosmos-Transfer1 for their applications.
19
- - **Pre-training scripts** for helping Physical AI builders train their own Cosmos-Transfer1 models from scratch.
20
-
21
- ## News
22
- - [2025/05] **Cosmos AV Single2MultiView** is available! Now you can create dynamic, multi-view clips from just one video. Try it out and tell us what you think!
23
- - [Inference guide](examples/inference_cosmos_transfer1_7b_sample_av_single2multiview.md)
24
- - [Build your own or PyTorch post-training](examples/training_cosmos_transfer_7B_sample_AV.md)
25
-
26
- - [Hugging Face model](https://huggingface.co/nvidia/Cosmos-Transfer1-7B-Sample-AV-Single2MultiView)
27
-
28
- - [2025/04] [Post training](README.md#post-train-pre-trained-cosmos-transfer1-models) is available! Now you can customize Transfer1 models in your own way. Please try it out and we look forward to your feedback.
29
-
30
- ## Example Model Behavior
31
-
32
- [Cosmos-Transfer LiDAR + HDMap Conditional Inputs -> World](https://github.com/nvidia-cosmos/cosmos-transfer1)
33
-
34
- <video src="https://github.com/user-attachments/assets/169cf5c5-de59-44db-b1bf-19fb57cb7e2e">
35
- Your browser does not support the video tag.
36
- </video>
37
-
38
- [Cosmos-Transfer Multimodal Conditional Inputs -> World](https://github.com/nvidia-cosmos/cosmos-transfer1)
39
-
40
- <video src="https://github.com/user-attachments/assets/4c1da01f-c3fd-4b6c-b084-f5ef653abb80">
41
- Your browser does not support the video tag.
42
- </video>
43
-
44
- ## Getting Started
45
-
46
- We provide a comphrehensive set of examples to illustrate how to perform inference, post-training, etc, with Cosmos-Transfer1. Click a relevant example below and start your Cosmos journey.
47
-
48
- ### Installation
49
-
50
- Please refer to [INSTALL.md](INSTALL.md) for general instructions on environment setup.
51
-
52
- ### Inference with pre-trained Cosmos-Transfer1 models
53
-
54
- * [Inference with pre-trained Cosmos-Transfer1-7B](/examples/inference_cosmos_transfer1_7b.md) **[with multi-GPU support]**
55
- * [Inference with pre-trained Cosmos-Transfer1-7B-Sample-AV](/examples/inference_cosmos_transfer1_7b_sample_av.md) **[with multi-GPU support]**
56
- * [Inference with pre-trained Cosmos-Transfer1-7B-4KUpscaler](/examples/inference_cosmos_transfer1_7b_4kupscaler.md) **[with multi-GPU support]**
57
- * [Inference with pre-trained Cosmos-Transfer1-7B (Depth)](examples/inference_cosmos_transfer1_7b_depth.md)
58
- * [Inference with pre-trained Cosmos-Transfer1-7B (Segmentation)](examples/inference_cosmos_transfer1_7b_seg.md)
59
- * [Inference with pre-trained Cosmos-Transfer1-7B (Edge)](examples/inference_cosmos_transfer1_7b.md#example-1-single-control-edge)
60
- * [Inference with pre-trained Cosmos-Transfer1-7B (Vis)](examples/inference_cosmos_transfer1_7b_vis.md)
61
- * [Inference with pre-trained Cosmos-Transfer1pt1-7B [Keypoint]](/examples/inference_cosmos_transfer1pt1_7b_keypoint.md)
62
- * [Inference with pre-trained Cosmos-Transfer1-7B-Sample-AV-Multiview](/examples/inference_cosmos_transfer1_7b_sample_av_single2multiview.md)
63
-
64
- ### Post-train pre-trained Cosmos-Transfer1 models
65
-
66
- * [Post-train pre-trained Cosmos-Transfer1-7B [Depth | Edge | Keypoint | Segmentation | Vis]](examples/training_cosmos_transfer_7b.md) **[with multi-GPU support]**
67
- * [Post-train pre-trained Cosmos-Transfer1-7B-Sample-AV [LiDAR|HDMap]](examples/training_cosmos_transfer_7B_sample_AV.md) **[with multi-GPU support]**
68
- * [Post-train pre-trained Cosmos-Transfer1-7B-Sample-AV-Multiview[LiDAR|HDMap]](examples/training_cosmos_transfer_7B_sample_AV.md) **[with multi-GPU support]**
69
-
70
- ### Build your own Cosmos-Transfer1 models from scratch
71
-
72
- * [Pre-train Cosmos-Transfer1-7B [Depth | Edge | Keypoint | Segmentation | Vis]](examples/training_cosmos_transfer_7b.md) **[with multi-GPU support]**
73
- * [Pre-train Cosmos-Transfer1-7B-Sample-AV [LiDAR|HDMap]](examples/training_cosmos_transfer_7B_sample_AV.md) **[with multi-GPU support]**
74
- * [Pre-train Cosmos-Transfer1-7B-Sample-AV-Multiview[LiDAR|HDMap]](examples/training_cosmos_transfer_7B_sample_AV.md) **[with multi-GPU support]**
75
-
76
- ### Workflow
77
-
78
- *[Robotics Augmentation Workflow](/cosmos_transfer1/auxiliary/robot_augmentation/README.md): Scene augmentation for robotic manipulation, mapping one robotics synthetic example to multiple realistic examples
79
-
80
-
81
- <video src="https://github.com/user-attachments/assets/6dee15f5-9d8b-469a-a92a-3419cb466d44">
82
- Your browser does not support the video tag.
83
- </video>
84
-
85
- ## Cosmos-Transfer1 Models
86
-
87
- * [Cosmos-Transfer1-7B](https://huggingface.co/nvidia/Cosmos-Transfer1-7B): multimodal controllable conditional world generation with adaptive spatiotemporal control map. The supported modalities include segmentation, depth, canny edge, and blur visual.
88
- * [Cosmos-Transfer1-7B [Depth | Edge | Keypoint | Segmentation | Vis]](https://huggingface.co/nvidia/Cosmos-Transfer1-7B): single modality controllable conditional world generation. This refers to Cosmos-Transfer1-7B operates on the single modality case and is reduced to a ControlNet.
89
- * [Cosmos-Transfer1-7B-Sample-AV](https://huggingface.co/nvidia/Cosmos-Transfer1-7B-Sample-AV): multimodal controllable conditional world generation with adaptive spatiotemporal control map specialized for autonomous vehicle applications. The supported modalities include LiDAR and HDMap.
90
- * [Cosmos-Transfer1-7B [LiDAR | HDMap]](https://huggingface.co/nvidia/Cosmos-Transfer1-7B-Sample-AV): single modality controllable conditional world generation for autonomous vehicle applications. This refers to Cosmos-Transfer1-7B-Sample-AV operates on the single modality case and is reduced to a ControlNet.
91
- * [Cosmos-Transfer1-7B-4KUpscaler](https://huggingface.co/nvidia/Cosmos-Transfer1-7B-4KUpscaler): 4K upscaler to super-resolute 720p videos to 4K videos.
92
-
93
-
94
- ## License and Contact
95
-
96
- This project will download and install additional third-party open source software projects. Review the license terms of these open source projects before use.
97
-
98
- This model includes safety and content moderation features powered by Llama Guard 3. Llama Guard 3 is used solely as a content input filter and is subject to its own license.
99
-
100
- NVIDIA Cosmos source code is released under the [Apache 2 License](https://www.apache.org/licenses/LICENSE-2.0).
101
-
102
- NVIDIA Cosmos models are released under the [NVIDIA Open Model License](https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-open-model-license). For a custom license, please contact [[email protected]](mailto:[email protected]).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cosmos-transfer1/checkpoints/README.md DELETED
@@ -1,3 +0,0 @@
1
- # Checkpoint directory
2
-
3
- Follow our instructions for downloading checkpoints in [Cosmos Diffusion Inference](../cosmos_transfer1/diffusion/README.md#download-checkpoints). Cosmos checkpoints will be downloaded to this directory.
 
 
 
 
cosmos-transfer1/cosmos-transfer1.yaml DELETED
@@ -1,30 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- name: cosmos-transfer1
17
- channels:
18
- - conda-forge
19
- dependencies:
20
- - python=3.12
21
- - pip=25.0
22
- - cmake
23
- - ninja
24
- - libgl
25
- - ffmpeg
26
- - gcc=12.4.0
27
- - gxx=12.4.0
28
- - cuda=12.4
29
- - cuda-nvcc=12.4
30
- - cuda-toolkit=12.4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cosmos-transfer1/cosmos_transfer1/auxiliary/depth_anything/inference/__init__.py DELETED
File without changes
cosmos-transfer1/cosmos_transfer1/auxiliary/depth_anything/inference/depth_anything_pipeline.py DELETED
@@ -1,55 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import argparse
17
-
18
- from PIL import Image
19
-
20
- from cosmos_transfer1.auxiliary.depth_anything.model.depth_anything import DepthAnythingModel
21
-
22
-
23
- def parse_args():
24
- parser = argparse.ArgumentParser(description="Depth Estimation using Depth Anything V2")
25
- parser.add_argument("--input", type=str, required=True, help="Path to input image or video file")
26
- parser.add_argument("--output", type=str, required=True, help="Path to save the output image or video")
27
- parser.add_argument(
28
- "--mode",
29
- type=str,
30
- choices=["image", "video"],
31
- default="image",
32
- help="Processing mode: 'image' for a single image, 'video' for a video file",
33
- )
34
- return parser.parse_args()
35
-
36
-
37
- def main():
38
- args = parse_args()
39
- model = DepthAnythingModel()
40
-
41
- if args.mode == "image":
42
- # Load the input image and predict its depth
43
- image = Image.open(args.input).convert("RGB")
44
- depth_image = model.predict_depth(image)
45
- depth_image.save(args.output)
46
- print(f"Depth image saved to {args.output}")
47
- elif args.mode == "video":
48
- # Process the video and save the output
49
- out_path = model.predict_depth_video(args.input, args.output)
50
- if out_path:
51
- print(f"Depth video saved to {out_path}")
52
-
53
-
54
- if __name__ == "__main__":
55
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cosmos-transfer1/cosmos_transfer1/auxiliary/depth_anything/model/__init__.py DELETED
File without changes
cosmos-transfer1/cosmos_transfer1/auxiliary/depth_anything/model/depth_anything.py DELETED
@@ -1,151 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import os
17
-
18
- import cv2
19
- import imageio
20
- import numpy as np
21
- import torch
22
- from PIL import Image
23
- from transformers import AutoImageProcessor, AutoModelForDepthEstimation
24
-
25
- from cosmos_transfer1.checkpoints import DEPTH_ANYTHING_MODEL_CHECKPOINT
26
- from cosmos_transfer1.utils import log
27
-
28
-
29
- class DepthAnythingModel:
30
- def __init__(self):
31
- """
32
- Initialize the Depth Anything model and its image processor.
33
- """
34
- self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
35
- # Load image processor and model with half precision
36
- print(f"Loading Depth Anything model - {DEPTH_ANYTHING_MODEL_CHECKPOINT}...")
37
- self.image_processor = AutoImageProcessor.from_pretrained(
38
- DEPTH_ANYTHING_MODEL_CHECKPOINT,
39
- torch_dtype=torch.float16,
40
- trust_remote_code=True,
41
- )
42
- self.model = AutoModelForDepthEstimation.from_pretrained(
43
- DEPTH_ANYTHING_MODEL_CHECKPOINT,
44
- torch_dtype=torch.float16,
45
- trust_remote_code=True,
46
- ).to(self.device)
47
-
48
- def predict_depth(self, image: Image.Image) -> Image.Image:
49
- """
50
- Process a single PIL image and return a depth map as a uint16 PIL Image.
51
- """
52
- # Prepare inputs for the model
53
- inputs = self.image_processor(images=image, return_tensors="pt")
54
- # Move all tensors to the proper device with half precision
55
- inputs = {k: v.to(self.device, dtype=torch.float16) for k, v in inputs.items()}
56
-
57
- with torch.no_grad():
58
- outputs = self.model(**inputs)
59
- predicted_depth = outputs.predicted_depth
60
-
61
- # Interpolate the predicted depth to the original image size
62
- prediction = torch.nn.functional.interpolate(
63
- predicted_depth.unsqueeze(1),
64
- size=image.size[::-1], # PIL image size is (width, height), interpolate expects (height, width)
65
- mode="bicubic",
66
- align_corners=False,
67
- )
68
-
69
- # Convert the output tensor to a numpy array and save as a depth image
70
- output = prediction.squeeze().cpu().numpy()
71
- depth_image = DepthAnythingModel.save_depth(output)
72
- return depth_image
73
-
74
- def __call__(self, input_video: str, output_video: str = "depth.mp4") -> str:
75
- """
76
- Process a video file frame-by-frame to produce a depth-estimated video.
77
- The output video is saved as an MP4 file.
78
- """
79
-
80
- log.info(f"Processing video: {input_video} to generate depth video: {output_video}")
81
- assert os.path.exists(input_video)
82
-
83
- cap = cv2.VideoCapture(input_video)
84
- if not cap.isOpened():
85
- print("Error: Cannot open video file.")
86
- return
87
-
88
- # Retrieve video properties
89
- frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
90
- frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
91
- fps = cap.get(cv2.CAP_PROP_FPS)
92
-
93
- depths = []
94
- while True:
95
- ret, frame = cap.read()
96
- if not ret:
97
- break
98
-
99
- # Convert frame from BGR to RGB and then to PIL Image
100
- image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
101
- inputs = self.image_processor(images=image, return_tensors="pt")
102
- inputs = {k: v.to(self.device, dtype=torch.float16) for k, v in inputs.items()}
103
-
104
- with torch.no_grad():
105
- outputs = self.model(**inputs)
106
- predicted_depth = outputs.predicted_depth
107
-
108
- # For video processing, take the first output and interpolate to original size
109
- prediction = torch.nn.functional.interpolate(
110
- predicted_depth[0].unsqueeze(0).unsqueeze(0),
111
- size=(frame_height, frame_width),
112
- mode="bicubic",
113
- align_corners=False,
114
- )
115
- depth = prediction.squeeze().cpu().numpy()
116
- depths += [depth]
117
- cap.release()
118
-
119
- depths = np.stack(depths)
120
- depths_normed = (depths - depths.min()) / (depths.max() - depths.min() + 1e-8) * 255.0
121
- depths_normed = depths_normed.astype(np.uint8)
122
-
123
- os.makedirs(os.path.dirname(output_video), exist_ok=True)
124
- self.write_video(depths_normed, output_video, fps=fps)
125
- return output_video
126
-
127
- @staticmethod
128
- def save_depth(output: np.ndarray) -> Image.Image:
129
- """
130
- Convert the raw depth output (float values) into a uint16 PIL Image.
131
- """
132
- depth_min = output.min()
133
- depth_max = output.max()
134
- max_val = (2**16) - 1 # Maximum value for uint16
135
-
136
- if depth_max - depth_min > np.finfo("float").eps:
137
- out_array = max_val * (output - depth_min) / (depth_max - depth_min)
138
- else:
139
- out_array = np.zeros_like(output)
140
-
141
- formatted = out_array.astype("uint16")
142
- depth_image = Image.fromarray(formatted, mode="I;16")
143
- return depth_image
144
-
145
- @staticmethod
146
- def write_video(frames, output_path, fps=30):
147
- with imageio.get_writer(output_path, fps=fps, macro_block_size=8) as writer:
148
- for frame in frames:
149
- if len(frame.shape) == 2: # single channel
150
- frame = frame[:, :, None].repeat(3, axis=2)
151
- writer.append_data(frame)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cosmos-transfer1/cosmos_transfer1/auxiliary/guardrail/README.md DELETED
@@ -1,17 +0,0 @@
1
- # Cosmos Guardrail
2
-
3
- This page outlines a set of tools to ensure content safety in Cosmos. For implementation details, please consult the [Cosmos paper](https://research.nvidia.com/publication/2025-01_cosmos-world-foundation-model-platform-physical-ai).
4
-
5
- ## Overview
6
-
7
- Our guardrail system consists of two stages: pre-Guard and post-Guard.
8
-
9
- Cosmos pre-Guard models are applied to text input, including input prompts and upsampled prompts.
10
-
11
- * Blocklist: a keyword list checker for detecting harmful keywords
12
- * Llama Guard 3: an LLM-based approach for blocking harmful prompts
13
-
14
- Cosmos post-Guard models are applied to video frames generated by Cosmos models.
15
-
16
- * Video Content Safety Filter: a classifier trained to distinguish between safe and unsafe video frames
17
- * Face Blur Filter: a face detection and blurring module
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cosmos-transfer1/cosmos_transfer1/auxiliary/guardrail/__init__.py DELETED
@@ -1,14 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cosmos-transfer1/cosmos_transfer1/auxiliary/guardrail/aegis/__init__.py DELETED
@@ -1,14 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cosmos-transfer1/cosmos_transfer1/auxiliary/guardrail/aegis/aegis.py DELETED
@@ -1,135 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import argparse
17
- import os
18
-
19
- import torch
20
- from peft import PeftModel
21
- from transformers import AutoModelForCausalLM, AutoTokenizer
22
-
23
- from cosmos_transfer1.auxiliary.guardrail.aegis.categories import UNSAFE_CATEGORIES
24
- from cosmos_transfer1.auxiliary.guardrail.common.core import ContentSafetyGuardrail, GuardrailRunner
25
- from cosmos_transfer1.checkpoints import AEGIS_MODEL_CHECKPOINT, LLAMA_GUARD_MODEL_CHECKPOINT
26
- from cosmos_transfer1.utils import log, misc
27
-
28
- SAFE = misc.Color.green("SAFE")
29
- UNSAFE = misc.Color.red("UNSAFE")
30
-
31
-
32
- class Aegis(ContentSafetyGuardrail):
33
- def __init__(
34
- self,
35
- checkpoint_dir: str,
36
- device="cuda" if torch.cuda.is_available() else "cpu",
37
- ) -> None:
38
- self.checkpoint_dir = checkpoint_dir
39
- self.device = device
40
- self.dtype = torch.bfloat16
41
- base_model_id = "meta-llama/LlamaGuard-7b"
42
- aegis_adapter = "nvidia/Aegis-AI-Content-Safety-LlamaGuard-Defensive-1.0"
43
-
44
- base_model_dir = os.path.join(self.checkpoint_dir, base_model_id)
45
- aegis_adapter_dir = os.path.join(self.checkpoint_dir, aegis_adapter)
46
-
47
- base_model = AutoModelForCausalLM.from_pretrained(base_model_id, cache_dir=base_model_dir)
48
- self.tokenizer = AutoTokenizer.from_pretrained(base_model_id, cache_dir=base_model_dir)
49
- self.model = PeftModel.from_pretrained(base_model, aegis_adapter, cache_dir=aegis_adapter_dir)
50
-
51
- self.model.to(self.device, dtype=self.dtype).eval()
52
-
53
- def get_moderation_prompt(self, user_prompt: str) -> str:
54
- """Create the moderation prompt for the Aegis model."""
55
- full_prompt = f"""[INST] Task: Check if there is unsafe content in 'User' messages in conversations according our safety policy with the below categories.
56
-
57
- <BEGIN UNSAFE CONTENT CATEGORIES>
58
- {"".join([f"{category['instruction']}" for category in UNSAFE_CATEGORIES])}
59
- <END UNSAFE CONTENT CATEGORIES>
60
-
61
- <BEGIN CONVERSATION>
62
-
63
- """
64
- full_prompt += f"User: {user_prompt}\n\n"
65
- full_prompt += """<END CONVERSATION>
66
-
67
- Provide your safety assessment for User in the above conversation:
68
- - First line must read 'safe' or 'unsafe'.
69
- - If unsafe, a second line must include a comma-separated list of violated categories. [/INST]"""
70
-
71
- return full_prompt
72
-
73
- def get_aegis_block_message(self, moderation_output: str) -> str:
74
- """Extract the blocked category and reason from the Aegis model output."""
75
- block_msg = "Prompt blocked by Aegis."
76
- try:
77
- # Extract blocked category and reason
78
- if len(moderation_output.split("\n")) >= 2:
79
- moderation_output_list = moderation_output.split("\n")
80
- blocked_category = int(moderation_output_list[1].lstrip("O")) - 1
81
- if 0 <= blocked_category < len(UNSAFE_CATEGORIES):
82
- blocked_reason = UNSAFE_CATEGORIES[blocked_category]["blocked_reason"]
83
- blocked_category_name = UNSAFE_CATEGORIES[blocked_category]["category"]
84
- block_msg = f"{blocked_category_name}: {blocked_reason}"
85
- except Exception as e:
86
- log.warning(f"Unable to extract blocked category and reason from Aegis output: {e}")
87
- return block_msg
88
-
89
- def filter_aegis_output(self, prompt: str) -> tuple[bool, str]:
90
- """Filter the Aegis model output and return the safety status and message."""
91
- full_prompt = self.get_moderation_prompt(prompt)
92
- inputs = self.tokenizer([full_prompt], add_special_tokens=False, return_tensors="pt").to(self.device)
93
- output = self.model.generate(**inputs, max_new_tokens=100, pad_token_id=self.tokenizer.eos_token_id)
94
- prompt_len = inputs["input_ids"].shape[-1]
95
- moderation_output = self.tokenizer.decode(output[0][prompt_len:], skip_special_tokens=True)
96
-
97
- if "unsafe" in moderation_output.lower():
98
- block_msg = self.get_aegis_block_message(moderation_output)
99
- return False, block_msg
100
- else:
101
- return True, ""
102
-
103
- def is_safe(self, prompt: str) -> tuple[bool, str]:
104
- """Check if the input prompt is safe according to the Aegis model."""
105
- try:
106
- return self.filter_aegis_output(prompt)
107
- except Exception as e:
108
- log.error(f"Unexpected error occurred when running Aegis guardrail: {e}")
109
- return True, "Unexpected error occurred when running Aegis guardrail."
110
-
111
-
112
- def parse_args():
113
- parser = argparse.ArgumentParser()
114
- parser.add_argument("--prompt", type=str, required=True, help="Input prompt")
115
- parser.add_argument(
116
- "--checkpoint_dir",
117
- type=str,
118
- help="Path to the Aegis checkpoint folder",
119
- default=DEFAULT_CHECKPOINT_DIR,
120
- )
121
- return parser.parse_args()
122
-
123
-
124
- def main(args):
125
- aegis = Aegis(checkpoint_dir=args.checkpoint_dir)
126
- runner = GuardrailRunner(safety_models=[aegis])
127
- with misc.timer("aegis safety check"):
128
- safety, message = runner.run_safety_check(args.prompt)
129
- log.info(f"Input is: {'SAFE' if safety else 'UNSAFE'}")
130
- log.info(f"Message: {message}") if not safety else None
131
-
132
-
133
- if __name__ == "__main__":
134
- args = parse_args()
135
- main(args)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cosmos-transfer1/cosmos_transfer1/auxiliary/guardrail/aegis/categories.py DELETED
@@ -1,192 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- VIOLENCE = {
17
- "category": "violence",
18
- "blocked_reason": """This prompt is blocked due to the presence of violent content.""",
19
- "instruction": """O1: Violence.
20
- Should not
21
- - Help people plan or engage in violence.
22
- - Encourage people to engage in violence.
23
- - Contain unpleasant vivid violent imagery or violent description that arouses discomfort in the reader.
24
- Can
25
- - Provide information on violence.
26
- - Discuss the topics of violence, and can discuss historical events involving violence.
27
- """,
28
- }
29
-
30
- SEXUAL = {
31
- "category": "sexual",
32
- "blocked_reason": """This prompt is blocked due to the presence of sexual content.""",
33
- "instruction": """O2: Sexual.
34
- Should not
35
- - Engage in sexually explicit (i.e., erotic) conversations and should not encourage people to engage in sexual activities.
36
- - Engage or encourage sexual activity that is considered illegal.
37
- Can
38
- - Discuss topics related to sexuality and sexual education.
39
- - Provide general information regarding sexual activities.
40
- - Provide guidance on topics related to sexual health.
41
- """,
42
- }
43
-
44
- CRIMINAL_PLANNING = {
45
- "category": "criminal_planning",
46
- "blocked_reason": """This prompt is blocked due to the presence of criminal planning/confessions content.""",
47
- "instruction": """O3: Criminal Planning/Confessions.
48
- Should not
49
- - Help people plan or engage in criminal activities like theft, kidnapping, or financial crimes, especially when the person indicates that they may act on the information provided to them.
50
- - Encourage people to engage in criminal activities or condone criminal behavior.
51
- - Have a confession of crime that was committed or will be committed in future.
52
- Can
53
- - Help people understand how crimes are committed, provided that the response is not so detailed that it could be used to provide a roadmap for future crimes.
54
- - Help people understand how systems work that are used to prevent crimes, provided those descriptions do not provide detailed information on how to defeat those systems.
55
- """,
56
- }
57
-
58
- WEAPONS = {
59
- "category": "weapons",
60
- "blocked_reason": """This prompt is blocked due to the presence of weapons content.""",
61
- "instruction": """O4: Guns and Illegal Weapons.
62
- Should not
63
- - Assist in the planning or execution of firearm or illegal weapons crimes, especially when the person indicates that they may act on the information provided to them.
64
- - Encourage people to engage in firearm or illegal weapons crimes or condone such behavior.
65
- - Assist or encourage people to execute or own other illegal weapons for example biological agents, or chemical weapons, including instructions on how to create or use them.
66
- Can
67
- - Discuss firearms and the arguments for and against firearm ownership.
68
- """,
69
- }
70
-
71
- SUBSTANCE_ABUSE = {
72
- "category": "substance_abuse",
73
- "blocked_reason": """This prompt is blocked due to the presence of substance abuse content.""",
74
- "instruction": """O5: Controlled/Regulated Substances.
75
- Should not
76
- - Encourage or assist people in the trafficking, consumption, or use of regulated or controlled substances, including but not limited to illegal drugs, tobacco, alcohol, and cannabis.
77
- - Assist or encourage people to create such substances when it is illegal to do so.
78
- Can
79
- - Discuss the use of regulated or controlled substances, and can provide general information on those substances (ex: what their chemical composition is and what their effects are).
80
- - Provide information on how those substances are manufactured, provided that the information is not so detailed as to provide step-by-step instructions for their creation.
81
- """,
82
- }
83
-
84
- SUICIDE = {
85
- "category": "suicide",
86
- "blocked_reason": "This prompt is blocked due to the presence of suicide and self harm content.",
87
- "instruction": """O6: Suicide and Self Harm.
88
- Should not
89
- - Encourage people to harm themselves.
90
- - Romanticize or condone self-harm or suicide.
91
- - Provide information regarding the methods of suicide or self-harm.
92
- - Provide any information or advice, other than information regarding appropriate health resources, to people who express or imply that they wish to harm themselves.
93
- - Contain confessions or intent of self harm
94
- Can
95
- - Provide general information regarding self-harm and suicide (ex: statistics, the mental health challenges faced by those who take these actions).
96
- Should
97
- - Provide information regarding appropriate health resources to people who imply or express the desire to harm themselves.
98
- """,
99
- }
100
-
101
- CSAM = {
102
- "category": "csam",
103
- "blocked_reason": "This prompt contains material related to Child Sexual Abuse",
104
- "instruction": """O7: Sexual (minor).
105
- Should not
106
- - Assist, engage or encourage intention to engage in sexual activity with an underage/minor.
107
- - Discuss or plan sexual acts, sharing of media of sexual nature, personal information or intention to meet in person for sexual activity with a minor.
108
- - Should not include discussion where private information is shared with underage minors for purposes of acting on them and taking actions that are sexual in nature including meeting or sharing media.
109
- - Should not discuss child abuse or engage, encourage, assist in sexual grooming/trafficking of children.
110
- Can
111
- - Provide information about sex education for use in schools.
112
- """,
113
- }
114
-
115
- HATRED = {
116
- "category": "hatred",
117
- "blocked_reason": "This prompt is blocked due to the presence of hate speech content.",
118
- "instruction": """O8: Hate/Identity Hate.
119
- Should not
120
- - Express hateful, derogatory, or demeaning sentiments against people based on sensitive personal characteristics like their
121
- race, color, ethnicity, religion, national origin, sexual orientation, gender, gender identity, or disability or caste.
122
- - Encourage discrimination against people based on those characteristics and should not use slurs or create insults based on those characteristics.
123
- - Should not discuss prejudice towards a group classified as protected according to the law of Germany, UK, USA or any other country.
124
- - Should not contain slurs.
125
- - Should not contain text that is distasteful, offensive, or hateful that causes others to feel grief, anger, or fear.
126
- Can
127
- - Provide information on discrimination.
128
- - Discuss the topics of discrimination, and can discuss historical events involving discrimination.
129
- - Discuss neutral statements referring to group identity and contextualized hate speech (that talks about hate but does not incite hate).
130
- """,
131
- }
132
-
133
- HARASSMENT = {
134
- "category": "harassment",
135
- "blocked_reason": "This prompt is blocked due to the presence of harassment content.",
136
- "instruction": """O10: Harassment.
137
- Should not
138
- - Have content that may be used to torment or annoy individuals in real life, or make harassment more likely to occur.
139
- - Engage in harassment, misconduct and unlawful discrimination
140
- - Single out an individual or marginalization and/or retaliation based on protected characteristics like Race, Color, Gender, Sex,
141
- Sexual orientation, Gender identity, and gender expression, National origin, Ethnicity, Disability (including being regarded as disabled),
142
- Religion, Age (40+), Pregnancy (including pregnancy, childbirth or related medical conditions), Genetic information, Military or veteran status, Citizenship status, Political activity, or affiliation,
143
- - Taking or requesting statutorily protected leave Body characteristics.
144
- - Promise to give a benefit, or a threat to retaliate or take an adverse action based on a response to harassment and misconduct.
145
- - Bullying.
146
- - Being derogatory and humiliating toward an individual but not necessarily protected characteristics under law.
147
- - Making rude or insulting comments, demeaning, and objectifying terms toward an individual.
148
- Can
149
- - Discuss negative impacts of harassment and bullying based on protected characteristics like race, color, gender, sex, nationality, within a contextualized setting.
150
- - Provide information on protected characteristics or refers, cites material that describes what protected characteristics are.
151
- - Provide information or refer or cite material that informs what harassment is.
152
- """,
153
- }
154
-
155
- THREAT = {
156
- "category": "threat",
157
- "blocked_reason": "This prompt is blocked due to the presence of threat content.",
158
- "instruction": """O11: Threat.
159
- Should not
160
- - Discuss a communication of intent to inflict harm or loss on another person.
161
- - Should not contain discussion of intention of intimidation which is a tactic used between conflicting parties to make the other timid or psychologically insecure for coercion or control.
162
- - Should not contain intention of knowingly putting another person in fear of bodily injury.
163
- - Should not assist or plan injury, physical or mental damage, act or instance of injury, or material or any kind of loss to a person.
164
- - Should not contain threats that are forbidden by law as intent to obtain a monetary advantage or to compel a person to act against their will.
165
- Can
166
- - Provide information about threat is and the meaning of threat.
167
- """,
168
- }
169
-
170
- PROFANITY = {
171
- "category": "profanity",
172
- "blocked_reason": "This prompt is blocked due to the presence of profanity content.",
173
- "instruction": """O12: Profanity.
174
- Should not
175
- - Contain swear words, curse words, or other obscene or profane language
176
- - This includes offensive words used without any intention to act on them or without any explicit targeted recipient
177
- """,
178
- }
179
-
180
- UNSAFE_CATEGORIES = [
181
- VIOLENCE,
182
- SEXUAL,
183
- CRIMINAL_PLANNING,
184
- WEAPONS,
185
- SUBSTANCE_ABUSE,
186
- SUICIDE,
187
- CSAM,
188
- HATRED,
189
- HARASSMENT,
190
- THREAT,
191
- PROFANITY,
192
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cosmos-transfer1/cosmos_transfer1/auxiliary/guardrail/blocklist/__init__.py DELETED
@@ -1,14 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cosmos-transfer1/cosmos_transfer1/auxiliary/guardrail/blocklist/blocklist.py DELETED
@@ -1,216 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import argparse
17
- import os
18
- import re
19
- import string
20
- from difflib import SequenceMatcher
21
-
22
- import nltk
23
- from better_profanity import profanity
24
-
25
- from cosmos_transfer1.auxiliary.guardrail.blocklist.utils import read_keyword_list_from_dir, to_ascii
26
- from cosmos_transfer1.auxiliary.guardrail.common.core import ContentSafetyGuardrail, GuardrailRunner
27
- from cosmos_transfer1.utils import log, misc
28
-
29
- CENSOR = misc.Color.red("*")
30
-
31
-
32
- class Blocklist(ContentSafetyGuardrail):
33
- def __init__(
34
- self,
35
- checkpoint_dir: str,
36
- guardrail_partial_match_min_chars: int = 6,
37
- guardrail_partial_match_letter_count: float = 0.4,
38
- ) -> None:
39
- self.checkpoint_dir = os.path.join(checkpoint_dir, "nvidia/Cosmos-Guardrail1/blocklist")
40
- nltk.data.path.append(os.path.join(self.checkpoint_dir, "nltk_data"))
41
- self.lemmatizer = nltk.WordNetLemmatizer()
42
- self.profanity = profanity
43
- self.guardrail_partial_match_min_chars = guardrail_partial_match_min_chars
44
- self.guardrail_partial_match_letter_count = guardrail_partial_match_letter_count
45
-
46
- # Load blocklist and whitelist keywords
47
- self.blocklist_words = read_keyword_list_from_dir(os.path.join(self.checkpoint_dir, "custom"))
48
- self.whitelist_words = read_keyword_list_from_dir(os.path.join(self.checkpoint_dir, "whitelist"))
49
- self.exact_match_words = read_keyword_list_from_dir(os.path.join(self.checkpoint_dir, "exact_match"))
50
-
51
- self.profanity.load_censor_words(custom_words=self.blocklist_words, whitelist_words=self.whitelist_words)
52
- log.debug(f"Loaded {len(self.blocklist_words)} words/phrases from blocklist")
53
- log.debug(f"Whitelisted {len(self.whitelist_words)} words/phrases from whitelist")
54
- log.debug(f"Loaded {len(self.exact_match_words)} exact match words/phrases from blocklist")
55
-
56
- def uncensor_whitelist(self, input_prompt: str, censored_prompt: str) -> str:
57
- """Explicitly uncensor words that are in the whitelist."""
58
- input_words = input_prompt.split()
59
- censored_words = censored_prompt.split()
60
- whitelist_words = set(self.whitelist_words)
61
- for i, token in enumerate(input_words):
62
- if token.strip(string.punctuation).lower() in whitelist_words:
63
- censored_words[i] = token
64
- censored_prompt = " ".join(censored_words)
65
- return censored_prompt
66
-
67
- def censor_prompt(self, input_prompt: str) -> tuple[bool, str]:
68
- """Censor the prompt using the blocklist with better-profanity fuzzy matching.
69
-
70
- Args:
71
- input_prompt: input prompt to censor
72
-
73
- Returns:
74
- bool: True if the prompt is blocked, False otherwise
75
- str: A message indicating why the prompt was blocked
76
- """
77
- censored_prompt = self.profanity.censor(input_prompt, censor_char=CENSOR)
78
- # Uncensor whitelisted words that were censored from blocklist fuzzy matching
79
- censored_prompt = self.uncensor_whitelist(input_prompt, censored_prompt)
80
- if CENSOR in censored_prompt:
81
- return True, f"Prompt blocked by censorship: Censored Prompt: {censored_prompt}"
82
- return False, ""
83
-
84
- @staticmethod
85
- def check_partial_match(
86
- normalized_prompt: str, normalized_word: str, guardrail_partial_match_letter_count: float
87
- ) -> tuple[bool, str]:
88
- """
89
- Check robustly if normalized word and the matching target have a difference of up to guardrail_partial_match_letter_count characters.
90
-
91
- Args:
92
- normalized_prompt: a string with many words
93
- normalized_word: a string with one or multiple words, its length is smaller than normalized_prompt
94
- guardrail_partial_match_letter_count: maximum allowed difference in characters (float to allow partial characters)
95
-
96
- Returns:
97
- bool: True if a match is found, False otherwise
98
- str: A message indicating why the prompt was blocked
99
- """
100
- prompt_words = normalized_prompt.split()
101
- word_length = len(normalized_word.split())
102
- max_similarity_ratio = (len(normalized_word) - float(guardrail_partial_match_letter_count)) / float(
103
- len(normalized_word)
104
- )
105
-
106
- for i in range(len(prompt_words) - word_length + 1):
107
- # Extract a substring from the prompt with the same number of words as the normalized_word
108
- substring = " ".join(prompt_words[i : i + word_length])
109
- similarity_ratio = SequenceMatcher(None, substring, normalized_word).ratio()
110
- if similarity_ratio >= max_similarity_ratio:
111
- return (
112
- True,
113
- f"Prompt blocked by partial match blocklist: Prompt: {normalized_prompt}, Partial Match Word: {normalized_word}",
114
- )
115
-
116
- return False, ""
117
-
118
- @staticmethod
119
- def check_against_whole_word_blocklist(
120
- prompt: str,
121
- blocklist: list[str],
122
- guardrail_partial_match_min_chars: int = 6,
123
- guardrail_partial_match_letter_count: float = 0.4,
124
- ) -> bool:
125
- """
126
- Check if the prompt contains any whole words from the blocklist.
127
- The match is case insensitive and robust to multiple spaces between words.
128
-
129
- Args:
130
- prompt: input prompt to check
131
- blocklist: list of words to check against
132
- guardrail_partial_match_min_chars: minimum number of characters in a word to check for partial match
133
- guardrail_partial_match_letter_count: maximum allowed difference in characters for partial match
134
-
135
- Returns:
136
- bool: True if a match is found, False otherwise
137
- str: A message indicating why the prompt was blocked
138
- """
139
- # Normalize spaces and convert to lowercase
140
- normalized_prompt = re.sub(r"\s+", " ", prompt).strip().lower()
141
-
142
- for word in blocklist:
143
- # Normalize spaces and convert to lowercase for each blocklist word
144
- normalized_word = re.sub(r"\s+", " ", word).strip().lower()
145
-
146
- # Use word boundaries to ensure whole word match
147
- if re.search(r"\b" + re.escape(normalized_word) + r"\b", normalized_prompt):
148
- return True, f"Prompt blocked by exact match blocklist: Prompt: {prompt}, Exact Match Word: {word}"
149
-
150
- # Check for partial match if the word is long enough
151
- if len(normalized_word) >= guardrail_partial_match_min_chars:
152
- match, message = Blocklist.check_partial_match(
153
- normalized_prompt, normalized_word, guardrail_partial_match_letter_count
154
- )
155
- if match:
156
- return True, message
157
-
158
- return False, ""
159
-
160
- def is_safe(self, input_prompt: str = "") -> tuple[bool, str]:
161
- """Check if the input prompt is safe using the blocklist."""
162
- # Check if the input is empty
163
- if not input_prompt:
164
- return False, "Input is empty"
165
- input_prompt = to_ascii(input_prompt)
166
-
167
- # Check full sentence for censored words
168
- censored, message = self.censor_prompt(input_prompt)
169
- if censored:
170
- return False, message
171
-
172
- # Check lemmatized words for censored words
173
- tokens = nltk.word_tokenize(input_prompt)
174
- lemmas = [self.lemmatizer.lemmatize(token) for token in tokens]
175
- lemmatized_prompt = " ".join(lemmas)
176
- censored, message = self.censor_prompt(lemmatized_prompt)
177
- if censored:
178
- return False, message
179
-
180
- # Check for exact match blocklist words
181
- censored, message = self.check_against_whole_word_blocklist(
182
- input_prompt,
183
- self.exact_match_words,
184
- self.guardrail_partial_match_min_chars,
185
- self.guardrail_partial_match_letter_count,
186
- )
187
- if censored:
188
- return False, message
189
-
190
- # If all these checks pass, the input is safe
191
- return True, "Input is safe"
192
-
193
-
194
- def parse_args():
195
- parser = argparse.ArgumentParser()
196
- parser.add_argument("--prompt", type=str, required=True, help="Input prompt")
197
- parser.add_argument(
198
- "--checkpoint_dir",
199
- type=str,
200
- help="Path to the Blocklist checkpoint folder",
201
- )
202
- return parser.parse_args()
203
-
204
-
205
- def main(args):
206
- blocklist = Blocklist(checkpoint_dir=args.checkpoint_dir)
207
- runner = GuardrailRunner(safety_models=[blocklist])
208
- with misc.timer("blocklist safety check"):
209
- safety, message = runner.run_safety_check(args.prompt)
210
- log.info(f"Input is: {'SAFE' if safety else 'UNSAFE'}")
211
- log.info(f"Message: {message}") if not safety else None
212
-
213
-
214
- if __name__ == "__main__":
215
- args = parse_args()
216
- main(args)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cosmos-transfer1/cosmos_transfer1/auxiliary/guardrail/blocklist/utils.py DELETED
@@ -1,45 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import os
17
- import re
18
-
19
- from cosmos_transfer1.utils import log
20
-
21
-
22
- def read_keyword_list_from_dir(folder_path: str) -> list[str]:
23
- """Read keyword list from all files in a folder."""
24
- output_list = []
25
- file_list = []
26
- # Get list of files in the folder
27
- for file in os.listdir(folder_path):
28
- if os.path.isfile(os.path.join(folder_path, file)):
29
- file_list.append(file)
30
-
31
- # Process each file
32
- for file in file_list:
33
- file_path = os.path.join(folder_path, file)
34
- try:
35
- with open(file_path, "r") as f:
36
- output_list.extend([line.strip() for line in f.readlines()])
37
- except Exception as e:
38
- log.error(f"Error reading file {file}: {str(e)}")
39
-
40
- return output_list
41
-
42
-
43
- def to_ascii(prompt: str) -> str:
44
- """Convert prompt to ASCII."""
45
- return re.sub(r"[^\x00-\x7F]+", " ", prompt)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cosmos-transfer1/cosmos_transfer1/auxiliary/guardrail/common/__init__.py DELETED
File without changes
cosmos-transfer1/cosmos_transfer1/auxiliary/guardrail/common/core.py DELETED
@@ -1,71 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- from typing import Any, Tuple
17
-
18
- import numpy as np
19
-
20
- from cosmos_transfer1.utils import log
21
-
22
-
23
- class ContentSafetyGuardrail:
24
- def is_safe(self, **kwargs) -> Tuple[bool, str]:
25
- raise NotImplementedError("Child classes must implement the is_safe method")
26
-
27
-
28
- class PostprocessingGuardrail:
29
- def postprocess(self, frames: np.ndarray) -> np.ndarray:
30
- raise NotImplementedError("Child classes must implement the postprocess method")
31
-
32
-
33
- class GuardrailRunner:
34
- def __init__(
35
- self,
36
- safety_models: list[ContentSafetyGuardrail] | None = None,
37
- generic_block_msg: str = "",
38
- generic_safe_msg: str = "",
39
- postprocessors: list[PostprocessingGuardrail] | None = None,
40
- ):
41
- self.safety_models = safety_models
42
- self.generic_block_msg = generic_block_msg
43
- self.generic_safe_msg = generic_safe_msg if generic_safe_msg else "Prompt is safe"
44
- self.postprocessors = postprocessors
45
-
46
- def run_safety_check(self, input: Any) -> Tuple[bool, str]:
47
- """Run the safety check on the input."""
48
- if not self.safety_models:
49
- log.warning("No safety models found, returning safe")
50
- return True, self.generic_safe_msg
51
-
52
- for guardrail in self.safety_models:
53
- guardrail_name = str(guardrail.__class__.__name__).upper()
54
- log.debug(f"Running guardrail: {guardrail_name}")
55
- safe, message = guardrail.is_safe(input)
56
- if not safe:
57
- reasoning = self.generic_block_msg if self.generic_block_msg else f"{guardrail_name}: {message}"
58
- return False, reasoning
59
- return True, self.generic_safe_msg
60
-
61
- def postprocess(self, frames: np.ndarray) -> np.ndarray:
62
- """Run the postprocessing on the video frames."""
63
- if not self.postprocessors:
64
- log.warning("No postprocessors found, returning original frames")
65
- return frames
66
-
67
- for guardrail in self.postprocessors:
68
- guardrail_name = str(guardrail.__class__.__name__).upper()
69
- log.debug(f"Running guardrail: {guardrail_name}")
70
- frames = guardrail.postprocess(frames)
71
- return frames
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cosmos-transfer1/cosmos_transfer1/auxiliary/guardrail/common/io_utils.py DELETED
@@ -1,78 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import glob
17
- from dataclasses import dataclass
18
-
19
- import imageio
20
- import numpy as np
21
-
22
- from cosmos_transfer1.utils import log
23
-
24
-
25
- @dataclass
26
- class VideoData:
27
- frames: np.ndarray # Shape: [B, H, W, C]
28
- fps: int
29
- duration: int # in seconds
30
-
31
-
32
- def get_video_filepaths(input_dir: str) -> list[str]:
33
- """Get a list of filepaths for all videos in the input directory."""
34
- paths = glob.glob(f"{input_dir}/**/*.mp4", recursive=True)
35
- paths += glob.glob(f"{input_dir}/**/*.avi", recursive=True)
36
- paths += glob.glob(f"{input_dir}/**/*.mov", recursive=True)
37
- paths = sorted(paths)
38
- log.debug(f"Found {len(paths)} videos")
39
- return paths
40
-
41
-
42
- def read_video(filepath: str) -> VideoData:
43
- """Read a video file and extract its frames and metadata."""
44
- try:
45
- reader = imageio.get_reader(filepath, "ffmpeg")
46
- except Exception as e:
47
- raise ValueError(f"Failed to read video file: {filepath}") from e
48
-
49
- # Extract metadata from the video file
50
- try:
51
- metadata = reader.get_meta_data()
52
- fps = metadata.get("fps")
53
- duration = metadata.get("duration")
54
- except Exception as e:
55
- reader.close()
56
- raise ValueError(f"Failed to extract metadata from video file: {filepath}") from e
57
-
58
- # Extract frames from the video file
59
- try:
60
- frames = np.array([frame for frame in reader])
61
- except Exception as e:
62
- raise ValueError(f"Failed to extract frames from video file: {filepath}") from e
63
- finally:
64
- reader.close()
65
-
66
- return VideoData(frames=frames, fps=fps, duration=duration)
67
-
68
-
69
- def save_video(filepath: str, frames: np.ndarray, fps: int) -> None:
70
- """Save a video file from a sequence of frames."""
71
- try:
72
- writer = imageio.get_writer(filepath, fps=fps, macro_block_size=1)
73
- for frame in frames:
74
- writer.append_data(frame)
75
- except Exception as e:
76
- raise ValueError(f"Failed to save video file to {filepath}") from e
77
- finally:
78
- writer.close()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cosmos-transfer1/cosmos_transfer1/auxiliary/guardrail/common/presets.py DELETED
@@ -1,75 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import os
17
-
18
- import numpy as np
19
-
20
- from cosmos_transfer1.auxiliary.guardrail.blocklist.blocklist import Blocklist
21
- from cosmos_transfer1.auxiliary.guardrail.common.core import GuardrailRunner
22
- from cosmos_transfer1.auxiliary.guardrail.face_blur_filter.face_blur_filter import RetinaFaceFilter
23
- from cosmos_transfer1.auxiliary.guardrail.llamaGuard3.llamaGuard3 import LlamaGuard3
24
- from cosmos_transfer1.auxiliary.guardrail.video_content_safety_filter.video_content_safety_filter import (
25
- VideoContentSafetyFilter,
26
- )
27
- from cosmos_transfer1.utils import log
28
-
29
-
30
- def create_text_guardrail_runner(checkpoint_dir: str) -> GuardrailRunner:
31
- """Create the text guardrail runner."""
32
- return GuardrailRunner(safety_models=[Blocklist(checkpoint_dir), LlamaGuard3(checkpoint_dir)])
33
-
34
-
35
- def create_video_guardrail_runner(checkpoint_dir: str) -> GuardrailRunner:
36
- """Create the video guardrail runner."""
37
- return GuardrailRunner(
38
- safety_models=[VideoContentSafetyFilter(checkpoint_dir)],
39
- postprocessors=[RetinaFaceFilter(checkpoint_dir)],
40
- )
41
-
42
-
43
- def run_text_guardrail(prompt: str, guardrail_runner: GuardrailRunner) -> bool:
44
- """Run the text guardrail on the prompt, checking for content safety.
45
-
46
- Args:
47
- prompt: The text prompt.
48
- guardrail_runner: The text guardrail runner.
49
-
50
- Returns:
51
- bool: Whether the prompt is safe.
52
- """
53
- is_safe, message = guardrail_runner.run_safety_check(prompt)
54
- if not is_safe:
55
- log.critical(f"GUARDRAIL BLOCKED: {message}")
56
- return is_safe
57
-
58
-
59
- def run_video_guardrail(frames: np.ndarray, guardrail_runner: GuardrailRunner) -> np.ndarray | None:
60
- """Run the video guardrail on the frames, checking for content safety and applying face blur.
61
-
62
- Args:
63
- frames: The frames of the generated video.
64
- guardrail_runner: The video guardrail runner.
65
-
66
- Returns:
67
- The processed frames if safe, otherwise None.
68
- """
69
- is_safe, message = guardrail_runner.run_safety_check(frames)
70
- if not is_safe:
71
- log.critical(f"GUARDRAIL BLOCKED: {message}")
72
- return None
73
-
74
- frames = guardrail_runner.postprocess(frames)
75
- return frames
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cosmos-transfer1/cosmos_transfer1/auxiliary/guardrail/face_blur_filter/__init__.py DELETED
@@ -1,14 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cosmos-transfer1/cosmos_transfer1/auxiliary/guardrail/face_blur_filter/blur_utils.py DELETED
@@ -1,35 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import cv2
17
- import numpy as np
18
-
19
-
20
- def pixelate_face(face_img: np.ndarray, blocks: int = 5) -> np.ndarray:
21
- """
22
- Pixelate a face region by reducing resolution and then upscaling.
23
-
24
- Args:
25
- face_img: Face region to pixelate
26
- blocks: Number of blocks to divide the face into (in each dimension)
27
-
28
- Returns:
29
- Pixelated face region
30
- """
31
- h, w = face_img.shape[:2]
32
- # Shrink the image and scale back up to create pixelation effect
33
- temp = cv2.resize(face_img, (blocks, blocks), interpolation=cv2.INTER_LINEAR)
34
- pixelated = cv2.resize(temp, (w, h), interpolation=cv2.INTER_NEAREST)
35
- return pixelated
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cosmos-transfer1/cosmos_transfer1/auxiliary/guardrail/face_blur_filter/face_blur_filter.py DELETED
@@ -1,225 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import argparse
17
- import os
18
-
19
- import numpy as np
20
- import torch
21
- from retinaface.data import cfg_re50
22
- from retinaface.layers.functions.prior_box import PriorBox
23
- from retinaface.models.retinaface import RetinaFace
24
- from torch.utils.data import DataLoader, TensorDataset
25
- from tqdm import tqdm
26
-
27
- from cosmos_transfer1.auxiliary.guardrail.common.core import GuardrailRunner, PostprocessingGuardrail
28
- from cosmos_transfer1.auxiliary.guardrail.common.io_utils import get_video_filepaths, read_video, save_video
29
- from cosmos_transfer1.auxiliary.guardrail.face_blur_filter.blur_utils import pixelate_face
30
- from cosmos_transfer1.auxiliary.guardrail.face_blur_filter.retinaface_utils import (
31
- decode_batch,
32
- filter_detected_boxes,
33
- load_model,
34
- )
35
- from cosmos_transfer1.utils import log, misc
36
-
37
- # RetinaFace model constants from https://github.com/biubug6/Pytorch_Retinaface/blob/master/detect.py
38
- TOP_K = 5_000
39
- KEEP_TOP_K = 750
40
- NMS_THRESHOLD = 0.4
41
-
42
-
43
- class RetinaFaceFilter(PostprocessingGuardrail):
44
- def __init__(
45
- self,
46
- checkpoint_dir: str,
47
- batch_size: int = 1,
48
- confidence_threshold: float = 0.7,
49
- device="cuda" if torch.cuda.is_available() else "cpu",
50
- ) -> None:
51
- """
52
- Initialize the RetinaFace model for face detection and blurring.
53
-
54
- Args:
55
- checkpoint: Path to the RetinaFace checkpoint file
56
- batch_size: Batch size for RetinaFace inference and processing
57
- confidence_threshold: Minimum confidence score to consider a face detection
58
- """
59
- self.checkpoint = f"{checkpoint_dir}/nvidia/Cosmos-Guardrail1/face_blur_filter/Resnet50_Final.pth"
60
- self.cfg = cfg_re50
61
- self.batch_size = batch_size
62
- self.confidence_threshold = confidence_threshold
63
- self.device = device
64
- self.dtype = torch.float32
65
-
66
- # Disable loading ResNet pretrained weights
67
- self.cfg["pretrain"] = False
68
- self.net = RetinaFace(cfg=self.cfg, phase="test")
69
- cpu = self.device == "cpu"
70
-
71
- # Load from RetinaFace pretrained checkpoint
72
- self.net = load_model(self.net, self.checkpoint, cpu)
73
- self.net.to(self.device, dtype=self.dtype).eval()
74
-
75
- def preprocess_frames(self, frames: np.ndarray) -> torch.Tensor:
76
- """Preprocess a sequence of frames for face detection.
77
-
78
- Args:
79
- frames: Input frames
80
-
81
- Returns:
82
- Preprocessed frames tensor
83
- """
84
- with torch.no_grad():
85
- frames_tensor = torch.from_numpy(frames).to(self.device, dtype=self.dtype) # Shape: [T, H, W, C]
86
- frames_tensor = frames_tensor.permute(0, 3, 1, 2) # Shape: [T, C, H, W]
87
- frames_tensor = frames_tensor[:, [2, 1, 0], :, :] # RGB to BGR to match RetinaFace model input
88
- means = torch.tensor([104.0, 117.0, 123.0], device=self.device, dtype=self.dtype).view(1, 3, 1, 1)
89
- frames_tensor = frames_tensor - means # Subtract mean BGR values for each channel
90
- return frames_tensor
91
-
92
- def blur_detected_faces(
93
- self,
94
- frames: np.ndarray,
95
- batch_loc: torch.Tensor,
96
- batch_conf: torch.Tensor,
97
- prior_data: torch.Tensor,
98
- scale: torch.Tensor,
99
- min_size: tuple[int] = (20, 20),
100
- ) -> list[np.ndarray]:
101
- """Blur detected faces in a batch of frames using RetinaFace predictions.
102
-
103
- Args:
104
- frames: Input frames
105
- batch_loc: Batched location predictions
106
- batch_conf: Batched confidence scores
107
- prior_data: Prior boxes for the video
108
- scale: Scale factor for resizing detections
109
- min_size: Minimum size of a detected face region in pixels
110
-
111
- Returns:
112
- Processed frames with pixelated faces
113
- """
114
- with torch.no_grad():
115
- batch_boxes = decode_batch(batch_loc, prior_data, self.cfg["variance"])
116
- batch_boxes = batch_boxes * scale
117
-
118
- blurred_frames = []
119
- for i, boxes in enumerate(batch_boxes):
120
- boxes = boxes.detach().cpu().numpy()
121
- scores = batch_conf[i, :, 1].detach().cpu().numpy()
122
-
123
- filtered_boxes = filter_detected_boxes(
124
- boxes,
125
- scores,
126
- confidence_threshold=self.confidence_threshold,
127
- nms_threshold=NMS_THRESHOLD,
128
- top_k=TOP_K,
129
- keep_top_k=KEEP_TOP_K,
130
- )
131
-
132
- frame = frames[i]
133
- for box in filtered_boxes:
134
- x1, y1, x2, y2 = map(int, box)
135
- # Ignore bounding boxes smaller than the minimum size
136
- if x2 - x1 < min_size[0] or y2 - y1 < min_size[1]:
137
- continue
138
- max_h, max_w = frame.shape[:2]
139
- face_roi = frame[max(y1, 0) : min(y2, max_h), max(x1, 0) : min(x2, max_w)]
140
- blurred_face = pixelate_face(face_roi)
141
- frame[max(y1, 0) : min(y2, max_h), max(x1, 0) : min(x2, max_w)] = blurred_face
142
- blurred_frames.append(frame)
143
-
144
- return blurred_frames
145
-
146
- def postprocess(self, frames: np.ndarray) -> np.ndarray:
147
- """Blur faces in a sequence of frames.
148
-
149
- Args:
150
- frames: Input frames
151
-
152
- Returns:
153
- Processed frames with pixelated faces
154
- """
155
- # Create dataset and dataloader
156
- frames_tensor = self.preprocess_frames(frames)
157
- dataset = TensorDataset(frames_tensor)
158
- dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=False)
159
- processed_frames, processed_batches = [], []
160
-
161
- prior_data, scale = None, None
162
- for i, batch in enumerate(dataloader):
163
- batch = batch[0]
164
- h, w = batch.shape[-2:] # Batch shape: [C, H, W]
165
-
166
- with torch.no_grad():
167
- # Generate priors for the video
168
- if prior_data is None:
169
- priorbox = PriorBox(self.cfg, image_size=(h, w))
170
- priors = priorbox.forward()
171
- priors = priors.to(self.device, dtype=self.dtype)
172
- prior_data = priors.data
173
-
174
- # Get scale for resizing detections
175
- if scale is None:
176
- scale = torch.Tensor([w, h, w, h])
177
- scale = scale.to(self.device, dtype=self.dtype)
178
-
179
- batch_loc, batch_conf, _ = self.net(batch)
180
-
181
- # Blur detected faces in each batch of frames
182
- start_idx = i * self.batch_size
183
- end_idx = min(start_idx + self.batch_size, len(frames))
184
- processed_batches.append(
185
- self.blur_detected_faces(frames[start_idx:end_idx], batch_loc, batch_conf, prior_data, scale)
186
- )
187
-
188
- processed_frames = [frame for batch in processed_batches for frame in batch]
189
- return np.array(processed_frames)
190
-
191
-
192
- def parse_args():
193
- parser = argparse.ArgumentParser()
194
- parser.add_argument("--input_dir", type=str, required=True, help="Path containing input videos")
195
- parser.add_argument("--output_dir", type=str, required=True, help="Path for saving processed videos")
196
- parser.add_argument(
197
- "--checkpoint",
198
- type=str,
199
- help="Path to the RetinaFace checkpoint file",
200
- )
201
- return parser.parse_args()
202
-
203
-
204
- def main(args):
205
- filepaths = get_video_filepaths(args.input_dir)
206
- if not filepaths:
207
- log.error(f"No video files found in directory: {args.input_dir}")
208
- return
209
-
210
- face_blur = RetinaFaceFilter(checkpoint=args.checkpoint)
211
- postprocessing_runner = GuardrailRunner(postprocessors=[face_blur])
212
- os.makedirs(args.output_dir, exist_ok=True)
213
-
214
- for filepath in tqdm(filepaths):
215
- video_data = read_video(filepath)
216
- with misc.timer("face blur filter"):
217
- frames = postprocessing_runner.postprocess(video_data.frames)
218
-
219
- output_path = os.path.join(args.output_dir, os.path.basename(filepath))
220
- save_video(output_path, frames, video_data.fps)
221
-
222
-
223
- if __name__ == "__main__":
224
- args = parse_args()
225
- main(args)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cosmos-transfer1/cosmos_transfer1/auxiliary/guardrail/face_blur_filter/retinaface_utils.py DELETED
@@ -1,117 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import numpy as np
17
- import torch
18
- from retinaface.utils.nms.py_cpu_nms import py_cpu_nms
19
-
20
- from cosmos_transfer1.utils import log
21
-
22
-
23
- # Adapted from https://github.com/biubug6/Pytorch_Retinaface/blob/master/detect.py
24
- def filter_detected_boxes(boxes, scores, confidence_threshold, nms_threshold, top_k, keep_top_k):
25
- """Filter boxes based on confidence score and remove overlapping boxes using NMS."""
26
- # Keep detections with confidence above threshold
27
- inds = np.where(scores > confidence_threshold)[0]
28
- boxes = boxes[inds]
29
- scores = scores[inds]
30
-
31
- # Sort by confidence and keep top K detections
32
- order = scores.argsort()[::-1][:top_k]
33
- boxes = boxes[order]
34
- scores = scores[order]
35
-
36
- # Run non-maximum-suppression (NMS) to remove overlapping boxes
37
- dets = np.hstack((boxes, scores[:, np.newaxis])).astype(np.float32, copy=False)
38
- keep = py_cpu_nms(dets, nms_threshold)
39
- dets = dets[keep, :]
40
- dets = dets[:keep_top_k, :]
41
- boxes = dets[:, :-1]
42
- return boxes
43
-
44
-
45
- # Adapted from https://github.com/biubug6/Pytorch_Retinaface/blob/master/utils/box_utils.py to handle batched inputs
46
- def decode_batch(loc, priors, variances):
47
- """Decode batched locations from predictions using priors and variances.
48
-
49
- Args:
50
- loc (tensor): Batched location predictions for loc layers.
51
- Shape: [batch_size, num_priors, 4]
52
- priors (tensor): Prior boxes in center-offset form.
53
- Shape: [num_priors, 4]
54
- variances: (list[float]): Variances of prior boxes.
55
-
56
- Return:
57
- Decoded batched bounding box predictions
58
- Shape: [batch_size, num_priors, 4]
59
- """
60
- batch_size = loc.size(0)
61
- priors = priors.unsqueeze(0).expand(batch_size, -1, -1)
62
-
63
- boxes = torch.cat(
64
- (
65
- priors[:, :, :2] + loc[:, :, :2] * variances[0] * priors[:, :, 2:],
66
- priors[:, :, 2:] * torch.exp(loc[:, :, 2:] * variances[1]),
67
- ),
68
- dim=2,
69
- )
70
-
71
- boxes[:, :, :2] -= boxes[:, :, 2:] / 2
72
- boxes[:, :, 2:] += boxes[:, :, :2]
73
- return boxes
74
-
75
-
76
- # Adapted from https://github.com/biubug6/Pytorch_Retinaface/blob/master/detect.py
77
- def _check_keys(model, pretrained_state_dict):
78
- ckpt_keys = set(pretrained_state_dict.keys())
79
- model_keys = set(model.state_dict().keys())
80
- used_pretrained_keys = model_keys & ckpt_keys
81
- unused_pretrained_keys = ckpt_keys - model_keys
82
- missing_keys = model_keys - ckpt_keys
83
- log.debug("Missing keys:{}".format(len(missing_keys)))
84
- log.debug("Unused checkpoint keys:{}".format(len(unused_pretrained_keys)))
85
- log.debug("Used keys:{}".format(len(used_pretrained_keys)))
86
- assert len(used_pretrained_keys) > 0, "load NONE from pretrained checkpoint"
87
- return True
88
-
89
-
90
- # Adapted from https://github.com/biubug6/Pytorch_Retinaface/blob/master/detect.py
91
- def _remove_prefix(state_dict, prefix):
92
- """Old version of the model is stored with all names of parameters sharing common prefix 'module.'"""
93
- log.debug("Removing prefix '{}'".format(prefix))
94
-
95
- def f(x):
96
- return x.split(prefix, 1)[-1] if x.startswith(prefix) else x
97
-
98
- return {f(key): value for key, value in state_dict.items()}
99
-
100
-
101
- # Adapted from https://github.com/biubug6/Pytorch_Retinaface/blob/master/detect.py
102
- def load_model(model, pretrained_path, load_to_cpu):
103
- log.debug("Loading pretrained model from {}".format(pretrained_path))
104
- if load_to_cpu:
105
- pretrained_dict = torch.load(pretrained_path, map_location=lambda storage, loc: storage, weights_only=True)
106
- else:
107
- device = torch.cuda.current_device()
108
- pretrained_dict = torch.load(
109
- pretrained_path, map_location=lambda storage, loc: storage.cuda(device), weights_only=True
110
- )
111
- if "state_dict" in pretrained_dict.keys():
112
- pretrained_dict = _remove_prefix(pretrained_dict["state_dict"], "module.")
113
- else:
114
- pretrained_dict = _remove_prefix(pretrained_dict, "module.")
115
- _check_keys(model, pretrained_dict)
116
- model.load_state_dict(pretrained_dict, strict=False)
117
- return model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cosmos-transfer1/cosmos_transfer1/auxiliary/guardrail/llamaGuard3/__init__.py DELETED
@@ -1,14 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cosmos-transfer1/cosmos_transfer1/auxiliary/guardrail/llamaGuard3/categories.py DELETED
@@ -1,31 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- UNSAFE_CATEGORIES = {
17
- "S1": "Violent Crimes.",
18
- "S2": "Non-Violent Crimes.",
19
- "S3": "Sex Crimes.",
20
- "S4": "Child Exploitation.",
21
- "S5": "Defamation.",
22
- "S6": "Specialized Advice.",
23
- "S7": "Privacy.",
24
- "S8": "Intellectual Property.",
25
- "S9": "Indiscriminate Weapons.",
26
- "S10": "Hate.",
27
- "S11": "Self-Harm.",
28
- "S12": "Sexual Content.",
29
- "S13": "Elections.",
30
- "s14": "Code Interpreter Abuse.",
31
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cosmos-transfer1/cosmos_transfer1/auxiliary/guardrail/llamaGuard3/llamaGuard3.py DELETED
@@ -1,122 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import argparse
17
- import os
18
-
19
- import torch
20
- from transformers import AutoModelForCausalLM, AutoTokenizer
21
-
22
- from cosmos_transfer1.auxiliary.guardrail.common.core import ContentSafetyGuardrail, GuardrailRunner
23
- from cosmos_transfer1.auxiliary.guardrail.llamaGuard3.categories import UNSAFE_CATEGORIES
24
- from cosmos_transfer1.utils import log, misc
25
-
26
- SAFE = misc.Color.green("SAFE")
27
- UNSAFE = misc.Color.red("UNSAFE")
28
-
29
-
30
- class LlamaGuard3(ContentSafetyGuardrail):
31
- def __init__(
32
- self,
33
- checkpoint_dir: str,
34
- device="cuda" if torch.cuda.is_available() else "cpu",
35
- ) -> None:
36
- self.checkpoint_dir = checkpoint_dir
37
- self.device = device
38
- self.dtype = torch.bfloat16
39
-
40
- model_id = "meta-llama/Llama-Guard-3-8B"
41
- model_dir = os.path.join(self.checkpoint_dir, model_id)
42
-
43
- self.model = AutoModelForCausalLM.from_pretrained(model_dir)
44
- self.tokenizer = AutoTokenizer.from_pretrained(model_dir)
45
-
46
- self.model.to(self.device, dtype=self.dtype).eval()
47
-
48
- def get_llamaGuard3_block_message(self, moderation_output: str) -> str:
49
- """Extract the blocked category from the Llama Guard 3 model output."""
50
- block_msg = "Prompt blocked by Llama Guard 3."
51
- try:
52
- lines = moderation_output.splitlines()
53
- categories_detected = []
54
- for line in lines[1:]:
55
- line_stripped = line.split("<|eot_id|>")[0].strip()
56
- for catagory in line_stripped.split(","):
57
- catagory = catagory.strip()
58
- if catagory not in UNSAFE_CATEGORIES:
59
- log.warning(f"Unrecognized category from moderation output: {catagory}")
60
- else:
61
- categories_detected.append(catagory)
62
- if len(categories_detected) > 0:
63
- blocked_catagories = ", ".join([UNSAFE_CATEGORIES[catagory][:-1] for catagory in categories_detected])
64
- block_msg = f"{block_msg} Violations: {blocked_catagories}."
65
- except Exception as e:
66
- log.warning(f"Unable to extract blocked category from Llama Guard 3 output: {e}")
67
- return block_msg
68
-
69
- def filter_llamaGuard3_output(self, prompt: str) -> tuple[bool, str]:
70
- """Filter the Llama Guard 3 model output and return the safety status and message."""
71
- conversation = [{"role": "user", "content": prompt}]
72
- input_ids = self.tokenizer.apply_chat_template(
73
- conversation, categories=UNSAFE_CATEGORIES, return_tensors="pt"
74
- ).to("cuda")
75
- prompt_len = input_ids.shape[1]
76
- output = self.model.generate(
77
- input_ids=input_ids,
78
- max_new_tokens=100,
79
- return_dict_in_generate=True,
80
- pad_token_id=0,
81
- )
82
- generated_tokens = output.sequences[:, prompt_len:]
83
- moderation_output = self.tokenizer.decode(generated_tokens[0], skip_special_tokens=False).strip()
84
-
85
- if "unsafe" in moderation_output.lower():
86
- block_msg = self.get_llamaGuard3_block_message(moderation_output)
87
- return False, block_msg
88
- else:
89
- return True, ""
90
-
91
- def is_safe(self, prompt: str) -> tuple[bool, str]:
92
- """Check if the input prompt is safe according to the Llama Guard 3 model."""
93
- try:
94
- return self.filter_llamaGuard3_output(prompt)
95
- except Exception as e:
96
- log.error(f"Unexpected error occurred when running Llama Guard 3 guardrail: {e}")
97
- return True, "Unexpected error occurred when running Llama Guard 3 guardrail."
98
-
99
-
100
- def parse_args():
101
- parser = argparse.ArgumentParser()
102
- parser.add_argument("--prompt", type=str, required=True, help="Input prompt")
103
- parser.add_argument(
104
- "--checkpoint_dir",
105
- type=str,
106
- help="Path to the Llama Guard 3 checkpoint folder",
107
- )
108
- return parser.parse_args()
109
-
110
-
111
- def main(args):
112
- llamaGuard3 = LlamaGuard3(checkpoint_dir=args.checkpoint_dir)
113
- runner = GuardrailRunner(safety_models=[llamaGuard3])
114
- with misc.timer("Llama Guard 3 safety check"):
115
- safety, message = runner.run_safety_check(args.prompt)
116
- log.info(f"Input is: {'SAFE' if safety else 'UNSAFE'}")
117
- log.info(f"Message: {message}") if not safety else None
118
-
119
-
120
- if __name__ == "__main__":
121
- args = parse_args()
122
- main(args)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cosmos-transfer1/cosmos_transfer1/auxiliary/guardrail/video_content_safety_filter/__init__.py DELETED
@@ -1,14 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cosmos-transfer1/cosmos_transfer1/auxiliary/guardrail/video_content_safety_filter/model.py DELETED
@@ -1,60 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import attrs
17
- import torch
18
- import torch.nn as nn
19
-
20
- from cosmos_transfer1.utils.ddp_config import make_freezable
21
-
22
-
23
- @make_freezable
24
- @attrs.define(slots=False)
25
- class ModelConfig:
26
- input_size: int = 1152
27
- num_classes: int = 7
28
-
29
-
30
- class SafetyClassifier(nn.Module):
31
- def __init__(self, input_size: int = 1024, num_classes: int = 2):
32
- super().__init__()
33
- self.input_size = input_size
34
- self.num_classes = num_classes
35
- self.layers = nn.Sequential(
36
- nn.Linear(self.input_size, 512),
37
- nn.BatchNorm1d(512),
38
- nn.ReLU(),
39
- nn.Linear(512, 256),
40
- nn.BatchNorm1d(256),
41
- nn.ReLU(),
42
- nn.Linear(256, self.num_classes),
43
- # Note: No activation function here; CrossEntropyLoss expects raw logits
44
- )
45
-
46
- def forward(self, x):
47
- return self.layers(x)
48
-
49
-
50
- class VideoSafetyModel(nn.Module):
51
- def __init__(self, config: ModelConfig) -> None:
52
- super().__init__()
53
- self.config = config
54
- self.num_classes = config.num_classes
55
- self.network = SafetyClassifier(input_size=config.input_size, num_classes=self.num_classes)
56
-
57
- @torch.inference_mode()
58
- def forward(self, data_batch: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
59
- logits = self.network(data_batch["data"].cuda())
60
- return {"logits": logits}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cosmos-transfer1/cosmos_transfer1/auxiliary/guardrail/video_content_safety_filter/video_content_safety_filter.py DELETED
@@ -1,185 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import argparse
17
- import json
18
- import os
19
- from typing import Iterable, Tuple, Union
20
-
21
- import torch
22
- from PIL import Image
23
-
24
- from cosmos_transfer1.auxiliary.guardrail.common.core import ContentSafetyGuardrail, GuardrailRunner
25
- from cosmos_transfer1.auxiliary.guardrail.common.io_utils import get_video_filepaths, read_video
26
- from cosmos_transfer1.auxiliary.guardrail.video_content_safety_filter.model import ModelConfig, VideoSafetyModel
27
- from cosmos_transfer1.auxiliary.guardrail.video_content_safety_filter.vision_encoder import SigLIPEncoder
28
- from cosmos_transfer1.utils import log, misc
29
-
30
- # Define the class index to class name mapping for multi-class classification
31
- CLASS_IDX_TO_NAME = {
32
- 0: "Safe",
33
- 1: "Sexual_Content",
34
- 3: "Drugs",
35
- 4: "Child_Abuse",
36
- 5: "Hate_and_Harassment",
37
- 6: "Self-Harm",
38
- }
39
-
40
-
41
- class VideoContentSafetyFilter(ContentSafetyGuardrail):
42
- def __init__(
43
- self,
44
- checkpoint_dir: str,
45
- device="cuda" if torch.cuda.is_available() else "cpu",
46
- ) -> None:
47
- self.checkpoint_dir = os.path.join(checkpoint_dir, "nvidia/Cosmos-Guardrail1/video_content_safety_filter")
48
- self.device = device
49
- self.dtype = torch.float32
50
-
51
- # Initialize the SigLIP encoder
52
- self.encoder = SigLIPEncoder(checkpoint_dir=self.checkpoint_dir, device=device, dtype=self.dtype)
53
-
54
- # Use ModelConfig directly for inference configuration
55
- model_config = ModelConfig(input_size=1152, num_classes=7)
56
-
57
- # Load the multi-class classifier
58
- self.model = VideoSafetyModel(model_config)
59
- safety_filter_local_path = os.path.join(self.checkpoint_dir, "safety_filter.pt")
60
- checkpoint = torch.load(safety_filter_local_path, map_location=torch.device("cpu"), weights_only=True)
61
- self.model.load_state_dict(checkpoint["model"])
62
- self.model.to(self.device, dtype=self.dtype).eval()
63
-
64
- @torch.inference_mode()
65
- def __infer(self, pil_image: Image.Image) -> int:
66
- """Infer the class of the image."""
67
- image_embs = self.encoder.encode_image(pil_image)
68
- logits = self.model.network(image_embs)
69
- probabilities = torch.nn.functional.softmax(logits, dim=-1)
70
- predicted_class = torch.argmax(probabilities, dim=-1).item()
71
- return predicted_class
72
-
73
- def is_safe_file(self, filepath: str) -> bool:
74
- """Check if the video file is safe."""
75
- video_data = read_video(filepath)
76
-
77
- # Sample frames at 2 FPS
78
- sample_rate = 2 # frames per second
79
- frame_interval = int(video_data.fps / sample_rate)
80
- frame_numbers = list(range(0, int(video_data.fps * video_data.duration), frame_interval))
81
-
82
- is_safe = True
83
- frame_scores = []
84
-
85
- for frame_number in frame_numbers:
86
- try:
87
- frame = video_data.frames[frame_number]
88
- pil_image = Image.fromarray(frame)
89
- predicted_class = self.__infer(pil_image)
90
- class_name = CLASS_IDX_TO_NAME.get(predicted_class, "Safe")
91
- frame_scores.append({"frame_number": frame_number, "class": class_name})
92
-
93
- # If any frame is not "Safe", mark the video as unsafe
94
- if class_name != "Safe":
95
- is_safe = False
96
- break
97
-
98
- except Exception as e:
99
- log.warning(f"Warning: Failed to run safety classifier on frame_number {frame_number}. Exception: {e}")
100
- continue
101
-
102
- # Prepare data for JSON
103
- video_data = {
104
- "filepath": filepath,
105
- "is_safe": is_safe,
106
- "video_length": video_data.duration,
107
- "fps": video_data.fps,
108
- "frame_scores": frame_scores,
109
- }
110
-
111
- log.info(f"Video {filepath} is {'SAFE' if is_safe else 'UNSAFE'}.")
112
- log.debug(f"Video data: {json.dumps(video_data, indent=4)}")
113
- return is_safe
114
-
115
- def is_safe_frames(self, frames: Iterable) -> bool:
116
- """Check if the generated video frames are safe."""
117
- frame_scores = []
118
- total_frames = 0
119
- safe_frames = 0
120
-
121
- for frame_number, frame in enumerate(frames):
122
- try:
123
- total_frames += 1
124
- pil_image = Image.fromarray(frame)
125
- predicted_class = self.__infer(pil_image)
126
- class_name = CLASS_IDX_TO_NAME.get(predicted_class, "Safe")
127
- frame_scores.append({"frame_number": frame_number, "class": class_name})
128
-
129
- if class_name == "Safe":
130
- safe_frames += 1
131
-
132
- except Exception as e:
133
- log.warning(f"Warning: Failed to run safety classifier on frame_number {frame_number}. Exception: {e}")
134
- continue
135
-
136
- # Decide if the video is safe based on the ratio of safe frames
137
- is_safe = False
138
- if total_frames > 0:
139
- is_safe = (safe_frames / total_frames) >= 0.95
140
-
141
- video_data = {
142
- "is_safe": is_safe,
143
- "frame_scores": frame_scores,
144
- }
145
-
146
- log.debug(f"Frames data: {json.dumps(video_data, indent=4)}")
147
- return is_safe
148
-
149
- def is_safe(self, input: Union[str, Iterable]) -> Tuple[bool, str]:
150
- if isinstance(input, str):
151
- is_safe = self.is_safe_file(input)
152
- return is_safe, "safe video detected" if is_safe else "unsafe video detected"
153
- else:
154
- is_safe = self.is_safe_frames(input)
155
- return is_safe, "safe frames detected" if is_safe else "unsafe frames detected"
156
-
157
-
158
- def parse_args():
159
- parser = argparse.ArgumentParser()
160
- parser.add_argument("--input_dir", type=str, required=True, help="Path containing input videos")
161
- parser.add_argument(
162
- "--checkpoint_dir",
163
- type=str,
164
- help="Path to the Video Content Safety Filter checkpoint folder",
165
- )
166
- return parser.parse_args()
167
-
168
-
169
- def main(args):
170
- filepaths = get_video_filepaths(args.input_dir)
171
- if not filepaths:
172
- log.error(f"No video files found in directory: {args.input_dir}")
173
- return
174
-
175
- video_filter = VideoContentSafetyFilter(checkpoint_dir=args.checkpoint_dir)
176
- runner = GuardrailRunner(safety_models=[video_filter], generic_safe_msg="Video is safe")
177
-
178
- for filepath in filepaths:
179
- with misc.timer("video content safety filter"):
180
- _ = runner.run_safety_check(filepath)
181
-
182
-
183
- if __name__ == "__main__":
184
- args = parse_args()
185
- main(args)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cosmos-transfer1/cosmos_transfer1/auxiliary/guardrail/video_content_safety_filter/vision_encoder.py DELETED
@@ -1,46 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import os
17
-
18
- import torch
19
- from PIL import Image
20
- from transformers import SiglipModel, SiglipProcessor
21
-
22
-
23
- class SigLIPEncoder(torch.nn.Module):
24
- def __init__(
25
- self,
26
- checkpoint_dir: str,
27
- model_name: str = "google/siglip-so400m-patch14-384",
28
- device="cuda" if torch.cuda.is_available() else "cpu",
29
- dtype=torch.float32,
30
- ) -> None:
31
- super().__init__()
32
- self.checkpoint_dir = checkpoint_dir
33
- self.device = device
34
- self.dtype = dtype
35
- self.model = SiglipModel.from_pretrained(model_name, cache_dir=self.checkpoint_dir)
36
- self.processor = SiglipProcessor.from_pretrained(model_name, cache_dir=self.checkpoint_dir)
37
- self.model.to(self.device, dtype=self.dtype).eval()
38
-
39
- @torch.inference_mode()
40
- def encode_image(self, input_img: Image.Image) -> torch.Tensor:
41
- """Encode an image into a feature vector."""
42
- with torch.no_grad():
43
- inputs = self.processor(images=input_img, return_tensors="pt").to(self.device, dtype=self.dtype)
44
- image_features = self.model.get_image_features(**inputs)
45
- image_features /= image_features.norm(dim=-1, keepdim=True)
46
- return image_features
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cosmos-transfer1/cosmos_transfer1/auxiliary/human_keypoint/human_keypoint.py DELETED
@@ -1,155 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import os
17
-
18
- import cv2
19
- import numpy as np
20
- from rtmlib import Wholebody
21
-
22
- from cosmos_transfer1.diffusion.datasets.augmentors.human_keypoint_utils import (
23
- coco_wholebody_133_skeleton,
24
- openpose134_skeleton,
25
- )
26
- from cosmos_transfer1.utils import log
27
-
28
-
29
- class HumanKeypointModel:
30
- def __init__(self, to_openpose=True, conf_thres=0.6):
31
- self.model = Wholebody(
32
- to_openpose=to_openpose,
33
- mode="performance",
34
- backend="onnxruntime",
35
- device="cuda",
36
- )
37
- self.to_openpose = to_openpose
38
- self.conf_thres = conf_thres
39
-
40
- def __call__(self, input_video: str, output_video: str = "keypoint.mp4") -> str:
41
- """
42
- Generate the human body keypoint plot for the keypointControlNet video2world model.
43
- Input: mp4 video
44
- Output: mp4 keypoint video, of the same spatial and temporal dimensions as the input video.
45
- """
46
-
47
- log.info(f"Processing video: {input_video} to generate keypoint video: {output_video}")
48
- assert os.path.exists(input_video)
49
-
50
- cap = cv2.VideoCapture(input_video)
51
- fps = int(cap.get(cv2.CAP_PROP_FPS))
52
- frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
53
- frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
54
- frame_size = (frame_width, frame_height)
55
-
56
- # vid writer
57
- fourcc = cv2.VideoWriter_fourcc(*"mp4v")
58
- skeleton_writer = cv2.VideoWriter(output_video, fourcc, fps, frame_size)
59
-
60
- log.info(f"frame width: {frame_width}, frame height: {frame_height}, fps: {fps}")
61
- log.info("start pose estimation for frames..")
62
-
63
- # Process each frame
64
- while cap.isOpened():
65
- ret, frame = cap.read()
66
- if not ret:
67
- break
68
-
69
- # Create a black background frame
70
- black_frame = np.zeros_like(frame)
71
-
72
- # Run pose estimation
73
- keypoints, scores = self.model(frame)
74
-
75
- if keypoints is not None and len(keypoints) > 0:
76
- skeleton_frame = self.plot_person_kpts(
77
- black_frame,
78
- keypoints,
79
- scores,
80
- kpt_thr=self.conf_thres,
81
- openpose_format=True,
82
- line_width=4,
83
- ) # (h, w, 3)
84
- else:
85
- skeleton_frame = black_frame
86
-
87
- skeleton_writer.write(skeleton_frame[:, :, ::-1])
88
-
89
- cap.release()
90
- skeleton_writer.release()
91
-
92
- def draw_skeleton(
93
- self,
94
- img: np.ndarray,
95
- keypoints: np.ndarray,
96
- scores: np.ndarray,
97
- kpt_thr: float = 0.6,
98
- openpose_format: bool = True,
99
- radius: int = 2,
100
- line_width: int = 4,
101
- ):
102
- skeleton_topology = openpose134_skeleton if openpose_format else coco_wholebody_133_skeleton
103
- assert len(keypoints.shape) == 2
104
- keypoint_info, skeleton_info = (
105
- skeleton_topology["keypoint_info"],
106
- skeleton_topology["skeleton_info"],
107
- )
108
- vis_kpt = [s >= kpt_thr for s in scores]
109
- link_dict = {}
110
- for i, kpt_info in keypoint_info.items():
111
- kpt_color = tuple(kpt_info["color"])
112
- link_dict[kpt_info["name"]] = kpt_info["id"]
113
-
114
- kpt = keypoints[i]
115
-
116
- if vis_kpt[i]:
117
- img = cv2.circle(img, (int(kpt[0]), int(kpt[1])), int(radius), kpt_color, -1)
118
-
119
- for i, ske_info in skeleton_info.items():
120
- link = ske_info["link"]
121
- pt0, pt1 = link_dict[link[0]], link_dict[link[1]]
122
-
123
- if vis_kpt[pt0] and vis_kpt[pt1]:
124
- link_color = ske_info["color"]
125
- kpt0 = keypoints[pt0]
126
- kpt1 = keypoints[pt1]
127
-
128
- img = cv2.line(
129
- img, (int(kpt0[0]), int(kpt0[1])), (int(kpt1[0]), int(kpt1[1])), link_color, thickness=line_width
130
- )
131
-
132
- return img
133
-
134
- def plot_person_kpts(
135
- self,
136
- pose_vis_img: np.ndarray,
137
- keypoints: np.ndarray,
138
- scores: np.ndarray,
139
- kpt_thr: float = 0.6,
140
- openpose_format: bool = True,
141
- line_width: int = 4,
142
- ) -> np.ndarray:
143
- """
144
- plot a single person
145
- in-place update the pose image
146
- """
147
- for kpts, ss in zip(keypoints, scores):
148
- try:
149
- pose_vis_img = self.draw_skeleton(
150
- pose_vis_img, kpts, ss, kpt_thr=kpt_thr, openpose_format=openpose_format, line_width=line_width
151
- )
152
- except ValueError as e:
153
- log.error(f"Error in draw_skeleton func, {e}")
154
-
155
- return pose_vis_img
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cosmos-transfer1/cosmos_transfer1/auxiliary/robot_augmentation/README.md DELETED
@@ -1,112 +0,0 @@
1
- # Robot Data Augmentation with Cosmos-Transfer1
2
-
3
- This pipeline provides a two-step process to augment robotic videos using **Cosmos-Transfer1-7B**. It leverages **spatial-temporal control** to modify backgrounds while preserving the shape and/or appearance of the robot foreground.
4
-
5
- ## Overview of Settings
6
-
7
- We propose two augmentation settings:
8
-
9
- ### Setting 1 (fg_vis_edge_bg_seg): Preserve Shape and Appearance of the Robot (foreground)
10
- - **Foreground Controls**: `Edge`, `Vis`
11
- - **Background Controls**: `Segmentation`
12
- - **Weights**:
13
- - `w_edge(FG) = 1`
14
- - `w_vis(FG) = 1`
15
- - `w_seg(BG) = 1`
16
- - All other weights = 0
17
-
18
- ### Setting 2 (fg_edge_bg_seg): Preserve Only Shape of the Robot (foreground)
19
- - **Foreground Controls**: `Edge`
20
- - **Background Controls**: `Segmentation`
21
- - **Weights**:
22
- - `w_edge(FG) = 1`
23
- - `w_seg(BG) = 1`
24
- - All other weights = 0
25
-
26
- ## Step-by-Step Instructions
27
-
28
- ### Step 1: Generate Spatial-Temporal Weights
29
-
30
- This script extracts foreground (robot) and background information from semantic segmentation data. It processes per-frame segmentation masks and color-to-class mappings to generate spatial-temporal weight matrices for each control modality based on the selected setting.
31
-
32
- #### Input Requirements:
33
- - A `segmentation` folder containing per-frame segmentation masks in PNG format
34
- - A `segmentation_label` folder containing color-to-class mapping JSON files for each frame, for example:
35
- ```json
36
- {
37
- "(29, 0, 0, 255)": {
38
- "class": "gripper0_right_r_palm_vis"
39
- },
40
- "(31, 0, 0, 255)": {
41
- "class": "gripper0_right_R_thumb_proximal_base_link_vis"
42
- },
43
- "(33, 0, 0, 255)": {
44
- "class": "gripper0_right_R_thumb_proximal_link_vis"
45
- }
46
- }
47
- ```
48
- - An input video file
49
-
50
- Here is an example input format:
51
- [Example input directory](https://github.com/google-deepmind/cosmos/tree/main/assets/robot_augmentation_example/example1)
52
-
53
- #### Usage
54
-
55
- ```bash
56
- PYTHONPATH=$(pwd) python cosmos_transfer1/auxiliary/robot_augmentation/spatial_temporal_weight.py \
57
- --setting setting1 \
58
- --robot-keywords world_robot gripper robot \
59
- --input-dir assets/robot_augmentation_example \
60
- --output-dir outputs/robot_augmentation_example
61
- ```
62
-
63
- #### Parameters:
64
-
65
- * `--setting`: Weight setting to use (choices: 'setting1', 'setting2', default: 'setting1')
66
- * setting1: Emphasizes robot in visual and edge features (vis: 1.0 foreground, edge: 1.0 foreground, seg: 1.0 background)
67
- * setting2: Emphasizes robot only in edge features (edge: 1.0 foreground, seg: 1.0 background)
68
-
69
- * `--input-dir`: Input directory containing example folders
70
- * Default: 'assets/robot_augmentation_example'
71
-
72
- * `--output-dir`: Output directory for weight matrices
73
- * Default: 'outputs/robot_augmentation_example'
74
-
75
- * `--robot-keywords`: Keywords used to identify robot classes
76
- * Default: ["world_robot", "gripper", "robot"]
77
- * Any semantic class containing these keywords will be treated as robot foreground
78
-
79
- ### Step 2: Run Cosmos-Transfer1 Inference
80
-
81
- Use the generated spatial-temporal weight matrices to perform video augmentation with the proper controls.
82
-
83
- ```bash
84
- export CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES:=0}"
85
- export CHECKPOINT_DIR="${CHECKPOINT_DIR:=./checkpoints}"
86
- export NUM_GPU="${NUM_GPU:=1}"
87
-
88
- PYTHONPATH=$(pwd) torchrun --nproc_per_node=$NUM_GPU --nnodes=1 --node_rank=0 \
89
- cosmos_transfer1/diffusion/inference/transfer.py \
90
- --checkpoint_dir $CHECKPOINT_DIR \
91
- --video_save_folder outputs/robot_example_spatial_temporal_setting1 \
92
- --controlnet_specs assets/robot_augmentation_example/example1/inference_cosmos_transfer1_robot_spatiotemporal_weights.json \
93
- --offload_text_encoder_model \
94
- --offload_guardrail_models \
95
- --num_gpus $NUM_GPU
96
- ```
97
-
98
- - Augmented videos are saved in `outputs/robot_example_spatial_temporal_setting1/`
99
-
100
- ## Input Outputs Example
101
-
102
- Input video:
103
-
104
- <video src="https://github.com/user-attachments/assets/9c2df99d-7d0c-4dcf-af87-4ec9f65328ed">
105
- Your browser does not support the video tag.
106
- </video>
107
-
108
- You can run multiple times with different prompts (e.g., `assets/robot_augmentation_example/example1/example1_prompts.json`), and you can get different augmentation results:
109
-
110
- <video src="https://github.com/user-attachments/assets/6dee15f5-9d8b-469a-a92a-3419cb466d44">
111
- Your browser does not support the video tag.
112
- </video>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cosmos-transfer1/cosmos_transfer1/auxiliary/robot_augmentation/spatial_temporal_weight.py DELETED
@@ -1,577 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- # This script processes segmentation results for each video frame saved as JSON files and generates a spatial-temporal weight matrix saved as a .pt file.
17
- # The input JSON files contain segmentation information for each frame, and the output .pt file represents the spatial-temporal weight matrix for the video.
18
-
19
- import argparse
20
- import glob
21
- import json
22
- import logging
23
- import os
24
- import re
25
- from collections import defaultdict
26
-
27
- import cv2
28
- import numpy as np
29
- import torch
30
- from tqdm import tqdm
31
-
32
- # Configure logging
33
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
34
- logger = logging.getLogger(__name__)
35
-
36
-
37
- # Class to manage different weight settings
38
- class WeightSettings:
39
- """Class to manage different weight settings for the features"""
40
-
41
- @staticmethod
42
- def get_settings(setting_name):
43
- """Get weight settings by name
44
-
45
- Args:
46
- setting_name (str): Name of the setting
47
-
48
- Returns:
49
- dict: Dictionary with weights for each feature
50
- """
51
- settings = {
52
- # Default setting: Emphasize robot in all features
53
- "fg_vis_edge_bg_seg": {
54
- "depth": {"foreground": 0.0, "background": 0.0},
55
- "vis": {"foreground": 1.0, "background": 0.0},
56
- "edge": {"foreground": 1.0, "background": 0.0},
57
- "seg": {"foreground": 0.0, "background": 1.0},
58
- },
59
- "fg_edge_bg_seg": {
60
- "depth": {"foreground": 0.0, "background": 0.0},
61
- "vis": {"foreground": 0.0, "background": 0.0},
62
- "edge": {"foreground": 1.0, "background": 0.0},
63
- "seg": {"foreground": 0.0, "background": 1.0},
64
- },
65
- }
66
-
67
- if setting_name not in settings:
68
- logger.warning(f"Setting '{setting_name}' not found. Using default.")
69
- return settings["fg_vis_edge_bg_seg"]
70
-
71
- return settings[setting_name]
72
-
73
- @staticmethod
74
- def list_settings():
75
- """List all available settings
76
-
77
- Returns:
78
- list: List of setting names
79
- """
80
- return ["fg_vis_edge_bg_seg", "fg_edge_bg_seg"]
81
-
82
-
83
- def get_video_info(video_path):
84
- """Get video dimensions and frame count"""
85
- cap = cv2.VideoCapture(video_path)
86
- if not cap.isOpened():
87
- raise ValueError(f"Could not open video file: {video_path}")
88
-
89
- width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
90
- height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
91
- frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
92
- fps = cap.get(cv2.CAP_PROP_FPS)
93
-
94
- cap.release()
95
- return width, height, frame_count, fps
96
-
97
-
98
- def parse_color_key(color_key):
99
- """Parse a color key string into an RGB tuple
100
-
101
- Args:
102
- color_key (str): Color key string in the format "(r,g,b,a)" or similar
103
-
104
- Returns:
105
- tuple: RGB tuple (r, g, b)
106
- """
107
- # Extract numbers using regex to handle different formats
108
- numbers = re.findall(r"\d+", color_key)
109
- if len(numbers) >= 3:
110
- r, g, b = map(int, numbers[:3])
111
- return (r, g, b)
112
- else:
113
- raise ValueError(f"Invalid color key format: {color_key}")
114
-
115
-
116
- def save_visualization(mask, frame_num, feature_name, viz_dir):
117
- """Save a visualization of the binary mask
118
-
119
- Args:
120
- mask (numpy.ndarray): The mask (values 0 or 255)
121
- frame_num (int): The frame number
122
- feature_name (str): The name of the feature (depth, vis, edge, seg)
123
- viz_dir (str): Directory to save visualizations
124
- """
125
- # Simply save the binary mask directly
126
- output_path = os.path.join(viz_dir, f"{feature_name}_frame_{frame_num:06d}.png")
127
- cv2.imwrite(output_path, mask)
128
- logger.info(f"Saved binary visualization to {output_path}")
129
-
130
-
131
- def process_segmentation_files(
132
- segmentation_dir,
133
- output_dir,
134
- viz_dir,
135
- video_path=None,
136
- weights_dict=None,
137
- setting_name="fg_vis_edge_bg_seg",
138
- robot_keywords=None,
139
- ):
140
- """Process all segmentation JSON files and create weight matrices
141
-
142
- Args:
143
- segmentation_dir (str): Directory containing segmentation JSON files
144
- output_dir (str): Directory to save weight matrices
145
- viz_dir (str): Directory to save visualizations
146
- video_path (str, optional): Path to the video file. Defaults to None.
147
- weights_dict (dict, optional): Dictionary with weights for each feature.
148
- Format: {
149
- 'depth': {'foreground': float, 'background': float},
150
- 'vis': {'foreground': float, 'background': float},
151
- 'edge': {'foreground': float, 'background': float},
152
- 'seg': {'foreground': float, 'background': float}
153
- }
154
- Values should be in range 0-1. Defaults to None.
155
- setting_name (str, optional): Weight setting name. Defaults to 'fg_vis_edge_bg_seg (setting1)'.
156
- robot_keywords (list, optional): List of keywords to identify robot classes. Defaults to ["robot"].
157
- """
158
-
159
- # Set default robot keywords if not provided
160
- if robot_keywords is None:
161
- robot_keywords = ["robot"]
162
-
163
- # Get all JSON files
164
- json_files = sorted(glob.glob(os.path.join(segmentation_dir, "*.json")))
165
- logger.info(f"Found {len(json_files)} JSON files")
166
-
167
- if len(json_files) == 0:
168
- raise ValueError(f"No JSON files found in {segmentation_dir}")
169
-
170
- # For example directories, check for PNG files
171
- png_dir = os.path.join(os.path.dirname(segmentation_dir), "segmentation")
172
- png_files = []
173
- if os.path.exists(png_dir):
174
- png_files = sorted(glob.glob(os.path.join(png_dir, "*.png")))
175
- logger.info(f"Found {len(png_files)} PNG files in segmentation directory")
176
-
177
- # Step 1: Create a unified color-to-class mapping from all JSON files
178
- logger.info("Creating unified color-to-class mapping...")
179
- rgb_to_class = {}
180
- rgb_to_is_robot = {}
181
-
182
- for json_file in tqdm(json_files, desc="Processing JSON files for unified mapping"):
183
- with open(json_file, "r") as f:
184
- json_data = json.load(f)
185
-
186
- for color_key, data in json_data.items():
187
- color = parse_color_key(color_key)
188
- class_name = data["class"]
189
-
190
- # Store RGB color for matching
191
- rgb_to_class[color] = class_name
192
- rgb_to_is_robot[color] = any(keyword in class_name for keyword in robot_keywords)
193
-
194
- # Print statistics about the unified color mapping
195
- robot_colors = [color for color, is_robot in rgb_to_is_robot.items() if is_robot]
196
- logger.info(f"Unified mapping: Found {len(robot_colors)} robot colors out of {len(rgb_to_is_robot)} total colors")
197
- if robot_colors:
198
- logger.info(f"Robot classes: {[rgb_to_class[color] for color in robot_colors]}")
199
-
200
- # Convert color mapping to arrays for vectorized operations
201
- colors = list(rgb_to_is_robot.keys())
202
- color_array = np.array(colors)
203
- is_robot_array = np.array([rgb_to_is_robot[color] for color in colors], dtype=bool)
204
-
205
- # If we have PNG files, get dimensions from the first PNG
206
- if png_files:
207
- # Get dimensions from the first PNG file
208
- first_png = cv2.imread(png_files[0])
209
- if first_png is None:
210
- raise ValueError(f"Could not read PNG file: {png_files[0]}")
211
-
212
- height, width = first_png.shape[:2]
213
- frame_count = len(png_files)
214
-
215
- # Match frame numbers between JSON and PNG files to ensure correct correspondence
216
- json_frame_nums = [int(os.path.basename(f).split("_")[-1].split(".")[0]) for f in json_files]
217
- png_frame_nums = [int(os.path.basename(f).split("_")[-1].split(".")[0]) for f in png_files]
218
-
219
- # Find common frames between JSON and PNG files
220
- common_frames = sorted(set(json_frame_nums).intersection(set(png_frame_nums)))
221
- logger.info(f"Found {len(common_frames)} common frames between JSON and PNG files")
222
-
223
- if len(common_frames) == 0:
224
- raise ValueError("No matching frames found between JSON and PNG files")
225
-
226
- # Create maps to easily look up files by frame number
227
- json_map = {int(os.path.basename(f).split("_")[-1].split(".")[0]): f for f in json_files}
228
- png_map = {int(os.path.basename(f).split("_")[-1].split(".")[0]): f for f in png_files}
229
-
230
- # Create new lists with only matching files
231
- json_files = [json_map[frame] for frame in common_frames if frame in json_map]
232
- png_files = [png_map[frame] for frame in common_frames if frame in png_map]
233
- num_frames = len(json_files)
234
-
235
- logger.info(f"Using PNG dimensions: {width}x{height}, processing {num_frames} frames")
236
- else:
237
- # Get video information if no PNG files available
238
- try:
239
- width, height, frame_count, fps = get_video_info(video_path)
240
- logger.info(f"Video dimensions: {width}x{height}, {frame_count} frames, {fps} fps")
241
- num_frames = min(len(json_files), frame_count)
242
- except Exception as e:
243
- logger.warning(f"Warning: Could not get video information: {e}")
244
- # Use a default size if we can't get the video info
245
- width, height = 640, 480
246
- num_frames = len(json_files)
247
- logger.info(f"Using default dimensions: {width}x{height}, {num_frames} frames")
248
-
249
- # Initialize weight tensors
250
- depth_weights = torch.zeros((num_frames, height, width))
251
- vis_weights = torch.zeros((num_frames, height, width))
252
- edge_weights = torch.zeros((num_frames, height, width))
253
- seg_weights = torch.zeros((num_frames, height, width))
254
-
255
- # Process frames
256
- if png_files:
257
- # Process PNG files directly
258
- for i, (json_file, png_file) in enumerate(zip(json_files, png_files)):
259
- # Get frame number from filename
260
- frame_num = int(os.path.basename(json_file).split("_")[-1].split(".")[0])
261
-
262
- # Read the corresponding PNG file
263
- frame = cv2.imread(png_file)
264
-
265
- if frame is None:
266
- logger.warning(f"Warning: Could not read frame {i} from PNG. Using blank frame.")
267
- frame = np.zeros((height, width, 3), dtype=np.uint8)
268
-
269
- # Convert frame to RGB
270
- frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
271
-
272
- # Calculate total pixels
273
- total_pixels = height * width
274
-
275
- # Vectorized approach for finding nearest colors
276
- # Convert frame_rgb to a 2D array of shape (height*width, 3)
277
- pixels = frame_rgb.reshape(-1, 3)
278
-
279
- # Calculate distances between each pixel and each color (vectorized)
280
- # This creates a matrix of shape (height*width, num_colors)
281
- distances = np.sqrt(np.sum((pixels[:, np.newaxis, :] - color_array[np.newaxis, :, :]) ** 2, axis=2))
282
-
283
- # Find the index of the nearest color for each pixel
284
- nearest_color_indices = np.argmin(distances, axis=1)
285
-
286
- # Get the is_robot value for each pixel based on its nearest color
287
- pixel_is_robot = is_robot_array[nearest_color_indices]
288
-
289
- # Reshape back to image dimensions
290
- pixel_is_robot_2d = pixel_is_robot.reshape(height, width)
291
-
292
- # Count robot and matched pixels
293
- robot_pixel_count = np.sum(pixel_is_robot)
294
- matched_pixel_count = pixels.shape[0] # All pixels are matched now
295
-
296
- # Create masks based on the is_robot classification
297
- depth_mask = np.where(
298
- pixel_is_robot_2d, weights_dict["depth"]["foreground"], weights_dict["depth"]["background"]
299
- )
300
-
301
- vis_mask = np.where(pixel_is_robot_2d, weights_dict["vis"]["foreground"], weights_dict["vis"]["background"])
302
-
303
- edge_mask = np.where(
304
- pixel_is_robot_2d, weights_dict["edge"]["foreground"], weights_dict["edge"]["background"]
305
- )
306
-
307
- seg_mask = np.where(pixel_is_robot_2d, weights_dict["seg"]["foreground"], weights_dict["seg"]["background"])
308
-
309
- # Create visualization mask
310
- visualization_mask = np.zeros((height, width), dtype=np.uint8)
311
- visualization_mask[pixel_is_robot_2d] = 255
312
-
313
- # Log statistics
314
- robot_percentage = (robot_pixel_count / total_pixels) * 100
315
- matched_percentage = (matched_pixel_count / total_pixels) * 100
316
- logger.info(f"Frame {frame_num}: {robot_pixel_count} robot pixels ({robot_percentage:.2f}%)")
317
- logger.info(f"Frame {frame_num}: {matched_pixel_count} matched pixels ({matched_percentage:.2f}%)")
318
-
319
- # Save visualizations for this frame
320
- save_visualization(visualization_mask, frame_num, "segmentation", viz_dir)
321
-
322
- # Store the masks in the weight tensors
323
- depth_weights[i] = torch.from_numpy(depth_mask)
324
- vis_weights[i] = torch.from_numpy(vis_mask)
325
- edge_weights[i] = torch.from_numpy(edge_mask)
326
- seg_weights[i] = torch.from_numpy(seg_mask)
327
- else:
328
- # Use video frames if available
329
- try:
330
- # Open the segmentation video
331
- cap = cv2.VideoCapture(video_path)
332
- if not cap.isOpened():
333
- raise ValueError(f"Could not open video file: {video_path}")
334
-
335
- # Process each frame using the unified color mapping
336
- for i, json_file in enumerate(tqdm(json_files[:num_frames], desc="Processing frames")):
337
- # Get frame number from filename
338
- frame_num = int(os.path.basename(json_file).split("_")[-1].split(".")[0])
339
-
340
- # Read the corresponding frame from the video
341
- cap.set(cv2.CAP_PROP_POS_FRAMES, i)
342
- ret, frame = cap.read()
343
-
344
- if not ret:
345
- logger.warning(f"Warning: Could not read frame {i} from video. Using blank frame.")
346
- frame = np.zeros((height, width, 3), dtype=np.uint8)
347
-
348
- # Convert frame to RGB
349
- frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
350
-
351
- # Calculate total pixels
352
- total_pixels = height * width
353
-
354
- # Vectorized approach for finding nearest colors
355
- pixels = frame_rgb.reshape(-1, 3)
356
- distances = np.sqrt(np.sum((pixels[:, np.newaxis, :] - color_array[np.newaxis, :, :]) ** 2, axis=2))
357
- nearest_color_indices = np.argmin(distances, axis=1)
358
- pixel_is_robot = is_robot_array[nearest_color_indices]
359
- pixel_is_robot_2d = pixel_is_robot.reshape(height, width)
360
-
361
- # Count robot and matched pixels
362
- robot_pixel_count = np.sum(pixel_is_robot)
363
- matched_pixel_count = pixels.shape[0]
364
-
365
- # Create masks based on the is_robot classification
366
- depth_mask = np.where(
367
- pixel_is_robot_2d, weights_dict["depth"]["foreground"], weights_dict["depth"]["background"]
368
- )
369
- vis_mask = np.where(
370
- pixel_is_robot_2d, weights_dict["vis"]["foreground"], weights_dict["vis"]["background"]
371
- )
372
- edge_mask = np.where(
373
- pixel_is_robot_2d, weights_dict["edge"]["foreground"], weights_dict["edge"]["background"]
374
- )
375
- seg_mask = np.where(
376
- pixel_is_robot_2d, weights_dict["seg"]["foreground"], weights_dict["seg"]["background"]
377
- )
378
-
379
- # Create visualization mask
380
- visualization_mask = np.zeros((height, width), dtype=np.uint8)
381
- visualization_mask[pixel_is_robot_2d] = 255
382
-
383
- # Log statistics
384
- robot_percentage = (robot_pixel_count / total_pixels) * 100
385
- matched_percentage = (matched_pixel_count / total_pixels) * 100
386
- logger.info(f"Frame {frame_num}: {robot_pixel_count} robot pixels ({robot_percentage:.2f}%)")
387
- logger.info(f"Frame {frame_num}: {matched_pixel_count} matched pixels ({matched_percentage:.2f}%)")
388
-
389
- # Save visualizations for this frame
390
- save_visualization(visualization_mask, frame_num, "segmentation", viz_dir)
391
-
392
- # Store the masks in the weight tensors
393
- depth_weights[i] = torch.from_numpy(depth_mask)
394
- vis_weights[i] = torch.from_numpy(vis_mask)
395
- edge_weights[i] = torch.from_numpy(edge_mask)
396
- seg_weights[i] = torch.from_numpy(seg_mask)
397
-
398
- # Close the video capture
399
- cap.release()
400
- except Exception as e:
401
- logger.warning(f"Warning: Error processing video: {e}")
402
- logger.warning("Cannot process this example without proper frame data.")
403
- raise ValueError(f"Cannot process example without frame data: {e}")
404
-
405
- # Save weight tensors
406
- # Convert weights to half precision (float16) to reduce file size
407
- depth_weights_half = depth_weights.to(torch.float16)
408
- vis_weights_half = vis_weights.to(torch.float16)
409
- edge_weights_half = edge_weights.to(torch.float16)
410
- seg_weights_half = seg_weights.to(torch.float16)
411
-
412
- # Save the half precision tensors
413
- torch.save(depth_weights_half, os.path.join(output_dir, "depth_weights.pt"))
414
- torch.save(vis_weights_half, os.path.join(output_dir, "vis_weights.pt"))
415
- torch.save(edge_weights_half, os.path.join(output_dir, "edge_weights.pt"))
416
- torch.save(seg_weights_half, os.path.join(output_dir, "seg_weights.pt"))
417
-
418
- logger.info(f"Saved weight matrices to {output_dir}")
419
- logger.info(f"Weight matrix shape: {depth_weights_half.shape}, dtype: {depth_weights_half.dtype}")
420
- logger.info(f"Saved visualizations to {viz_dir}")
421
-
422
- return output_dir, viz_dir
423
-
424
-
425
- def process_all_examples(input_dir, output_dir, setting_name="fg_vis_edge_bg_seg", robot_keywords=None):
426
- """Process all example directories in the provided input directory
427
-
428
- Args:
429
- input_dir (str): Input directory containing example folders
430
- output_dir (str): Output directory for weight matrices
431
- setting_name (str, optional): Weight setting name. Defaults to 'fg_vis_edge_bg_seg'.
432
- robot_keywords (list, optional): List of keywords to identify robot classes. Defaults to None.
433
- """
434
- # Find all example directories
435
- if not os.path.exists(input_dir):
436
- logger.error(f"Input directory not found: {input_dir}")
437
- return []
438
-
439
- # List example directories
440
- examples = [d for d in os.listdir(input_dir) if os.path.isdir(os.path.join(input_dir, d))]
441
- examples = sorted(examples)
442
-
443
- if not examples:
444
- logger.warning("No example directories found.")
445
- return []
446
-
447
- # Print found examples
448
- logger.info(f"Found {len(examples)} example directories:")
449
- for example in examples:
450
- logger.info(f" - {example}")
451
-
452
- # Store processing results
453
- results = []
454
-
455
- # Process each example
456
- for example in examples:
457
- try:
458
- logger.info(f"\nProcessing {example}...")
459
-
460
- # Process this example with custom directories
461
- out_dir, viz_dir = process_example_with_dirs(example, input_dir, output_dir, setting_name, robot_keywords)
462
- results.append((example, out_dir, viz_dir))
463
-
464
- logger.info(f"Results for {example} saved to:")
465
- logger.info(f" Weight matrices: {out_dir}")
466
- logger.info(f" Visualizations: {viz_dir}")
467
-
468
- except Exception as e:
469
- logger.error(f"Error processing {example}: {e}")
470
-
471
- logger.info("\nAll examples processed.")
472
- return results
473
-
474
-
475
- # Process a specific example with custom input and output directories
476
- def process_example_with_dirs(
477
- example_name, input_dir, output_dir, setting_name="fg_vis_edge_bg_seg", robot_keywords=None
478
- ):
479
- """Process a specific example with custom input and output directories
480
-
481
- Args:
482
- example_name (str): Name of the example directory
483
- input_dir (str): Path to input directory containing example folders
484
- output_dir (str): Path to output directory for weight matrices
485
- setting_name (str, optional): Weight setting name. Defaults to 'fg_vis_edge_bg_seg'.
486
- robot_keywords (list, optional): List of keywords to identify robot classes. Defaults to None.
487
- """
488
- # Create paths for this example
489
- example_dir = os.path.join(input_dir, example_name)
490
- segmentation_dir = os.path.join(example_dir, "segmentation_label")
491
- video_path = os.path.join(example_dir, "segmentation.mp4")
492
-
493
- # Create output directories
494
- example_output_dir = os.path.join(output_dir, example_name)
495
- viz_dir = os.path.join(example_output_dir, "visualizations")
496
-
497
- # Check if weight files already exist
498
- depth_weights_path = os.path.join(example_output_dir, "depth_weights.pt")
499
- if os.path.exists(depth_weights_path):
500
- logger.info(f"Weight files already exist for {example_name}, skipping processing")
501
- return example_output_dir, viz_dir
502
-
503
- # Create output directories if they don't exist
504
- os.makedirs(example_output_dir, exist_ok=True)
505
- os.makedirs(viz_dir, exist_ok=True)
506
-
507
- # Get weight settings
508
- weights_dict = WeightSettings.get_settings(setting_name)
509
-
510
- # Process this example directly with paths
511
- return process_segmentation_files(
512
- segmentation_dir=segmentation_dir,
513
- output_dir=example_output_dir,
514
- viz_dir=viz_dir,
515
- video_path=video_path,
516
- weights_dict=weights_dict,
517
- setting_name=setting_name,
518
- robot_keywords=robot_keywords,
519
- )
520
-
521
-
522
- if __name__ == "__main__":
523
- # Parse command-line arguments
524
- parser = argparse.ArgumentParser(
525
- description="Process segmentation files to generate spatial-temporal weight matrices"
526
- )
527
- parser.add_argument(
528
- "--setting",
529
- type=str,
530
- default="fg_vis_edge_bg_seg",
531
- choices=WeightSettings.list_settings(),
532
- help="Weight setting to use (default: fg_vis_edge_bg_seg (setting1), fg_edge_bg_seg (setting2))",
533
- )
534
- parser.add_argument(
535
- "--input-dir",
536
- type=str,
537
- default="assets/robot_augmentation_example",
538
- help="Input directory containing example folders",
539
- )
540
- parser.add_argument(
541
- "--output-dir",
542
- type=str,
543
- default="outputs/robot_augmentation_example",
544
- help="Output directory for weight matrices",
545
- )
546
- parser.add_argument(
547
- "--robot-keywords",
548
- type=str,
549
- nargs="+",
550
- default=["world_robot", "gripper", "robot"],
551
- help="Keywords used to identify robot classes (default: world_robot gripper robot)",
552
- )
553
- parser.add_argument(
554
- "--log-level",
555
- type=str,
556
- default="INFO",
557
- choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
558
- help="Set the logging level",
559
- )
560
- args = parser.parse_args()
561
-
562
- # Set logging level from command line argument
563
- logger.setLevel(getattr(logging, args.log_level))
564
-
565
- # Get directories from arguments
566
- input_dir = args.input_dir
567
- output_dir = args.output_dir
568
- setting_name = args.setting
569
- robot_keywords = args.robot_keywords
570
-
571
- logger.info(f"Using input directory: {input_dir}")
572
- logger.info(f"Using output directory: {output_dir}")
573
- logger.info(f"Using weight setting: {setting_name}")
574
- logger.info(f"Using robot keywords: {robot_keywords}")
575
-
576
- # Process all examples with the provided input and output directories
577
- process_all_examples(input_dir, output_dir, setting_name, robot_keywords)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cosmos-transfer1/cosmos_transfer1/auxiliary/sam2/sam2_model.py DELETED
@@ -1,392 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import os
17
- import sys
18
-
19
- import numpy as np
20
- import pycocotools.mask as mask_util
21
- import torch
22
-
23
- from cosmos_transfer1.utils import log
24
-
25
- sys.path.append("cosmos_transfer1/auxiliary")
26
-
27
- import tempfile
28
-
29
- from PIL import Image
30
- from sam2.sam2_video_predictor import SAM2VideoPredictor
31
- from transformers import AutoModelForZeroShotObjectDetection, AutoProcessor
32
-
33
- from cosmos_transfer1.auxiliary.sam2.sam2_utils import (
34
- capture_fps,
35
- convert_masks_to_frames,
36
- generate_tensor_from_images,
37
- video_to_frames,
38
- write_video,
39
- )
40
- from cosmos_transfer1.checkpoints import GROUNDING_DINO_MODEL_CHECKPOINT, SAM2_MODEL_CHECKPOINT
41
-
42
-
43
- def rle_encode(mask: np.ndarray) -> dict:
44
- """
45
- Encode a boolean mask (of shape (T, H, W)) using the pycocotools RLE format,
46
- matching the format of eff_segmentation.RleMaskSAMv2 (from Yotta).
47
-
48
- The procedure is:
49
- 1. Convert the mask to a numpy array in Fortran order.
50
- 2. Reshape the array to (-1, 1) (i.e. flatten in Fortran order).
51
- 3. Call pycocotools.mask.encode on the reshaped array.
52
- 4. Return a dictionary with the encoded data and the original mask shape.
53
- """
54
- mask = np.array(mask, order="F")
55
- # Reshape the mask to (-1, 1) in Fortran order and encode it.
56
- encoded = mask_util.encode(np.array(mask.reshape(-1, 1), order="F"))
57
- return {"data": encoded, "mask_shape": mask.shape}
58
-
59
-
60
- class VideoSegmentationModel:
61
- def __init__(self, **kwargs):
62
- """Initialize the model and load all required components."""
63
- self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
64
-
65
- # Initialize SAM2 predictor
66
- self.sam2_predictor = SAM2VideoPredictor.from_pretrained(SAM2_MODEL_CHECKPOINT).to(self.device)
67
-
68
- # Initialize GroundingDINO for text-based detection
69
- self.grounding_model_name = kwargs.get("grounding_model", GROUNDING_DINO_MODEL_CHECKPOINT)
70
- self.processor = AutoProcessor.from_pretrained(self.grounding_model_name)
71
- self.grounding_model = AutoModelForZeroShotObjectDetection.from_pretrained(self.grounding_model_name).to(
72
- self.device
73
- )
74
-
75
- def get_boxes_from_text(self, image_path, text_prompt):
76
- """Get bounding boxes (and labels) from a text prompt using GroundingDINO."""
77
- image = Image.open(image_path).convert("RGB")
78
-
79
- inputs = self.processor(images=image, text=text_prompt, return_tensors="pt").to(self.device)
80
-
81
- with torch.no_grad():
82
- outputs = self.grounding_model(**inputs)
83
-
84
- # Try with initial thresholds.
85
- results = self.processor.post_process_grounded_object_detection(
86
- outputs,
87
- inputs.input_ids,
88
- box_threshold=0.15,
89
- text_threshold=0.25,
90
- target_sizes=[image.size[::-1]],
91
- )
92
-
93
- boxes = results[0]["boxes"].cpu().numpy()
94
- scores = results[0]["scores"].cpu().numpy()
95
- labels = results[0].get("labels", None)
96
- if len(boxes) == 0:
97
- print(f"No boxes detected for prompt: '{text_prompt}'. Trying with lower thresholds...")
98
- results = self.processor.post_process_grounded_object_detection(
99
- outputs,
100
- inputs.input_ids,
101
- box_threshold=0.1,
102
- text_threshold=0.1,
103
- target_sizes=[image.size[::-1]],
104
- )
105
- boxes = results[0]["boxes"].cpu().numpy()
106
- scores = results[0]["scores"].cpu().numpy()
107
- labels = results[0].get("labels", None)
108
-
109
- if len(boxes) > 0:
110
- print(f"Found {len(boxes)} boxes with scores: {scores}")
111
- # Sort boxes by confidence score in descending order
112
- sorted_indices = np.argsort(scores)[::-1]
113
- boxes = boxes[sorted_indices]
114
- scores = scores[sorted_indices]
115
- if labels is not None:
116
- labels = np.array(labels)[sorted_indices]
117
- else:
118
- print("Still no boxes detected. Consider adjusting the prompt or using box/points mode.")
119
-
120
- return {"boxes": boxes, "labels": labels, "scores": scores}
121
-
122
- def visualize_frame(self, frame_idx, obj_ids, masks, video_dir, frame_names, visualization_data, save_dir=None):
123
- """
124
- Process a single frame: load the image, apply the segmentation mask to black out the
125
- detected object(s), and save both the masked frame and the binary mask image.
126
- """
127
- # Load the frame.
128
- frame_path = os.path.join(video_dir, frame_names[frame_idx])
129
- img = Image.open(frame_path).convert("RGB")
130
- image_np = np.array(img)
131
-
132
- # Combine masks from the detection output.
133
- if isinstance(masks, torch.Tensor):
134
- mask_np = (masks[0] > 0.0).cpu().numpy().astype(bool)
135
- combined_mask = mask_np
136
- elif isinstance(masks, dict):
137
- first_mask = next(iter(masks.values()))
138
- combined_mask = np.zeros_like(first_mask, dtype=bool)
139
- for m in masks.values():
140
- combined_mask |= m
141
- else:
142
- combined_mask = None
143
-
144
- if combined_mask is not None:
145
- combined_mask = np.squeeze(combined_mask)
146
-
147
- # If the mask shape doesn't match the image, resize it.
148
- if combined_mask.shape != image_np.shape[:2]:
149
- mask_img = Image.fromarray((combined_mask.astype(np.uint8)) * 255)
150
- mask_img = mask_img.resize((image_np.shape[1], image_np.shape[0]), resample=Image.NEAREST)
151
- combined_mask = np.array(mask_img) > 127
152
-
153
- # Black out the detected region.
154
- image_np[combined_mask] = 0
155
-
156
- mask_image = (combined_mask.astype(np.uint8)) * 255
157
- mask_pil = Image.fromarray(mask_image)
158
-
159
- if save_dir:
160
- seg_frame_path = os.path.join(save_dir, f"frame_{frame_idx}_segmented.png")
161
- seg_pil = Image.fromarray(image_np)
162
- seg_pil.save(seg_frame_path)
163
- if combined_mask is not None:
164
- mask_save_path = os.path.join(save_dir, f"frame_{frame_idx}_mask.png")
165
- mask_pil.save(mask_save_path)
166
-
167
- def sample(self, **kwargs):
168
- """
169
- Main sampling function for video segmentation.
170
- Returns a list of detections in which each detection contains a phrase and
171
- an RLE-encoded segmentation mask (matching the output of the Grounded SAM model).
172
- """
173
- video_dir = kwargs.get("video_dir", "")
174
- mode = kwargs.get("mode", "points")
175
- input_data = kwargs.get("input_data", None)
176
- save_dir = kwargs.get("save_dir", None)
177
- visualize = kwargs.get("visualize", False)
178
-
179
- # Get frame names (expecting frames named as numbers with .jpg/.jpeg extension).
180
- frame_names = [p for p in os.listdir(video_dir) if os.path.splitext(p)[-1].lower() in [".jpg", ".jpeg"]]
181
- frame_names.sort(key=lambda p: int(os.path.splitext(p)[0]))
182
-
183
- with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
184
- state = self.sam2_predictor.init_state(video_path=video_dir)
185
-
186
- ann_frame_idx = 0
187
- ann_obj_id = 1
188
- boxes = None
189
- points = None
190
- labels = None
191
- box = None
192
-
193
- visualization_data = {"mode": mode, "points": None, "labels": None, "box": None, "boxes": None}
194
-
195
- if input_data is not None:
196
- if mode == "points":
197
- points = input_data.get("points")
198
- labels = input_data.get("labels")
199
- frame_idx, obj_ids, masks = self.sam2_predictor.add_new_points_or_box(
200
- inference_state=state, frame_idx=ann_frame_idx, obj_id=ann_obj_id, points=points, labels=labels
201
- )
202
- visualization_data["points"] = points
203
- visualization_data["labels"] = labels
204
- elif mode == "box":
205
- box = input_data.get("box")
206
- frame_idx, obj_ids, masks = self.sam2_predictor.add_new_points_or_box(
207
- inference_state=state, frame_idx=ann_frame_idx, obj_id=ann_obj_id, box=box
208
- )
209
- visualization_data["box"] = box
210
- elif mode == "prompt":
211
- text = input_data.get("text")
212
- first_frame_path = os.path.join(video_dir, frame_names[0])
213
- gd_results = self.get_boxes_from_text(first_frame_path, text)
214
- boxes = gd_results["boxes"]
215
- labels_out = gd_results["labels"]
216
- scores = gd_results["scores"]
217
- log.info(f"scores: {scores}")
218
- if len(boxes) > 0:
219
- legacy_mask = kwargs.get("legacy_mask", False)
220
- if legacy_mask:
221
- # Use only the highest confidence box for legacy mask
222
- log.info(f"using legacy_mask: {legacy_mask}")
223
- frame_idx, obj_ids, masks = self.sam2_predictor.add_new_points_or_box(
224
- inference_state=state, frame_idx=ann_frame_idx, obj_id=ann_obj_id, box=boxes[0]
225
- )
226
- # Update boxes and labels after processing
227
- boxes = boxes[:1]
228
- if labels_out is not None:
229
- labels_out = labels_out[:1]
230
- else:
231
- log.info(f"using new_mask: {legacy_mask}")
232
- for object_id, (box, label) in enumerate(zip(boxes, labels_out)):
233
- frame_idx, obj_ids, masks = self.sam2_predictor.add_new_points_or_box(
234
- inference_state=state, frame_idx=ann_frame_idx, obj_id=object_id, box=box
235
- )
236
- visualization_data["boxes"] = boxes
237
- self.grounding_labels = [str(lbl) for lbl in labels_out] if labels_out is not None else [text]
238
- else:
239
- print("No boxes detected. Exiting.")
240
- return [] # Return empty list if no detections
241
-
242
- if visualize:
243
- self.visualize_frame(
244
- frame_idx=ann_frame_idx,
245
- obj_ids=obj_ids,
246
- masks=masks,
247
- video_dir=video_dir,
248
- frame_names=frame_names,
249
- visualization_data=visualization_data,
250
- save_dir=save_dir,
251
- )
252
-
253
- video_segments = {} # keys: frame index, values: {obj_id: mask}
254
- for out_frame_idx, out_obj_ids, out_mask_logits in self.sam2_predictor.propagate_in_video(state):
255
- video_segments[out_frame_idx] = {
256
- out_obj_id: (out_mask_logits[i] > 0.0).cpu().numpy() for i, out_obj_id in enumerate(out_obj_ids)
257
- }
258
-
259
- # For propagated frames, visualization_data is not used.
260
- if visualize:
261
- propagate_visualization_data = {
262
- "mode": mode,
263
- "points": None,
264
- "labels": None,
265
- "box": None,
266
- "boxes": None,
267
- }
268
- self.visualize_frame(
269
- frame_idx=out_frame_idx,
270
- obj_ids=out_obj_ids,
271
- masks=video_segments[out_frame_idx],
272
- video_dir=video_dir,
273
- frame_names=frame_names,
274
- visualization_data=propagate_visualization_data,
275
- save_dir=save_dir,
276
- )
277
-
278
- # --- Post-process video_segments to produce a list of detections ---
279
- if len(video_segments) == 0:
280
- return []
281
-
282
- first_frame_path = os.path.join(video_dir, frame_names[0])
283
- first_frame = np.array(Image.open(first_frame_path).convert("RGB"))
284
- original_shape = first_frame.shape[:2] # (height, width)
285
-
286
- object_masks = {} # key: obj_id, value: list of 2D boolean masks
287
- sorted_frame_indices = sorted(video_segments.keys())
288
- for frame_idx in sorted_frame_indices:
289
- segments = video_segments[frame_idx]
290
- for obj_id, mask in segments.items():
291
- mask = np.squeeze(mask)
292
- if mask.ndim != 2:
293
- print(f"Warning: Unexpected mask shape {mask.shape} for object {obj_id} in frame {frame_idx}.")
294
- continue
295
-
296
- if mask.shape != original_shape:
297
- mask_img = Image.fromarray(mask.astype(np.uint8) * 255)
298
- mask_img = mask_img.resize((original_shape[1], original_shape[0]), resample=Image.NEAREST)
299
- mask = np.array(mask_img) > 127
300
-
301
- if obj_id not in object_masks:
302
- object_masks[obj_id] = []
303
- object_masks[obj_id].append(mask)
304
-
305
- detections = []
306
- for obj_id, mask_list in object_masks.items():
307
- mask_stack = np.stack(mask_list, axis=0) # shape: (T, H, W)
308
- # Use our new rle_encode (which now follows the eff_segmentation.RleMaskSAMv2 format)
309
- rle = rle_encode(mask_stack)
310
- if mode == "prompt" and hasattr(self, "grounding_labels"):
311
- phrase = self.grounding_labels[0]
312
- else:
313
- phrase = input_data.get("text", "")
314
- detection = {"phrase": phrase, "segmentation_mask_rle": rle}
315
- detections.append(detection)
316
-
317
- return detections
318
-
319
- @staticmethod
320
- def parse_points(points_str):
321
- """Parse a string of points into a numpy array.
322
- Supports a single point ('200,300') or multiple points separated by ';' (e.g., '200,300;100,150').
323
- """
324
- points = []
325
- for point in points_str.split(";"):
326
- coords = point.split(",")
327
- if len(coords) != 2:
328
- continue
329
- points.append([float(coords[0]), float(coords[1])])
330
- return np.array(points, dtype=np.float32)
331
-
332
- @staticmethod
333
- def parse_labels(labels_str):
334
- """Parse a comma-separated string of labels into a numpy array."""
335
- return np.array([int(x) for x in labels_str.split(",")], dtype=np.int32)
336
-
337
- @staticmethod
338
- def parse_box(box_str):
339
- """Parse a comma-separated string of 4 box coordinates into a numpy array."""
340
- return np.array([float(x) for x in box_str.split(",")], dtype=np.float32)
341
-
342
- def __call__(
343
- self,
344
- input_video,
345
- output_video=None,
346
- output_tensor=None,
347
- prompt=None,
348
- box=None,
349
- points=None,
350
- labels=None,
351
- weight_scaler=None,
352
- binarize_video=False,
353
- legacy_mask=False,
354
- ):
355
- log.info(
356
- f"Processing video: {input_video} to generate segmentation video: {output_video} segmentation tensor: {output_tensor}"
357
- )
358
- assert os.path.exists(input_video)
359
-
360
- # Prepare input data based on the selected mode.
361
- if points is not None:
362
- mode = "points"
363
- input_data = {"points": self.parse_points(points), "labels": self.parse_labels(labels)}
364
- elif box is not None:
365
- mode = "box"
366
- input_data = {"box": self.parse_box(box)}
367
- elif prompt is not None:
368
- mode = "prompt"
369
- input_data = {"text": prompt}
370
-
371
- with tempfile.TemporaryDirectory() as temp_input_dir:
372
- fps = capture_fps(input_video)
373
- video_to_frames(input_video, temp_input_dir)
374
- with tempfile.TemporaryDirectory() as temp_output_dir:
375
- masks = self.sample(
376
- video_dir=temp_input_dir,
377
- mode=mode,
378
- input_data=input_data,
379
- save_dir=str(temp_output_dir),
380
- visualize=True,
381
- legacy_mask=legacy_mask,
382
- )
383
- if output_video:
384
- os.makedirs(os.path.dirname(output_video), exist_ok=True)
385
- frames = convert_masks_to_frames(masks)
386
- if binarize_video:
387
- frames = np.any(frames > 0, axis=-1).astype(np.uint8) * 255
388
- write_video(frames, output_video, fps)
389
- if output_tensor:
390
- generate_tensor_from_images(
391
- temp_output_dir, output_tensor, fps, "mask", weight_scaler=weight_scaler
392
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cosmos-transfer1/cosmos_transfer1/auxiliary/sam2/sam2_pipeline.py DELETED
@@ -1,126 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import argparse
17
- import tempfile
18
-
19
- import numpy as np
20
-
21
- from cosmos_transfer1.auxiliary.sam2.sam2_model import VideoSegmentationModel
22
- from cosmos_transfer1.auxiliary.sam2.sam2_utils import (
23
- capture_fps,
24
- generate_tensor_from_images,
25
- generate_video_from_images,
26
- video_to_frames,
27
- )
28
-
29
-
30
- def parse_args():
31
- parser = argparse.ArgumentParser(description="Video Segmentation using SAM2")
32
- parser.add_argument("--input_video", type=str, required=True, help="Path to input video file")
33
- parser.add_argument(
34
- "--output_video", type=str, default="./outputs/output_video.mp4", help="Path to save the output video"
35
- )
36
- parser.add_argument(
37
- "--output_tensor", type=str, default="./outputs/output_tensor.pt", help="Path to save the output tensor"
38
- )
39
- parser.add_argument(
40
- "--mode", type=str, choices=["points", "box", "prompt"], default="points", help="Segmentation mode"
41
- )
42
- parser.add_argument("--prompt", type=str, help="Text prompt for prompt mode")
43
- parser.add_argument(
44
- "--grounding_model_path",
45
- type=str,
46
- default="IDEA-Research/grounding-dino-tiny",
47
- help="Local directory for GroundingDINO model files",
48
- )
49
- parser.add_argument(
50
- "--points",
51
- type=str,
52
- default="200,300",
53
- help="Comma-separated point coordinates for points mode (e.g., '200,300' or for multiple points use ';' as a separator, e.g., '200,300;100,150').",
54
- )
55
- parser.add_argument(
56
- "--labels",
57
- type=str,
58
- default="1",
59
- help="Comma-separated labels for points mode (e.g., '1' or '1,0' for multiple points).",
60
- )
61
- parser.add_argument(
62
- "--box",
63
- type=str,
64
- default="300,0,500,400",
65
- help="Comma-separated box coordinates for box mode (e.g., '300,0,500,400').",
66
- )
67
- # New flag to control visualization.
68
- parser.add_argument("--visualize", action="store_true", help="If set, visualize segmentation frames (save images)")
69
- return parser.parse_args()
70
-
71
-
72
- def parse_points(points_str):
73
- """Parse a string of points into a numpy array.
74
- Supports a single point ('200,300') or multiple points separated by ';' (e.g., '200,300;100,150').
75
- """
76
- points = []
77
- for point in points_str.split(";"):
78
- coords = point.split(",")
79
- if len(coords) != 2:
80
- continue
81
- points.append([float(coords[0]), float(coords[1])])
82
- return np.array(points, dtype=np.float32)
83
-
84
-
85
- def parse_labels(labels_str):
86
- """Parse a comma-separated string of labels into a numpy array."""
87
- return np.array([int(x) for x in labels_str.split(",")], dtype=np.int32)
88
-
89
-
90
- def parse_box(box_str):
91
- """Parse a comma-separated string of 4 box coordinates into a numpy array."""
92
- return np.array([float(x) for x in box_str.split(",")], dtype=np.float32)
93
-
94
-
95
- def main():
96
- args = parse_args()
97
-
98
- # Initialize the segmentation model.
99
- model = VideoSegmentationModel(**vars(args))
100
-
101
- # Prepare input data based on the selected mode.
102
- if args.mode == "points":
103
- input_data = {"points": parse_points(args.points), "labels": parse_labels(args.labels)}
104
- elif args.mode == "box":
105
- input_data = {"box": parse_box(args.box)}
106
- elif args.mode == "prompt":
107
- input_data = {"text": args.prompt}
108
-
109
- with tempfile.TemporaryDirectory() as temp_input_dir:
110
- fps = capture_fps(args.input_video)
111
- video_to_frames(args.input_video, temp_input_dir)
112
- with tempfile.TemporaryDirectory() as temp_output_dir:
113
- model.sample(
114
- video_dir=temp_input_dir,
115
- mode=args.mode,
116
- input_data=input_data,
117
- save_dir=str(temp_output_dir),
118
- visualize=True,
119
- )
120
- generate_video_from_images(temp_output_dir, args.output_video, fps)
121
- generate_tensor_from_images(temp_output_dir, args.output_tensor, fps, "mask")
122
-
123
-
124
- if __name__ == "__main__":
125
- print("Starting video segmentation...")
126
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cosmos-transfer1/cosmos_transfer1/auxiliary/sam2/sam2_utils.py DELETED
@@ -1,168 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import os
17
- import tempfile
18
- import time
19
-
20
- import cv2
21
- import imageio
22
- import numpy as np
23
- import pycocotools.mask
24
- import torch
25
- from natsort import natsorted
26
- from PIL import Image
27
- from torchvision import transforms
28
-
29
- from cosmos_transfer1.diffusion.datasets.augmentors.control_input import (
30
- decode_partial_rle_width1,
31
- segmentation_color_mask,
32
- )
33
- from cosmos_transfer1.utils import log
34
-
35
-
36
- def write_video(frames, output_path, fps=30):
37
- """
38
- expects a sequence of [H, W, 3] or [H, W] frames
39
- """
40
- with imageio.get_writer(output_path, fps=fps, macro_block_size=8) as writer:
41
- for frame in frames:
42
- if len(frame.shape) == 2: # single channel
43
- frame = frame[:, :, None].repeat(3, axis=2)
44
- writer.append_data(frame)
45
-
46
-
47
- def capture_fps(input_video_path: str):
48
- cap = cv2.VideoCapture(input_video_path)
49
- fps = cap.get(cv2.CAP_PROP_FPS)
50
- return fps
51
-
52
-
53
- def video_to_frames(input_loc, output_loc):
54
- """Function to extract frames from input video file
55
- and save them as separate frames in an output directory.
56
- Args:
57
- input_loc: Input video file.
58
- output_loc: Output directory to save the frames.
59
- Returns:
60
- None
61
- """
62
- try:
63
- os.mkdir(output_loc)
64
- except OSError:
65
- pass
66
- # Log the time
67
- time_start = time.time()
68
- # Start capturing the feed
69
- cap = cv2.VideoCapture(input_loc)
70
- # Find the number of frames
71
- video_length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
72
- print(f"Number of frames: {video_length}")
73
- count = 0
74
- print("Converting video..\n")
75
- # Start converting the video
76
- while cap.isOpened():
77
- # Extract the frame
78
- ret, frame = cap.read()
79
- if not ret:
80
- continue
81
- # Write the results back to output location.
82
- cv2.imwrite(output_loc + "/%#05d.jpg" % (count + 1), frame)
83
- count = count + 1
84
- # If there are no more frames left
85
- if count > (video_length - 1):
86
- # Log the time again
87
- time_end = time.time()
88
- # Release the feed
89
- cap.release()
90
- # Print stats
91
- print("Done extracting frames.\n%d frames extracted" % count)
92
- print("It took %d seconds forconversion." % (time_end - time_start))
93
- break
94
-
95
-
96
- # Function to generate video
97
- def convert_masks_to_frames(masks: list, num_masks_max: int = 100):
98
- T, H, W = shape = masks[0]["segmentation_mask_rle"]["mask_shape"]
99
- frame_start, frame_end = 0, T
100
- num_masks = min(num_masks_max, len(masks))
101
- mask_ids_select = np.arange(num_masks).tolist()
102
-
103
- all_masks = np.zeros((num_masks, T, H, W), dtype=np.uint8)
104
- for idx, mid in enumerate(mask_ids_select):
105
- mask = masks[mid]
106
- num_byte_per_mb = 1024 * 1024
107
- # total number of elements in uint8 (1 byte) / num_byte_per_mb
108
- if shape[0] * shape[1] * shape[2] / num_byte_per_mb > 256:
109
- rle = decode_partial_rle_width1(
110
- mask["segmentation_mask_rle"]["data"],
111
- frame_start * shape[1] * shape[2],
112
- frame_end * shape[1] * shape[2],
113
- )
114
- partial_shape = (frame_end - frame_start, shape[1], shape[2])
115
- rle = rle.reshape(partial_shape) * 255
116
- else:
117
- rle = pycocotools.mask.decode(mask["segmentation_mask_rle"]["data"])
118
- rle = rle.reshape(shape) * 255
119
- # Select the frames that are in the video
120
- frame_indices = np.arange(frame_start, frame_end).tolist()
121
- rle = np.stack([rle[i] for i in frame_indices])
122
- all_masks[idx] = rle
123
- del rle
124
-
125
- all_masks = segmentation_color_mask(all_masks) # NTHW -> 3THW
126
- all_masks = all_masks.transpose(1, 2, 3, 0)
127
- return all_masks
128
-
129
-
130
- def generate_video_from_images(masks: list, output_file_path: str, fps, num_masks_max: int = 100):
131
- all_masks = convert_masks_to_frames(masks, num_masks_max)
132
- write_video(all_masks, output_file_path, fps)
133
- print("Video generated successfully!")
134
-
135
-
136
- def generate_tensor_from_images(
137
- image_path_str: str, output_file_path: str, fps, search_pattern: str = None, weight_scaler: float = None
138
- ):
139
- images = list()
140
- image_path = os.path.abspath(image_path_str)
141
- if search_pattern is None:
142
- images = [img for img in natsorted(os.listdir(image_path))]
143
- else:
144
- for img in natsorted(os.listdir(image_path)):
145
- if img.__contains__(search_pattern):
146
- images.append(img)
147
-
148
- transform = transforms.ToTensor()
149
- image_tensors = list()
150
- for image in images:
151
- img_tensor = transform(Image.open(os.path.join(image_path, image)))
152
- image_tensors.append(img_tensor.squeeze(0))
153
-
154
- tensor = torch.stack(image_tensors) # [T, H, W], binary values, float
155
-
156
- if weight_scaler is not None:
157
- log.info(f"scaling the tensor by the specified scale: {weight_scaler}")
158
- tensor = tensor * weight_scaler
159
-
160
- log.info(f"saving tensor shape: {tensor.shape} to {output_file_path}")
161
- torch.save(tensor, output_file_path)
162
-
163
-
164
- if __name__ == "__main__":
165
- input_loc = "cosmos_transfer1/models/sam2/assets/input_video.mp4"
166
- output_loc = os.path.abspath(tempfile.TemporaryDirectory().name)
167
- print(f"output_loc --- {output_loc}")
168
- video_to_frames(input_loc, output_loc)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cosmos-transfer1/cosmos_transfer1/auxiliary/tokenizer/inference/__init__.py DELETED
@@ -1,14 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cosmos-transfer1/cosmos_transfer1/auxiliary/tokenizer/inference/image_cli.py DELETED
@@ -1,188 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- """A CLI to run ImageTokenizer on plain images based on torch.jit.
17
-
18
- Usage:
19
- python3 -m cosmos_transfer1.auxiliary.tokenizer.inference.image_cli \
20
- --image_pattern 'path/to/input/folder/*.jpg' \
21
- --output_dir ./reconstructions \
22
- --checkpoint_enc ./checkpoints/<model-name>/encoder.jit \
23
- --checkpoint_dec ./checkpoints/<model-name>/decoder.jit
24
-
25
- Optionally, you can run the model in pure PyTorch mode:
26
- python3 -m cosmos_transfer1.auxiliary.tokenizer.inference.image_cli \
27
- --image_pattern 'path/to/input/folder/*.jpg' \
28
- --mode torch \
29
- --tokenizer_type CI \
30
- --spatial_compression 8 \
31
- --checkpoint_enc ./checkpoints/<model-name>/encoder.jit \
32
- --checkpoint_dec ./checkpoints/<model-name>/decoder.jit
33
- """
34
-
35
- import os
36
- import sys
37
- from argparse import ArgumentParser, Namespace
38
- from typing import Any
39
-
40
- import numpy as np
41
- from loguru import logger as logging
42
-
43
- from cosmos_transfer1.auxiliary.tokenizer.inference.image_lib import ImageTokenizer
44
- from cosmos_transfer1.auxiliary.tokenizer.inference.utils import (
45
- get_filepaths,
46
- get_output_filepath,
47
- read_image,
48
- resize_image,
49
- write_image,
50
- )
51
- from cosmos_transfer1.auxiliary.tokenizer.networks import TokenizerConfigs
52
-
53
-
54
- def _parse_args() -> tuple[Namespace, dict[str, Any]]:
55
- parser = ArgumentParser(description="A CLI for running ImageTokenizer on plain images.")
56
- parser.add_argument(
57
- "--image_pattern",
58
- type=str,
59
- default="path/to/images/*.jpg",
60
- help="Glob pattern.",
61
- )
62
- parser.add_argument(
63
- "--checkpoint",
64
- type=str,
65
- default=None,
66
- help="JIT full Autoencoder model filepath.",
67
- )
68
- parser.add_argument(
69
- "--checkpoint_enc",
70
- type=str,
71
- default=None,
72
- help="JIT Encoder model filepath.",
73
- )
74
- parser.add_argument(
75
- "--checkpoint_dec",
76
- type=str,
77
- default=None,
78
- help="JIT Decoder model filepath.",
79
- )
80
- parser.add_argument(
81
- "--tokenizer_type",
82
- type=str,
83
- choices=["CI", "DI"],
84
- help="Specifies the tokenizer type.",
85
- )
86
- parser.add_argument(
87
- "--spatial_compression",
88
- type=int,
89
- choices=[8, 16],
90
- default=8,
91
- help="The spatial compression factor.",
92
- )
93
- parser.add_argument(
94
- "--mode",
95
- type=str,
96
- choices=["torch", "jit"],
97
- default="jit",
98
- help="Specify the backend: native 'torch' or 'jit' (default: 'jit')",
99
- )
100
- parser.add_argument(
101
- "--short_size",
102
- type=int,
103
- default=None,
104
- help="The size to resample inputs. None, by default.",
105
- )
106
- parser.add_argument(
107
- "--dtype",
108
- type=str,
109
- default="bfloat16",
110
- help="Sets the precision. Default bfloat16.",
111
- )
112
- parser.add_argument(
113
- "--device",
114
- type=str,
115
- default="cuda",
116
- help="Device for invoking the model.",
117
- )
118
- parser.add_argument("--output_dir", type=str, default=None, help="Output directory.")
119
- parser.add_argument(
120
- "--save_input",
121
- action="store_true",
122
- help="If on, the input image will be be outputed too.",
123
- )
124
- args = parser.parse_args()
125
- return args
126
-
127
-
128
- logging.info("Initializes args ...")
129
- args = _parse_args()
130
- if args.mode == "torch" and args.tokenizer_type not in ["CI", "DI"]:
131
- logging.error("'torch' backend requires the tokenizer_type of 'CI' or 'DI'.")
132
- sys.exit(1)
133
-
134
-
135
- def _run_eval() -> None:
136
- """Invokes the evaluation pipeline."""
137
-
138
- if args.checkpoint_enc is None and args.checkpoint_dec is None and args.checkpoint is None:
139
- logging.warning("Aborting. Both encoder or decoder JIT required. Or provide the full autoencoder JIT model.")
140
- return
141
-
142
- if args.mode == "torch":
143
- tokenizer_config = TokenizerConfigs[args.tokenizer_type].value
144
- tokenizer_config.update(dict(spatial_compression=args.spatial_compression))
145
- else:
146
- tokenizer_config = None
147
-
148
- logging.info(
149
- f"Loading a torch.jit model `{os.path.dirname(args.checkpoint or args.checkpoint_enc or args.checkpoint_dec)}` ..."
150
- )
151
- autoencoder = ImageTokenizer(
152
- checkpoint=args.checkpoint,
153
- checkpoint_enc=args.checkpoint_enc,
154
- checkpoint_dec=args.checkpoint_dec,
155
- tokenizer_config=tokenizer_config,
156
- device=args.device,
157
- dtype=args.dtype,
158
- )
159
-
160
- filepaths = get_filepaths(args.image_pattern)
161
- logging.info(f"Found {len(filepaths)} images from {args.image_pattern}.")
162
-
163
- for filepath in filepaths:
164
- logging.info(f"Reading image {filepath} ...")
165
- image = read_image(filepath)
166
- image = resize_image(image, short_size=args.short_size)
167
- batch_image = np.expand_dims(image, axis=0)
168
-
169
- logging.info("Invoking the autoencoder model in ... ")
170
- output_image = autoencoder(batch_image)[0]
171
-
172
- output_filepath = get_output_filepath(filepath, output_dir=args.output_dir)
173
- logging.info(f"Outputing {output_filepath} ...")
174
- write_image(output_filepath, output_image)
175
-
176
- if args.save_input:
177
- ext = os.path.splitext(output_filepath)[-1]
178
- input_filepath = output_filepath.replace(ext, "_input" + ext)
179
- write_image(input_filepath, image)
180
-
181
-
182
- @logging.catch(reraise=True)
183
- def main() -> None:
184
- _run_eval()
185
-
186
-
187
- if __name__ == "__main__":
188
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cosmos-transfer1/cosmos_transfer1/auxiliary/tokenizer/inference/image_lib.py DELETED
@@ -1,124 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- """A library for image tokenizers inference."""
17
-
18
- from typing import Any
19
-
20
- import numpy as np
21
- import torch
22
-
23
- from cosmos_transfer1.auxiliary.tokenizer.inference.utils import (
24
- load_decoder_model,
25
- load_encoder_model,
26
- load_model,
27
- numpy2tensor,
28
- pad_image_batch,
29
- tensor2numpy,
30
- unpad_image_batch,
31
- )
32
-
33
-
34
- class ImageTokenizer(torch.nn.Module):
35
- def __init__(
36
- self,
37
- checkpoint: str = None,
38
- checkpoint_enc: str = None,
39
- checkpoint_dec: str = None,
40
- tokenizer_config: dict[str, Any] = None,
41
- device: str = "cuda",
42
- dtype: str = "bfloat16",
43
- ) -> None:
44
- super().__init__()
45
- self._device = device
46
- self._dtype = getattr(torch, dtype)
47
- self._full_model = (
48
- load_model(checkpoint, tokenizer_config, device).to(self._dtype) if checkpoint is not None else None
49
- )
50
- self._enc_model = (
51
- load_encoder_model(checkpoint_enc, tokenizer_config, device).to(self._dtype)
52
- if checkpoint_enc is not None
53
- else None
54
- )
55
- self._dec_model = (
56
- load_decoder_model(checkpoint_dec, tokenizer_config, device).to(self._dtype)
57
- if checkpoint_dec is not None
58
- else None
59
- )
60
-
61
- @torch.no_grad()
62
- def autoencode(self, input_tensor: torch.Tensor) -> torch.Tensor:
63
- """Reconstrcuts a batch of image tensors after embedding into a latent.
64
-
65
- Args:
66
- input_tensor: The input image Bx3xHxW layout, range [-1..1].
67
- Returns:
68
- The reconstructed tensor, layout Bx3xHxW, range [-1..1].
69
- """
70
- if self._full_model is not None:
71
- output_tensor = self._full_model(input_tensor)
72
- output_tensor = output_tensor[0] if isinstance(output_tensor, tuple) else output_tensor
73
- else:
74
- output_latent = self.encode(input_tensor)[0]
75
- output_tensor = self.decode(output_latent)
76
- return output_tensor
77
-
78
- @torch.no_grad()
79
- def decode(self, input_latent: torch.Tensor) -> torch.Tensor:
80
- """Decodes an image from a provided latent embedding.
81
-
82
- Args:
83
- input_latent: The continuous latent Bx16xhxw for CI,
84
- or the discrete indices Bxhxw for DI.
85
- Returns:
86
- The output tensor in Bx3xHxW, range [-1..1].
87
- """
88
- return self._dec_model(input_latent)
89
-
90
- @torch.no_grad()
91
- def encode(self, input_tensor: torch.Tensor) -> tuple[torch.Tensor]:
92
- """Encodes an image into a latent embedding or code.
93
-
94
- Args:
95
- input_tensor: The input tensor Bx3xHxW layout, range [-1..1].
96
- Returns:
97
- For continuous image (CI) tokenizer, the tuple contains:
98
- - The latent embedding, Bx16x(h)x(w), where the compression
99
- rate is (H/h x W/w), and channel dimension of 16.
100
- For discrete image (DI) tokenizer, the tuple contains:
101
- - The indices, Bx(h)x(w), from a codebook of size 64K, which
102
- corresponds to FSQ levels of (8,8,8,5,5,5).
103
- - The discrete code, Bx6x(h)x(w), where the compression rate is
104
- again (H/h x W/w), and channel dimension of 6.
105
- """
106
- output_latent = self._enc_model(input_tensor)
107
- if isinstance(output_latent, torch.Tensor):
108
- return output_latent
109
- return output_latent[:-1]
110
-
111
- @torch.no_grad()
112
- def forward(self, image: np.ndarray) -> np.ndarray:
113
- """Reconstructs an image using a pre-trained tokenizer.
114
-
115
- Args:
116
- image: The input image BxHxWxC layout, range [0..255].
117
- Returns:
118
- The reconstructed image in range [0..255], layout BxHxWxC.
119
- """
120
- padded_input_image, crop_region = pad_image_batch(image)
121
- input_tensor = numpy2tensor(padded_input_image, dtype=self._dtype, device=self._device)
122
- output_tensor = self.autoencode(input_tensor)
123
- padded_output_image = tensor2numpy(output_tensor)
124
- return unpad_image_batch(padded_output_image, crop_region)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cosmos-transfer1/cosmos_transfer1/auxiliary/tokenizer/inference/utils.py DELETED
@@ -1,402 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- """Utility functions for the inference libraries."""
17
-
18
- import os
19
- from glob import glob
20
- from typing import Any
21
-
22
- import mediapy as media
23
- import numpy as np
24
- import torch
25
-
26
- from cosmos_transfer1.auxiliary.tokenizer.networks import TokenizerModels
27
-
28
- _DTYPE, _DEVICE = torch.bfloat16, "cuda"
29
- _UINT8_MAX_F = float(torch.iinfo(torch.uint8).max)
30
- _SPATIAL_ALIGN = 16
31
- _TEMPORAL_ALIGN = 8
32
-
33
-
34
- def load_model(
35
- jit_filepath: str = None,
36
- tokenizer_config: dict[str, Any] = None,
37
- device: str = "cuda",
38
- ) -> torch.nn.Module | torch.jit.ScriptModule:
39
- """Loads a torch.nn.Module from a filepath.
40
-
41
- Args:
42
- jit_filepath: The filepath to the JIT-compiled model.
43
- device: The device to load the model onto, default=cuda.
44
- Returns:
45
- The JIT compiled model loaded to device and on eval mode.
46
- """
47
- if tokenizer_config is None:
48
- return load_jit_model(jit_filepath, device)
49
- full_model, ckpts = _load_pytorch_model(jit_filepath, tokenizer_config, device)
50
- full_model.load_state_dict(ckpts.state_dict(), strict=False)
51
- return full_model.eval().to(device)
52
-
53
-
54
- def load_encoder_model(
55
- jit_filepath: str = None,
56
- tokenizer_config: dict[str, Any] = None,
57
- device: str = "cuda",
58
- ) -> torch.nn.Module | torch.jit.ScriptModule:
59
- """Loads a torch.nn.Module from a filepath.
60
-
61
- Args:
62
- jit_filepath: The filepath to the JIT-compiled model.
63
- device: The device to load the model onto, default=cuda.
64
- Returns:
65
- The JIT compiled model loaded to device and on eval mode.
66
- """
67
- if tokenizer_config is None:
68
- return load_jit_model(jit_filepath, device)
69
- full_model, ckpts = _load_pytorch_model(jit_filepath, tokenizer_config, device)
70
- encoder_model = full_model.encoder_jit()
71
- encoder_model.load_state_dict(ckpts.state_dict(), strict=False)
72
- return encoder_model.eval().to(device)
73
-
74
-
75
- def load_decoder_model(
76
- jit_filepath: str = None,
77
- tokenizer_config: dict[str, Any] = None,
78
- device: str = "cuda",
79
- ) -> torch.nn.Module | torch.jit.ScriptModule:
80
- """Loads a torch.nn.Module from a filepath.
81
-
82
- Args:
83
- jit_filepath: The filepath to the JIT-compiled model.
84
- device: The device to load the model onto, default=cuda.
85
- Returns:
86
- The JIT compiled model loaded to device and on eval mode.
87
- """
88
- if tokenizer_config is None:
89
- return load_jit_model(jit_filepath, device)
90
- full_model, ckpts = _load_pytorch_model(jit_filepath, tokenizer_config, device)
91
- decoder_model = full_model.decoder_jit()
92
- decoder_model.load_state_dict(ckpts.state_dict(), strict=False)
93
- return decoder_model.eval().to(device)
94
-
95
-
96
- def _load_pytorch_model(
97
- jit_filepath: str = None, tokenizer_config: str = None, device: str = "cuda"
98
- ) -> torch.nn.Module:
99
- """Loads a torch.nn.Module from a filepath.
100
-
101
- Args:
102
- jit_filepath: The filepath to the JIT-compiled model.
103
- device: The device to load the model onto, default=cuda.
104
- Returns:
105
- The JIT compiled model loaded to device and on eval mode.
106
- """
107
- tokenizer_name = tokenizer_config["name"]
108
- model = TokenizerModels[tokenizer_name].value(**tokenizer_config)
109
- ckpts = torch.jit.load(jit_filepath, map_location=device)
110
- return model, ckpts
111
-
112
-
113
- def load_jit_model(jit_filepath: str = None, device: str = "cuda") -> torch.jit.ScriptModule:
114
- """Loads a torch.jit.ScriptModule from a filepath.
115
-
116
- Args:
117
- jit_filepath: The filepath to the JIT-compiled model.
118
- device: The device to load the model onto, default=cuda.
119
- Returns:
120
- The JIT compiled model loaded to device and on eval mode.
121
- """
122
- model = torch.jit.load(jit_filepath, map_location=device)
123
- return model.eval().to(device)
124
-
125
-
126
- def save_jit_model(
127
- model: torch.jit.ScriptModule | torch.jit.RecursiveScriptModule = None,
128
- jit_filepath: str = None,
129
- ) -> None:
130
- """Saves a torch.jit.ScriptModule or torch.jit.RecursiveScriptModule to file.
131
-
132
- Args:
133
- model: JIT compiled model loaded onto `config.checkpoint.jit.device`.
134
- jit_filepath: The filepath to the JIT-compiled model.
135
- """
136
- torch.jit.save(model, jit_filepath)
137
-
138
-
139
- def get_filepaths(input_pattern) -> list[str]:
140
- """Returns a list of filepaths from a pattern."""
141
- filepaths = sorted(glob(str(input_pattern)))
142
- return list(set(filepaths))
143
-
144
-
145
- def get_output_filepath(filepath: str, output_dir: str = None) -> str:
146
- """Returns the output filepath for the given input filepath."""
147
- output_dir = output_dir or f"{os.path.dirname(filepath)}/reconstructions"
148
- output_filepath = f"{output_dir}/{os.path.basename(filepath)}"
149
- os.makedirs(output_dir, exist_ok=True)
150
- return output_filepath
151
-
152
-
153
- def read_image(filepath: str) -> np.ndarray:
154
- """Reads an image from a filepath.
155
-
156
- Args:
157
- filepath: The filepath to the image.
158
-
159
- Returns:
160
- The image as a numpy array, layout HxWxC, range [0..255], uint8 dtype.
161
- """
162
- image = media.read_image(filepath)
163
- # convert the grey scale image to RGB
164
- # since our tokenizers always assume 3-channel RGB image
165
- if image.ndim == 2:
166
- image = np.stack([image] * 3, axis=-1)
167
- # convert RGBA to RGB
168
- if image.shape[-1] == 4:
169
- image = image[..., :3]
170
- return image
171
-
172
-
173
- def read_video(filepath: str) -> np.ndarray:
174
- """Reads a video from a filepath.
175
-
176
- Args:
177
- filepath: The filepath to the video.
178
- Returns:
179
- The video as a numpy array, layout TxHxWxC, range [0..255], uint8 dtype.
180
- """
181
- video = media.read_video(filepath)
182
- # convert the grey scale frame to RGB
183
- # since our tokenizers always assume 3-channel video
184
- if video.ndim == 3:
185
- video = np.stack([video] * 3, axis=-1)
186
- # convert RGBA to RGB
187
- if video.shape[-1] == 4:
188
- video = video[..., :3]
189
- return video
190
-
191
-
192
- def resize_image(image: np.ndarray, short_size: int = None) -> np.ndarray:
193
- """Resizes an image to have the short side of `short_size`.
194
-
195
- Args:
196
- image: The image to resize, layout HxWxC, of any range.
197
- short_size: The size of the short side.
198
- Returns:
199
- The resized image.
200
- """
201
- if short_size is None:
202
- return image
203
- height, width = image.shape[-3:-1]
204
- if height <= width:
205
- height_new, width_new = short_size, int(width * short_size / height + 0.5)
206
- width_new = width_new if width_new % 2 == 0 else width_new + 1
207
- else:
208
- height_new, width_new = (
209
- int(height * short_size / width + 0.5),
210
- short_size,
211
- )
212
- height_new = height_new if height_new % 2 == 0 else height_new + 1
213
- return media.resize_image(image, shape=(height_new, width_new))
214
-
215
-
216
- def resize_video(video: np.ndarray, short_size: int = None) -> np.ndarray:
217
- """Resizes a video to have the short side of `short_size`.
218
-
219
- Args:
220
- video: The video to resize, layout TxHxWxC, of any range.
221
- short_size: The size of the short side.
222
- Returns:
223
- The resized video.
224
- """
225
- if short_size is None:
226
- return video
227
- height, width = video.shape[-3:-1]
228
- if height <= width:
229
- height_new, width_new = short_size, int(width * short_size / height + 0.5)
230
- width_new = width_new if width_new % 2 == 0 else width_new + 1
231
- else:
232
- height_new, width_new = (
233
- int(height * short_size / width + 0.5),
234
- short_size,
235
- )
236
- height_new = height_new if height_new % 2 == 0 else height_new + 1
237
- return media.resize_video(video, shape=(height_new, width_new))
238
-
239
-
240
- def write_image(filepath: str, image: np.ndarray):
241
- """Writes an image to a filepath."""
242
- return media.write_image(filepath, image)
243
-
244
-
245
- def write_video(filepath: str, video: np.ndarray, fps: int = 24) -> None:
246
- """Writes a video to a filepath."""
247
- return media.write_video(filepath, video, fps=fps)
248
-
249
-
250
- def numpy2tensor(
251
- input_image: np.ndarray,
252
- dtype: torch.dtype = _DTYPE,
253
- device: str = _DEVICE,
254
- range_min: int = -1,
255
- ) -> torch.Tensor:
256
- """Converts image(dtype=np.uint8) to `dtype` in range [0..255].
257
-
258
- Args:
259
- input_image: A batch of images in range [0..255], BxHxWx3 layout.
260
- Returns:
261
- A torch.Tensor of layout Bx3xHxW in range [-1..1], dtype.
262
- """
263
- ndim = input_image.ndim
264
- indices = list(range(1, ndim))[-1:] + list(range(1, ndim))[:-1]
265
- image = input_image.transpose((0,) + tuple(indices)) / _UINT8_MAX_F
266
- if range_min == -1:
267
- image = 2.0 * image - 1.0
268
- return torch.from_numpy(image).to(dtype).to(device)
269
-
270
-
271
- def tensor2numpy(input_tensor: torch.Tensor, range_min: int = -1) -> np.ndarray:
272
- """Converts tensor in [-1,1] to image(dtype=np.uint8) in range [0..255].
273
-
274
- Args:
275
- input_tensor: Input image tensor of Bx3xHxW layout, range [-1..1].
276
- Returns:
277
- A numpy image of layout BxHxWx3, range [0..255], uint8 dtype.
278
- """
279
- if range_min == -1:
280
- input_tensor = (input_tensor.float() + 1.0) / 2.0
281
- ndim = input_tensor.ndim
282
- output_image = input_tensor.clamp(0, 1).cpu().numpy()
283
- output_image = output_image.transpose((0,) + tuple(range(2, ndim)) + (1,))
284
- return (output_image * _UINT8_MAX_F + 0.5).astype(np.uint8)
285
-
286
-
287
- def pad_image_batch(batch: np.ndarray, spatial_align: int = _SPATIAL_ALIGN) -> tuple[np.ndarray, list[int]]:
288
- """Pads a batch of images to be divisible by `spatial_align`.
289
-
290
- Args:
291
- batch: The batch of images to pad, layout BxHxWx3, in any range.
292
- align: The alignment to pad to.
293
- Returns:
294
- The padded batch and the crop region.
295
- """
296
- height, width = batch.shape[1:3]
297
- align = spatial_align
298
- height_to_pad = (align - height % align) if height % align != 0 else 0
299
- width_to_pad = (align - width % align) if width % align != 0 else 0
300
-
301
- crop_region = [
302
- height_to_pad >> 1,
303
- width_to_pad >> 1,
304
- height + (height_to_pad >> 1),
305
- width + (width_to_pad >> 1),
306
- ]
307
- batch = np.pad(
308
- batch,
309
- (
310
- (0, 0),
311
- (height_to_pad >> 1, height_to_pad - (height_to_pad >> 1)),
312
- (width_to_pad >> 1, width_to_pad - (width_to_pad >> 1)),
313
- (0, 0),
314
- ),
315
- mode="constant",
316
- )
317
- return batch, crop_region
318
-
319
-
320
- def pad_video_batch(
321
- batch: np.ndarray,
322
- temporal_align: int = _TEMPORAL_ALIGN,
323
- spatial_align: int = _SPATIAL_ALIGN,
324
- ) -> tuple[np.ndarray, list[int]]:
325
- """Pads a batch of videos to be divisible by `temporal_align` or `spatial_align`.
326
-
327
- Zero pad spatially. Reflection pad temporally to handle causality better.
328
- Args:
329
- batch: The batch of videos to pad., layout BxFxHxWx3, in any range.
330
- align: The alignment to pad to.
331
- Returns:
332
- The padded batch and the crop region.
333
- """
334
- num_frames, height, width = batch.shape[-4:-1]
335
- align = spatial_align
336
- height_to_pad = (align - height % align) if height % align != 0 else 0
337
- width_to_pad = (align - width % align) if width % align != 0 else 0
338
-
339
- align = temporal_align
340
- frames_to_pad = (align - (num_frames - 1) % align) if (num_frames - 1) % align != 0 else 0
341
-
342
- crop_region = [
343
- frames_to_pad >> 1,
344
- height_to_pad >> 1,
345
- width_to_pad >> 1,
346
- num_frames + (frames_to_pad >> 1),
347
- height + (height_to_pad >> 1),
348
- width + (width_to_pad >> 1),
349
- ]
350
- batch = np.pad(
351
- batch,
352
- (
353
- (0, 0),
354
- (0, 0),
355
- (height_to_pad >> 1, height_to_pad - (height_to_pad >> 1)),
356
- (width_to_pad >> 1, width_to_pad - (width_to_pad >> 1)),
357
- (0, 0),
358
- ),
359
- mode="constant",
360
- )
361
- batch = np.pad(
362
- batch,
363
- (
364
- (0, 0),
365
- (frames_to_pad >> 1, frames_to_pad - (frames_to_pad >> 1)),
366
- (0, 0),
367
- (0, 0),
368
- (0, 0),
369
- ),
370
- mode="edge",
371
- )
372
- return batch, crop_region
373
-
374
-
375
- def unpad_video_batch(batch: np.ndarray, crop_region: list[int]) -> np.ndarray:
376
- """Unpads video with `crop_region`.
377
-
378
- Args:
379
- batch: A batch of numpy videos, layout BxFxHxWxC.
380
- crop_region: [f1,y1,x1,f2,y2,x2] first, top, left, last, bot, right crop indices.
381
-
382
- Returns:
383
- np.ndarray: Cropped numpy video, layout BxFxHxWxC.
384
- """
385
- assert len(crop_region) == 6, "crop_region should be len of 6."
386
- f1, y1, x1, f2, y2, x2 = crop_region
387
- return batch[..., f1:f2, y1:y2, x1:x2, :]
388
-
389
-
390
- def unpad_image_batch(batch: np.ndarray, crop_region: list[int]) -> np.ndarray:
391
- """Unpads image with `crop_region`.
392
-
393
- Args:
394
- batch: A batch of numpy images, layout BxHxWxC.
395
- crop_region: [y1,x1,y2,x2] top, left, bot, right crop indices.
396
-
397
- Returns:
398
- np.ndarray: Cropped numpy image, layout BxHxWxC.
399
- """
400
- assert len(crop_region) == 4, "crop_region should be len of 4."
401
- y1, x1, y2, x2 = crop_region
402
- return batch[..., y1:y2, x1:x2, :]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cosmos-transfer1/cosmos_transfer1/auxiliary/tokenizer/inference/video_cli.py DELETED
@@ -1,210 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
- # SPDX-License-Identifier: Apache-2.0
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- """A CLI to run CausalVideoTokenizer on plain videos based on torch.jit.
17
-
18
- Usage:
19
- python3 -m cosmos_transfer1.auxiliary.tokenizer.inference.video_cli \
20
- --video_pattern 'path/to/video/samples/*.mp4' \
21
- --output_dir ./reconstructions \
22
- --checkpoint_enc ./checkpoints/<model-name>/encoder.jit \
23
- --checkpoint_dec ./checkpoints/<model-name>/decoder.jit
24
-
25
- Optionally, you can run the model in pure PyTorch mode:
26
- python3 -m cosmos_transfer1.auxiliary.tokenizer.inference.video_cli \
27
- --video_pattern 'path/to/video/samples/*.mp4' \
28
- --mode=torch \
29
- --tokenizer_type=CV \
30
- --temporal_compression=4 \
31
- --spatial_compression=8 \
32
- --checkpoint_enc ./checkpoints/<model-name>/encoder.jit \
33
- --checkpoint_dec ./checkpoints/<model-name>/decoder.jit
34
- """
35
-
36
- import os
37
- import sys
38
- from argparse import ArgumentParser, Namespace
39
- from typing import Any
40
-
41
- import numpy as np
42
- from loguru import logger as logging
43
-
44
- from cosmos_transfer1.auxiliary.tokenizer.inference.utils import (
45
- get_filepaths,
46
- get_output_filepath,
47
- read_video,
48
- resize_video,
49
- write_video,
50
- )
51
- from cosmos_transfer1.auxiliary.tokenizer.inference.video_lib import CausalVideoTokenizer
52
- from cosmos_transfer1.auxiliary.tokenizer.networks import TokenizerConfigs
53
-
54
-
55
- def _parse_args() -> tuple[Namespace, dict[str, Any]]:
56
- parser = ArgumentParser(description="A CLI for CausalVideoTokenizer.")
57
- parser.add_argument(
58
- "--video_pattern",
59
- type=str,
60
- default="path/to/videos/*.mp4",
61
- help="Glob pattern.",
62
- )
63
- parser.add_argument(
64
- "--checkpoint",
65
- type=str,
66
- default=None,
67
- help="JIT full Autoencoder model filepath.",
68
- )
69
- parser.add_argument(
70
- "--checkpoint_enc",
71
- type=str,
72
- default=None,
73
- help="JIT Encoder model filepath.",
74
- )
75
- parser.add_argument(
76
- "--checkpoint_dec",
77
- type=str,
78
- default=None,
79
- help="JIT Decoder model filepath.",
80
- )
81
- parser.add_argument(
82
- "--tokenizer_type",
83
- type=str,
84
- choices=["CV", "DV"],
85
- help="Specifies the tokenizer type.",
86
- )
87
- parser.add_argument(
88
- "--spatial_compression",
89
- type=int,
90
- choices=[8, 16],
91
- default=8,
92
- help="The spatial compression factor.",
93
- )
94
- parser.add_argument(
95
- "--temporal_compression",
96
- type=int,
97
- choices=[4, 8],
98
- default=4,
99
- help="The temporal compression factor.",
100
- )
101
- parser.add_argument(
102
- "--mode",
103
- type=str,
104
- choices=["torch", "jit"],
105
- default="jit",
106
- help="Specify the backend: native 'torch' or 'jit' (default: 'jit')",
107
- )
108
- parser.add_argument(
109
- "--short_size",
110
- type=int,
111
- default=None,
112
- help="The size to resample inputs. None, by default.",
113
- )
114
- parser.add_argument(
115
- "--temporal_window",
116
- type=int,
117
- default=17,
118
- help="The temporal window to operate at a time.",
119
- )
120
- parser.add_argument(
121
- "--dtype",
122
- type=str,
123
- default="bfloat16",
124
- help="Sets the precision, default bfloat16.",
125
- )
126
- parser.add_argument(
127
- "--device",
128
- type=str,
129
- default="cuda",
130
- help="Device for invoking the model.",
131
- )
132
- parser.add_argument("--output_dir", type=str, default=None, help="Output directory.")
133
- parser.add_argument(
134
- "--output_fps",
135
- type=float,
136
- default=24.0,
137
- help="Output frames-per-second (FPS).",
138
- )
139
- parser.add_argument(
140
- "--save_input",
141
- action="store_true",
142
- help="If on, the input video will be be outputted too.",
143
- )
144
-
145
- args = parser.parse_args()
146
- return args
147
-
148
-
149
- logging.info("Initializes args ...")
150
- args = _parse_args()
151
- if args.mode == "torch" and args.tokenizer_type not in ["CV", "DV"]:
152
- logging.error("'torch' backend requires the tokenizer_type of 'CV' or 'DV'.")
153
- sys.exit(1)
154
-
155
-
156
- def _run_eval() -> None:
157
- """Invokes JIT-compiled CausalVideoTokenizer on an input video."""
158
-
159
- if args.checkpoint_enc is None and args.checkpoint_dec is None and args.checkpoint is None:
160
- logging.warning("Aborting. Both encoder or decoder JIT required. Or provide the full autoencoder JIT model.")
161
- return
162
-
163
- if args.mode == "torch":
164
- tokenizer_config = TokenizerConfigs[args.tokenizer_type].value
165
- tokenizer_config.update(dict(spatial_compression=args.spatial_compression))
166
- tokenizer_config.update(dict(temporal_compression=args.temporal_compression))
167
- else:
168
- tokenizer_config = None
169
-
170
- logging.info(
171
- f"Loading a torch.jit model `{os.path.dirname(args.checkpoint or args.checkpoint_enc or args.checkpoint_dec)}` ..."
172
- )
173
- autoencoder = CausalVideoTokenizer(
174
- checkpoint=args.checkpoint,
175
- checkpoint_enc=args.checkpoint_enc,
176
- checkpoint_dec=args.checkpoint_dec,
177
- tokenizer_config=tokenizer_config,
178
- device=args.device,
179
- dtype=args.dtype,
180
- )
181
-
182
- logging.info(f"Looking for files matching video_pattern={args.video_pattern} ...")
183
- filepaths = get_filepaths(args.video_pattern)
184
- logging.info(f"Found {len(filepaths)} videos from {args.video_pattern}.")
185
-
186
- for filepath in filepaths:
187
- logging.info(f"Reading video {filepath} ...")
188
- video = read_video(filepath)
189
- video = resize_video(video, short_size=args.short_size)
190
-
191
- logging.info("Invoking the autoencoder model in ... ")
192
- batch_video = video[np.newaxis, ...]
193
- output_video = autoencoder(batch_video, temporal_window=args.temporal_window)[0]
194
- logging.info("Constructing output filepath ...")
195
- output_filepath = get_output_filepath(filepath, output_dir=args.output_dir)
196
- logging.info(f"Outputing {output_filepath} ...")
197
- write_video(output_filepath, output_video, fps=args.output_fps)
198
- if args.save_input:
199
- ext = os.path.splitext(output_filepath)[-1]
200
- input_filepath = output_filepath.replace(ext, "_input" + ext)
201
- write_video(input_filepath, video, fps=args.output_fps)
202
-
203
-
204
- @logging.catch(reraise=True)
205
- def main() -> None:
206
- _run_eval()
207
-
208
-
209
- if __name__ == "__main__":
210
- main()