Spaces:

jbilcke-hf
/

VideoModelStudio

Running

App Files Files Community

jbilcke-hf HF Staff commited on Mar 9

Commit

ecd5028

1 Parent(s): 7c52128

upgrade finetrainers + gradio

Browse files

Files changed (25) hide show

README.md +1 -1
docs/huggingface/Downloading files from the hub.md +270 -0
docs/huggingface/HfApi Client API Reference.md +0 -0
docs/huggingface/Load a dataset from the hub.md +126 -0
docs/huggingface/Search the Hub.md +61 -0
finetrainers/args.py +4 -0
finetrainers/config.py +10 -4
finetrainers/data/__init__.py +9 -1
finetrainers/data/dataset.py +154 -20
finetrainers/data/precomputation.py +222 -9
finetrainers/functional/image.py +1 -1
finetrainers/models/cogvideox/base_specification.py +1 -2
finetrainers/models/cogview4/__init__.py +1 -0
finetrainers/models/cogview4/base_specification.py +395 -0
finetrainers/models/hunyuan_video/base_specification.py +1 -4
finetrainers/models/ltx_video/base_specification.py +2 -6
finetrainers/models/modeling_utils.py +0 -3
finetrainers/models/wan/base_specification.py +2 -11
finetrainers/processors/__init__.py +1 -0
finetrainers/processors/glm.py +74 -0
finetrainers/trainer/sft_trainer/trainer.py +116 -61
finetrainers/utils/__init__.py +1 -0
finetrainers/utils/diffusion.py +7 -0
requirements.txt +1 -1
requirements_without_flash_attention.txt +1 -1

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 🎥
 colorFrom: gray
 colorTo: gray
 sdk: gradio
-sdk_version: 5.15.0
 app_file: app.py
 pinned: true
 license: apache-2.0

 colorFrom: gray
 colorTo: gray
 sdk: gradio
+sdk_version: 5.20.1
 app_file: app.py
 pinned: true
 license: apache-2.0

docs/huggingface/Downloading files from the hub.md ADDED Viewed

	@@ -0,0 +1,270 @@

+[](#downloading-files)Downloading files
+=======================================
+[](#download-a-single-file)Download a single file
+-------------------------------------------------
+### [](#huggingface_hub.hf_hub_download)hf\_hub\_download
+#### huggingface\_hub.hf\_hub\_download
+[](#huggingface_hub.hf_hub_download)[< source \>](https://github.com/huggingface/huggingface_hub/blob/v0.29.2/src/huggingface_hub/file_download.py#L663)
+( repo\_id: strfilename: strsubfolder: typing.Optional\[str\] = Nonerepo\_type: typing.Optional\[str\] = Nonerevision: typing.Optional\[str\] = Nonelibrary\_name: typing.Optional\[str\] = Nonelibrary\_version: typing.Optional\[str\] = Nonecache\_dir: typing.Union\[str, pathlib.Path, NoneType\] = Nonelocal\_dir: typing.Union\[str, pathlib.Path, NoneType\] = Noneuser\_agent: typing.Union\[typing.Dict, str, NoneType\] = Noneforce\_download: bool = Falseproxies: typing.Optional\[typing.Dict\] = Noneetag\_timeout: float = 10token: typing.Union\[bool, str, NoneType\] = Nonelocal\_files\_only: bool = Falseheaders: typing.Optional\[typing.Dict\[str, str\]\] = Noneendpoint: typing.Optional\[str\] = Noneresume\_download: typing.Optional\[bool\] = Noneforce\_filename: typing.Optional\[str\] = Nonelocal\_dir\_use\_symlinks: typing.Union\[bool, typing.Literal\['auto'\]\] = 'auto' ) → export const metadata = 'undefined';`str`
+Expand 16 parameters
+Parameters
+*   [](#huggingface_hub.hf_hub_download.repo_id)**repo\_id** (`str`) — A user or an organization name and a repo name separated by a `/`.
+*   [](#huggingface_hub.hf_hub_download.filename)**filename** (`str`) — The name of the file in the repo.
+*   [](#huggingface_hub.hf_hub_download.subfolder)**subfolder** (`str`, _optional_) — An optional value corresponding to a folder inside the model repo.
+*   [](#huggingface_hub.hf_hub_download.repo_type)**repo\_type** (`str`, _optional_) — Set to `"dataset"` or `"space"` if downloading from a dataset or space, `None` or `"model"` if downloading from a model. Default is `None`.
+*   [](#huggingface_hub.hf_hub_download.revision)**revision** (`str`, _optional_) — An optional Git revision id which can be a branch name, a tag, or a commit hash.
+*   [](#huggingface_hub.hf_hub_download.library_name)**library\_name** (`str`, _optional_) — The name of the library to which the object corresponds.
+*   [](#huggingface_hub.hf_hub_download.library_version)**library\_version** (`str`, _optional_) — The version of the library.
+*   [](#huggingface_hub.hf_hub_download.cache_dir)**cache\_dir** (`str`, `Path`, _optional_) — Path to the folder where cached files are stored.
+*   [](#huggingface_hub.hf_hub_download.local_dir)**local\_dir** (`str` or `Path`, _optional_) — If provided, the downloaded file will be placed under this directory.
+*   [](#huggingface_hub.hf_hub_download.user_agent)**user\_agent** (`dict`, `str`, _optional_) — The user-agent info in the form of a dictionary or a string.
+*   [](#huggingface_hub.hf_hub_download.force_download)**force\_download** (`bool`, _optional_, defaults to `False`) — Whether the file should be downloaded even if it already exists in the local cache.
+*   [](#huggingface_hub.hf_hub_download.proxies)**proxies** (`dict`, _optional_) — Dictionary mapping protocol to the URL of the proxy passed to `requests.request`.
+*   [](#huggingface_hub.hf_hub_download.etag_timeout)**etag\_timeout** (`float`, _optional_, defaults to `10`) — When fetching ETag, how many seconds to wait for the server to send data before giving up which is passed to `requests.request`.
+*   [](#huggingface_hub.hf_hub_download.token)**token** (`str`, `bool`, _optional_) — A token to be used for the download.
+    *   If `True`, the token is read from the HuggingFace config folder.
+    *   If a string, it’s used as the authentication token.
+*   [](#huggingface_hub.hf_hub_download.local_files_only)**local\_files\_only** (`bool`, _optional_, defaults to `False`) — If `True`, avoid downloading the file and return the path to the local cached file if it exists.
+*   [](#huggingface_hub.hf_hub_download.headers)**headers** (`dict`, _optional_) — Additional headers to be sent with the request.
+Returns
+export const metadata = 'undefined';
+`str`
+export const metadata = 'undefined';
+Local path of file or if networking is off, last version of file cached on disk.
+Raises
+export const metadata = 'undefined';
+[RepositoryNotFoundError](/docs/huggingface_hub/v0.29.2/en/package_reference/utilities#huggingface_hub.errors.RepositoryNotFoundError) or [RevisionNotFoundError](/docs/huggingface_hub/v0.29.2/en/package_reference/utilities#huggingface_hub.errors.RevisionNotFoundError) or [EntryNotFoundError](/docs/huggingface_hub/v0.29.2/en/package_reference/utilities#huggingface_hub.errors.EntryNotFoundError) or [LocalEntryNotFoundError](/docs/huggingface_hub/v0.29.2/en/package_reference/utilities#huggingface_hub.errors.LocalEntryNotFoundError) or `EnvironmentError` or `OSError` or `ValueError`
+export const metadata = 'undefined';
+*   [RepositoryNotFoundError](/docs/huggingface_hub/v0.29.2/en/package_reference/utilities#huggingface_hub.errors.RepositoryNotFoundError) — If the repository to download from cannot be found. This may be because it doesn’t exist, or because it is set to `private` and you do not have access.
+*   [RevisionNotFoundError](/docs/huggingface_hub/v0.29.2/en/package_reference/utilities#huggingface_hub.errors.RevisionNotFoundError) — If the revision to download from cannot be found.
+*   [EntryNotFoundError](/docs/huggingface_hub/v0.29.2/en/package_reference/utilities#huggingface_hub.errors.EntryNotFoundError) — If the file to download cannot be found.
+*   [LocalEntryNotFoundError](/docs/huggingface_hub/v0.29.2/en/package_reference/utilities#huggingface_hub.errors.LocalEntryNotFoundError) — If network is disabled or unavailable and file is not found in cache.
+*   [`EnvironmentError`](https://docs.python.org/3/library/exceptions.html#EnvironmentError) — If `token=True` but the token cannot be found.
+*   [`OSError`](https://docs.python.org/3/library/exceptions.html#OSError) — If ETag cannot be determined.
+*   [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError) — If some parameter value is invalid.
+Download a given file if it’s not already present in the local cache.
+The new cache file layout looks like this:
+*   The cache directory contains one subfolder per repo\_id (namespaced by repo type)
+*   inside each repo folder:
+    *   refs is a list of the latest known revision => commit\_hash pairs
+    *   blobs contains the actual file blobs (identified by their git-sha or sha256, depending on whether they’re LFS files or not)
+    *   snapshots contains one subfolder per commit, each “commit” contains the subset of the files that have been resolved at that particular commit. Each filename is a symlink to the blob at that particular commit.
+[](#huggingface_hub.hf_hub_download.example)
+Copied
+\[  96\]  .
+└── \[ 160\]  models\--julien-c--EsperBERTo-small
+    ├── \[ 160\]  blobs
+    │   ├── \[321M\]  403450e234d65943a7dcf7e05a771ce3c92faa84dd07db4ac20f592037a1e4bd
+    │   ├── \[ 398\]  7cb18dc9bafbfcf74629a4b760af1b160957a83e
+    │   └── \[1.4K\]  d7edf6bd2a681fb0175f7735299831ee1b22b812
+    ├── \[  96\]  refs
+    │   └── \[  40\]  main
+    └── \[ 128\]  snapshots
+        ├── \[ 128\]  2439f60ef33a0d46d85da5001d52aeda5b00ce9f
+        │   ├── \[  52\]  README.md -> ../../blobs/d7edf6bd2a681fb0175f7735299831ee1b22b812
+        │   └── \[  76\]  pytorch\_model.bin -> ../../blobs/403450e234d65943a7dcf7e05a771ce3c92faa84dd07db4ac20f592037a1e4bd
+        └── \[ 128\]  bbc77c8132af1cc5cf678da3f1ddf2de43606d48
+            ├── \[  52\]  README.md -> ../../blobs/7cb18dc9bafbfcf74629a4b760af1b160957a83e
+            └── \[  76\]  pytorch\_model.bin -> ../../blobs/403450e234d65943a7dcf7e05a771ce3c92faa84dd07db4ac20f592037a1e4bd
+If `local_dir` is provided, the file structure from the repo will be replicated in this location. When using this option, the `cache_dir` will not be used and a `.cache/huggingface/` folder will be created at the root of `local_dir` to store some metadata related to the downloaded files. While this mechanism is not as robust as the main cache-system, it’s optimized for regularly pulling the latest version of a repository.
+### [](#huggingface_hub.hf_hub_url)hf\_hub\_url
+#### huggingface\_hub.hf\_hub\_url
+[](#huggingface_hub.hf_hub_url)[< source \>](https://github.com/huggingface/huggingface_hub/blob/v0.29.2/src/huggingface_hub/file_download.py#L171)
+( repo\_id: strfilename: strsubfolder: typing.Optional\[str\] = Nonerepo\_type: typing.Optional\[str\] = Nonerevision: typing.Optional\[str\] = Noneendpoint: typing.Optional\[str\] = None )
+Parameters
+*   [](#huggingface_hub.hf_hub_url.repo_id)**repo\_id** (`str`) — A namespace (user or an organization) name and a repo name separated by a `/`.
+*   [](#huggingface_hub.hf_hub_url.filename)**filename** (`str`) — The name of the file in the repo.
+*   [](#huggingface_hub.hf_hub_url.subfolder)**subfolder** (`str`, _optional_) — An optional value corresponding to a folder inside the repo.
+*   [](#huggingface_hub.hf_hub_url.repo_type)**repo\_type** (`str`, _optional_) — Set to `"dataset"` or `"space"` if downloading from a dataset or space, `None` or `"model"` if downloading from a model. Default is `None`.
+*   [](#huggingface_hub.hf_hub_url.revision)**revision** (`str`, _optional_) — An optional Git revision id which can be a branch name, a tag, or a commit hash.
+Construct the URL of a file from the given information.
+The resolved address can either be a huggingface.co-hosted url, or a link to Cloudfront (a Content Delivery Network, or CDN) for large files which are more than a few MBs.
+[](#huggingface_hub.hf_hub_url.example)
+Example:
+Copied
+\>>> from huggingface\_hub import hf\_hub\_url
+\>>> hf\_hub\_url(
+...     repo\_id="julien-c/EsperBERTo-small", filename="pytorch\_model.bin"
+... )
+'https://huggingface.co/julien-c/EsperBERTo-small/resolve/main/pytorch\_model.bin'
+Notes:
+Cloudfront is replicated over the globe so downloads are way faster for the end user (and it also lowers our bandwidth costs).
+Cloudfront aggressively caches files by default (default TTL is 24 hours), however this is not an issue here because we implement a git-based versioning system on huggingface.co, which means that we store the files on S3/Cloudfront in a content-addressable way (i.e., the file name is its hash). Using content-addressable filenames means cache can’t ever be stale.
+In terms of client-side caching from this library, we base our caching on the objects’ entity tag (`ETag`), which is an identifier of a specific version of a resource \[1\]\_. An object’s ETag is: its git-sha1 if stored in git, or its sha256 if stored in git-lfs.
+References:
+*   \[1\] [https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/ETag](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/ETag)
+[](#huggingface_hub.snapshot_download)Download a snapshot of the repo
+---------------------------------------------------------------------
+#### huggingface\_hub.snapshot\_download
+[](#huggingface_hub.snapshot_download)[< source \>](https://github.com/huggingface/huggingface_hub/blob/v0.29.2/src/huggingface_hub/_snapshot_download.py#L20)
+( repo\_id: strrepo\_type: typing.Optional\[str\] = Nonerevision: typing.Optional\[str\] = Nonecache\_dir: typing.Union\[str, pathlib.Path, NoneType\] = Nonelocal\_dir: typing.Union\[str, pathlib.Path, NoneType\] = Nonelibrary\_name: typing.Optional\[str\] = Nonelibrary\_version: typing.Optional\[str\] = Noneuser\_agent: typing.Union\[typing.Dict, str, NoneType\] = Noneproxies: typing.Optional\[typing.Dict\] = Noneetag\_timeout: float = 10force\_download: bool = Falsetoken: typing.Union\[bool, str, NoneType\] = Nonelocal\_files\_only: bool = Falseallow\_patterns: typing.Union\[typing.List\[str\], str, NoneType\] = Noneignore\_patterns: typing.Union\[typing.List\[str\], str, NoneType\] = Nonemax\_workers: int = 8tqdm\_class: typing.Optional\[tqdm.asyncio.tqdm\_asyncio\] = Noneheaders: typing.Optional\[typing.Dict\[str, str\]\] = Noneendpoint: typing.Optional\[str\] = Nonelocal\_dir\_use\_symlinks: typing.Union\[bool, typing.Literal\['auto'\]\] = 'auto'resume\_download: typing.Optional\[bool\] = None ) → export const metadata = 'undefined';`str`
+Expand 18 parameters
+Parameters
+*   [](#huggingface_hub.snapshot_download.repo_id)**repo\_id** (`str`) — A user or an organization name and a repo name separated by a `/`.
+*   [](#huggingface_hub.snapshot_download.repo_type)**repo\_type** (`str`, _optional_) — Set to `"dataset"` or `"space"` if downloading from a dataset or space, `None` or `"model"` if downloading from a model. Default is `None`.
+*   [](#huggingface_hub.snapshot_download.revision)**revision** (`str`, _optional_) — An optional Git revision id which can be a branch name, a tag, or a commit hash.
+*   [](#huggingface_hub.snapshot_download.cache_dir)**cache\_dir** (`str`, `Path`, _optional_) — Path to the folder where cached files are stored.
+*   [](#huggingface_hub.snapshot_download.local_dir)**local\_dir** (`str` or `Path`, _optional_) — If provided, the downloaded files will be placed under this directory.
+*   [](#huggingface_hub.snapshot_download.library_name)**library\_name** (`str`, _optional_) — The name of the library to which the object corresponds.
+*   [](#huggingface_hub.snapshot_download.library_version)**library\_version** (`str`, _optional_) — The version of the library.
+*   [](#huggingface_hub.snapshot_download.user_agent)**user\_agent** (`str`, `dict`, _optional_) — The user-agent info in the form of a dictionary or a string.
+*   [](#huggingface_hub.snapshot_download.proxies)**proxies** (`dict`, _optional_) — Dictionary mapping protocol to the URL of the proxy passed to `requests.request`.
+*   [](#huggingface_hub.snapshot_download.etag_timeout)**etag\_timeout** (`float`, _optional_, defaults to `10`) — When fetching ETag, how many seconds to wait for the server to send data before giving up which is passed to `requests.request`.
+*   [](#huggingface_hub.snapshot_download.force_download)**force\_download** (`bool`, _optional_, defaults to `False`) — Whether the file should be downloaded even if it already exists in the local cache.
+*   [](#huggingface_hub.snapshot_download.token)**token** (`str`, `bool`, _optional_) — A token to be used for the download.
+    *   If `True`, the token is read from the HuggingFace config folder.
+    *   If a string, it’s used as the authentication token.
+*   [](#huggingface_hub.snapshot_download.headers)**headers** (`dict`, _optional_) — Additional headers to include in the request. Those headers take precedence over the others.
+*   [](#huggingface_hub.snapshot_download.local_files_only)**local\_files\_only** (`bool`, _optional_, defaults to `False`) — If `True`, avoid downloading the file and return the path to the local cached file if it exists.
+*   [](#huggingface_hub.snapshot_download.allow_patterns)**allow\_patterns** (`List[str]` or `str`, _optional_) — If provided, only files matching at least one pattern are downloaded.
+*   [](#huggingface_hub.snapshot_download.ignore_patterns)**ignore\_patterns** (`List[str]` or `str`, _optional_) — If provided, files matching any of the patterns are not downloaded.
+*   [](#huggingface_hub.snapshot_download.max_workers)**max\_workers** (`int`, _optional_) — Number of concurrent threads to download files (1 thread = 1 file download). Defaults to 8.
+*   [](#huggingface_hub.snapshot_download.tqdm_class)**tqdm\_class** (`tqdm`, _optional_) — If provided, overwrites the default behavior for the progress bar. Passed argument must inherit from `tqdm.auto.tqdm` or at least mimic its behavior. Note that the `tqdm_class` is not passed to each individual download. Defaults to the custom HF progress bar that can be disabled by setting `HF_HUB_DISABLE_PROGRESS_BARS` environment variable.
+Returns
+export const metadata = 'undefined';
+`str`
+export const metadata = 'undefined';
+folder path of the repo snapshot.
+Raises
+export const metadata = 'undefined';
+[RepositoryNotFoundError](/docs/huggingface_hub/v0.29.2/en/package_reference/utilities#huggingface_hub.errors.RepositoryNotFoundError) or [RevisionNotFoundError](/docs/huggingface_hub/v0.29.2/en/package_reference/utilities#huggingface_hub.errors.RevisionNotFoundError) or `EnvironmentError` or `OSError` or `ValueError`
+export const metadata = 'undefined';
+*   [RepositoryNotFoundError](/docs/huggingface_hub/v0.29.2/en/package_reference/utilities#huggingface_hub.errors.RepositoryNotFoundError) — If the repository to download from cannot be found. This may be because it doesn’t exist, or because it is set to `private` and you do not have access.
+*   [RevisionNotFoundError](/docs/huggingface_hub/v0.29.2/en/package_reference/utilities#huggingface_hub.errors.RevisionNotFoundError) — If the revision to download from cannot be found.
+*   [`EnvironmentError`](https://docs.python.org/3/library/exceptions.html#EnvironmentError) — If `token=True` and the token cannot be found.
+*   [`OSError`](https://docs.python.org/3/library/exceptions.html#OSError) — if ETag cannot be determined.
+*   [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError) — if some parameter value is invalid.
+Download repo files.
+Download a whole snapshot of a repo’s files at the specified revision. This is useful when you want all files from a repo, because you don’t know which ones you will need a priori. All files are nested inside a folder in order to keep their actual filename relative to that folder. You can also filter which files to download using `allow_patterns` and `ignore_patterns`.
+If `local_dir` is provided, the file structure from the repo will be replicated in this location. When using this option, the `cache_dir` will not be used and a `.cache/huggingface/` folder will be created at the root of `local_dir` to store some metadata related to the downloaded files. While this mechanism is not as robust as the main cache-system, it’s optimized for regularly pulling the latest version of a repository.
+An alternative would be to clone the repo but this requires git and git-lfs to be installed and properly configured. It is also not possible to filter which files to download when cloning a repository using git.
+[](#get-metadata-about-a-file)Get metadata about a file
+-------------------------------------------------------
+### [](#huggingface_hub.get_hf_file_metadata)get\_hf\_file\_metadata
+#### huggingface\_hub.get\_hf\_file\_metadata
+[](#huggingface_hub.get_hf_file_metadata)[< source \>](https://github.com/huggingface/huggingface_hub/blob/v0.29.2/src/huggingface_hub/file_download.py#L1246)
+( url: strtoken: typing.Union\[bool, str, NoneType\] = Noneproxies: typing.Optional\[typing.Dict\] = Nonetimeout: typing.Optional\[float\] = 10library\_name: typing.Optional\[str\] = Nonelibrary\_version: typing.Optional\[str\] = Noneuser\_agent: typing.Union\[typing.Dict, str, NoneType\] = Noneheaders: typing.Optional\[typing.Dict\[str, str\]\] = None )
+Parameters
+*   [](#huggingface_hub.get_hf_file_metadata.url)**url** (`str`) — File url, for example returned by [hf\_hub\_url()](/docs/huggingface_hub/v0.29.2/en/package_reference/file_download#huggingface_hub.hf_hub_url).
+*   [](#huggingface_hub.get_hf_file_metadata.token)**token** (`str` or `bool`, _optional_) — A token to be used for the download.
+    *   If `True`, the token is read from the HuggingFace config folder.
+    *   If `False` or `None`, no token is provided.
+    *   If a string, it’s used as the authentication token.
+*   [](#huggingface_hub.get_hf_file_metadata.proxies)**proxies** (`dict`, _optional_) — Dictionary mapping protocol to the URL of the proxy passed to `requests.request`.
+*   [](#huggingface_hub.get_hf_file_metadata.timeout)**timeout** (`float`, _optional_, defaults to 10) — How many seconds to wait for the server to send metadata before giving up.
+*   [](#huggingface_hub.get_hf_file_metadata.library_name)**library\_name** (`str`, _optional_) — The name of the library to which the object corresponds.
+*   [](#huggingface_hub.get_hf_file_metadata.library_version)**library\_version** (`str`, _optional_) — The version of the library.
+*   [](#huggingface_hub.get_hf_file_metadata.user_agent)**user\_agent** (`dict`, `str`, _optional_) — The user-agent info in the form of a dictionary or a string.
+*   [](#huggingface_hub.get_hf_file_metadata.headers)**headers** (`dict`, _optional_) — Additional headers to be sent with the request.
+Fetch metadata of a file versioned on the Hub for a given url.
+### [](#huggingface_hub.HfFileMetadata)HfFileMetadata
+### class huggingface\_hub.HfFileMetadata
+[](#huggingface_hub.HfFileMetadata)[< source \>](https://github.com/huggingface/huggingface_hub/blob/v0.29.2/src/huggingface_hub/file_download.py#L147)
+( commit\_hash: typing.Optional\[str\]etag: typing.Optional\[str\]location: strsize: typing.Optional\[int\] )
+Parameters
+*   [](#huggingface_hub.HfFileMetadata.commit_hash)**commit\_hash** (`str`, _optional_) — The commit\_hash related to the file.
+*   [](#huggingface_hub.HfFileMetadata.etag)**etag** (`str`, _optional_) — Etag of the file on the server.
+*   [](#huggingface_hub.HfFileMetadata.location)**location** (`str`) — Location where to download the file. Can be a Hub url or not (CDN).
+*   [](#huggingface_hub.HfFileMetadata.size)**size** (`size`) — Size of the file. In case of an LFS file, contains the size of the actual LFS file, not the pointer.
+Data structure containing information about a file versioned on the Hub.
+Returned by [get\_hf\_file\_metadata()](/docs/huggingface_hub/v0.29.2/en/package_reference/file_download#huggingface_hub.get_hf_file_metadata) based on a URL.
+[](#caching)Caching
+-------------------
+The methods displayed above are designed to work with a caching system that prevents re-downloading files. The caching system was updated in v0.8.0 to become the central cache-system shared across libraries that depend on the Hub.
+Read the [cache-system guide](../guides/manage-cache) for a detailed presentation of caching at at HF.
+[< \> Update on GitHub](https://github.com/huggingface/huggingface_hub/blob/main/docs/source/en/package_reference/file_download.md)
+HfApi Client
+[←Hugging Face Hub API](/docs/huggingface_hub/en/package_reference/hf_api) [Mixins & serialization methods→](/docs/huggingface_hub/en/package_reference/mixins)

docs/huggingface/HfApi Client API Reference.md ADDED Viewed

The diff for this file is too large to render. See raw diff

docs/huggingface/Load a dataset from the hub.md ADDED Viewed

	@@ -0,0 +1,126 @@

+[](#load-a-dataset-from-the-hub)Load a dataset from the Hub
+===========================================================
+Finding high-quality datasets that are reproducible and accessible can be difficult. One of 🤗 Datasets main goals is to provide a simple way to load a dataset of any format or type. The easiest way to get started is to discover an existing dataset on the [Hugging Face Hub](https://huggingface.co/datasets) - a community-driven collection of datasets for tasks in NLP, computer vision, and audio - and use 🤗 Datasets to download and generate the dataset.
+This tutorial uses the [rotten\_tomatoes](https://huggingface.co/datasets/rotten_tomatoes) and [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) datasets, but feel free to load any dataset you want and follow along. Head over to the Hub now and find a dataset for your task!
+[](#load-a-dataset)Load a dataset
+---------------------------------
+Before you take the time to download a dataset, it’s often helpful to quickly get some general information about a dataset. A dataset’s information is stored inside [DatasetInfo](/docs/datasets/v3.3.2/en/package_reference/main_classes#datasets.DatasetInfo) and can include information such as the dataset description, features, and dataset size.
+Use the [load\_dataset\_builder()](/docs/datasets/v3.3.2/en/package_reference/loading_methods#datasets.load_dataset_builder) function to load a dataset builder and inspect a dataset’s attributes without committing to downloading it:
+Copied
+\>>> from datasets import load\_dataset\_builder
+\>>> ds\_builder = load\_dataset\_builder("cornell-movie-review-data/rotten\_tomatoes")
+\# Inspect dataset description
+\>>> ds\_builder.info.description
+Movie Review Dataset. This is a dataset of containing 5,331 positive and 5,331 negative processed sentences from Rotten Tomatoes movie reviews. This data was first used in Bo Pang and Lillian Lee, \`\`Seeing stars: Exploiting class relationships for sentiment categorization with respect to rating scales.'', Proceedings of the ACL, 2005.
+\# Inspect dataset features
+\>>> ds\_builder.info.features
+{'label': ClassLabel(names=\['neg', 'pos'\], id\=None),
+ 'text': Value(dtype='string', id\=None)}
+If you’re happy with the dataset, then load it with [load\_dataset()](/docs/datasets/v3.3.2/en/package_reference/loading_methods#datasets.load_dataset):
+Copied
+\>>> from datasets import load\_dataset
+\>>> dataset = load\_dataset("cornell-movie-review-data/rotten\_tomatoes", split="train")
+[](#splits)Splits
+-----------------
+A split is a specific subset of a dataset like `train` and `test`. List a dataset’s split names with the [get\_dataset\_split\_names()](/docs/datasets/v3.3.2/en/package_reference/loading_methods#datasets.get_dataset_split_names) function:
+Copied
+\>>> from datasets import get\_dataset\_split\_names
+\>>> get\_dataset\_split\_names("cornell-movie-review-data/rotten\_tomatoes")
+\['train', 'validation', 'test'\]
+Then you can load a specific split with the `split` parameter. Loading a dataset `split` returns a [Dataset](/docs/datasets/v3.3.2/en/package_reference/main_classes#datasets.Dataset) object:
+Copied
+\>>> from datasets import load\_dataset
+\>>> dataset = load\_dataset("cornell-movie-review-data/rotten\_tomatoes", split="train")
+\>>> dataset
+Dataset({
+    features: \['text', 'label'\],
+    num\_rows: 8530
+})
+If you don’t specify a `split`, 🤗 Datasets returns a [DatasetDict](/docs/datasets/v3.3.2/en/package_reference/main_classes#datasets.DatasetDict) object instead:
+Copied
+\>>> from datasets import load\_dataset
+\>>> dataset = load\_dataset("cornell-movie-review-data/rotten\_tomatoes")
+DatasetDict({
+    train: Dataset({
+        features: \['text', 'label'\],
+        num\_rows: 8530
+    })
+    validation: Dataset({
+        features: \['text', 'label'\],
+        num\_rows: 1066
+    })
+    test: Dataset({
+        features: \['text', 'label'\],
+        num\_rows: 1066
+    })
+})
+[](#configurations)Configurations
+---------------------------------
+Some datasets contain several sub-datasets. For example, the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset has several sub-datasets, each one containing audio data in a different language. These sub-datasets are known as _configurations_ or _subsets_, and you must explicitly select one when loading the dataset. If you don’t provide a configuration name, 🤗 Datasets will raise a `ValueError` and remind you to choose a configuration.
+Use the [get\_dataset\_config\_names()](/docs/datasets/v3.3.2/en/package_reference/loading_methods#datasets.get_dataset_config_names) function to retrieve a list of all the possible configurations available to your dataset:
+Copied
+\>>> from datasets import get\_dataset\_config\_names
+\>>> configs = get\_dataset\_config\_names("PolyAI/minds14")
+\>>> print(configs)
+\['cs-CZ', 'de-DE', 'en-AU', 'en-GB', 'en-US', 'es-ES', 'fr-FR', 'it-IT', 'ko-KR', 'nl-NL', 'pl-PL', 'pt-PT', 'ru-RU', 'zh-CN', 'all'\]
+Then load the configuration you want:
+Copied
+\>>> from datasets import load\_dataset
+\>>> mindsFR = load\_dataset("PolyAI/minds14", "fr-FR", split="train")
+[](#remote-code)Remote code
+---------------------------
+Certain datasets repositories contain a loading script with the Python code used to generate the dataset. All files and code uploaded to the Hub are scanned for malware (refer to the Hub security documentation for more information), but you should still review the dataset loading scripts and authors to avoid executing malicious code on your machine. You should set `trust_remote_code=True` to use a dataset with a loading script, or you will get an error:
+Copied
+\>>> from datasets import get\_dataset\_config\_names, get\_dataset\_split\_names, load\_dataset
+\>>> c4 = load\_dataset("c4", "en", split="train", trust\_remote\_code=True)
+\>>> get\_dataset\_config\_names("c4", trust\_remote\_code=True)
+\['en', 'realnewslike', 'en.noblocklist', 'en.noclean'\]
+\>>> get\_dataset\_split\_names("c4", "en", trust\_remote\_code=True)
+\['train', 'validation'\]
+For security reasons, 🤗 Datasets do not allow running dataset loading scripts by default, and you have to pass `trust_remote_code=True` to load datasets that require running a dataset script.
+[< \> Update on GitHub](https://github.com/huggingface/datasets/blob/main/docs/source/load_hub.mdx)
+[←Overview](/docs/datasets/en/tutorial) [Know your dataset→](/docs/datasets/en/access)

docs/huggingface/Search the Hub.md ADDED Viewed

	@@ -0,0 +1,61 @@

+[](#search-the-hub)Search the Hub
+=================================
+In this tutorial, you will learn how to search models, datasets and spaces on the Hub using `huggingface_hub`.
+[](#how-to-list-repositories-)How to list repositories ?
+--------------------------------------------------------
+`huggingface_hub` library includes an HTTP client [HfApi](/docs/huggingface_hub/v0.29.2/en/package_reference/hf_api#huggingface_hub.HfApi) to interact with the Hub. Among other things, it can list models, datasets and spaces stored on the Hub:
+Copied
+\>>> from huggingface\_hub import HfApi
+\>>> api = HfApi()
+\>>> models = api.list\_models()
+The output of [list\_models()](/docs/huggingface_hub/v0.29.2/en/package_reference/hf_api#huggingface_hub.HfApi.list_models) is an iterator over the models stored on the Hub.
+Similarly, you can use [list\_datasets()](/docs/huggingface_hub/v0.29.2/en/package_reference/hf_api#huggingface_hub.HfApi.list_datasets) to list datasets and [list\_spaces()](/docs/huggingface_hub/v0.29.2/en/package_reference/hf_api#huggingface_hub.HfApi.list_spaces) to list Spaces.
+[](#how-to-filter-repositories-)How to filter repositories ?
+------------------------------------------------------------
+Listing repositories is great but now you might want to filter your search. The list helpers have several attributes like:
+*   `filter`
+*   `author`
+*   `search`
+*   …
+Let’s see an example to get all models on the Hub that does image classification, have been trained on the imagenet dataset and that runs with PyTorch.
+Copied
+models = hf\_api.list\_models(
+	task="image-classification",
+	library="pytorch",
+	trained\_dataset="imagenet",
+)
+While filtering, you can also sort the models and take only the top results. For example, the following example fetches the top 5 most downloaded datasets on the Hub:
+Copied
+\>>> list(list\_datasets(sort="downloads", direction=-1, limit=5))
+\[DatasetInfo(
+	id\='argilla/databricks-dolly-15k-curated-en',
+	author='argilla',
+	sha='4dcd1dedbe148307a833c931b21ca456a1fc4281',
+	last\_modified=datetime.datetime(2023, 10, 2, 12, 32, 53, tzinfo=datetime.timezone.utc),
+	private=False,
+	downloads=8889377,
+	(...)
+To explore available filters on the Hub, visit [models](https://huggingface.co/models) and [datasets](https://huggingface.co/datasets) pages in your browser, search for some parameters and look at the values in the URL.
+[< \> Update on GitHub](https://github.com/huggingface/huggingface_hub/blob/main/docs/source/en/guides/search.md)
+HfApi Client
+[←Repository](/docs/huggingface_hub/en/guides/repository) [Inference→](/docs/huggingface_hub/en/guides/inference)

finetrainers/args.py CHANGED Viewed

@@ -316,6 +316,7 @@ class BaseArgs:
     # Dataset arguments
     dataset_config: str = None
     dataset_shuffle_buffer_size: int = 1
     precomputation_items: int = 512
     precomputation_dir: Optional[str] = None
     precomputation_once: bool = False
@@ -420,6 +421,7 @@ class BaseArgs:
         dataset_arguments = {
             "dataset_config": self.dataset_config,
             "dataset_shuffle_buffer_size": self.dataset_shuffle_buffer_size,
             "precomputation_items": self.precomputation_items,
             "precomputation_dir": self.precomputation_dir,
             "precomputation_once": self.precomputation_once,
@@ -625,6 +627,7 @@ def _add_model_arguments(parser: argparse.ArgumentParser) -> None:
 def _add_dataset_arguments(parser: argparse.ArgumentParser) -> None:
     parser.add_argument("--dataset_config", type=str, required=True)
     parser.add_argument("--dataset_shuffle_buffer_size", type=int, default=1)
     parser.add_argument("--precomputation_items", type=int, default=512)
     parser.add_argument("--precomputation_dir", type=str, default=None)
     parser.add_argument("--precomputation_once", action="store_true")
@@ -761,6 +764,7 @@ def _map_to_args_type(args: Dict[str, Any]) -> BaseArgs:
     # Dataset arguments
     result_args.dataset_config = args.dataset_config
     result_args.dataset_shuffle_buffer_size = args.dataset_shuffle_buffer_size
     result_args.precomputation_items = args.precomputation_items
     result_args.precomputation_dir = args.precomputation_dir or os.path.join(args.output_dir, "precomputed")
     result_args.precomputation_once = args.precomputation_once

     # Dataset arguments
     dataset_config: str = None
     dataset_shuffle_buffer_size: int = 1
+    enable_precomputation: bool = False
     precomputation_items: int = 512
     precomputation_dir: Optional[str] = None
     precomputation_once: bool = False
         dataset_arguments = {
             "dataset_config": self.dataset_config,
             "dataset_shuffle_buffer_size": self.dataset_shuffle_buffer_size,
+            "enable_precomputation": self.enable_precomputation,
             "precomputation_items": self.precomputation_items,
             "precomputation_dir": self.precomputation_dir,
             "precomputation_once": self.precomputation_once,
 def _add_dataset_arguments(parser: argparse.ArgumentParser) -> None:
     parser.add_argument("--dataset_config", type=str, required=True)
     parser.add_argument("--dataset_shuffle_buffer_size", type=int, default=1)
+    parser.add_argument("--enable_precomputation", action="store_true")
     parser.add_argument("--precomputation_items", type=int, default=512)
     parser.add_argument("--precomputation_dir", type=str, default=None)
     parser.add_argument("--precomputation_once", action="store_true")
     # Dataset arguments
     result_args.dataset_config = args.dataset_config
     result_args.dataset_shuffle_buffer_size = args.dataset_shuffle_buffer_size
+    result_args.enable_precomputation = args.enable_precomputation
     result_args.precomputation_items = args.precomputation_items
     result_args.precomputation_dir = args.precomputation_dir or os.path.join(args.output_dir, "precomputed")
     result_args.precomputation_once = args.precomputation_once

finetrainers/config.py CHANGED Viewed

@@ -3,6 +3,7 @@ from typing import Type
 from .models import ModelSpecification
 from .models.cogvideox import CogVideoXModelSpecification
 from .models.hunyuan_video import HunyuanVideoModelSpecification
 from .models.ltx_video import LTXVideoModelSpecification
 from .models.wan import WanModelSpecification
@@ -10,6 +11,7 @@ from .models.wan import WanModelSpecification
 class ModelType(str, Enum):
     COGVIDEOX = "cogvideox"
     HUNYUAN_VIDEO = "hunyuan_video"
     LTX_VIDEO = "ltx_video"
     WAN = "wan"
@@ -21,6 +23,14 @@ class TrainingType(str, Enum):
 SUPPORTED_MODEL_CONFIGS = {
     ModelType.HUNYUAN_VIDEO: {
         TrainingType.LORA: HunyuanVideoModelSpecification,
         TrainingType.FULL_FINETUNE: HunyuanVideoModelSpecification,
@@ -29,10 +39,6 @@ SUPPORTED_MODEL_CONFIGS = {
         TrainingType.LORA: LTXVideoModelSpecification,
         TrainingType.FULL_FINETUNE: LTXVideoModelSpecification,
     },
-    ModelType.COGVIDEOX: {
-        TrainingType.LORA: CogVideoXModelSpecification,
-        TrainingType.FULL_FINETUNE: CogVideoXModelSpecification,
-    },
     ModelType.WAN: {
         TrainingType.LORA: WanModelSpecification,
         TrainingType.FULL_FINETUNE: WanModelSpecification,

 from .models import ModelSpecification
 from .models.cogvideox import CogVideoXModelSpecification
+from .models.cogview4 import CogView4ModelSpecification
 from .models.hunyuan_video import HunyuanVideoModelSpecification
 from .models.ltx_video import LTXVideoModelSpecification
 from .models.wan import WanModelSpecification
 class ModelType(str, Enum):
     COGVIDEOX = "cogvideox"
+    COGVIEW4 = "cogview4"
     HUNYUAN_VIDEO = "hunyuan_video"
     LTX_VIDEO = "ltx_video"
     WAN = "wan"
 SUPPORTED_MODEL_CONFIGS = {
+    ModelType.COGVIDEOX: {
+        TrainingType.LORA: CogVideoXModelSpecification,
+        TrainingType.FULL_FINETUNE: CogVideoXModelSpecification,
+    },
+    ModelType.COGVIEW4: {
+        TrainingType.LORA: CogView4ModelSpecification,
+        TrainingType.FULL_FINETUNE: CogView4ModelSpecification,
+    },
     ModelType.HUNYUAN_VIDEO: {
         TrainingType.LORA: HunyuanVideoModelSpecification,
         TrainingType.FULL_FINETUNE: HunyuanVideoModelSpecification,
         TrainingType.LORA: LTXVideoModelSpecification,
         TrainingType.FULL_FINETUNE: LTXVideoModelSpecification,
     },
     ModelType.WAN: {
         TrainingType.LORA: WanModelSpecification,
         TrainingType.FULL_FINETUNE: WanModelSpecification,

finetrainers/data/__init__.py CHANGED Viewed

@@ -14,6 +14,14 @@ from .dataset import (
     initialize_dataset,
     wrap_iterable_dataset_for_preprocessing,
 )
-from .precomputation import DistributedDataPreprocessor, PreprocessedDataIterable
 from .sampler import ResolutionSampler
 from .utils import find_files

     initialize_dataset,
     wrap_iterable_dataset_for_preprocessing,
 )
+from .precomputation import (
+    InMemoryDataIterable,
+    InMemoryDistributedDataPreprocessor,
+    InMemoryOnceDataIterable,
+    PrecomputedDataIterable,
+    PrecomputedDistributedDataPreprocessor,
+    PrecomputedOnceDataIterable,
+    initialize_preprocessor,
+)
 from .sampler import ResolutionSampler
 from .utils import find_files

finetrainers/data/dataset.py CHANGED Viewed

@@ -29,10 +29,13 @@ decord.bridge.set_bridge("torch")
 logger = get_logger()
 MAX_PRECOMPUTABLE_ITEMS_LIMIT = 1024
 COMMON_CAPTION_FILES = ["prompt.txt", "prompts.txt", "caption.txt", "captions.txt"]
 COMMON_VIDEO_FILES = ["video.txt", "videos.txt"]
 COMMON_IMAGE_FILES = ["image.txt", "images.txt"]
 class ImageCaptionFilePairDataset(torch.utils.data.IterableDataset, torch.distributed.checkpoint.stateful.Stateful):
@@ -420,22 +423,69 @@ class VideoFolderDataset(torch.utils.data.IterableDataset, torch.distributed.che
 class ImageWebDataset(torch.utils.data.IterableDataset, torch.distributed.checkpoint.stateful.Stateful):
-    def __init__(self, dataset_name: str, infinite: bool = False) -> None:
         super().__init__()
         self.dataset_name = dataset_name
         self.infinite = infinite
         data = datasets.load_dataset(dataset_name, split="train", streaming=True)
-        data = data.rename_column("txt", "caption")
-        for column_name in constants.SUPPORTED_IMAGE_FILE_EXTENSIONS:
-            if column_name in data.column_names:
-                data = data.cast_column(column_name, datasets.Image(mode="RGB"))
-                data = data.rename_column(column_name, "image")
         self._data = data
         self._sample_index = 0
         self._precomputable_once = False
     def _get_data_iter(self):
         if self._sample_index == 0:
@@ -446,6 +496,9 @@ class ImageWebDataset(torch.utils.data.IterableDataset, torch.distributed.checkp
         while True:
             for sample in self._get_data_iter():
                 self._sample_index += 1
                 yield sample
             if not self.infinite:
@@ -464,22 +517,69 @@ class ImageWebDataset(torch.utils.data.IterableDataset, torch.distributed.checkp
 class VideoWebDataset(torch.utils.data.IterableDataset, torch.distributed.checkpoint.stateful.Stateful):
-    def __init__(self, dataset_name: str, infinite: bool = False) -> None:
         super().__init__()
         self.dataset_name = dataset_name
         self.infinite = infinite
         data = datasets.load_dataset(dataset_name, split="train", streaming=True)
-        data = data.rename_column("txt", "caption")
-        for column_name in constants.SUPPORTED_VIDEO_FILE_EXTENSIONS:
-            if column_name in data.column_names:
-                data = data.cast_column(column_name, datasets.Video())
-                data = data.rename_column(column_name, "video")
         self._data = data
         self._sample_index = 0
         self._precomputable_once = False
     def _get_data_iter(self):
         if self._sample_index == 0:
@@ -490,6 +590,9 @@ class VideoWebDataset(torch.utils.data.IterableDataset, torch.distributed.checkp
         while True:
             for sample in self._get_data_iter():
                 self._sample_index += 1
                 yield sample
             if not self.infinite:
@@ -600,11 +703,17 @@ class IterableDatasetPreprocessingWrapper(
         for sample in iter(self.dataset):
             if self.dataset_type == "image":
                 if self.image_resolution_buckets:
                     sample["image"] = FF.resize_to_nearest_bucket_image(
                         sample["image"], self.image_resolution_buckets, self.reshape_mode
                     )
             elif self.dataset_type == "video":
                 if self.video_resolution_buckets:
                     sample["video"], _first_frame_only = FF.resize_to_nearest_bucket_video(
                         sample["video"], self.video_resolution_buckets, self.reshape_mode
                     )
@@ -682,7 +791,12 @@ class IterableCombinedDataset(torch.utils.data.IterableDataset, torch.distribute
 # TODO(aryan): maybe write a test for this
 def initialize_dataset(
-    dataset_name_or_root: str, dataset_type: str = "video", streaming: bool = True, infinite: bool = False
 ) -> torch.utils.data.IterableDataset:
     assert dataset_type in ["image", "video"]
@@ -692,7 +806,7 @@ def initialize_dataset(
         does_repo_exist_on_hub = False
     if does_repo_exist_on_hub:
-        return _initialize_hub_dataset(dataset_name_or_root, dataset_type, infinite)
     else:
         return _initialize_local_dataset(dataset_name_or_root, dataset_type, infinite)
@@ -745,14 +859,33 @@ def _initialize_local_dataset(dataset_name_or_root: str, dataset_type: str, infi
     return dataset
-def _initialize_hub_dataset(dataset_name: str, dataset_type: str, infinite: bool = False):
     repo_file_list = list_repo_files(dataset_name, repo_type="dataset")
     if _has_data_caption_file_pairs(repo_file_list, remote=True):
         return _initialize_data_caption_file_dataset_from_hub(dataset_name, dataset_type, infinite)
     elif _has_data_file_caption_file_lists(repo_file_list, remote=True):
         return _initialize_data_file_caption_file_dataset_from_hub(dataset_name, dataset_type, infinite)
-    else:
-        return _initialize_webdataset(dataset_name, dataset_type, infinite)
 def _initialize_data_caption_file_dataset_from_hub(
@@ -778,13 +911,14 @@ def _initialize_data_file_caption_file_dataset_from_hub(
 def _initialize_webdataset(
-    dataset_name: str, dataset_type: str, infinite: bool = False
 ) -> torch.utils.data.IterableDataset:
     logger.info(f"Streaming webdataset {dataset_name} from the HF Hub")
     if dataset_type == "image":
-        return ImageWebDataset(dataset_name, infinite=infinite)
     else:
-        return VideoWebDataset(dataset_name, infinite=infinite)
 def _has_data_caption_file_pairs(root: Union[pathlib.Path, List[str]], remote: bool = False) -> bool:

 logger = get_logger()
+# fmt: off
 MAX_PRECOMPUTABLE_ITEMS_LIMIT = 1024
 COMMON_CAPTION_FILES = ["prompt.txt", "prompts.txt", "caption.txt", "captions.txt"]
 COMMON_VIDEO_FILES = ["video.txt", "videos.txt"]
 COMMON_IMAGE_FILES = ["image.txt", "images.txt"]
+COMMON_WDS_CAPTION_COLUMN_NAMES = ["txt", "text", "caption", "captions", "short_caption", "long_caption", "prompt", "prompts", "short_prompt", "long_prompt", "description", "descriptions", "alt_text", "alt_texts", "alt_caption", "alt_captions", "alt_prompt", "alt_prompts", "alt_description", "alt_descriptions", "image_description", "image_descriptions", "image_caption", "image_captions", "image_prompt", "image_prompts", "image_alt_text", "image_alt_texts", "image_alt_caption", "image_alt_captions", "image_alt_prompt", "image_alt_prompts", "image_alt_description", "image_alt_descriptions", "video_description", "video_descriptions", "video_caption", "video_captions", "video_prompt", "video_prompts", "video_alt_text", "video_alt_texts", "video_alt_caption", "video_alt_captions", "video_alt_prompt", "video_alt_prompts", "video_alt_description"]
+# fmt: on
 class ImageCaptionFilePairDataset(torch.utils.data.IterableDataset, torch.distributed.checkpoint.stateful.Stateful):
 class ImageWebDataset(torch.utils.data.IterableDataset, torch.distributed.checkpoint.stateful.Stateful):
+    def __init__(
+        self,
+        dataset_name: str,
+        infinite: bool = False,
+        column_names: Union[str, List[str]] = "__auto__",
+        weights: Dict[str, float] = -1,
+        **kwargs,
+    ) -> None:
         super().__init__()
+        assert weights == -1 or isinstance(
+            weights, dict
+        ), "`weights` must be a dictionary of probabilities for each caption column"
         self.dataset_name = dataset_name
         self.infinite = infinite
         data = datasets.load_dataset(dataset_name, split="train", streaming=True)
+        if column_names == "__auto__":
+            if weights == -1:
+                caption_columns = [column for column in data.column_names if column in COMMON_WDS_CAPTION_COLUMN_NAMES]
+                if len(caption_columns) == 0:
+                    raise ValueError(
+                        f"No common caption column found in the dataset. Supported columns are: {COMMON_WDS_CAPTION_COLUMN_NAMES}"
+                    )
+                weights = [1] * len(caption_columns)
+            else:
+                caption_columns = list(weights.keys())
+                weights = list(weights.values())
+                if not all(column in data.column_names for column in caption_columns):
+                    raise ValueError(
+                        f"Caption columns {caption_columns} not found in the dataset. Available columns are: {data.column_names}"
+                    )
+        else:
+            if isinstance(column_names, str):
+                if column_names not in data.column_names:
+                    raise ValueError(
+                        f"Caption column {column_names} not found in the dataset. Available columns are: {data.column_names}"
+                    )
+                caption_columns = [column_names]
+                weights = [1] if weights == -1 else [weights.get(column_names)]
+            elif isinstance(column_names, list):
+                if not all(column in data.column_names for column in column_names):
+                    raise ValueError(
+                        f"Caption columns {column_names} not found in the dataset. Available columns are: {data.column_names}"
+                    )
+                caption_columns = column_names
+                weights = [1] if weights == -1 else [weights.get(column) for column in column_names]
+            else:
+                raise ValueError(f"Unsupported type for column_name: {type(column_names)}")
+        for column_names in constants.SUPPORTED_IMAGE_FILE_EXTENSIONS:
+            if column_names in data.column_names:
+                data = data.cast_column(column_names, datasets.Image(mode="RGB"))
+                data = data.rename_column(column_names, "image")
+                break
         self._data = data
         self._sample_index = 0
         self._precomputable_once = False
+        self._caption_columns = caption_columns
+        self._weights = weights
     def _get_data_iter(self):
         if self._sample_index == 0:
         while True:
             for sample in self._get_data_iter():
                 self._sample_index += 1
+                caption_column = random.choices(self._caption_columns, weights=self._weights, k=1)[0]
+                sample["caption"] = sample[caption_column]
+                sample["image"] = _preprocess_image(sample["image"])
                 yield sample
             if not self.infinite:
 class VideoWebDataset(torch.utils.data.IterableDataset, torch.distributed.checkpoint.stateful.Stateful):
+    def __init__(
+        self,
+        dataset_name: str,
+        infinite: bool = False,
+        column_names: Union[str, List[str]] = "__auto__",
+        weights: Dict[str, float] = -1,
+        **kwargs,
+    ) -> None:
         super().__init__()
+        assert weights == -1 or isinstance(
+            weights, dict
+        ), "`weights` must be a dictionary of probabilities for each caption column"
         self.dataset_name = dataset_name
         self.infinite = infinite
         data = datasets.load_dataset(dataset_name, split="train", streaming=True)
+        if column_names == "__auto__":
+            if weights == -1:
+                caption_columns = [column for column in data.column_names if column in COMMON_WDS_CAPTION_COLUMN_NAMES]
+                if len(caption_columns) == 0:
+                    raise ValueError(
+                        f"No common caption column found in the dataset. Supported columns are: {COMMON_WDS_CAPTION_COLUMN_NAMES}"
+                    )
+                weights = [1] * len(caption_columns)
+            else:
+                caption_columns = list(weights.keys())
+                weights = list(weights.values())
+                if not all(column in data.column_names for column in caption_columns):
+                    raise ValueError(
+                        f"Caption columns {caption_columns} not found in the dataset. Available columns are: {data.column_names}"
+                    )
+        else:
+            if isinstance(column_names, str):
+                if column_names not in data.column_names:
+                    raise ValueError(
+                        f"Caption column {column_names} not found in the dataset. Available columns are: {data.column_names}"
+                    )
+                caption_columns = [column_names]
+                weights = [1] if weights == -1 else [weights.get(column_names)]
+            elif isinstance(column_names, list):
+                if not all(column in data.column_names for column in column_names):
+                    raise ValueError(
+                        f"Caption columns {column_names} not found in the dataset. Available columns are: {data.column_names}"
+                    )
+                caption_columns = column_names
+                weights = [1] if weights == -1 else [weights.get(column) for column in column_names]
+            else:
+                raise ValueError(f"Unsupported type for column_name: {type(column_names)}")
+        for column_names in constants.SUPPORTED_VIDEO_FILE_EXTENSIONS:
+            if column_names in data.column_names:
+                data = data.cast_column(column_names, datasets.Video())
+                data = data.rename_column(column_names, "video")
+                break
         self._data = data
         self._sample_index = 0
         self._precomputable_once = False
+        self._caption_columns = caption_columns
+        self._weights = weights
     def _get_data_iter(self):
         if self._sample_index == 0:
         while True:
             for sample in self._get_data_iter():
                 self._sample_index += 1
+                caption_column = random.choices(self._caption_columns, weights=self._weights, k=1)[0]
+                sample["caption"] = sample[caption_column]
+                sample["video"] = _preprocess_video(sample["video"])
                 yield sample
             if not self.infinite:
         for sample in iter(self.dataset):
             if self.dataset_type == "image":
                 if self.image_resolution_buckets:
+                    sample["_original_num_frames"] = 1
+                    sample["_original_height"] = sample["image"].size(1)
+                    sample["_original_width"] = sample["image"].size(2)
                     sample["image"] = FF.resize_to_nearest_bucket_image(
                         sample["image"], self.image_resolution_buckets, self.reshape_mode
                     )
             elif self.dataset_type == "video":
                 if self.video_resolution_buckets:
+                    sample["_original_num_frames"] = sample["video"].size(0)
+                    sample["_original_height"] = sample["video"].size(2)
+                    sample["_original_width"] = sample["video"].size(3)
                     sample["video"], _first_frame_only = FF.resize_to_nearest_bucket_video(
                         sample["video"], self.video_resolution_buckets, self.reshape_mode
                     )
 # TODO(aryan): maybe write a test for this
 def initialize_dataset(
+    dataset_name_or_root: str,
+    dataset_type: str = "video",
+    streaming: bool = True,
+    infinite: bool = False,
+    *,
+    _caption_options: Optional[Dict[str, Any]] = None,
 ) -> torch.utils.data.IterableDataset:
     assert dataset_type in ["image", "video"]
         does_repo_exist_on_hub = False
     if does_repo_exist_on_hub:
+        return _initialize_hub_dataset(dataset_name_or_root, dataset_type, infinite, _caption_options=_caption_options)
     else:
         return _initialize_local_dataset(dataset_name_or_root, dataset_type, infinite)
     return dataset
+def _initialize_hub_dataset(
+    dataset_name: str, dataset_type: str, infinite: bool = False, *, _caption_options: Optional[Dict[str, Any]] = None
+):
     repo_file_list = list_repo_files(dataset_name, repo_type="dataset")
     if _has_data_caption_file_pairs(repo_file_list, remote=True):
         return _initialize_data_caption_file_dataset_from_hub(dataset_name, dataset_type, infinite)
     elif _has_data_file_caption_file_lists(repo_file_list, remote=True):
         return _initialize_data_file_caption_file_dataset_from_hub(dataset_name, dataset_type, infinite)
+    has_tar_files = any(file.endswith(".tar") or file.endswith(".parquet") for file in repo_file_list)
+    if has_tar_files:
+        return _initialize_webdataset(dataset_name, dataset_type, infinite, _caption_options=_caption_options)
+    # TODO(aryan): This should be improved
+    caption_files = [pathlib.Path(file).name for file in repo_file_list if file.endswith(".txt")]
+    if len(caption_files) < MAX_PRECOMPUTABLE_ITEMS_LIMIT:
+        try:
+            dataset_root = snapshot_download(dataset_name, repo_type="dataset")
+            if dataset_type == "image":
+                dataset = ImageFolderDataset(dataset_root, infinite=infinite)
+            else:
+                dataset = VideoFolderDataset(dataset_root, infinite=infinite)
+            return dataset
+        except Exception:
+            pass
+    raise ValueError(f"Could not load dataset {dataset_name} from the HF Hub")
 def _initialize_data_caption_file_dataset_from_hub(
 def _initialize_webdataset(
+    dataset_name: str, dataset_type: str, infinite: bool = False, _caption_options: Optional[Dict[str, Any]] = None
 ) -> torch.utils.data.IterableDataset:
     logger.info(f"Streaming webdataset {dataset_name} from the HF Hub")
+    _caption_options = _caption_options or {}
     if dataset_type == "image":
+        return ImageWebDataset(dataset_name, infinite=infinite, **_caption_options)
     else:
+        return VideoWebDataset(dataset_name, infinite=infinite, **_caption_options)
 def _has_data_caption_file_pairs(root: Union[pathlib.Path, List[str]], remote: bool = False) -> bool:

finetrainers/data/precomputation.py CHANGED Viewed

@@ -1,13 +1,132 @@
 import pathlib
-from typing import Any, Callable, Dict, Iterable, Optional
 import torch
 from tqdm.auto import tqdm
 from .. import utils
-class DistributedDataPreprocessor:
     def __init__(
         self,
         rank: int,
@@ -15,13 +134,15 @@ class DistributedDataPreprocessor:
         processor_fn: Dict[str, Callable[[Dict[str, Any]], Dict[str, Any]]],
         save_dir: str,
     ) -> None:
         self._rank = rank
         self._num_items = num_items
         self._processor_fn = processor_fn
         self._save_dir = pathlib.Path(save_dir)
         self._cached_samples = []
-        self._preprocessed_iterator: "PreprocessedDataIterable" = None
         self._save_dir.mkdir(parents=True, exist_ok=True)
@@ -59,9 +180,8 @@ class DistributedDataPreprocessor:
         if drop_samples:
             del self._cached_samples
             self._cached_samples = []
-            utils.free_memory()
-        self._preprocessed_iterator = PreprocessedDataIterable(self._rank, self._save_dir, data_type)
         return iter(self._preprocessed_iterator)
     def consume_once(
@@ -95,9 +215,8 @@ class DistributedDataPreprocessor:
         if drop_samples:
             del self._cached_samples
             self._cached_samples = []
-            utils.free_memory()
-        self._preprocessed_iterator = PreprocessedOnceDataIterable(self._rank, self._save_dir, data_type)
         return iter(self._preprocessed_iterator)
     @property
@@ -107,7 +226,70 @@ class DistributedDataPreprocessor:
         return self._preprocessed_iterator.requires_data
-class PreprocessedDataIterable:
     def __init__(self, rank: int, save_dir: str, data_type: str) -> None:
         self._rank = rank
         self._save_dir = pathlib.Path(save_dir)
@@ -130,7 +312,13 @@ class PreprocessedDataIterable:
         return self._requires_data
-class PreprocessedOnceDataIterable:
     def __init__(self, rank: int, save_dir: str, data_type: str) -> None:
         self._rank = rank
         self._save_dir = pathlib.Path(save_dir)
@@ -153,6 +341,31 @@ class PreprocessedOnceDataIterable:
         return self._requires_data
 def _save_item(rank: int, index: int, item: Dict[str, Any], directory: pathlib.Path, data_type: str) -> None:
     filename = directory / f"{data_type}-{rank}-{index}.pt"
     torch.save(item, filename.as_posix())

 import pathlib
+from typing import Any, Callable, Dict, Iterable, List, Optional, Union
 import torch
 from tqdm.auto import tqdm
 from .. import utils
+from ..logging import get_logger
+logger = get_logger()
+def initialize_preprocessor(
+    rank: int,
+    num_items: int,
+    processor_fn: Dict[str, Callable[[Dict[str, Any]], Dict[str, Any]]],
+    save_dir: Optional[str] = None,
+    enable_precomputation: bool = False,
+) -> Union["InMemoryDistributedDataPreprocessor", "PrecomputedDistributedDataPreprocessor"]:
+    if enable_precomputation:
+        return PrecomputedDistributedDataPreprocessor(rank, num_items, processor_fn, save_dir)
+    return InMemoryDistributedDataPreprocessor(rank, num_items, processor_fn)
+class DistributedDataProcessorMixin:
+    def consume(self, *args, **kwargs):
+        raise NotImplementedError("DistributedDataProcessorMixin::consume must be implemented by the subclass.")
+    def consume_once(self, *args, **kwargs):
+        raise NotImplementedError("DistributedDataProcessorMixin::consume_once must be implemented by the subclass.")
+    @property
+    def requires_data(self):
+        raise NotImplementedError("DistributedDataProcessorMixin::requires_data must be implemented by the subclass.")
+class InMemoryDistributedDataPreprocessor(DistributedDataProcessorMixin):
+    def __init__(
+        self, rank: int, num_items: int, processor_fn: Dict[str, Callable[[Dict[str, Any]], Dict[str, Any]]]
+    ) -> None:
+        super().__init__()
+        self._rank = rank
+        self._num_items = num_items
+        self._processor_fn = processor_fn
+        self._cached_samples = []
+        self._buffer = InMemoryDataBuffer(num_items)
+        self._preprocessed_iterator: Union["InMemoryDataIterable", "InMemoryOnceDataIterable"] = None
+    def consume(
+        self,
+        data_type: str,
+        components: Dict[str, Any],
+        data_iterator,
+        generator: Optional[torch.Generator] = None,
+        cache_samples: bool = False,
+        use_cached_samples: bool = False,
+        drop_samples: bool = False,
+    ) -> Iterable[Dict[str, Any]]:
+        if data_type not in self._processor_fn.keys():
+            raise ValueError(f"Invalid data type: {data_type}. Supported types: {list(self._processor_fn.keys())}")
+        if cache_samples:
+            if use_cached_samples:
+                raise ValueError("Cannot cache and use cached samples at the same time.")
+            if drop_samples:
+                raise ValueError("Cannot cache and drop samples at the same time.")
+        for i in range(self._num_items):
+            if use_cached_samples:
+                item = self._cached_samples[i]
+            else:
+                item = next(data_iterator)
+                if cache_samples:
+                    self._cached_samples.append(item)
+            item = self._processor_fn[data_type](**item, **components, generator=generator)
+            self._buffer.add(data_type, item)
+        if drop_samples:
+            del self._cached_samples
+            self._cached_samples = []
+        self._preprocessed_iterator = InMemoryDataIterable(self._rank, data_type, self._buffer)
+        return iter(self._preprocessed_iterator)
+    def consume_once(
+        self,
+        data_type: str,
+        components: Dict[str, Any],
+        data_iterator,
+        generator: Optional[torch.Generator] = None,
+        cache_samples: bool = False,
+        use_cached_samples: bool = False,
+        drop_samples: bool = False,
+    ) -> Iterable[Dict[str, Any]]:
+        if data_type not in self._processor_fn.keys():
+            raise ValueError(f"Invalid data type: {data_type}. Supported types: {list(self._processor_fn.keys())}")
+        if cache_samples:
+            if use_cached_samples:
+                raise ValueError("Cannot cache and use cached samples at the same time.")
+            if drop_samples:
+                raise ValueError("Cannot cache and drop samples at the same time.")
+        for i in range(self._num_items):
+            if use_cached_samples:
+                item = self._cached_samples[i]
+            else:
+                item = next(data_iterator)
+                if cache_samples:
+                    self._cached_samples.append(item)
+            item = self._processor_fn[data_type](**item, **components, generator=generator)
+            self._buffer.add(data_type, item)
+        if drop_samples:
+            del self._cached_samples
+            self._cached_samples = []
+        self._preprocessed_iterator = InMemoryOnceDataIterable(self._rank, data_type, self._buffer)
+        return iter(self._preprocessed_iterator)
+    @property
+    def requires_data(self):
+        if self._preprocessed_iterator is None:
+            return True
+        return self._preprocessed_iterator.requires_data
+class PrecomputedDistributedDataPreprocessor(DistributedDataProcessorMixin):
     def __init__(
         self,
         rank: int,
         processor_fn: Dict[str, Callable[[Dict[str, Any]], Dict[str, Any]]],
         save_dir: str,
     ) -> None:
+        super().__init__()
         self._rank = rank
         self._num_items = num_items
         self._processor_fn = processor_fn
         self._save_dir = pathlib.Path(save_dir)
         self._cached_samples = []
+        self._preprocessed_iterator: Union["PrecomputedDataIterable", "PrecomputedOnceDataIterable"] = None
         self._save_dir.mkdir(parents=True, exist_ok=True)
         if drop_samples:
             del self._cached_samples
             self._cached_samples = []
+        self._preprocessed_iterator = PrecomputedDataIterable(self._rank, self._save_dir, data_type)
         return iter(self._preprocessed_iterator)
     def consume_once(
         if drop_samples:
             del self._cached_samples
             self._cached_samples = []
+        self._preprocessed_iterator = PrecomputedOnceDataIterable(self._rank, self._save_dir, data_type)
         return iter(self._preprocessed_iterator)
     @property
         return self._preprocessed_iterator.requires_data
+class InMemoryDataIterable:
+    """
+    An iterator that loads data items from an in-memory buffer. Once all the data is consumed,
+    `requires_data` is set to True, indicating that the more data is required and the preprocessor's
+    consume method should be called again.
+    """
+    def __init__(self, rank: int, data_type: str, buffer: "InMemoryDataBuffer") -> None:
+        self._rank = rank
+        self._data_type = data_type
+        self._buffer = buffer
+        self._requires_data = False
+    def __iter__(self) -> Iterable[Dict[str, Any]]:
+        while (length := self._buffer.get_length(self._data_type)) > 0:
+            if length <= 1:
+                self._requires_data = True
+            yield self._buffer.get(self._data_type)
+    def __len__(self) -> int:
+        return self._buffer.get_length(self._data_type)
+    @property
+    def requires_data(self):
+        return self._requires_data
+class InMemoryOnceDataIterable:
+    """
+    An iterator that loads data items from an in-memory buffer. This iterator will never set
+    `requires_data` to True, as it is assumed that all the data was configured to be preprocessed
+    by the user. The data will indefinitely be cycled from the buffer.
+    """
+    def __init__(self, rank: int, data_type: str, buffer: "InMemoryDataBuffer") -> None:
+        self._rank = rank
+        self._data_type = data_type
+        self._buffer = buffer
+        self._requires_data = False
+    def __iter__(self) -> Iterable[Dict[str, Any]]:
+        assert len(self) > 0, "No data available in the buffer."
+        while True:
+            item = self._buffer.get(self._data_type)
+            yield item
+            self._buffer.add(self._data_type, item)
+    def __len__(self) -> int:
+        return self._buffer.get_length(self._data_type)
+    @property
+    def requires_data(self):
+        return self._requires_data
+class PrecomputedDataIterable:
+    """
+    An iterator that loads preconfigured number of data items from disk. Once all the data is
+    loaded, `requires_data` is set to True, indicating that the more data is required and
+    the preprocessor's consume method should be called again.
+    """
     def __init__(self, rank: int, save_dir: str, data_type: str) -> None:
         self._rank = rank
         self._save_dir = pathlib.Path(save_dir)
         return self._requires_data
+class PrecomputedOnceDataIterable:
+    """
+    An infinite iterator that loads preprocessed data from disk. Once initialized, this iterator
+    will never set `requires_data` to True, as it is assumed that all the data was configured to
+    be preprocessed by the user.
+    """
     def __init__(self, rank: int, save_dir: str, data_type: str) -> None:
         self._rank = rank
         self._save_dir = pathlib.Path(save_dir)
         return self._requires_data
+class InMemoryDataBuffer:
+    def __init__(self, max_limit: int = -1) -> None:
+        self.max_limit = max_limit
+        self.buffer: Dict[str, List[str]] = {}
+    def add(self, data_type: str, item: Dict[str, Any]) -> None:
+        if data_type not in self.buffer:
+            self.buffer[data_type] = []
+        if self.max_limit != -1 and len(self.buffer[data_type]) >= self.max_limit:
+            logger.log_freq(
+                "WARN",
+                "IN_MEMORY_DATA_BUFFER_FULL",
+                "Buffer is full. Dropping the oldest item. This message will be logged every 64th time this happens.",
+                64,
+            )
+            self.buffer[data_type].pop(0)
+        self.buffer[data_type].append(item)
+    def get(self, data_type: str) -> Dict[str, Any]:
+        return self.buffer[data_type].pop(0)
+    def get_length(self, data_type: str) -> int:
+        return len(self.buffer[data_type])
 def _save_item(rank: int, index: int, item: Dict[str, Any], directory: pathlib.Path, data_type: str) -> None:
     filename = directory / f"{data_type}-{rank}-{index}.pt"
     torch.save(item, filename.as_posix())

finetrainers/functional/image.py CHANGED Viewed

@@ -22,7 +22,7 @@ def resize_crop_image(image: torch.Tensor, size: Tuple[int, int]) -> torch.Tenso
 def bicubic_resize_image(image: torch.Tensor, size: Tuple[int, int]) -> torch.Tensor:
-    return F.interpolate(image, size=size, mode="bicubic", align_corners=False)
 def find_nearest_resolution_image(image: torch.Tensor, resolution_buckets: List[Tuple[int, int]]) -> Tuple[int, int]:

 def bicubic_resize_image(image: torch.Tensor, size: Tuple[int, int]) -> torch.Tensor:
+    return F.interpolate(image.unsqueeze(0), size=size, mode="bicubic", align_corners=False)[0]
 def find_nearest_resolution_image(image: torch.Tensor, resolution_buckets: List[Tuple[int, int]]) -> Tuple[int, int]:

finetrainers/models/cogvideox/base_specification.py CHANGED Viewed

@@ -105,7 +105,7 @@ class CogVideoXModelSpecification(ModelSpecification):
         )
         if condition_model_processors is None:
-            condition_model_processors = [T5Processor(["prompt_embeds", "prompt_attention_mask"])]
         if latent_model_processors is None:
             latent_model_processors = [CogVideoXLatentEncodeProcessor(["latents"])]
@@ -337,7 +337,6 @@ class CogVideoXModelSpecification(ModelSpecification):
         latent_model_conditions["hidden_states"] = noisy_latents.to(latents)
         latent_model_conditions["image_rotary_emb"] = image_rotary_emb
         latent_model_conditions["ofs"] = ofs_emb
-        condition_model_conditions["encoder_hidden_states"] = condition_model_conditions.pop("prompt_embeds")
         velocity = transformer(
             **latent_model_conditions,

         )
         if condition_model_processors is None:
+            condition_model_processors = [T5Processor(["encoder_hidden_states", "prompt_attention_mask"])]
         if latent_model_processors is None:
             latent_model_processors = [CogVideoXLatentEncodeProcessor(["latents"])]
         latent_model_conditions["hidden_states"] = noisy_latents.to(latents)
         latent_model_conditions["image_rotary_emb"] = image_rotary_emb
         latent_model_conditions["ofs"] = ofs_emb
         velocity = transformer(
             **latent_model_conditions,

finetrainers/models/cogview4/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .base_specification import CogView4ModelSpecification

finetrainers/models/cogview4/base_specification.py ADDED Viewed

	@@ -0,0 +1,395 @@

+import os
+from typing import Any, Dict, List, Optional, Tuple
+import torch
+from accelerate import init_empty_weights
+from diffusers import (
+    AutoencoderKL,
+    CogView4Pipeline,
+    CogView4Transformer2DModel,
+    FlowMatchEulerDiscreteScheduler,
+)
+from diffusers.models.autoencoders.vae import DiagonalGaussianDistribution
+from transformers import AutoTokenizer, GlmModel
+from ... import data
+from ... import functional as FF
+from ...logging import get_logger
+from ...processors import CogView4GLMProcessor, ProcessorMixin
+from ...typing import ArtifactType, SchedulerType
+from ...utils import get_non_null_items
+from ..modeling_utils import ModelSpecification
+logger = get_logger()
+class CogView4LatentEncodeProcessor(ProcessorMixin):
+    r"""
+    Processor to encode image/video into latents using the LTX VAE.
+    Args:
+        output_names (`List[str]`):
+            The names of the outputs that the processor returns. The outputs are in the following order:
+            - latents: The latents of the input image/video.
+            - original_size: The original size of the input image/video.
+            - target_size: The target size of the input image/video.
+            - crop_coords: The top-left crop coordinates of the input image/video.
+    """
+    def __init__(self, output_names: List[str]):
+        super().__init__()
+        self.output_names = output_names
+        assert len(self.output_names) == 4
+    def forward(
+        self,
+        vae: AutoencoderKL,
+        image: Optional[torch.Tensor] = None,
+        video: Optional[torch.Tensor] = None,
+        generator: Optional[torch.Generator] = None,
+        compute_posterior: bool = True,
+        _original_height: Optional[int] = None,
+        _original_width: Optional[int] = None,
+    ) -> Dict[str, torch.Tensor]:
+        device = vae.device
+        dtype = vae.dtype
+        if video is not None:
+            # TODO(aryan): perhaps better would be to flatten(0, 1), but need to account for reshaping sigmas accordingly
+            image = video[:, 0]  # [B, F, C, H, W] -> [B, 1, C, H, W]
+        assert image.ndim == 4, f"Expected 4D tensor, got {image.ndim}D tensor"
+        image = image.to(device=device, dtype=vae.dtype)
+        if compute_posterior:
+            latents = vae.encode(image).latent_dist.sample(generator=generator)
+            latents = latents.to(dtype=dtype)
+        else:
+            if vae.use_slicing and image.shape[0] > 1:
+                encoded_slices = [vae._encode(x_slice) for x_slice in image.split(1)]
+                moments = torch.cat(encoded_slices)
+            else:
+                moments = vae._encode(image)
+            latents = moments.to(dtype=dtype)
+        batch_size = latents.size(0)
+        target_height = image.size(2)
+        target_width = image.size(3)
+        original_size = torch.tensor([(_original_height, _original_width)], device=device, dtype=dtype).repeat(
+            batch_size, 1
+        )
+        target_size = torch.tensor([(target_height, target_width)], device=device, dtype=dtype).repeat(batch_size, 1)
+        crop_coords = torch.tensor([(0, 0)], device=device, dtype=dtype).repeat(batch_size, 1)
+        return {
+            self.output_names[0]: latents,
+            self.output_names[1]: original_size,
+            self.output_names[2]: target_size,
+            self.output_names[3]: crop_coords,
+        }
+class CogView4ModelSpecification(ModelSpecification):
+    def __init__(
+        self,
+        pretrained_model_name_or_path: str = "THUDM/CogView4-6B",
+        tokenizer_id: Optional[str] = None,
+        text_encoder_id: Optional[str] = None,
+        transformer_id: Optional[str] = None,
+        vae_id: Optional[str] = None,
+        text_encoder_dtype: torch.dtype = torch.bfloat16,
+        transformer_dtype: torch.dtype = torch.bfloat16,
+        vae_dtype: torch.dtype = torch.bfloat16,
+        revision: Optional[str] = None,
+        cache_dir: Optional[str] = None,
+        condition_model_processors: List[ProcessorMixin] = None,
+        latent_model_processors: List[ProcessorMixin] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__(
+            pretrained_model_name_or_path=pretrained_model_name_or_path,
+            tokenizer_id=tokenizer_id,
+            text_encoder_id=text_encoder_id,
+            transformer_id=transformer_id,
+            vae_id=vae_id,
+            text_encoder_dtype=text_encoder_dtype,
+            transformer_dtype=transformer_dtype,
+            vae_dtype=vae_dtype,
+            revision=revision,
+            cache_dir=cache_dir,
+        )
+        if condition_model_processors is None:
+            condition_model_processors = [CogView4GLMProcessor(["encoder_hidden_states"])]
+        if latent_model_processors is None:
+            latent_model_processors = [
+                CogView4LatentEncodeProcessor(["latents", "original_size", "target_size", "crop_coords"])
+            ]
+        self.condition_model_processors = condition_model_processors
+        self.latent_model_processors = latent_model_processors
+    @property
+    def _resolution_dim_keys(self):
+        return {"latents": (2, 3)}
+    def load_condition_models(self) -> Dict[str, torch.nn.Module]:
+        if self.tokenizer_id is not None:
+            tokenizer = AutoTokenizer.from_pretrained(
+                self.tokenizer_id, revision=self.revision, cache_dir=self.cache_dir
+            )
+        else:
+            tokenizer = AutoTokenizer.from_pretrained(
+                self.pretrained_model_name_or_path,
+                subfolder="tokenizer",
+                revision=self.revision,
+                cache_dir=self.cache_dir,
+            )
+        if self.text_encoder_id is not None:
+            text_encoder = GlmModel.from_pretrained(
+                self.text_encoder_id,
+                torch_dtype=self.text_encoder_dtype,
+                revision=self.revision,
+                cache_dir=self.cache_dir,
+            )
+        else:
+            text_encoder = GlmModel.from_pretrained(
+                self.pretrained_model_name_or_path,
+                subfolder="text_encoder",
+                torch_dtype=self.text_encoder_dtype,
+                revision=self.revision,
+                cache_dir=self.cache_dir,
+            )
+        return {"tokenizer": tokenizer, "text_encoder": text_encoder}
+    def load_latent_models(self) -> Dict[str, torch.nn.Module]:
+        if self.vae_id is not None:
+            vae = AutoencoderKL.from_pretrained(
+                self.vae_id,
+                torch_dtype=self.vae_dtype,
+                revision=self.revision,
+                cache_dir=self.cache_dir,
+            )
+        else:
+            vae = AutoencoderKL.from_pretrained(
+                self.pretrained_model_name_or_path,
+                subfolder="vae",
+                torch_dtype=self.vae_dtype,
+                revision=self.revision,
+                cache_dir=self.cache_dir,
+            )
+        return {"vae": vae}
+    def load_diffusion_models(self) -> Dict[str, torch.nn.Module]:
+        if self.transformer_id is not None:
+            transformer = CogView4Transformer2DModel.from_pretrained(
+                self.transformer_id,
+                torch_dtype=self.transformer_dtype,
+                revision=self.revision,
+                cache_dir=self.cache_dir,
+            )
+        else:
+            transformer = CogView4Transformer2DModel.from_pretrained(
+                self.pretrained_model_name_or_path,
+                subfolder="transformer",
+                torch_dtype=self.transformer_dtype,
+                revision=self.revision,
+                cache_dir=self.cache_dir,
+            )
+        scheduler = FlowMatchEulerDiscreteScheduler()
+        return {"transformer": transformer, "scheduler": scheduler}
+    def load_pipeline(
+        self,
+        tokenizer: Optional[AutoTokenizer] = None,
+        text_encoder: Optional[GlmModel] = None,
+        transformer: Optional[CogView4Transformer2DModel] = None,
+        vae: Optional[AutoencoderKL] = None,
+        scheduler: Optional[FlowMatchEulerDiscreteScheduler] = None,
+        enable_slicing: bool = False,
+        enable_tiling: bool = False,
+        enable_model_cpu_offload: bool = False,
+        training: bool = False,
+        **kwargs,
+    ) -> CogView4Pipeline:
+        components = {
+            "tokenizer": tokenizer,
+            "text_encoder": text_encoder,
+            "transformer": transformer,
+            "vae": vae,
+            # Load the scheduler based on CogView4's config instead of using the default initialization being used for training
+            # "scheduler": scheduler,
+        }
+        components = get_non_null_items(components)
+        pipe = CogView4Pipeline.from_pretrained(
+            self.pretrained_model_name_or_path, **components, revision=self.revision, cache_dir=self.cache_dir
+        )
+        pipe.text_encoder.to(self.text_encoder_dtype)
+        pipe.vae.to(self.vae_dtype)
+        if not training:
+            pipe.transformer.to(self.transformer_dtype)
+        if enable_slicing:
+            pipe.vae.enable_slicing()
+        if enable_tiling:
+            pipe.vae.enable_tiling()
+        if enable_model_cpu_offload:
+            pipe.enable_model_cpu_offload()
+        return pipe
+    @torch.no_grad()
+    def prepare_conditions(
+        self,
+        tokenizer: AutoTokenizer,
+        text_encoder: GlmModel,
+        caption: str,
+        max_sequence_length: int = 1024,
+        **kwargs,
+    ) -> Dict[str, Any]:
+        conditions = {
+            "tokenizer": tokenizer,
+            "text_encoder": text_encoder,
+            "caption": caption,
+            "max_sequence_length": max_sequence_length,
+            **kwargs,
+        }
+        input_keys = set(conditions.keys())
+        conditions = super().prepare_conditions(**conditions)
+        conditions = {k: v for k, v in conditions.items() if k not in input_keys}
+        return conditions
+    @torch.no_grad()
+    def prepare_latents(
+        self,
+        vae: AutoencoderKL,
+        image: Optional[torch.Tensor] = None,
+        video: Optional[torch.Tensor] = None,
+        generator: Optional[torch.Generator] = None,
+        compute_posterior: bool = True,
+        _original_height: Optional[int] = None,
+        _original_width: Optional[int] = None,
+        **kwargs,
+    ) -> Dict[str, torch.Tensor]:
+        conditions = {
+            "vae": vae,
+            "image": image,
+            "video": video,
+            "generator": generator,
+            "compute_posterior": compute_posterior,
+            "_original_height": _original_height,
+            "_original_width": _original_width,
+            **kwargs,
+        }
+        input_keys = set(conditions.keys())
+        conditions = super().prepare_latents(**conditions)
+        conditions = {k: v for k, v in conditions.items() if k not in input_keys}
+        return conditions
+    def forward(
+        self,
+        transformer: CogView4Transformer2DModel,
+        condition_model_conditions: Dict[str, torch.Tensor],
+        latent_model_conditions: Dict[str, torch.Tensor],
+        sigmas: torch.Tensor,
+        generator: Optional[torch.Generator] = None,
+        compute_posterior: bool = True,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, ...]:
+        if compute_posterior:
+            latents = latent_model_conditions.pop("latents")
+        else:
+            posterior = DiagonalGaussianDistribution(latent_model_conditions.pop("latents"))
+            latents = posterior.sample(generator=generator)
+            del posterior
+        latents = (latents - self.vae_config.shift_factor) * self.vae_config.scaling_factor
+        noise = torch.zeros_like(latents).normal_(generator=generator)
+        timesteps = (sigmas.flatten() * 1000.0).long()
+        base_image_sequence_length = 256
+        base_shift = 0.25
+        max_shift = 0.75
+        image_sequence_length = latents.size(2) * latents.size(3) // self.transformer_config.patch_size**2
+        mu = (image_sequence_length / base_image_sequence_length) ** 0.5
+        mu = mu * max_shift + base_shift
+        shifted_sigmas = mu / (mu + (1 / sigmas - 1) ** 1.0)
+        noisy_latents = FF.flow_match_xt(latents, noise, shifted_sigmas)
+        latent_model_conditions["hidden_states"] = noisy_latents.to(latents)
+        pred = transformer(
+            **latent_model_conditions,
+            **condition_model_conditions,
+            timestep=timesteps,
+            return_dict=False,
+        )[0]
+        target = FF.flow_match_target(noise, latents)
+        # NOTE: shifted_sigmas loss weighting seems to work better than sigmas. Needs more investigation
+        # but let's keep it this way for now. Longer training runs should reveal more insights.
+        # return pred, target, sigmas
+        return pred, target, shifted_sigmas
+    def validation(
+        self,
+        pipeline: CogView4Pipeline,
+        prompt: str,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        generator: Optional[torch.Generator] = None,
+        **kwargs,
+    ) -> List[ArtifactType]:
+        generation_kwargs = {
+            "prompt": prompt,
+            "height": height,
+            "width": width,
+            "num_inference_steps": num_inference_steps,
+            "generator": generator,
+            "return_dict": True,
+            "output_type": "pil",
+        }
+        generation_kwargs = get_non_null_items(generation_kwargs)
+        image = pipeline(**generation_kwargs).images[0]
+        return [data.ImageArtifact(value=image)]
+    def _save_lora_weights(
+        self,
+        directory: str,
+        transformer_state_dict: Optional[Dict[str, torch.Tensor]] = None,
+        scheduler: Optional[SchedulerType] = None,
+        *args,
+        **kwargs,
+    ) -> None:
+        # TODO(aryan): this needs refactoring
+        if transformer_state_dict is not None:
+            CogView4Pipeline.save_lora_weights(directory, transformer_state_dict, safe_serialization=True)
+        if scheduler is not None:
+            scheduler.save_pretrained(os.path.join(directory, "scheduler"))
+    def _save_model(
+        self,
+        directory: str,
+        transformer: CogView4Transformer2DModel,
+        transformer_state_dict: Optional[Dict[str, torch.Tensor]] = None,
+        scheduler: Optional[SchedulerType] = None,
+    ) -> None:
+        # TODO(aryan): this needs refactoring
+        if transformer_state_dict is not None:
+            with init_empty_weights():
+                transformer_copy = CogView4Transformer2DModel.from_config(transformer.config)
+            transformer_copy.load_state_dict(transformer_state_dict, strict=True, assign=True)
+            transformer_copy.save_pretrained(os.path.join(directory, "transformer"))
+        if scheduler is not None:
+            scheduler.save_pretrained(os.path.join(directory, "scheduler"))

finetrainers/models/hunyuan_video/base_specification.py CHANGED Viewed

@@ -117,10 +117,7 @@ class HunyuanVideoModelSpecification(ModelSpecification):
     @property
     def _resolution_dim_keys(self):
-        # TODO
-        return {
-            "latents": (2, 3, 4),
-        }
     def load_condition_models(self) -> Dict[str, torch.nn.Module]:
         if self.tokenizer_id is not None:

     @property
     def _resolution_dim_keys(self):
+        return {"latents": (2, 3, 4)}
     def load_condition_models(self) -> Dict[str, torch.nn.Module]:
         if self.tokenizer_id is not None:

finetrainers/models/ltx_video/base_specification.py CHANGED Viewed

@@ -120,7 +120,7 @@ class LTXVideoModelSpecification(ModelSpecification):
         )
         if condition_model_processors is None:
-            condition_model_processors = [T5Processor(["prompt_embeds", "prompt_attention_mask"])]
         if latent_model_processors is None:
             latent_model_processors = [
                 LTXLatentEncodeProcessor(["latents", "num_frames", "height", "width", "latents_mean", "latents_std"])
@@ -131,9 +131,7 @@ class LTXVideoModelSpecification(ModelSpecification):
     @property
     def _resolution_dim_keys(self):
-        return {
-            "latents": (2, 3, 4),
-        }
     def load_condition_models(self) -> Dict[str, torch.nn.Module]:
         if self.tokenizer_id is not None:
@@ -342,8 +340,6 @@ class LTXVideoModelSpecification(ModelSpecification):
         sigmas = sigmas.view(-1, 1, 1).expand(-1, *noisy_latents.shape[1:-1], -1)
         latent_model_conditions["hidden_states"] = noisy_latents.to(latents)
-        condition_model_conditions["encoder_hidden_states"] = condition_model_conditions.pop("prompt_embeds")
-        condition_model_conditions["encoder_attention_mask"] = condition_model_conditions.pop("prompt_attention_mask")
         # TODO(aryan): make this configurable
         frame_rate = 25

         )
         if condition_model_processors is None:
+            condition_model_processors = [T5Processor(["encoder_hidden_states", "encoder_attention_mask"])]
         if latent_model_processors is None:
             latent_model_processors = [
                 LTXLatentEncodeProcessor(["latents", "num_frames", "height", "width", "latents_mean", "latents_std"])
     @property
     def _resolution_dim_keys(self):
+        return {"latents": (2, 3, 4)}
     def load_condition_models(self) -> Dict[str, torch.nn.Module]:
         if self.tokenizer_id is not None:
         sigmas = sigmas.view(-1, 1, 1).expand(-1, *noisy_latents.shape[1:-1], -1)
         latent_model_conditions["hidden_states"] = noisy_latents.to(latents)
         # TODO(aryan): make this configurable
         frame_rate = 25

finetrainers/models/modeling_utils.py CHANGED Viewed

@@ -115,9 +115,6 @@ class ModelSpecification:
             f"ModelSpecification::load_pipeline is not implemented for {self.__class__.__name__}"
         )
-    def collate_fn(self, batch: List[List[Dict[str, torch.Tensor]]]) -> Dict[str, torch.Tensor]:
-        raise NotImplementedError(f"ModelSpecification::collate_fn is not implemented for {self.__class__.__name__}")
     def prepare_conditions(self, **kwargs) -> Dict[str, Any]:
         for processor in self.condition_model_processors:
             result = processor(**kwargs)

             f"ModelSpecification::load_pipeline is not implemented for {self.__class__.__name__}"
         )
     def prepare_conditions(self, **kwargs) -> Dict[str, Any]:
         for processor in self.condition_model_processors:
             result = processor(**kwargs)

finetrainers/models/wan/base_specification.py CHANGED Viewed

@@ -34,11 +34,6 @@ class WanLatentEncodeProcessor(ProcessorMixin):
         output_names (`List[str]`):
             The names of the outputs that the processor returns. The outputs are in the following order:
             - latents: The latents of the input image/video.
-            - num_frames: The number of frames in the input video.
-            - height: The height of the input image/video.
-            - width: The width of the input image/video.
-            - latents_mean: The latent channel means from the VAE state dict.
-            - latents_std: The latent channel standard deviations from the VAE state dict.
     """
     def __init__(self, output_names: List[str]):
@@ -111,7 +106,7 @@ class WanModelSpecification(ModelSpecification):
         )
         if condition_model_processors is None:
-            condition_model_processors = [T5Processor(["prompt_embeds", "prompt_attention_mask"])]
         if latent_model_processors is None:
             latent_model_processors = [WanLatentEncodeProcessor(["latents"])]
@@ -120,10 +115,7 @@ class WanModelSpecification(ModelSpecification):
     @property
     def _resolution_dim_keys(self):
-        # TODO
-        return {
-            "latents": (2, 3, 4),
-        }
     def load_condition_models(self) -> Dict[str, torch.nn.Module]:
         if self.tokenizer_id is not None:
@@ -303,7 +295,6 @@ class WanModelSpecification(ModelSpecification):
         noisy_latents = FF.flow_match_xt(latents, noise, sigmas)
         latent_model_conditions["hidden_states"] = noisy_latents.to(latents)
-        condition_model_conditions["encoder_hidden_states"] = condition_model_conditions.pop("prompt_embeds")
         timesteps = (sigmas.flatten() * 1000.0).long()

         output_names (`List[str]`):
             The names of the outputs that the processor returns. The outputs are in the following order:
             - latents: The latents of the input image/video.
     """
     def __init__(self, output_names: List[str]):
         )
         if condition_model_processors is None:
+            condition_model_processors = [T5Processor(["encoder_hidden_states", "prompt_attention_mask"])]
         if latent_model_processors is None:
             latent_model_processors = [WanLatentEncodeProcessor(["latents"])]
     @property
     def _resolution_dim_keys(self):
+        return {"latents": (2, 3, 4)}
     def load_condition_models(self) -> Dict[str, torch.nn.Module]:
         if self.tokenizer_id is not None:
         noisy_latents = FF.flow_match_xt(latents, noise, sigmas)
         latent_model_conditions["hidden_states"] = noisy_latents.to(latents)
         timesteps = (sigmas.flatten() * 1000.0).long()

finetrainers/processors/__init__.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from .base import ProcessorMixin
 from .clip import CLIPPooledProcessor
 from .llama import LlamaProcessor
 from .t5 import T5Processor
 from .text import CaptionEmbeddingDropoutProcessor, CaptionTextDropoutProcessor

 from .base import ProcessorMixin
 from .clip import CLIPPooledProcessor
+from .glm import CogView4GLMProcessor
 from .llama import LlamaProcessor
 from .t5 import T5Processor
 from .text import CaptionEmbeddingDropoutProcessor, CaptionTextDropoutProcessor

finetrainers/processors/glm.py ADDED Viewed

	@@ -0,0 +1,74 @@

+from typing import List, Tuple, Union
+import torch
+from transformers import AutoTokenizer, GlmModel
+from .base import ProcessorMixin
+class CogView4GLMProcessor(ProcessorMixin):
+    r"""
+    Processor for the GLM family of models. This processor is used to encode text inputs and return the embeddings
+    and attention masks for the input text.
+    This processor is specific to CogView4 but can be used with any other model.
+    Args:
+        output_names (`List[str]`):
+            The names of the outputs that the processor should return. The first output is the embeddings of the input
+            text and the second output is the attention mask for the input text.
+    """
+    def __init__(self, output_names: List[str]):
+        super().__init__()
+        self.output_names = output_names
+        assert len(self.output_names) == 1
+    def forward(
+        self,
+        tokenizer: AutoTokenizer,
+        text_encoder: GlmModel,
+        caption: Union[str, List[str]],
+        max_sequence_length: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        r"""
+        Encode the input text and return the embeddings and attention mask for the input text.
+        Args:
+            tokenizer (`AutoTokenizer`):
+                The tokenizer used to tokenize the input text.
+            text_encoder (`GlmModel`):
+                The text encoder used to encode the input text.
+            caption (`Union[str, List[str]]`):
+                The input text to be encoded.
+            max_sequence_length (`int`):
+                The maximum sequence length of the input text.
+        """
+        if isinstance(caption, str):
+            caption = [caption]
+        device = text_encoder.device
+        dtype = text_encoder.dtype
+        text_inputs = tokenizer(
+            caption,
+            padding="longest",
+            max_length=max_sequence_length,
+            truncation=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids.to(device)
+        current_length = text_input_ids.size(1)
+        pad_length = 16 - current_length % 16
+        if pad_length > 0:
+            pad_ids = text_input_ids.new_full((text_input_ids.shape[0], pad_length), fill_value=tokenizer.pad_token_id)
+            text_input_ids = torch.cat([pad_ids, text_input_ids], dim=1)
+        prompt_embeds = text_encoder(text_input_ids, output_hidden_states=True).hidden_states[-2]
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+        return {self.output_names[0]: prompt_embeds}

finetrainers/trainer/sft_trainer/trainer.py CHANGED Viewed

@@ -2,6 +2,7 @@ import functools
 import json
 import math
 import os
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Union
@@ -33,6 +34,13 @@ logger = logging.get_logger()
 class SFTTrainer:
     def __init__(self, args: "BaseArgs", model_specification: "ModelSpecification") -> None:
         self.args = args
         self.state = State()
@@ -72,6 +80,7 @@ class SFTTrainer:
         patches.perform_patches_for_training(self.args, self.state.parallel_backend)
         self.model_specification = model_specification
     def run(self) -> None:
         try:
@@ -254,12 +263,15 @@ class SFTTrainer:
             data_root = config.pop("data_root", None)
             dataset_file = config.pop("dataset_file", None)
             dataset_type = config.pop("dataset_type")
             if data_root is not None and dataset_file is not None:
                 raise ValueError("Both data_root and dataset_file cannot be provided in the same dataset config.")
             dataset_name_or_root = data_root or dataset_file
-            dataset = data.initialize_dataset(dataset_name_or_root, dataset_type, streaming=True, infinite=True)
             if not dataset._precomputable_once and self.args.precomputation_once:
                 raise ValueError(
@@ -369,9 +381,9 @@ class SFTTrainer:
         self.transformer.train()
         data_iterator = iter(self.dataloader)
-        preprocessor = data.DistributedDataPreprocessor(
             rank=parallel_backend.rank,
-            num_items=self.args.precomputation_items,
             processor_fn={
                 "condition": self.model_specification.prepare_conditions,
                 "latent": functools.partial(
@@ -379,6 +391,7 @@ class SFTTrainer:
                 ),
             },
             save_dir=self.args.precomputation_dir,
         )
         precomputed_condition_iterator: Iterable[Dict[str, Any]] = None
         precomputed_latent_iterator: Iterable[Dict[str, Any]] = None
@@ -495,7 +508,6 @@ class SFTTrainer:
             if train_state.step % self.args.gradient_accumulation_steps == 0:
                 # TODO(aryan): revisit no_sync() for FSDP
-                # TODO(aryan): average the gradients for accumulation?
                 self.optimizer.step()
                 self.lr_scheduler.step()
                 self.optimizer.zero_grad()
@@ -651,28 +663,29 @@ class SFTTrainer:
             # TODO(aryan): Currently, we only support WandB so we've hardcoded it here. Needs to be revisited.
             for index, (key, artifact) in enumerate(list(artifacts.items())):
                 assert isinstance(artifact, (data.ImageArtifact, data.VideoArtifact))
                 filename = "validation-" if not final_validation else "final-"
-                filename += f"{step}-{parallel_backend.rank}-{index}-{prompt_filename}.{artifact.file_extension}"
                 output_filename = os.path.join(self.args.output_dir, filename)
                 if parallel_backend.is_main_process and artifact.file_extension == "mp4":
                     main_process_prompts_to_filenames[PROMPT] = filename
-                caption = f"{PROMPT} | (filename: {output_filename})"
                 if artifact.type == "image" and artifact.value is not None:
                     logger.debug(
                         f"Saving image from rank={parallel_backend.rank} to {output_filename}",
                         local_main_process_only=False,
                     )
                     artifact.value.save(output_filename)
-                    all_processes_artifacts.append(wandb.Image(output_filename, caption=caption))
                 elif artifact.type == "video" and artifact.value is not None:
                     logger.debug(
                         f"Saving video from rank={parallel_backend.rank} to {output_filename}",
                         local_main_process_only=False,
                     )
                     export_to_video(artifact.value, output_filename, fps=EXPORT_FPS)
-                    all_processes_artifacts.append(wandb.Video(output_filename, caption=caption))
         # 3. Cleanup & log artifacts
         parallel_backend.wait_for_everyone()
@@ -804,24 +817,16 @@ class SFTTrainer:
             component.to(device)
     def _set_components(self, components: Dict[str, Any]) -> None:
-        # fmt: off
-        component_names = ["tokenizer", "tokenizer_2", "tokenizer_3", "text_encoder", "text_encoder_2", "text_encoder_3", "transformer", "unet", "vae", "scheduler"]
-        # fmt: on
-        for component_name in component_names:
             existing_component = getattr(self, component_name, None)
             new_component = components.get(component_name, existing_component)
             setattr(self, component_name, new_component)
     def _delete_components(self, component_names: Optional[List[str]] = None) -> None:
         if component_names is None:
-            # fmt: off
-            component_names = ["tokenizer", "tokenizer_2", "tokenizer_3", "text_encoder", "text_encoder_2", "text_encoder_3", "transformer", "unet", "vae", "scheduler"]
-            # fmt: on
         for component_name in component_names:
             setattr(self, component_name, None)
         utils.free_memory()
         utils.synchronize_device()
@@ -848,7 +853,6 @@ class SFTTrainer:
                 training=True,
             )
         else:
-            # TODO(aryan): this branch does not work yet, needs to be implemented
             self._delete_components()
             # Load the transformer weights from the final checkpoint if performing full-finetune
@@ -874,50 +878,101 @@ class SFTTrainer:
         self._move_components_to_device(list(components.values()))
         return pipeline
-    def _prepare_data(self, preprocessor: data.DistributedDataPreprocessor, data_iterator):
-        logger.info("Precomputed condition & latent data exhausted. Loading & preprocessing new data.")
-        if self.args.precomputation_once:
-            consume_fn = preprocessor.consume_once
         else:
-            consume_fn = preprocessor.consume
-        condition_components = self.model_specification.load_condition_models()
-        component_names = list(condition_components.keys())
-        component_modules = list(condition_components.values())
-        self._set_components(condition_components)
-        self._move_components_to_device(component_modules)
-        precomputed_condition_iterator = consume_fn(
-            "condition",
-            components=condition_components,
-            data_iterator=data_iterator,
-            generator=self.state.generator,
-            cache_samples=True,
-        )
-        self._delete_components(component_names)
-        del condition_components, component_names, component_modules
-        latent_components = self.model_specification.load_latent_models()
-        if self.vae is not None:
-            if self.args.enable_slicing:
-                self.vae.enable_slicing()
-            if self.args.enable_tiling:
-                self.vae.enable_tiling()
-        component_names = list(latent_components.keys())
-        component_modules = list(latent_components.values())
-        self._set_components(latent_components)
-        self._move_components_to_device(component_modules)
-        precomputed_latent_iterator = consume_fn(
-            "latent",
-            components=latent_components,
-            data_iterator=data_iterator,
-            generator=self.state.generator,
-            use_cached_samples=True,
-            drop_samples=True,
-        )
-        self._delete_components(component_names)
-        del latent_components, component_names, component_modules
-        return precomputed_condition_iterator, precomputed_latent_iterator
     def _get_training_info(self) -> Dict[str, Any]:
         info = self.args.to_dict()

 import json
 import math
 import os
+import time
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Union
 class SFTTrainer:
+    # fmt: off
+    _all_component_names = ["tokenizer", "tokenizer_2", "tokenizer_3", "text_encoder", "text_encoder_2", "text_encoder_3", "transformer", "unet", "vae", "scheduler"]
+    _condition_component_names = ["tokenizer", "tokenizer_2", "tokenizer_3", "text_encoder", "text_encoder_2", "text_encoder_3"]
+    _latent_component_names = ["vae"]
+    _diffusion_component_names = ["transformer", "unet", "scheduler"]
+    # fmt: on
     def __init__(self, args: "BaseArgs", model_specification: "ModelSpecification") -> None:
         self.args = args
         self.state = State()
         patches.perform_patches_for_training(self.args, self.state.parallel_backend)
         self.model_specification = model_specification
+        self._are_condition_models_loaded = False
     def run(self) -> None:
         try:
             data_root = config.pop("data_root", None)
             dataset_file = config.pop("dataset_file", None)
             dataset_type = config.pop("dataset_type")
+            caption_options = config.pop("caption_options", {})
             if data_root is not None and dataset_file is not None:
                 raise ValueError("Both data_root and dataset_file cannot be provided in the same dataset config.")
             dataset_name_or_root = data_root or dataset_file
+            dataset = data.initialize_dataset(
+                dataset_name_or_root, dataset_type, streaming=True, infinite=True, _caption_options=caption_options
+            )
             if not dataset._precomputable_once and self.args.precomputation_once:
                 raise ValueError(
         self.transformer.train()
         data_iterator = iter(self.dataloader)
+        preprocessor = data.initialize_preprocessor(
             rank=parallel_backend.rank,
+            num_items=self.args.precomputation_items if self.args.enable_precomputation else 1,
             processor_fn={
                 "condition": self.model_specification.prepare_conditions,
                 "latent": functools.partial(
                 ),
             },
             save_dir=self.args.precomputation_dir,
+            enable_precomputation=self.args.enable_precomputation,
         )
         precomputed_condition_iterator: Iterable[Dict[str, Any]] = None
         precomputed_latent_iterator: Iterable[Dict[str, Any]] = None
             if train_state.step % self.args.gradient_accumulation_steps == 0:
                 # TODO(aryan): revisit no_sync() for FSDP
                 self.optimizer.step()
                 self.lr_scheduler.step()
                 self.optimizer.zero_grad()
             # TODO(aryan): Currently, we only support WandB so we've hardcoded it here. Needs to be revisited.
             for index, (key, artifact) in enumerate(list(artifacts.items())):
                 assert isinstance(artifact, (data.ImageArtifact, data.VideoArtifact))
+                time_, rank, ext = int(time.time()), parallel_backend.rank, artifact.file_extension
                 filename = "validation-" if not final_validation else "final-"
+                filename += f"{step}-{rank}-{index}-{prompt_filename}-{time_}.{ext}"
                 output_filename = os.path.join(self.args.output_dir, filename)
                 if parallel_backend.is_main_process and artifact.file_extension == "mp4":
                     main_process_prompts_to_filenames[PROMPT] = filename
                 if artifact.type == "image" and artifact.value is not None:
                     logger.debug(
                         f"Saving image from rank={parallel_backend.rank} to {output_filename}",
                         local_main_process_only=False,
                     )
                     artifact.value.save(output_filename)
+                    all_processes_artifacts.append(wandb.Image(output_filename, caption=PROMPT))
                 elif artifact.type == "video" and artifact.value is not None:
                     logger.debug(
                         f"Saving video from rank={parallel_backend.rank} to {output_filename}",
                         local_main_process_only=False,
                     )
                     export_to_video(artifact.value, output_filename, fps=EXPORT_FPS)
+                    all_processes_artifacts.append(wandb.Video(output_filename, caption=PROMPT))
         # 3. Cleanup & log artifacts
         parallel_backend.wait_for_everyone()
             component.to(device)
     def _set_components(self, components: Dict[str, Any]) -> None:
+        for component_name in self._all_component_names:
             existing_component = getattr(self, component_name, None)
             new_component = components.get(component_name, existing_component)
             setattr(self, component_name, new_component)
     def _delete_components(self, component_names: Optional[List[str]] = None) -> None:
         if component_names is None:
+            component_names = self._all_component_names
         for component_name in component_names:
             setattr(self, component_name, None)
         utils.free_memory()
         utils.synchronize_device()
                 training=True,
             )
         else:
             self._delete_components()
             # Load the transformer weights from the final checkpoint if performing full-finetune
         self._move_components_to_device(list(components.values()))
         return pipeline
+    def _prepare_data(
+        self,
+        preprocessor: Union[data.InMemoryDistributedDataPreprocessor, data.PrecomputedDistributedDataPreprocessor],
+        data_iterator,
+    ):
+        if not self.args.enable_precomputation:
+            if not self._are_condition_models_loaded:
+                logger.info(
+                    "Precomputation disabled. Loading in-memory data loaders. All components will be loaded on GPUs."
+                )
+                condition_components = self.model_specification.load_condition_models()
+                latent_components = self.model_specification.load_latent_models()
+                all_components = {**condition_components, **latent_components}
+                self._set_components(all_components)
+                self._move_components_to_device(list(all_components.values()))
+                utils._enable_vae_memory_optimizations(self.vae, self.args.enable_slicing, self.args.enable_tiling)
+            else:
+                condition_components = {k: v for k in self._condition_component_names if (v := getattr(self, k, None))}
+                latent_components = {k: v for k in self._latent_component_names if (v := getattr(self, k, None))}
+            condition_iterator = preprocessor.consume(
+                "condition",
+                components=condition_components,
+                data_iterator=data_iterator,
+                generator=self.state.generator,
+                cache_samples=True,
+            )
+            latent_iterator = preprocessor.consume(
+                "latent",
+                components=latent_components,
+                data_iterator=data_iterator,
+                generator=self.state.generator,
+                use_cached_samples=True,
+                drop_samples=True,
+            )
+            self._are_condition_models_loaded = True
         else:
+            logger.info("Precomputed condition & latent data exhausted. Loading & preprocessing new data.")
+            # TODO(aryan): This needs to be revisited. For some reason, the tests did not detect that self.transformer
+            # had become None after this but should have been loaded back from the checkpoint.
+            # parallel_backend = self.state.parallel_backend
+            # train_state = self.state.train_state
+            # self.checkpointer.save(
+            #     train_state.step,
+            #     force=True,
+            #     _device=parallel_backend.device,
+            #     _is_main_process=parallel_backend.is_main_process,
+            # )
+            # self._delete_components(component_names=["transformer", "unet"])
+            if self.args.precomputation_once:
+                consume_fn = preprocessor.consume_once
+            else:
+                consume_fn = preprocessor.consume
+            # Prepare condition iterators
+            condition_components = self.model_specification.load_condition_models()
+            component_names = list(condition_components.keys())
+            component_modules = list(condition_components.values())
+            self._set_components(condition_components)
+            self._move_components_to_device(component_modules)
+            condition_iterator = consume_fn(
+                "condition",
+                components=condition_components,
+                data_iterator=data_iterator,
+                generator=self.state.generator,
+                cache_samples=True,
+            )
+            self._delete_components(component_names)
+            del condition_components, component_names, component_modules
+            # Prepare latent iterators
+            latent_components = self.model_specification.load_latent_models()
+            utils._enable_vae_memory_optimizations(self.vae, self.args.enable_slicing, self.args.enable_tiling)
+            component_names = list(latent_components.keys())
+            component_modules = list(latent_components.values())
+            self._set_components(latent_components)
+            self._move_components_to_device(component_modules)
+            latent_iterator = consume_fn(
+                "latent",
+                components=latent_components,
+                data_iterator=data_iterator,
+                generator=self.state.generator,
+                use_cached_samples=True,
+                drop_samples=True,
+            )
+            self._delete_components(component_names)
+            del latent_components, component_names, component_modules
+            # self.checkpointer.load()
+            # self.transformer = self.checkpointer.states["model"].model[0]
+        return condition_iterator, latent_iterator
     def _get_training_info(self) -> Dict[str, Any]:
         info = self.args.to_dict()

finetrainers/utils/__init__.py CHANGED Viewed

@@ -4,6 +4,7 @@ from typing import Any, Dict, List, Optional, Set, Tuple, Union
 from .activation_checkpoint import apply_activation_checkpointing
 from .data import determine_batch_size, should_perform_precomputation
 from .diffusion import (
     default_flow_shift,
     get_scheduler_alphas,
     get_scheduler_sigmas,

 from .activation_checkpoint import apply_activation_checkpointing
 from .data import determine_batch_size, should_perform_precomputation
 from .diffusion import (
+    _enable_vae_memory_optimizations,
     default_flow_shift,
     get_scheduler_alphas,
     get_scheduler_sigmas,

finetrainers/utils/diffusion.py CHANGED Viewed

@@ -143,3 +143,10 @@ def prepare_target(
         raise ValueError(f"Unsupported scheduler type {type(scheduler)}")
     return target

         raise ValueError(f"Unsupported scheduler type {type(scheduler)}")
     return target
+def _enable_vae_memory_optimizations(vae, enable_slicing: bool = False, enable_tiling: bool = False):
+    if hasattr(vae, "enable_slicing") and enable_slicing:
+        vae.enable_slicing()
+    if hasattr(vae, "enable_tiling") and enable_tiling:
+        vae.enable_tiling()

requirements.txt CHANGED Viewed

@@ -40,5 +40,5 @@ av==14.1.0
 git+https://github.com/LLaVA-VL/LLaVA-NeXT.git
 # for our frontend
-gradio==5.15.0
 gradio_toggle

 git+https://github.com/LLaVA-VL/LLaVA-NeXT.git
 # for our frontend
+gradio==5.20.1
 gradio_toggle

requirements_without_flash_attention.txt CHANGED Viewed

@@ -39,5 +39,5 @@ av==14.1.0
 git+https://github.com/LLaVA-VL/LLaVA-NeXT.git
 # for our frontend
-gradio==5.15.0
 gradio_toggle

 git+https://github.com/LLaVA-VL/LLaVA-NeXT.git
 # for our frontend
+gradio==5.20.1
 gradio_toggle