Llama-3.1-8B-DALv0.1
/
venv
/lib
/python3.12
/site-packages
/huggingface_hub
/utils
/_cache_manager.py
# coding=utf-8 | |
# Copyright 2022-present, the HuggingFace Inc. team. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
"""Contains utilities to manage the HF cache directory.""" | |
import os | |
import shutil | |
import time | |
from collections import defaultdict | |
from dataclasses import dataclass | |
from pathlib import Path | |
from typing import Dict, FrozenSet, List, Literal, Optional, Set, Union | |
from ..constants import HF_HUB_CACHE | |
from . import logging | |
logger = logging.get_logger(__name__) | |
REPO_TYPE_T = Literal["model", "dataset", "space"] | |
# List of OS-created helper files that need to be ignored | |
FILES_TO_IGNORE = [".DS_Store"] | |
class CacheNotFound(Exception): | |
"""Exception thrown when the Huggingface cache is not found.""" | |
cache_dir: Union[str, Path] | |
def __init__(self, msg: str, cache_dir: Union[str, Path], *args, **kwargs): | |
super().__init__(msg, *args, **kwargs) | |
self.cache_dir = cache_dir | |
class CorruptedCacheException(Exception): | |
"""Exception for any unexpected structure in the Huggingface cache-system.""" | |
class CachedFileInfo: | |
"""Frozen data structure holding information about a single cached file. | |
Args: | |
file_name (`str`): | |
Name of the file. Example: `config.json`. | |
file_path (`Path`): | |
Path of the file in the `snapshots` directory. The file path is a symlink | |
referring to a blob in the `blobs` folder. | |
blob_path (`Path`): | |
Path of the blob file. This is equivalent to `file_path.resolve()`. | |
size_on_disk (`int`): | |
Size of the blob file in bytes. | |
blob_last_accessed (`float`): | |
Timestamp of the last time the blob file has been accessed (from any | |
revision). | |
blob_last_modified (`float`): | |
Timestamp of the last time the blob file has been modified/created. | |
<Tip warning={true}> | |
`blob_last_accessed` and `blob_last_modified` reliability can depend on the OS you | |
are using. See [python documentation](https://docs.python.org/3/library/os.html#os.stat_result) | |
for more details. | |
</Tip> | |
""" | |
file_name: str | |
file_path: Path | |
blob_path: Path | |
size_on_disk: int | |
blob_last_accessed: float | |
blob_last_modified: float | |
def blob_last_accessed_str(self) -> str: | |
""" | |
(property) Timestamp of the last time the blob file has been accessed (from any | |
revision), returned as a human-readable string. | |
Example: "2 weeks ago". | |
""" | |
return _format_timesince(self.blob_last_accessed) | |
def blob_last_modified_str(self) -> str: | |
""" | |
(property) Timestamp of the last time the blob file has been modified, returned | |
as a human-readable string. | |
Example: "2 weeks ago". | |
""" | |
return _format_timesince(self.blob_last_modified) | |
def size_on_disk_str(self) -> str: | |
""" | |
(property) Size of the blob file as a human-readable string. | |
Example: "42.2K". | |
""" | |
return _format_size(self.size_on_disk) | |
class CachedRevisionInfo: | |
"""Frozen data structure holding information about a revision. | |
A revision correspond to a folder in the `snapshots` folder and is populated with | |
the exact tree structure as the repo on the Hub but contains only symlinks. A | |
revision can be either referenced by 1 or more `refs` or be "detached" (no refs). | |
Args: | |
commit_hash (`str`): | |
Hash of the revision (unique). | |
Example: `"9338f7b671827df886678df2bdd7cc7b4f36dffd"`. | |
snapshot_path (`Path`): | |
Path to the revision directory in the `snapshots` folder. It contains the | |
exact tree structure as the repo on the Hub. | |
files: (`FrozenSet[CachedFileInfo]`): | |
Set of [`~CachedFileInfo`] describing all files contained in the snapshot. | |
refs (`FrozenSet[str]`): | |
Set of `refs` pointing to this revision. If the revision has no `refs`, it | |
is considered detached. | |
Example: `{"main", "2.4.0"}` or `{"refs/pr/1"}`. | |
size_on_disk (`int`): | |
Sum of the blob file sizes that are symlink-ed by the revision. | |
last_modified (`float`): | |
Timestamp of the last time the revision has been created/modified. | |
<Tip warning={true}> | |
`last_accessed` cannot be determined correctly on a single revision as blob files | |
are shared across revisions. | |
</Tip> | |
<Tip warning={true}> | |
`size_on_disk` is not necessarily the sum of all file sizes because of possible | |
duplicated files. Besides, only blobs are taken into account, not the (negligible) | |
size of folders and symlinks. | |
</Tip> | |
""" | |
commit_hash: str | |
snapshot_path: Path | |
size_on_disk: int | |
files: FrozenSet[CachedFileInfo] | |
refs: FrozenSet[str] | |
last_modified: float | |
def last_modified_str(self) -> str: | |
""" | |
(property) Timestamp of the last time the revision has been modified, returned | |
as a human-readable string. | |
Example: "2 weeks ago". | |
""" | |
return _format_timesince(self.last_modified) | |
def size_on_disk_str(self) -> str: | |
""" | |
(property) Sum of the blob file sizes as a human-readable string. | |
Example: "42.2K". | |
""" | |
return _format_size(self.size_on_disk) | |
def nb_files(self) -> int: | |
""" | |
(property) Total number of files in the revision. | |
""" | |
return len(self.files) | |
class CachedRepoInfo: | |
"""Frozen data structure holding information about a cached repository. | |
Args: | |
repo_id (`str`): | |
Repo id of the repo on the Hub. Example: `"google/fleurs"`. | |
repo_type (`Literal["dataset", "model", "space"]`): | |
Type of the cached repo. | |
repo_path (`Path`): | |
Local path to the cached repo. | |
size_on_disk (`int`): | |
Sum of the blob file sizes in the cached repo. | |
nb_files (`int`): | |
Total number of blob files in the cached repo. | |
revisions (`FrozenSet[CachedRevisionInfo]`): | |
Set of [`~CachedRevisionInfo`] describing all revisions cached in the repo. | |
last_accessed (`float`): | |
Timestamp of the last time a blob file of the repo has been accessed. | |
last_modified (`float`): | |
Timestamp of the last time a blob file of the repo has been modified/created. | |
<Tip warning={true}> | |
`size_on_disk` is not necessarily the sum of all revisions sizes because of | |
duplicated files. Besides, only blobs are taken into account, not the (negligible) | |
size of folders and symlinks. | |
</Tip> | |
<Tip warning={true}> | |
`last_accessed` and `last_modified` reliability can depend on the OS you are using. | |
See [python documentation](https://docs.python.org/3/library/os.html#os.stat_result) | |
for more details. | |
</Tip> | |
""" | |
repo_id: str | |
repo_type: REPO_TYPE_T | |
repo_path: Path | |
size_on_disk: int | |
nb_files: int | |
revisions: FrozenSet[CachedRevisionInfo] | |
last_accessed: float | |
last_modified: float | |
def last_accessed_str(self) -> str: | |
""" | |
(property) Last time a blob file of the repo has been accessed, returned as a | |
human-readable string. | |
Example: "2 weeks ago". | |
""" | |
return _format_timesince(self.last_accessed) | |
def last_modified_str(self) -> str: | |
""" | |
(property) Last time a blob file of the repo has been modified, returned as a | |
human-readable string. | |
Example: "2 weeks ago". | |
""" | |
return _format_timesince(self.last_modified) | |
def size_on_disk_str(self) -> str: | |
""" | |
(property) Sum of the blob file sizes as a human-readable string. | |
Example: "42.2K". | |
""" | |
return _format_size(self.size_on_disk) | |
def refs(self) -> Dict[str, CachedRevisionInfo]: | |
""" | |
(property) Mapping between `refs` and revision data structures. | |
""" | |
return {ref: revision for revision in self.revisions for ref in revision.refs} | |
class DeleteCacheStrategy: | |
"""Frozen data structure holding the strategy to delete cached revisions. | |
This object is not meant to be instantiated programmatically but to be returned by | |
[`~utils.HFCacheInfo.delete_revisions`]. See documentation for usage example. | |
Args: | |
expected_freed_size (`float`): | |
Expected freed size once strategy is executed. | |
blobs (`FrozenSet[Path]`): | |
Set of blob file paths to be deleted. | |
refs (`FrozenSet[Path]`): | |
Set of reference file paths to be deleted. | |
repos (`FrozenSet[Path]`): | |
Set of entire repo paths to be deleted. | |
snapshots (`FrozenSet[Path]`): | |
Set of snapshots to be deleted (directory of symlinks). | |
""" | |
expected_freed_size: int | |
blobs: FrozenSet[Path] | |
refs: FrozenSet[Path] | |
repos: FrozenSet[Path] | |
snapshots: FrozenSet[Path] | |
def expected_freed_size_str(self) -> str: | |
""" | |
(property) Expected size that will be freed as a human-readable string. | |
Example: "42.2K". | |
""" | |
return _format_size(self.expected_freed_size) | |
def execute(self) -> None: | |
"""Execute the defined strategy. | |
<Tip warning={true}> | |
If this method is interrupted, the cache might get corrupted. Deletion order is | |
implemented so that references and symlinks are deleted before the actual blob | |
files. | |
</Tip> | |
<Tip warning={true}> | |
This method is irreversible. If executed, cached files are erased and must be | |
downloaded again. | |
</Tip> | |
""" | |
# Deletion order matters. Blobs are deleted in last so that the user can't end | |
# up in a state where a `ref`` refers to a missing snapshot or a snapshot | |
# symlink refers to a deleted blob. | |
# Delete entire repos | |
for path in self.repos: | |
_try_delete_path(path, path_type="repo") | |
# Delete snapshot directories | |
for path in self.snapshots: | |
_try_delete_path(path, path_type="snapshot") | |
# Delete refs files | |
for path in self.refs: | |
_try_delete_path(path, path_type="ref") | |
# Delete blob files | |
for path in self.blobs: | |
_try_delete_path(path, path_type="blob") | |
logger.info(f"Cache deletion done. Saved {self.expected_freed_size_str}.") | |
class HFCacheInfo: | |
"""Frozen data structure holding information about the entire cache-system. | |
This data structure is returned by [`scan_cache_dir`] and is immutable. | |
Args: | |
size_on_disk (`int`): | |
Sum of all valid repo sizes in the cache-system. | |
repos (`FrozenSet[CachedRepoInfo]`): | |
Set of [`~CachedRepoInfo`] describing all valid cached repos found on the | |
cache-system while scanning. | |
warnings (`List[CorruptedCacheException]`): | |
List of [`~CorruptedCacheException`] that occurred while scanning the cache. | |
Those exceptions are captured so that the scan can continue. Corrupted repos | |
are skipped from the scan. | |
<Tip warning={true}> | |
Here `size_on_disk` is equal to the sum of all repo sizes (only blobs). However if | |
some cached repos are corrupted, their sizes are not taken into account. | |
</Tip> | |
""" | |
size_on_disk: int | |
repos: FrozenSet[CachedRepoInfo] | |
warnings: List[CorruptedCacheException] | |
def size_on_disk_str(self) -> str: | |
""" | |
(property) Sum of all valid repo sizes in the cache-system as a human-readable | |
string. | |
Example: "42.2K". | |
""" | |
return _format_size(self.size_on_disk) | |
def delete_revisions(self, *revisions: str) -> DeleteCacheStrategy: | |
"""Prepare the strategy to delete one or more revisions cached locally. | |
Input revisions can be any revision hash. If a revision hash is not found in the | |
local cache, a warning is thrown but no error is raised. Revisions can be from | |
different cached repos since hashes are unique across repos, | |
Examples: | |
```py | |
>>> from huggingface_hub import scan_cache_dir | |
>>> cache_info = scan_cache_dir() | |
>>> delete_strategy = cache_info.delete_revisions( | |
... "81fd1d6e7847c99f5862c9fb81387956d99ec7aa" | |
... ) | |
>>> print(f"Will free {delete_strategy.expected_freed_size_str}.") | |
Will free 7.9K. | |
>>> delete_strategy.execute() | |
Cache deletion done. Saved 7.9K. | |
``` | |
```py | |
>>> from huggingface_hub import scan_cache_dir | |
>>> scan_cache_dir().delete_revisions( | |
... "81fd1d6e7847c99f5862c9fb81387956d99ec7aa", | |
... "e2983b237dccf3ab4937c97fa717319a9ca1a96d", | |
... "6c0e6080953db56375760c0471a8c5f2929baf11", | |
... ).execute() | |
Cache deletion done. Saved 8.6G. | |
``` | |
<Tip warning={true}> | |
`delete_revisions` returns a [`~utils.DeleteCacheStrategy`] object that needs to | |
be executed. The [`~utils.DeleteCacheStrategy`] is not meant to be modified but | |
allows having a dry run before actually executing the deletion. | |
</Tip> | |
""" | |
hashes_to_delete: Set[str] = set(revisions) | |
repos_with_revisions: Dict[CachedRepoInfo, Set[CachedRevisionInfo]] = defaultdict(set) | |
for repo in self.repos: | |
for revision in repo.revisions: | |
if revision.commit_hash in hashes_to_delete: | |
repos_with_revisions[repo].add(revision) | |
hashes_to_delete.remove(revision.commit_hash) | |
if len(hashes_to_delete) > 0: | |
logger.warning(f"Revision(s) not found - cannot delete them: {', '.join(hashes_to_delete)}") | |
delete_strategy_blobs: Set[Path] = set() | |
delete_strategy_refs: Set[Path] = set() | |
delete_strategy_repos: Set[Path] = set() | |
delete_strategy_snapshots: Set[Path] = set() | |
delete_strategy_expected_freed_size = 0 | |
for affected_repo, revisions_to_delete in repos_with_revisions.items(): | |
other_revisions = affected_repo.revisions - revisions_to_delete | |
# If no other revisions, it means all revisions are deleted | |
# -> delete the entire cached repo | |
if len(other_revisions) == 0: | |
delete_strategy_repos.add(affected_repo.repo_path) | |
delete_strategy_expected_freed_size += affected_repo.size_on_disk | |
continue | |
# Some revisions of the repo will be deleted but not all. We need to filter | |
# which blob files will not be linked anymore. | |
for revision_to_delete in revisions_to_delete: | |
# Snapshot dir | |
delete_strategy_snapshots.add(revision_to_delete.snapshot_path) | |
# Refs dir | |
for ref in revision_to_delete.refs: | |
delete_strategy_refs.add(affected_repo.repo_path / "refs" / ref) | |
# Blobs dir | |
for file in revision_to_delete.files: | |
if file.blob_path not in delete_strategy_blobs: | |
is_file_alone = True | |
for revision in other_revisions: | |
for rev_file in revision.files: | |
if file.blob_path == rev_file.blob_path: | |
is_file_alone = False | |
break | |
if not is_file_alone: | |
break | |
# Blob file not referenced by remaining revisions -> delete | |
if is_file_alone: | |
delete_strategy_blobs.add(file.blob_path) | |
delete_strategy_expected_freed_size += file.size_on_disk | |
# Return the strategy instead of executing it. | |
return DeleteCacheStrategy( | |
blobs=frozenset(delete_strategy_blobs), | |
refs=frozenset(delete_strategy_refs), | |
repos=frozenset(delete_strategy_repos), | |
snapshots=frozenset(delete_strategy_snapshots), | |
expected_freed_size=delete_strategy_expected_freed_size, | |
) | |
def scan_cache_dir(cache_dir: Optional[Union[str, Path]] = None) -> HFCacheInfo: | |
"""Scan the entire HF cache-system and return a [`~HFCacheInfo`] structure. | |
Use `scan_cache_dir` in order to programmatically scan your cache-system. The cache | |
will be scanned repo by repo. If a repo is corrupted, a [`~CorruptedCacheException`] | |
will be thrown internally but captured and returned in the [`~HFCacheInfo`] | |
structure. Only valid repos get a proper report. | |
```py | |
>>> from huggingface_hub import scan_cache_dir | |
>>> hf_cache_info = scan_cache_dir() | |
HFCacheInfo( | |
size_on_disk=3398085269, | |
repos=frozenset({ | |
CachedRepoInfo( | |
repo_id='t5-small', | |
repo_type='model', | |
repo_path=PosixPath(...), | |
size_on_disk=970726914, | |
nb_files=11, | |
revisions=frozenset({ | |
CachedRevisionInfo( | |
commit_hash='d78aea13fa7ecd06c29e3e46195d6341255065d5', | |
size_on_disk=970726339, | |
snapshot_path=PosixPath(...), | |
files=frozenset({ | |
CachedFileInfo( | |
file_name='config.json', | |
size_on_disk=1197 | |
file_path=PosixPath(...), | |
blob_path=PosixPath(...), | |
), | |
CachedFileInfo(...), | |
... | |
}), | |
), | |
CachedRevisionInfo(...), | |
... | |
}), | |
), | |
CachedRepoInfo(...), | |
... | |
}), | |
warnings=[ | |
CorruptedCacheException("Snapshots dir doesn't exist in cached repo: ..."), | |
CorruptedCacheException(...), | |
... | |
], | |
) | |
``` | |
You can also print a detailed report directly from the `huggingface-cli` using: | |
```text | |
> huggingface-cli scan-cache | |
REPO ID REPO TYPE SIZE ON DISK NB FILES REFS LOCAL PATH | |
--------------------------- --------- ------------ -------- ------------------- ------------------------------------------------------------------------- | |
glue dataset 116.3K 15 1.17.0, main, 2.4.0 /Users/lucain/.cache/huggingface/hub/datasets--glue | |
google/fleurs dataset 64.9M 6 main, refs/pr/1 /Users/lucain/.cache/huggingface/hub/datasets--google--fleurs | |
Jean-Baptiste/camembert-ner model 441.0M 7 main /Users/lucain/.cache/huggingface/hub/models--Jean-Baptiste--camembert-ner | |
bert-base-cased model 1.9G 13 main /Users/lucain/.cache/huggingface/hub/models--bert-base-cased | |
t5-base model 10.1K 3 main /Users/lucain/.cache/huggingface/hub/models--t5-base | |
t5-small model 970.7M 11 refs/pr/1, main /Users/lucain/.cache/huggingface/hub/models--t5-small | |
Done in 0.0s. Scanned 6 repo(s) for a total of 3.4G. | |
Got 1 warning(s) while scanning. Use -vvv to print details. | |
``` | |
Args: | |
cache_dir (`str` or `Path`, `optional`): | |
Cache directory to cache. Defaults to the default HF cache directory. | |
<Tip warning={true}> | |
Raises: | |
`CacheNotFound` | |
If the cache directory does not exist. | |
[`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError) | |
If the cache directory is a file, instead of a directory. | |
</Tip> | |
Returns: a [`~HFCacheInfo`] object. | |
""" | |
if cache_dir is None: | |
cache_dir = HF_HUB_CACHE | |
cache_dir = Path(cache_dir).expanduser().resolve() | |
if not cache_dir.exists(): | |
raise CacheNotFound( | |
f"Cache directory not found: {cache_dir}. Please use `cache_dir` argument or set `HF_HUB_CACHE` environment variable.", | |
cache_dir=cache_dir, | |
) | |
if cache_dir.is_file(): | |
raise ValueError( | |
f"Scan cache expects a directory but found a file: {cache_dir}. Please use `cache_dir` argument or set `HF_HUB_CACHE` environment variable." | |
) | |
repos: Set[CachedRepoInfo] = set() | |
warnings: List[CorruptedCacheException] = [] | |
for repo_path in cache_dir.iterdir(): | |
if repo_path.name == ".locks": # skip './.locks/' folder | |
continue | |
try: | |
repos.add(_scan_cached_repo(repo_path)) | |
except CorruptedCacheException as e: | |
warnings.append(e) | |
return HFCacheInfo( | |
repos=frozenset(repos), | |
size_on_disk=sum(repo.size_on_disk for repo in repos), | |
warnings=warnings, | |
) | |
def _scan_cached_repo(repo_path: Path) -> CachedRepoInfo: | |
"""Scan a single cache repo and return information about it. | |
Any unexpected behavior will raise a [`~CorruptedCacheException`]. | |
""" | |
if not repo_path.is_dir(): | |
raise CorruptedCacheException(f"Repo path is not a directory: {repo_path}") | |
if "--" not in repo_path.name: | |
raise CorruptedCacheException(f"Repo path is not a valid HuggingFace cache directory: {repo_path}") | |
repo_type, repo_id = repo_path.name.split("--", maxsplit=1) | |
repo_type = repo_type[:-1] # "models" -> "model" | |
repo_id = repo_id.replace("--", "/") # google/fleurs -> "google/fleurs" | |
if repo_type not in {"dataset", "model", "space"}: | |
raise CorruptedCacheException( | |
f"Repo type must be `dataset`, `model` or `space`, found `{repo_type}` ({repo_path})." | |
) | |
blob_stats: Dict[Path, os.stat_result] = {} # Key is blob_path, value is blob stats | |
snapshots_path = repo_path / "snapshots" | |
refs_path = repo_path / "refs" | |
if not snapshots_path.exists() or not snapshots_path.is_dir(): | |
raise CorruptedCacheException(f"Snapshots dir doesn't exist in cached repo: {snapshots_path}") | |
# Scan over `refs` directory | |
# key is revision hash, value is set of refs | |
refs_by_hash: Dict[str, Set[str]] = defaultdict(set) | |
if refs_path.exists(): | |
# Example of `refs` directory | |
# ── refs | |
# ├── main | |
# └── refs | |
# └── pr | |
# └── 1 | |
if refs_path.is_file(): | |
raise CorruptedCacheException(f"Refs directory cannot be a file: {refs_path}") | |
for ref_path in refs_path.glob("**/*"): | |
# glob("**/*") iterates over all files and directories -> skip directories | |
if ref_path.is_dir(): | |
continue | |
ref_name = str(ref_path.relative_to(refs_path)) | |
with ref_path.open() as f: | |
commit_hash = f.read() | |
refs_by_hash[commit_hash].add(ref_name) | |
# Scan snapshots directory | |
cached_revisions: Set[CachedRevisionInfo] = set() | |
for revision_path in snapshots_path.iterdir(): | |
# Ignore OS-created helper files | |
if revision_path.name in FILES_TO_IGNORE: | |
continue | |
if revision_path.is_file(): | |
raise CorruptedCacheException(f"Snapshots folder corrupted. Found a file: {revision_path}") | |
cached_files = set() | |
for file_path in revision_path.glob("**/*"): | |
# glob("**/*") iterates over all files and directories -> skip directories | |
if file_path.is_dir(): | |
continue | |
blob_path = Path(file_path).resolve() | |
if not blob_path.exists(): | |
raise CorruptedCacheException(f"Blob missing (broken symlink): {blob_path}") | |
if blob_path not in blob_stats: | |
blob_stats[blob_path] = blob_path.stat() | |
cached_files.add( | |
CachedFileInfo( | |
file_name=file_path.name, | |
file_path=file_path, | |
size_on_disk=blob_stats[blob_path].st_size, | |
blob_path=blob_path, | |
blob_last_accessed=blob_stats[blob_path].st_atime, | |
blob_last_modified=blob_stats[blob_path].st_mtime, | |
) | |
) | |
# Last modified is either the last modified blob file or the revision folder | |
# itself if it is empty | |
if len(cached_files) > 0: | |
revision_last_modified = max(blob_stats[file.blob_path].st_mtime for file in cached_files) | |
else: | |
revision_last_modified = revision_path.stat().st_mtime | |
cached_revisions.add( | |
CachedRevisionInfo( | |
commit_hash=revision_path.name, | |
files=frozenset(cached_files), | |
refs=frozenset(refs_by_hash.pop(revision_path.name, set())), | |
size_on_disk=sum( | |
blob_stats[blob_path].st_size for blob_path in set(file.blob_path for file in cached_files) | |
), | |
snapshot_path=revision_path, | |
last_modified=revision_last_modified, | |
) | |
) | |
# Check that all refs referred to an existing revision | |
if len(refs_by_hash) > 0: | |
raise CorruptedCacheException( | |
f"Reference(s) refer to missing commit hashes: {dict(refs_by_hash)} ({repo_path})." | |
) | |
# Last modified is either the last modified blob file or the repo folder itself if | |
# no blob files has been found. Same for last accessed. | |
if len(blob_stats) > 0: | |
repo_last_accessed = max(stat.st_atime for stat in blob_stats.values()) | |
repo_last_modified = max(stat.st_mtime for stat in blob_stats.values()) | |
else: | |
repo_stats = repo_path.stat() | |
repo_last_accessed = repo_stats.st_atime | |
repo_last_modified = repo_stats.st_mtime | |
# Build and return frozen structure | |
return CachedRepoInfo( | |
nb_files=len(blob_stats), | |
repo_id=repo_id, | |
repo_path=repo_path, | |
repo_type=repo_type, # type: ignore | |
revisions=frozenset(cached_revisions), | |
size_on_disk=sum(stat.st_size for stat in blob_stats.values()), | |
last_accessed=repo_last_accessed, | |
last_modified=repo_last_modified, | |
) | |
def _format_size(num: int) -> str: | |
"""Format size in bytes into a human-readable string. | |
Taken from https://stackoverflow.com/a/1094933 | |
""" | |
num_f = float(num) | |
for unit in ["", "K", "M", "G", "T", "P", "E", "Z"]: | |
if abs(num_f) < 1000.0: | |
return f"{num_f:3.1f}{unit}" | |
num_f /= 1000.0 | |
return f"{num_f:.1f}Y" | |
_TIMESINCE_CHUNKS = ( | |
# Label, divider, max value | |
("second", 1, 60), | |
("minute", 60, 60), | |
("hour", 60 * 60, 24), | |
("day", 60 * 60 * 24, 6), | |
("week", 60 * 60 * 24 * 7, 6), | |
("month", 60 * 60 * 24 * 30, 11), | |
("year", 60 * 60 * 24 * 365, None), | |
) | |
def _format_timesince(ts: float) -> str: | |
"""Format timestamp in seconds into a human-readable string, relative to now. | |
Vaguely inspired by Django's `timesince` formatter. | |
""" | |
delta = time.time() - ts | |
if delta < 20: | |
return "a few seconds ago" | |
for label, divider, max_value in _TIMESINCE_CHUNKS: # noqa: B007 | |
value = round(delta / divider) | |
if max_value is not None and value <= max_value: | |
break | |
return f"{value} {label}{'s' if value > 1 else ''} ago" | |
def _try_delete_path(path: Path, path_type: str) -> None: | |
"""Try to delete a local file or folder. | |
If the path does not exists, error is logged as a warning and then ignored. | |
Args: | |
path (`Path`) | |
Path to delete. Can be a file or a folder. | |
path_type (`str`) | |
What path are we deleting ? Only for logging purposes. Example: "snapshot". | |
""" | |
logger.info(f"Delete {path_type}: {path}") | |
try: | |
if path.is_file(): | |
os.remove(path) | |
else: | |
shutil.rmtree(path) | |
except FileNotFoundError: | |
logger.warning(f"Couldn't delete {path_type}: file not found ({path})", exc_info=True) | |
except PermissionError: | |
logger.warning(f"Couldn't delete {path_type}: permission denied ({path})", exc_info=True) | |