|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Contains utilities to manage the HF cache directory.""" |
|
import os |
|
import shutil |
|
import time |
|
from collections import defaultdict |
|
from dataclasses import dataclass |
|
from pathlib import Path |
|
from typing import Dict, FrozenSet, List, Literal, Optional, Set, Union |
|
|
|
from ..constants import HF_HUB_CACHE |
|
from . import logging |
|
|
|
|
|
logger = logging.get_logger(__name__) |
|
|
|
REPO_TYPE_T = Literal["model", "dataset", "space"] |
|
|
|
|
|
class CacheNotFound(Exception): |
|
"""Exception thrown when the Huggingface cache is not found.""" |
|
|
|
cache_dir: Union[str, Path] |
|
|
|
def __init__(self, msg: str, cache_dir: Union[str, Path], *args, **kwargs): |
|
super().__init__(msg, *args, **kwargs) |
|
self.cache_dir = cache_dir |
|
|
|
|
|
class CorruptedCacheException(Exception): |
|
"""Exception for any unexpected structure in the Huggingface cache-system.""" |
|
|
|
|
|
@dataclass(frozen=True) |
|
class CachedFileInfo: |
|
"""Frozen data structure holding information about a single cached file. |
|
|
|
Args: |
|
file_name (`str`): |
|
Name of the file. Example: `config.json`. |
|
file_path (`Path`): |
|
Path of the file in the `snapshots` directory. The file path is a symlink |
|
referring to a blob in the `blobs` folder. |
|
blob_path (`Path`): |
|
Path of the blob file. This is equivalent to `file_path.resolve()`. |
|
size_on_disk (`int`): |
|
Size of the blob file in bytes. |
|
blob_last_accessed (`float`): |
|
Timestamp of the last time the blob file has been accessed (from any |
|
revision). |
|
blob_last_modified (`float`): |
|
Timestamp of the last time the blob file has been modified/created. |
|
|
|
<Tip warning={true}> |
|
|
|
`blob_last_accessed` and `blob_last_modified` reliability can depend on the OS you |
|
are using. See [python documentation](https://docs.python.org/3/library/os.html#os.stat_result) |
|
for more details. |
|
|
|
</Tip> |
|
""" |
|
|
|
file_name: str |
|
file_path: Path |
|
blob_path: Path |
|
size_on_disk: int |
|
|
|
blob_last_accessed: float |
|
blob_last_modified: float |
|
|
|
@property |
|
def blob_last_accessed_str(self) -> str: |
|
""" |
|
(property) Timestamp of the last time the blob file has been accessed (from any |
|
revision), returned as a human-readable string. |
|
|
|
Example: "2 weeks ago". |
|
""" |
|
return _format_timesince(self.blob_last_accessed) |
|
|
|
@property |
|
def blob_last_modified_str(self) -> str: |
|
""" |
|
(property) Timestamp of the last time the blob file has been modified, returned |
|
as a human-readable string. |
|
|
|
Example: "2 weeks ago". |
|
""" |
|
return _format_timesince(self.blob_last_modified) |
|
|
|
@property |
|
def size_on_disk_str(self) -> str: |
|
""" |
|
(property) Size of the blob file as a human-readable string. |
|
|
|
Example: "42.2K". |
|
""" |
|
return _format_size(self.size_on_disk) |
|
|
|
|
|
@dataclass(frozen=True) |
|
class CachedRevisionInfo: |
|
"""Frozen data structure holding information about a revision. |
|
|
|
A revision correspond to a folder in the `snapshots` folder and is populated with |
|
the exact tree structure as the repo on the Hub but contains only symlinks. A |
|
revision can be either referenced by 1 or more `refs` or be "detached" (no refs). |
|
|
|
Args: |
|
commit_hash (`str`): |
|
Hash of the revision (unique). |
|
Example: `"9338f7b671827df886678df2bdd7cc7b4f36dffd"`. |
|
snapshot_path (`Path`): |
|
Path to the revision directory in the `snapshots` folder. It contains the |
|
exact tree structure as the repo on the Hub. |
|
files: (`FrozenSet[CachedFileInfo]`): |
|
Set of [`~CachedFileInfo`] describing all files contained in the snapshot. |
|
refs (`FrozenSet[str]`): |
|
Set of `refs` pointing to this revision. If the revision has no `refs`, it |
|
is considered detached. |
|
Example: `{"main", "2.4.0"}` or `{"refs/pr/1"}`. |
|
size_on_disk (`int`): |
|
Sum of the blob file sizes that are symlink-ed by the revision. |
|
last_modified (`float`): |
|
Timestamp of the last time the revision has been created/modified. |
|
|
|
<Tip warning={true}> |
|
|
|
`last_accessed` cannot be determined correctly on a single revision as blob files |
|
are shared across revisions. |
|
|
|
</Tip> |
|
|
|
<Tip warning={true}> |
|
|
|
`size_on_disk` is not necessarily the sum of all file sizes because of possible |
|
duplicated files. Besides, only blobs are taken into account, not the (negligible) |
|
size of folders and symlinks. |
|
|
|
</Tip> |
|
""" |
|
|
|
commit_hash: str |
|
snapshot_path: Path |
|
size_on_disk: int |
|
files: FrozenSet[CachedFileInfo] |
|
refs: FrozenSet[str] |
|
|
|
last_modified: float |
|
|
|
@property |
|
def last_modified_str(self) -> str: |
|
""" |
|
(property) Timestamp of the last time the revision has been modified, returned |
|
as a human-readable string. |
|
|
|
Example: "2 weeks ago". |
|
""" |
|
return _format_timesince(self.last_modified) |
|
|
|
@property |
|
def size_on_disk_str(self) -> str: |
|
""" |
|
(property) Sum of the blob file sizes as a human-readable string. |
|
|
|
Example: "42.2K". |
|
""" |
|
return _format_size(self.size_on_disk) |
|
|
|
@property |
|
def nb_files(self) -> int: |
|
""" |
|
(property) Total number of files in the revision. |
|
""" |
|
return len(self.files) |
|
|
|
|
|
@dataclass(frozen=True) |
|
class CachedRepoInfo: |
|
"""Frozen data structure holding information about a cached repository. |
|
|
|
Args: |
|
repo_id (`str`): |
|
Repo id of the repo on the Hub. Example: `"google/fleurs"`. |
|
repo_type (`Literal["dataset", "model", "space"]`): |
|
Type of the cached repo. |
|
repo_path (`Path`): |
|
Local path to the cached repo. |
|
size_on_disk (`int`): |
|
Sum of the blob file sizes in the cached repo. |
|
nb_files (`int`): |
|
Total number of blob files in the cached repo. |
|
revisions (`FrozenSet[CachedRevisionInfo]`): |
|
Set of [`~CachedRevisionInfo`] describing all revisions cached in the repo. |
|
last_accessed (`float`): |
|
Timestamp of the last time a blob file of the repo has been accessed. |
|
last_modified (`float`): |
|
Timestamp of the last time a blob file of the repo has been modified/created. |
|
|
|
<Tip warning={true}> |
|
|
|
`size_on_disk` is not necessarily the sum of all revisions sizes because of |
|
duplicated files. Besides, only blobs are taken into account, not the (negligible) |
|
size of folders and symlinks. |
|
|
|
</Tip> |
|
|
|
<Tip warning={true}> |
|
|
|
`last_accessed` and `last_modified` reliability can depend on the OS you are using. |
|
See [python documentation](https://docs.python.org/3/library/os.html#os.stat_result) |
|
for more details. |
|
|
|
</Tip> |
|
""" |
|
|
|
repo_id: str |
|
repo_type: REPO_TYPE_T |
|
repo_path: Path |
|
size_on_disk: int |
|
nb_files: int |
|
revisions: FrozenSet[CachedRevisionInfo] |
|
|
|
last_accessed: float |
|
last_modified: float |
|
|
|
@property |
|
def last_accessed_str(self) -> str: |
|
""" |
|
(property) Last time a blob file of the repo has been accessed, returned as a |
|
human-readable string. |
|
|
|
Example: "2 weeks ago". |
|
""" |
|
return _format_timesince(self.last_accessed) |
|
|
|
@property |
|
def last_modified_str(self) -> str: |
|
""" |
|
(property) Last time a blob file of the repo has been modified, returned as a |
|
human-readable string. |
|
|
|
Example: "2 weeks ago". |
|
""" |
|
return _format_timesince(self.last_modified) |
|
|
|
@property |
|
def size_on_disk_str(self) -> str: |
|
""" |
|
(property) Sum of the blob file sizes as a human-readable string. |
|
|
|
Example: "42.2K". |
|
""" |
|
return _format_size(self.size_on_disk) |
|
|
|
@property |
|
def refs(self) -> Dict[str, CachedRevisionInfo]: |
|
""" |
|
(property) Mapping between `refs` and revision data structures. |
|
""" |
|
return {ref: revision for revision in self.revisions for ref in revision.refs} |
|
|
|
|
|
@dataclass(frozen=True) |
|
class DeleteCacheStrategy: |
|
"""Frozen data structure holding the strategy to delete cached revisions. |
|
|
|
This object is not meant to be instantiated programmatically but to be returned by |
|
[`~utils.HFCacheInfo.delete_revisions`]. See documentation for usage example. |
|
|
|
Args: |
|
expected_freed_size (`float`): |
|
Expected freed size once strategy is executed. |
|
blobs (`FrozenSet[Path]`): |
|
Set of blob file paths to be deleted. |
|
refs (`FrozenSet[Path]`): |
|
Set of reference file paths to be deleted. |
|
repos (`FrozenSet[Path]`): |
|
Set of entire repo paths to be deleted. |
|
snapshots (`FrozenSet[Path]`): |
|
Set of snapshots to be deleted (directory of symlinks). |
|
""" |
|
|
|
expected_freed_size: int |
|
blobs: FrozenSet[Path] |
|
refs: FrozenSet[Path] |
|
repos: FrozenSet[Path] |
|
snapshots: FrozenSet[Path] |
|
|
|
@property |
|
def expected_freed_size_str(self) -> str: |
|
""" |
|
(property) Expected size that will be freed as a human-readable string. |
|
|
|
Example: "42.2K". |
|
""" |
|
return _format_size(self.expected_freed_size) |
|
|
|
def execute(self) -> None: |
|
"""Execute the defined strategy. |
|
|
|
<Tip warning={true}> |
|
|
|
If this method is interrupted, the cache might get corrupted. Deletion order is |
|
implemented so that references and symlinks are deleted before the actual blob |
|
files. |
|
|
|
</Tip> |
|
|
|
<Tip warning={true}> |
|
|
|
This method is irreversible. If executed, cached files are erased and must be |
|
downloaded again. |
|
|
|
</Tip> |
|
""" |
|
|
|
|
|
|
|
|
|
|
|
for path in self.repos: |
|
_try_delete_path(path, path_type="repo") |
|
|
|
|
|
for path in self.snapshots: |
|
_try_delete_path(path, path_type="snapshot") |
|
|
|
|
|
for path in self.refs: |
|
_try_delete_path(path, path_type="ref") |
|
|
|
|
|
for path in self.blobs: |
|
_try_delete_path(path, path_type="blob") |
|
|
|
logger.info(f"Cache deletion done. Saved {self.expected_freed_size_str}.") |
|
|
|
|
|
@dataclass(frozen=True) |
|
class HFCacheInfo: |
|
"""Frozen data structure holding information about the entire cache-system. |
|
|
|
This data structure is returned by [`scan_cache_dir`] and is immutable. |
|
|
|
Args: |
|
size_on_disk (`int`): |
|
Sum of all valid repo sizes in the cache-system. |
|
repos (`FrozenSet[CachedRepoInfo]`): |
|
Set of [`~CachedRepoInfo`] describing all valid cached repos found on the |
|
cache-system while scanning. |
|
warnings (`List[CorruptedCacheException]`): |
|
List of [`~CorruptedCacheException`] that occurred while scanning the cache. |
|
Those exceptions are captured so that the scan can continue. Corrupted repos |
|
are skipped from the scan. |
|
|
|
<Tip warning={true}> |
|
|
|
Here `size_on_disk` is equal to the sum of all repo sizes (only blobs). However if |
|
some cached repos are corrupted, their sizes are not taken into account. |
|
|
|
</Tip> |
|
""" |
|
|
|
size_on_disk: int |
|
repos: FrozenSet[CachedRepoInfo] |
|
warnings: List[CorruptedCacheException] |
|
|
|
@property |
|
def size_on_disk_str(self) -> str: |
|
""" |
|
(property) Sum of all valid repo sizes in the cache-system as a human-readable |
|
string. |
|
|
|
Example: "42.2K". |
|
""" |
|
return _format_size(self.size_on_disk) |
|
|
|
def delete_revisions(self, *revisions: str) -> DeleteCacheStrategy: |
|
"""Prepare the strategy to delete one or more revisions cached locally. |
|
|
|
Input revisions can be any revision hash. If a revision hash is not found in the |
|
local cache, a warning is thrown but no error is raised. Revisions can be from |
|
different cached repos since hashes are unique across repos, |
|
|
|
Examples: |
|
```py |
|
>>> from huggingface_hub import scan_cache_dir |
|
>>> cache_info = scan_cache_dir() |
|
>>> delete_strategy = cache_info.delete_revisions( |
|
... "81fd1d6e7847c99f5862c9fb81387956d99ec7aa" |
|
... ) |
|
>>> print(f"Will free {delete_strategy.expected_freed_size_str}.") |
|
Will free 7.9K. |
|
>>> delete_strategy.execute() |
|
Cache deletion done. Saved 7.9K. |
|
``` |
|
|
|
```py |
|
>>> from huggingface_hub import scan_cache_dir |
|
>>> scan_cache_dir().delete_revisions( |
|
... "81fd1d6e7847c99f5862c9fb81387956d99ec7aa", |
|
... "e2983b237dccf3ab4937c97fa717319a9ca1a96d", |
|
... "6c0e6080953db56375760c0471a8c5f2929baf11", |
|
... ).execute() |
|
Cache deletion done. Saved 8.6G. |
|
``` |
|
|
|
<Tip warning={true}> |
|
|
|
`delete_revisions` returns a [`~utils.DeleteCacheStrategy`] object that needs to |
|
be executed. The [`~utils.DeleteCacheStrategy`] is not meant to be modified but |
|
allows having a dry run before actually executing the deletion. |
|
|
|
</Tip> |
|
""" |
|
hashes_to_delete: Set[str] = set(revisions) |
|
|
|
repos_with_revisions: Dict[CachedRepoInfo, Set[CachedRevisionInfo]] = defaultdict(set) |
|
|
|
for repo in self.repos: |
|
for revision in repo.revisions: |
|
if revision.commit_hash in hashes_to_delete: |
|
repos_with_revisions[repo].add(revision) |
|
hashes_to_delete.remove(revision.commit_hash) |
|
|
|
if len(hashes_to_delete) > 0: |
|
logger.warning(f"Revision(s) not found - cannot delete them: {', '.join(hashes_to_delete)}") |
|
|
|
delete_strategy_blobs: Set[Path] = set() |
|
delete_strategy_refs: Set[Path] = set() |
|
delete_strategy_repos: Set[Path] = set() |
|
delete_strategy_snapshots: Set[Path] = set() |
|
delete_strategy_expected_freed_size = 0 |
|
|
|
for affected_repo, revisions_to_delete in repos_with_revisions.items(): |
|
other_revisions = affected_repo.revisions - revisions_to_delete |
|
|
|
|
|
|
|
if len(other_revisions) == 0: |
|
delete_strategy_repos.add(affected_repo.repo_path) |
|
delete_strategy_expected_freed_size += affected_repo.size_on_disk |
|
continue |
|
|
|
|
|
|
|
for revision_to_delete in revisions_to_delete: |
|
|
|
delete_strategy_snapshots.add(revision_to_delete.snapshot_path) |
|
|
|
|
|
for ref in revision_to_delete.refs: |
|
delete_strategy_refs.add(affected_repo.repo_path / "refs" / ref) |
|
|
|
|
|
for file in revision_to_delete.files: |
|
if file.blob_path not in delete_strategy_blobs: |
|
is_file_alone = True |
|
for revision in other_revisions: |
|
for rev_file in revision.files: |
|
if file.blob_path == rev_file.blob_path: |
|
is_file_alone = False |
|
break |
|
if not is_file_alone: |
|
break |
|
|
|
|
|
if is_file_alone: |
|
delete_strategy_blobs.add(file.blob_path) |
|
delete_strategy_expected_freed_size += file.size_on_disk |
|
|
|
|
|
return DeleteCacheStrategy( |
|
blobs=frozenset(delete_strategy_blobs), |
|
refs=frozenset(delete_strategy_refs), |
|
repos=frozenset(delete_strategy_repos), |
|
snapshots=frozenset(delete_strategy_snapshots), |
|
expected_freed_size=delete_strategy_expected_freed_size, |
|
) |
|
|
|
|
|
def scan_cache_dir(cache_dir: Optional[Union[str, Path]] = None) -> HFCacheInfo: |
|
"""Scan the entire HF cache-system and return a [`~HFCacheInfo`] structure. |
|
|
|
Use `scan_cache_dir` in order to programmatically scan your cache-system. The cache |
|
will be scanned repo by repo. If a repo is corrupted, a [`~CorruptedCacheException`] |
|
will be thrown internally but captured and returned in the [`~HFCacheInfo`] |
|
structure. Only valid repos get a proper report. |
|
|
|
```py |
|
>>> from huggingface_hub import scan_cache_dir |
|
|
|
>>> hf_cache_info = scan_cache_dir() |
|
HFCacheInfo( |
|
size_on_disk=3398085269, |
|
repos=frozenset({ |
|
CachedRepoInfo( |
|
repo_id='t5-small', |
|
repo_type='model', |
|
repo_path=PosixPath(...), |
|
size_on_disk=970726914, |
|
nb_files=11, |
|
revisions=frozenset({ |
|
CachedRevisionInfo( |
|
commit_hash='d78aea13fa7ecd06c29e3e46195d6341255065d5', |
|
size_on_disk=970726339, |
|
snapshot_path=PosixPath(...), |
|
files=frozenset({ |
|
CachedFileInfo( |
|
file_name='config.json', |
|
size_on_disk=1197 |
|
file_path=PosixPath(...), |
|
blob_path=PosixPath(...), |
|
), |
|
CachedFileInfo(...), |
|
... |
|
}), |
|
), |
|
CachedRevisionInfo(...), |
|
... |
|
}), |
|
), |
|
CachedRepoInfo(...), |
|
... |
|
}), |
|
warnings=[ |
|
CorruptedCacheException("Snapshots dir doesn't exist in cached repo: ..."), |
|
CorruptedCacheException(...), |
|
... |
|
], |
|
) |
|
``` |
|
|
|
You can also print a detailed report directly from the `huggingface-cli` using: |
|
```text |
|
> huggingface-cli scan-cache |
|
REPO ID REPO TYPE SIZE ON DISK NB FILES REFS LOCAL PATH |
|
--------------------------- --------- ------------ -------- ------------------- ------------------------------------------------------------------------- |
|
glue dataset 116.3K 15 1.17.0, main, 2.4.0 /Users/lucain/.cache/huggingface/hub/datasets--glue |
|
google/fleurs dataset 64.9M 6 main, refs/pr/1 /Users/lucain/.cache/huggingface/hub/datasets--google--fleurs |
|
Jean-Baptiste/camembert-ner model 441.0M 7 main /Users/lucain/.cache/huggingface/hub/models--Jean-Baptiste--camembert-ner |
|
bert-base-cased model 1.9G 13 main /Users/lucain/.cache/huggingface/hub/models--bert-base-cased |
|
t5-base model 10.1K 3 main /Users/lucain/.cache/huggingface/hub/models--t5-base |
|
t5-small model 970.7M 11 refs/pr/1, main /Users/lucain/.cache/huggingface/hub/models--t5-small |
|
|
|
Done in 0.0s. Scanned 6 repo(s) for a total of 3.4G. |
|
Got 1 warning(s) while scanning. Use -vvv to print details. |
|
``` |
|
|
|
Args: |
|
cache_dir (`str` or `Path`, `optional`): |
|
Cache directory to cache. Defaults to the default HF cache directory. |
|
|
|
<Tip warning={true}> |
|
|
|
Raises: |
|
|
|
`CacheNotFound` |
|
If the cache directory does not exist. |
|
|
|
[`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError) |
|
If the cache directory is a file, instead of a directory. |
|
|
|
</Tip> |
|
|
|
Returns: a [`~HFCacheInfo`] object. |
|
""" |
|
if cache_dir is None: |
|
cache_dir = HF_HUB_CACHE |
|
|
|
cache_dir = Path(cache_dir).expanduser().resolve() |
|
if not cache_dir.exists(): |
|
raise CacheNotFound( |
|
f"Cache directory not found: {cache_dir}. Please use `cache_dir` argument or set `HF_HUB_CACHE` environment variable.", |
|
cache_dir=cache_dir, |
|
) |
|
|
|
if cache_dir.is_file(): |
|
raise ValueError( |
|
f"Scan cache expects a directory but found a file: {cache_dir}. Please use `cache_dir` argument or set `HF_HUB_CACHE` environment variable." |
|
) |
|
|
|
repos: Set[CachedRepoInfo] = set() |
|
warnings: List[CorruptedCacheException] = [] |
|
for repo_path in cache_dir.iterdir(): |
|
if repo_path.name == ".locks": |
|
continue |
|
try: |
|
repos.add(_scan_cached_repo(repo_path)) |
|
except CorruptedCacheException as e: |
|
warnings.append(e) |
|
|
|
return HFCacheInfo( |
|
repos=frozenset(repos), |
|
size_on_disk=sum(repo.size_on_disk for repo in repos), |
|
warnings=warnings, |
|
) |
|
|
|
|
|
def _scan_cached_repo(repo_path: Path) -> CachedRepoInfo: |
|
"""Scan a single cache repo and return information about it. |
|
|
|
Any unexpected behavior will raise a [`~CorruptedCacheException`]. |
|
""" |
|
if not repo_path.is_dir(): |
|
raise CorruptedCacheException(f"Repo path is not a directory: {repo_path}") |
|
|
|
if "--" not in repo_path.name: |
|
raise CorruptedCacheException(f"Repo path is not a valid HuggingFace cache directory: {repo_path}") |
|
|
|
repo_type, repo_id = repo_path.name.split("--", maxsplit=1) |
|
repo_type = repo_type[:-1] |
|
repo_id = repo_id.replace("--", "/") |
|
|
|
if repo_type not in {"dataset", "model", "space"}: |
|
raise CorruptedCacheException( |
|
f"Repo type must be `dataset`, `model` or `space`, found `{repo_type}` ({repo_path})." |
|
) |
|
|
|
blob_stats: Dict[Path, os.stat_result] = {} |
|
|
|
snapshots_path = repo_path / "snapshots" |
|
refs_path = repo_path / "refs" |
|
|
|
if not snapshots_path.exists() or not snapshots_path.is_dir(): |
|
raise CorruptedCacheException(f"Snapshots dir doesn't exist in cached repo: {snapshots_path}") |
|
|
|
|
|
|
|
|
|
refs_by_hash: Dict[str, Set[str]] = defaultdict(set) |
|
if refs_path.exists(): |
|
|
|
|
|
|
|
|
|
|
|
|
|
if refs_path.is_file(): |
|
raise CorruptedCacheException(f"Refs directory cannot be a file: {refs_path}") |
|
|
|
for ref_path in refs_path.glob("**/*"): |
|
|
|
if ref_path.is_dir(): |
|
continue |
|
|
|
ref_name = str(ref_path.relative_to(refs_path)) |
|
with ref_path.open() as f: |
|
commit_hash = f.read() |
|
|
|
refs_by_hash[commit_hash].add(ref_name) |
|
|
|
|
|
cached_revisions: Set[CachedRevisionInfo] = set() |
|
for revision_path in snapshots_path.iterdir(): |
|
if revision_path.is_file(): |
|
raise CorruptedCacheException(f"Snapshots folder corrupted. Found a file: {revision_path}") |
|
|
|
cached_files = set() |
|
for file_path in revision_path.glob("**/*"): |
|
|
|
if file_path.is_dir(): |
|
continue |
|
|
|
blob_path = Path(file_path).resolve() |
|
if not blob_path.exists(): |
|
raise CorruptedCacheException(f"Blob missing (broken symlink): {blob_path}") |
|
|
|
if blob_path not in blob_stats: |
|
blob_stats[blob_path] = blob_path.stat() |
|
|
|
cached_files.add( |
|
CachedFileInfo( |
|
file_name=file_path.name, |
|
file_path=file_path, |
|
size_on_disk=blob_stats[blob_path].st_size, |
|
blob_path=blob_path, |
|
blob_last_accessed=blob_stats[blob_path].st_atime, |
|
blob_last_modified=blob_stats[blob_path].st_mtime, |
|
) |
|
) |
|
|
|
|
|
|
|
if len(cached_files) > 0: |
|
revision_last_modified = max(blob_stats[file.blob_path].st_mtime for file in cached_files) |
|
else: |
|
revision_last_modified = revision_path.stat().st_mtime |
|
|
|
cached_revisions.add( |
|
CachedRevisionInfo( |
|
commit_hash=revision_path.name, |
|
files=frozenset(cached_files), |
|
refs=frozenset(refs_by_hash.pop(revision_path.name, set())), |
|
size_on_disk=sum( |
|
blob_stats[blob_path].st_size for blob_path in set(file.blob_path for file in cached_files) |
|
), |
|
snapshot_path=revision_path, |
|
last_modified=revision_last_modified, |
|
) |
|
) |
|
|
|
|
|
if len(refs_by_hash) > 0: |
|
raise CorruptedCacheException( |
|
f"Reference(s) refer to missing commit hashes: {dict(refs_by_hash)} ({repo_path})." |
|
) |
|
|
|
|
|
|
|
if len(blob_stats) > 0: |
|
repo_last_accessed = max(stat.st_atime for stat in blob_stats.values()) |
|
repo_last_modified = max(stat.st_mtime for stat in blob_stats.values()) |
|
else: |
|
repo_stats = repo_path.stat() |
|
repo_last_accessed = repo_stats.st_atime |
|
repo_last_modified = repo_stats.st_mtime |
|
|
|
|
|
return CachedRepoInfo( |
|
nb_files=len(blob_stats), |
|
repo_id=repo_id, |
|
repo_path=repo_path, |
|
repo_type=repo_type, |
|
revisions=frozenset(cached_revisions), |
|
size_on_disk=sum(stat.st_size for stat in blob_stats.values()), |
|
last_accessed=repo_last_accessed, |
|
last_modified=repo_last_modified, |
|
) |
|
|
|
|
|
def _format_size(num: int) -> str: |
|
"""Format size in bytes into a human-readable string. |
|
|
|
Taken from https://stackoverflow.com/a/1094933 |
|
""" |
|
num_f = float(num) |
|
for unit in ["", "K", "M", "G", "T", "P", "E", "Z"]: |
|
if abs(num_f) < 1000.0: |
|
return f"{num_f:3.1f}{unit}" |
|
num_f /= 1000.0 |
|
return f"{num_f:.1f}Y" |
|
|
|
|
|
_TIMESINCE_CHUNKS = ( |
|
|
|
("second", 1, 60), |
|
("minute", 60, 60), |
|
("hour", 60 * 60, 24), |
|
("day", 60 * 60 * 24, 6), |
|
("week", 60 * 60 * 24 * 7, 6), |
|
("month", 60 * 60 * 24 * 30, 11), |
|
("year", 60 * 60 * 24 * 365, None), |
|
) |
|
|
|
|
|
def _format_timesince(ts: float) -> str: |
|
"""Format timestamp in seconds into a human-readable string, relative to now. |
|
|
|
Vaguely inspired by Django's `timesince` formatter. |
|
""" |
|
delta = time.time() - ts |
|
if delta < 20: |
|
return "a few seconds ago" |
|
for label, divider, max_value in _TIMESINCE_CHUNKS: |
|
value = round(delta / divider) |
|
if max_value is not None and value <= max_value: |
|
break |
|
return f"{value} {label}{'s' if value > 1 else ''} ago" |
|
|
|
|
|
def _try_delete_path(path: Path, path_type: str) -> None: |
|
"""Try to delete a local file or folder. |
|
|
|
If the path does not exists, error is logged as a warning and then ignored. |
|
|
|
Args: |
|
path (`Path`) |
|
Path to delete. Can be a file or a folder. |
|
path_type (`str`) |
|
What path are we deleting ? Only for logging purposes. Example: "snapshot". |
|
""" |
|
logger.info(f"Delete {path_type}: {path}") |
|
try: |
|
if path.is_file(): |
|
os.remove(path) |
|
else: |
|
shutil.rmtree(path) |
|
except FileNotFoundError: |
|
logger.warning(f"Couldn't delete {path_type}: file not found ({path})", exc_info=True) |
|
except PermissionError: |
|
logger.warning(f"Couldn't delete {path_type}: permission denied ({path})", exc_info=True) |
|
|