Spaces:
Running
Running
from __future__ import annotations | |
import os | |
import pickle | |
import time | |
from typing import TYPE_CHECKING | |
from fsspec.utils import atomic_write | |
try: | |
import ujson as json | |
except ImportError: | |
if not TYPE_CHECKING: | |
import json | |
if TYPE_CHECKING: | |
from typing import Any, Dict, Iterator, Literal | |
from typing_extensions import TypeAlias | |
from .cached import CachingFileSystem | |
Detail: TypeAlias = Dict[str, Any] | |
class CacheMetadata: | |
"""Cache metadata. | |
All reading and writing of cache metadata is performed by this class, | |
accessing the cached files and blocks is not. | |
Metadata is stored in a single file per storage directory in JSON format. | |
For backward compatibility, also reads metadata stored in pickle format | |
which is converted to JSON when next saved. | |
""" | |
def __init__(self, storage: list[str]): | |
""" | |
Parameters | |
---------- | |
storage: list[str] | |
Directories containing cached files, must be at least one. Metadata | |
is stored in the last of these directories by convention. | |
""" | |
if not storage: | |
raise ValueError("CacheMetadata expects at least one storage location") | |
self._storage = storage | |
self.cached_files: list[Detail] = [{}] | |
# Private attribute to force saving of metadata in pickle format rather than | |
# JSON for use in tests to confirm can read both pickle and JSON formats. | |
self._force_save_pickle = False | |
def _load(self, fn: str) -> Detail: | |
"""Low-level function to load metadata from specific file""" | |
try: | |
with open(fn, "r") as f: | |
loaded = json.load(f) | |
except ValueError: | |
with open(fn, "rb") as f: | |
loaded = pickle.load(f) | |
for c in loaded.values(): | |
if isinstance(c.get("blocks"), list): | |
c["blocks"] = set(c["blocks"]) | |
return loaded | |
def _save(self, metadata_to_save: Detail, fn: str) -> None: | |
"""Low-level function to save metadata to specific file""" | |
if self._force_save_pickle: | |
with atomic_write(fn) as f: | |
pickle.dump(metadata_to_save, f) | |
else: | |
with atomic_write(fn, mode="w") as f: | |
json.dump(metadata_to_save, f) | |
def _scan_locations( | |
self, writable_only: bool = False | |
) -> Iterator[tuple[str, str, bool]]: | |
"""Yield locations (filenames) where metadata is stored, and whether | |
writable or not. | |
Parameters | |
---------- | |
writable: bool | |
Set to True to only yield writable locations. | |
Returns | |
------- | |
Yields (str, str, bool) | |
""" | |
n = len(self._storage) | |
for i, storage in enumerate(self._storage): | |
writable = i == n - 1 | |
if writable_only and not writable: | |
continue | |
yield os.path.join(storage, "cache"), storage, writable | |
def check_file( | |
self, path: str, cfs: CachingFileSystem | None | |
) -> Literal[False] | tuple[Detail, str]: | |
"""If path is in cache return its details, otherwise return ``False``. | |
If the optional CachingFileSystem is specified then it is used to | |
perform extra checks to reject possible matches, such as if they are | |
too old. | |
""" | |
for (fn, base, _), cache in zip(self._scan_locations(), self.cached_files): | |
if path not in cache: | |
continue | |
detail = cache[path].copy() | |
if cfs is not None: | |
if cfs.check_files and detail["uid"] != cfs.fs.ukey(path): | |
# Wrong file as determined by hash of file properties | |
continue | |
if cfs.expiry and time.time() - detail["time"] > cfs.expiry: | |
# Cached file has expired | |
continue | |
fn = os.path.join(base, detail["fn"]) | |
if os.path.exists(fn): | |
return detail, fn | |
return False | |
def clear_expired(self, expiry_time: int) -> tuple[list[str], bool]: | |
"""Remove expired metadata from the cache. | |
Returns names of files corresponding to expired metadata and a boolean | |
flag indicating whether the writable cache is empty. Caller is | |
responsible for deleting the expired files. | |
""" | |
expired_files = [] | |
for path, detail in self.cached_files[-1].copy().items(): | |
if time.time() - detail["time"] > expiry_time: | |
fn = detail.get("fn", "") | |
if not fn: | |
raise RuntimeError( | |
f"Cache metadata does not contain 'fn' for {path}" | |
) | |
fn = os.path.join(self._storage[-1], fn) | |
expired_files.append(fn) | |
self.cached_files[-1].pop(path) | |
if self.cached_files[-1]: | |
cache_path = os.path.join(self._storage[-1], "cache") | |
self._save(self.cached_files[-1], cache_path) | |
writable_cache_empty = not self.cached_files[-1] | |
return expired_files, writable_cache_empty | |
def load(self) -> None: | |
"""Load all metadata from disk and store in ``self.cached_files``""" | |
cached_files = [] | |
for fn, _, _ in self._scan_locations(): | |
if os.path.exists(fn): | |
# TODO: consolidate blocks here | |
cached_files.append(self._load(fn)) | |
else: | |
cached_files.append({}) | |
self.cached_files = cached_files or [{}] | |
def on_close_cached_file(self, f: Any, path: str) -> None: | |
"""Perform side-effect actions on closing a cached file. | |
The actual closing of the file is the responsibility of the caller. | |
""" | |
# File must be writeble, so in self.cached_files[-1] | |
c = self.cached_files[-1][path] | |
if c["blocks"] is not True and len(c["blocks"]) * f.blocksize >= f.size: | |
c["blocks"] = True | |
def pop_file(self, path: str) -> str | None: | |
"""Remove metadata of cached file. | |
If path is in the cache, return the filename of the cached file, | |
otherwise return ``None``. Caller is responsible for deleting the | |
cached file. | |
""" | |
details = self.check_file(path, None) | |
if not details: | |
return None | |
_, fn = details | |
if fn.startswith(self._storage[-1]): | |
self.cached_files[-1].pop(path) | |
self.save() | |
else: | |
raise PermissionError( | |
"Can only delete cached file in last, writable cache location" | |
) | |
return fn | |
def save(self) -> None: | |
"""Save metadata to disk""" | |
for (fn, _, writable), cache in zip(self._scan_locations(), self.cached_files): | |
if not writable: | |
continue | |
if os.path.exists(fn): | |
cached_files = self._load(fn) | |
for k, c in cached_files.items(): | |
if k in cache: | |
if c["blocks"] is True or cache[k]["blocks"] is True: | |
c["blocks"] = True | |
else: | |
# self.cached_files[*][*]["blocks"] must continue to | |
# point to the same set object so that updates | |
# performed by MMapCache are propagated back to | |
# self.cached_files. | |
blocks = cache[k]["blocks"] | |
blocks.update(c["blocks"]) | |
c["blocks"] = blocks | |
c["time"] = max(c["time"], cache[k]["time"]) | |
c["uid"] = cache[k]["uid"] | |
# Files can be added to cache after it was written once | |
for k, c in cache.items(): | |
if k not in cached_files: | |
cached_files[k] = c | |
else: | |
cached_files = cache | |
cache = {k: v.copy() for k, v in cached_files.items()} | |
for c in cache.values(): | |
if isinstance(c["blocks"], set): | |
c["blocks"] = list(c["blocks"]) | |
self._save(cache, fn) | |
self.cached_files[-1] = cached_files | |
def update_file(self, path: str, detail: Detail) -> None: | |
"""Update metadata for specific file in memory, do not save""" | |
self.cached_files[-1][path] = detail | |