Spaces:
Running
Running
from __future__ import annotations | |
import io | |
import json | |
import logging | |
import os | |
import threading | |
import warnings | |
import weakref | |
from errno import ESPIPE | |
from glob import has_magic | |
from hashlib import sha256 | |
from typing import Any, ClassVar, Dict, Tuple | |
from .callbacks import DEFAULT_CALLBACK | |
from .config import apply_config, conf | |
from .dircache import DirCache | |
from .transaction import Transaction | |
from .utils import ( | |
_unstrip_protocol, | |
glob_translate, | |
isfilelike, | |
other_paths, | |
read_block, | |
stringify_path, | |
tokenize, | |
) | |
logger = logging.getLogger("fsspec") | |
def make_instance(cls, args, kwargs): | |
return cls(*args, **kwargs) | |
class _Cached(type): | |
""" | |
Metaclass for caching file system instances. | |
Notes | |
----- | |
Instances are cached according to | |
* The values of the class attributes listed in `_extra_tokenize_attributes` | |
* The arguments passed to ``__init__``. | |
This creates an additional reference to the filesystem, which prevents the | |
filesystem from being garbage collected when all *user* references go away. | |
A call to the :meth:`AbstractFileSystem.clear_instance_cache` must *also* | |
be made for a filesystem instance to be garbage collected. | |
""" | |
def __init__(cls, *args, **kwargs): | |
super().__init__(*args, **kwargs) | |
# Note: we intentionally create a reference here, to avoid garbage | |
# collecting instances when all other references are gone. To really | |
# delete a FileSystem, the cache must be cleared. | |
if conf.get("weakref_instance_cache"): # pragma: no cover | |
# debug option for analysing fork/spawn conditions | |
cls._cache = weakref.WeakValueDictionary() | |
else: | |
cls._cache = {} | |
cls._pid = os.getpid() | |
def __call__(cls, *args, **kwargs): | |
kwargs = apply_config(cls, kwargs) | |
extra_tokens = tuple( | |
getattr(cls, attr, None) for attr in cls._extra_tokenize_attributes | |
) | |
token = tokenize( | |
cls, cls._pid, threading.get_ident(), *args, *extra_tokens, **kwargs | |
) | |
skip = kwargs.pop("skip_instance_cache", False) | |
if os.getpid() != cls._pid: | |
cls._cache.clear() | |
cls._pid = os.getpid() | |
if not skip and cls.cachable and token in cls._cache: | |
cls._latest = token | |
return cls._cache[token] | |
else: | |
obj = super().__call__(*args, **kwargs) | |
# Setting _fs_token here causes some static linters to complain. | |
obj._fs_token_ = token | |
obj.storage_args = args | |
obj.storage_options = kwargs | |
if obj.async_impl and obj.mirror_sync_methods: | |
from .asyn import mirror_sync_methods | |
mirror_sync_methods(obj) | |
if cls.cachable and not skip: | |
cls._latest = token | |
cls._cache[token] = obj | |
return obj | |
class AbstractFileSystem(metaclass=_Cached): | |
""" | |
An abstract super-class for pythonic file-systems | |
Implementations are expected to be compatible with or, better, subclass | |
from here. | |
""" | |
cachable = True # this class can be cached, instances reused | |
_cached = False | |
blocksize = 2**22 | |
sep = "/" | |
protocol: ClassVar[str | tuple[str, ...]] = "abstract" | |
_latest = None | |
async_impl = False | |
mirror_sync_methods = False | |
root_marker = "" # For some FSs, may require leading '/' or other character | |
transaction_type = Transaction | |
#: Extra *class attributes* that should be considered when hashing. | |
_extra_tokenize_attributes = () | |
# Set by _Cached metaclass | |
storage_args: Tuple[Any, ...] | |
storage_options: Dict[str, Any] | |
def __init__(self, *args, **storage_options): | |
"""Create and configure file-system instance | |
Instances may be cachable, so if similar enough arguments are seen | |
a new instance is not required. The token attribute exists to allow | |
implementations to cache instances if they wish. | |
A reasonable default should be provided if there are no arguments. | |
Subclasses should call this method. | |
Parameters | |
---------- | |
use_listings_cache, listings_expiry_time, max_paths: | |
passed to ``DirCache``, if the implementation supports | |
directory listing caching. Pass use_listings_cache=False | |
to disable such caching. | |
skip_instance_cache: bool | |
If this is a cachable implementation, pass True here to force | |
creating a new instance even if a matching instance exists, and prevent | |
storing this instance. | |
asynchronous: bool | |
loop: asyncio-compatible IOLoop or None | |
""" | |
if self._cached: | |
# reusing instance, don't change | |
return | |
self._cached = True | |
self._intrans = False | |
self._transaction = None | |
self._invalidated_caches_in_transaction = [] | |
self.dircache = DirCache(**storage_options) | |
if storage_options.pop("add_docs", None): | |
warnings.warn("add_docs is no longer supported.", FutureWarning) | |
if storage_options.pop("add_aliases", None): | |
warnings.warn("add_aliases has been removed.", FutureWarning) | |
# This is set in _Cached | |
self._fs_token_ = None | |
def fsid(self): | |
"""Persistent filesystem id that can be used to compare filesystems | |
across sessions. | |
""" | |
raise NotImplementedError | |
def _fs_token(self): | |
return self._fs_token_ | |
def __dask_tokenize__(self): | |
return self._fs_token | |
def __hash__(self): | |
return int(self._fs_token, 16) | |
def __eq__(self, other): | |
return isinstance(other, type(self)) and self._fs_token == other._fs_token | |
def __reduce__(self): | |
return make_instance, (type(self), self.storage_args, self.storage_options) | |
def _strip_protocol(cls, path): | |
"""Turn path from fully-qualified to file-system-specific | |
May require FS-specific handling, e.g., for relative paths or links. | |
""" | |
if isinstance(path, list): | |
return [cls._strip_protocol(p) for p in path] | |
path = stringify_path(path) | |
protos = (cls.protocol,) if isinstance(cls.protocol, str) else cls.protocol | |
for protocol in protos: | |
if path.startswith(protocol + "://"): | |
path = path[len(protocol) + 3 :] | |
elif path.startswith(protocol + "::"): | |
path = path[len(protocol) + 2 :] | |
path = path.rstrip("/") | |
# use of root_marker to make minimum required path, e.g., "/" | |
return path or cls.root_marker | |
def unstrip_protocol(self, name: str) -> str: | |
"""Format FS-specific path to generic, including protocol""" | |
protos = (self.protocol,) if isinstance(self.protocol, str) else self.protocol | |
for protocol in protos: | |
if name.startswith(f"{protocol}://"): | |
return name | |
return f"{protos[0]}://{name}" | |
def _get_kwargs_from_urls(path): | |
"""If kwargs can be encoded in the paths, extract them here | |
This should happen before instantiation of the class; incoming paths | |
then should be amended to strip the options in methods. | |
Examples may look like an sftp path "sftp://user@host:/my/path", where | |
the user and host should become kwargs and later get stripped. | |
""" | |
# by default, nothing happens | |
return {} | |
def current(cls): | |
"""Return the most recently instantiated FileSystem | |
If no instance has been created, then create one with defaults | |
""" | |
if cls._latest in cls._cache: | |
return cls._cache[cls._latest] | |
return cls() | |
def transaction(self): | |
"""A context within which files are committed together upon exit | |
Requires the file class to implement `.commit()` and `.discard()` | |
for the normal and exception cases. | |
""" | |
if self._transaction is None: | |
self._transaction = self.transaction_type(self) | |
return self._transaction | |
def start_transaction(self): | |
"""Begin write transaction for deferring files, non-context version""" | |
self._intrans = True | |
self._transaction = self.transaction_type(self) | |
return self.transaction | |
def end_transaction(self): | |
"""Finish write transaction, non-context version""" | |
self.transaction.complete() | |
self._transaction = None | |
# The invalid cache must be cleared after the transaction is completed. | |
for path in self._invalidated_caches_in_transaction: | |
self.invalidate_cache(path) | |
self._invalidated_caches_in_transaction.clear() | |
def invalidate_cache(self, path=None): | |
""" | |
Discard any cached directory information | |
Parameters | |
---------- | |
path: string or None | |
If None, clear all listings cached else listings at or under given | |
path. | |
""" | |
# Not necessary to implement invalidation mechanism, may have no cache. | |
# But if have, you should call this method of parent class from your | |
# subclass to ensure expiring caches after transacations correctly. | |
# See the implementation of FTPFileSystem in ftp.py | |
if self._intrans: | |
self._invalidated_caches_in_transaction.append(path) | |
def mkdir(self, path, create_parents=True, **kwargs): | |
""" | |
Create directory entry at path | |
For systems that don't have true directories, may create an for | |
this instance only and not touch the real filesystem | |
Parameters | |
---------- | |
path: str | |
location | |
create_parents: bool | |
if True, this is equivalent to ``makedirs`` | |
kwargs: | |
may be permissions, etc. | |
""" | |
pass # not necessary to implement, may not have directories | |
def makedirs(self, path, exist_ok=False): | |
"""Recursively make directories | |
Creates directory at path and any intervening required directories. | |
Raises exception if, for instance, the path already exists but is a | |
file. | |
Parameters | |
---------- | |
path: str | |
leaf directory name | |
exist_ok: bool (False) | |
If False, will error if the target already exists | |
""" | |
pass # not necessary to implement, may not have directories | |
def rmdir(self, path): | |
"""Remove a directory, if empty""" | |
pass # not necessary to implement, may not have directories | |
def ls(self, path, detail=True, **kwargs): | |
"""List objects at path. | |
This should include subdirectories and files at that location. The | |
difference between a file and a directory must be clear when details | |
are requested. | |
The specific keys, or perhaps a FileInfo class, or similar, is TBD, | |
but must be consistent across implementations. | |
Must include: | |
- full path to the entry (without protocol) | |
- size of the entry, in bytes. If the value cannot be determined, will | |
be ``None``. | |
- type of entry, "file", "directory" or other | |
Additional information | |
may be present, appropriate to the file-system, e.g., generation, | |
checksum, etc. | |
May use refresh=True|False to allow use of self._ls_from_cache to | |
check for a saved listing and avoid calling the backend. This would be | |
common where listing may be expensive. | |
Parameters | |
---------- | |
path: str | |
detail: bool | |
if True, gives a list of dictionaries, where each is the same as | |
the result of ``info(path)``. If False, gives a list of paths | |
(str). | |
kwargs: may have additional backend-specific options, such as version | |
information | |
Returns | |
------- | |
List of strings if detail is False, or list of directory information | |
dicts if detail is True. | |
""" | |
raise NotImplementedError | |
def _ls_from_cache(self, path): | |
"""Check cache for listing | |
Returns listing, if found (may be empty list for a directly that exists | |
but contains nothing), None if not in cache. | |
""" | |
parent = self._parent(path) | |
try: | |
return self.dircache[path.rstrip("/")] | |
except KeyError: | |
pass | |
try: | |
files = [ | |
f | |
for f in self.dircache[parent] | |
if f["name"] == path | |
or (f["name"] == path.rstrip("/") and f["type"] == "directory") | |
] | |
if len(files) == 0: | |
# parent dir was listed but did not contain this file | |
raise FileNotFoundError(path) | |
return files | |
except KeyError: | |
pass | |
def walk(self, path, maxdepth=None, topdown=True, on_error="omit", **kwargs): | |
"""Return all files belows path | |
List all files, recursing into subdirectories; output is iterator-style, | |
like ``os.walk()``. For a simple list of files, ``find()`` is available. | |
When topdown is True, the caller can modify the dirnames list in-place (perhaps | |
using del or slice assignment), and walk() will | |
only recurse into the subdirectories whose names remain in dirnames; | |
this can be used to prune the search, impose a specific order of visiting, | |
or even to inform walk() about directories the caller creates or renames before | |
it resumes walk() again. | |
Modifying dirnames when topdown is False has no effect. (see os.walk) | |
Note that the "files" outputted will include anything that is not | |
a directory, such as links. | |
Parameters | |
---------- | |
path: str | |
Root to recurse into | |
maxdepth: int | |
Maximum recursion depth. None means limitless, but not recommended | |
on link-based file-systems. | |
topdown: bool (True) | |
Whether to walk the directory tree from the top downwards or from | |
the bottom upwards. | |
on_error: "omit", "raise", a collable | |
if omit (default), path with exception will simply be empty; | |
If raise, an underlying exception will be raised; | |
if callable, it will be called with a single OSError instance as argument | |
kwargs: passed to ``ls`` | |
""" | |
if maxdepth is not None and maxdepth < 1: | |
raise ValueError("maxdepth must be at least 1") | |
path = self._strip_protocol(path) | |
full_dirs = {} | |
dirs = {} | |
files = {} | |
detail = kwargs.pop("detail", False) | |
try: | |
listing = self.ls(path, detail=True, **kwargs) | |
except (FileNotFoundError, OSError) as e: | |
if on_error == "raise": | |
raise | |
elif callable(on_error): | |
on_error(e) | |
if detail: | |
return path, {}, {} | |
return path, [], [] | |
for info in listing: | |
# each info name must be at least [path]/part , but here | |
# we check also for names like [path]/part/ | |
pathname = info["name"].rstrip("/") | |
name = pathname.rsplit("/", 1)[-1] | |
if info["type"] == "directory" and pathname != path: | |
# do not include "self" path | |
full_dirs[name] = pathname | |
dirs[name] = info | |
elif pathname == path: | |
# file-like with same name as give path | |
files[""] = info | |
else: | |
files[name] = info | |
if not detail: | |
dirs = list(dirs) | |
files = list(files) | |
if topdown: | |
# Yield before recursion if walking top down | |
yield path, dirs, files | |
if maxdepth is not None: | |
maxdepth -= 1 | |
if maxdepth < 1: | |
if not topdown: | |
yield path, dirs, files | |
return | |
for d in dirs: | |
yield from self.walk( | |
full_dirs[d], | |
maxdepth=maxdepth, | |
detail=detail, | |
topdown=topdown, | |
**kwargs, | |
) | |
if not topdown: | |
# Yield after recursion if walking bottom up | |
yield path, dirs, files | |
def find(self, path, maxdepth=None, withdirs=False, detail=False, **kwargs): | |
"""List all files below path. | |
Like posix ``find`` command without conditions | |
Parameters | |
---------- | |
path : str | |
maxdepth: int or None | |
If not None, the maximum number of levels to descend | |
withdirs: bool | |
Whether to include directory paths in the output. This is True | |
when used by glob, but users usually only want files. | |
kwargs are passed to ``ls``. | |
""" | |
# TODO: allow equivalent of -name parameter | |
path = self._strip_protocol(path) | |
out = {} | |
# Add the root directory if withdirs is requested | |
# This is needed for posix glob compliance | |
if withdirs and path != "" and self.isdir(path): | |
out[path] = self.info(path) | |
for _, dirs, files in self.walk(path, maxdepth, detail=True, **kwargs): | |
if withdirs: | |
files.update(dirs) | |
out.update({info["name"]: info for name, info in files.items()}) | |
if not out and self.isfile(path): | |
# walk works on directories, but find should also return [path] | |
# when path happens to be a file | |
out[path] = {} | |
names = sorted(out) | |
if not detail: | |
return names | |
else: | |
return {name: out[name] for name in names} | |
def du(self, path, total=True, maxdepth=None, withdirs=False, **kwargs): | |
"""Space used by files and optionally directories within a path | |
Directory size does not include the size of its contents. | |
Parameters | |
---------- | |
path: str | |
total: bool | |
Whether to sum all the file sizes | |
maxdepth: int or None | |
Maximum number of directory levels to descend, None for unlimited. | |
withdirs: bool | |
Whether to include directory paths in the output. | |
kwargs: passed to ``find`` | |
Returns | |
------- | |
Dict of {path: size} if total=False, or int otherwise, where numbers | |
refer to bytes used. | |
""" | |
sizes = {} | |
if withdirs and self.isdir(path): | |
# Include top-level directory in output | |
info = self.info(path) | |
sizes[info["name"]] = info["size"] | |
for f in self.find(path, maxdepth=maxdepth, withdirs=withdirs, **kwargs): | |
info = self.info(f) | |
sizes[info["name"]] = info["size"] | |
if total: | |
return sum(sizes.values()) | |
else: | |
return sizes | |
def glob(self, path, maxdepth=None, **kwargs): | |
""" | |
Find files by glob-matching. | |
If the path ends with '/', only folders are returned. | |
We support ``"**"``, | |
``"?"`` and ``"[..]"``. We do not support ^ for pattern negation. | |
The `maxdepth` option is applied on the first `**` found in the path. | |
kwargs are passed to ``ls``. | |
""" | |
if maxdepth is not None and maxdepth < 1: | |
raise ValueError("maxdepth must be at least 1") | |
import re | |
seps = (os.path.sep, os.path.altsep) if os.path.altsep else (os.path.sep,) | |
ends_with_sep = path.endswith(seps) # _strip_protocol strips trailing slash | |
path = self._strip_protocol(path) | |
append_slash_to_dirname = ends_with_sep or path.endswith( | |
tuple(sep + "**" for sep in seps) | |
) | |
idx_star = path.find("*") if path.find("*") >= 0 else len(path) | |
idx_qmark = path.find("?") if path.find("?") >= 0 else len(path) | |
idx_brace = path.find("[") if path.find("[") >= 0 else len(path) | |
min_idx = min(idx_star, idx_qmark, idx_brace) | |
detail = kwargs.pop("detail", False) | |
if not has_magic(path): | |
if self.exists(path, **kwargs): | |
if not detail: | |
return [path] | |
else: | |
return {path: self.info(path, **kwargs)} | |
else: | |
if not detail: | |
return [] # glob of non-existent returns empty | |
else: | |
return {} | |
elif "/" in path[:min_idx]: | |
min_idx = path[:min_idx].rindex("/") | |
root = path[: min_idx + 1] | |
depth = path[min_idx + 1 :].count("/") + 1 | |
else: | |
root = "" | |
depth = path[min_idx + 1 :].count("/") + 1 | |
if "**" in path: | |
if maxdepth is not None: | |
idx_double_stars = path.find("**") | |
depth_double_stars = path[idx_double_stars:].count("/") + 1 | |
depth = depth - depth_double_stars + maxdepth | |
else: | |
depth = None | |
allpaths = self.find(root, maxdepth=depth, withdirs=True, detail=True, **kwargs) | |
pattern = glob_translate(path + ("/" if ends_with_sep else "")) | |
pattern = re.compile(pattern) | |
out = { | |
p: info | |
for p, info in sorted(allpaths.items()) | |
if pattern.match( | |
( | |
p + "/" | |
if append_slash_to_dirname and info["type"] == "directory" | |
else p | |
) | |
) | |
} | |
if detail: | |
return out | |
else: | |
return list(out) | |
def exists(self, path, **kwargs): | |
"""Is there a file at the given path""" | |
try: | |
self.info(path, **kwargs) | |
return True | |
except: # noqa: E722 | |
# any exception allowed bar FileNotFoundError? | |
return False | |
def lexists(self, path, **kwargs): | |
"""If there is a file at the given path (including | |
broken links)""" | |
return self.exists(path) | |
def info(self, path, **kwargs): | |
"""Give details of entry at path | |
Returns a single dictionary, with exactly the same information as ``ls`` | |
would with ``detail=True``. | |
The default implementation should calls ls and could be overridden by a | |
shortcut. kwargs are passed on to ```ls()``. | |
Some file systems might not be able to measure the file's size, in | |
which case, the returned dict will include ``'size': None``. | |
Returns | |
------- | |
dict with keys: name (full path in the FS), size (in bytes), type (file, | |
directory, or something else) and other FS-specific keys. | |
""" | |
path = self._strip_protocol(path) | |
out = self.ls(self._parent(path), detail=True, **kwargs) | |
out = [o for o in out if o["name"].rstrip("/") == path] | |
if out: | |
return out[0] | |
out = self.ls(path, detail=True, **kwargs) | |
path = path.rstrip("/") | |
out1 = [o for o in out if o["name"].rstrip("/") == path] | |
if len(out1) == 1: | |
if "size" not in out1[0]: | |
out1[0]["size"] = None | |
return out1[0] | |
elif len(out1) > 1 or out: | |
return {"name": path, "size": 0, "type": "directory"} | |
else: | |
raise FileNotFoundError(path) | |
def checksum(self, path): | |
"""Unique value for current version of file | |
If the checksum is the same from one moment to another, the contents | |
are guaranteed to be the same. If the checksum changes, the contents | |
*might* have changed. | |
This should normally be overridden; default will probably capture | |
creation/modification timestamp (which would be good) or maybe | |
access timestamp (which would be bad) | |
""" | |
return int(tokenize(self.info(path)), 16) | |
def size(self, path): | |
"""Size in bytes of file""" | |
return self.info(path).get("size", None) | |
def sizes(self, paths): | |
"""Size in bytes of each file in a list of paths""" | |
return [self.size(p) for p in paths] | |
def isdir(self, path): | |
"""Is this entry directory-like?""" | |
try: | |
return self.info(path)["type"] == "directory" | |
except OSError: | |
return False | |
def isfile(self, path): | |
"""Is this entry file-like?""" | |
try: | |
return self.info(path)["type"] == "file" | |
except: # noqa: E722 | |
return False | |
def read_text(self, path, encoding=None, errors=None, newline=None, **kwargs): | |
"""Get the contents of the file as a string. | |
Parameters | |
---------- | |
path: str | |
URL of file on this filesystems | |
encoding, errors, newline: same as `open`. | |
""" | |
with self.open( | |
path, | |
mode="r", | |
encoding=encoding, | |
errors=errors, | |
newline=newline, | |
**kwargs, | |
) as f: | |
return f.read() | |
def write_text( | |
self, path, value, encoding=None, errors=None, newline=None, **kwargs | |
): | |
"""Write the text to the given file. | |
An existing file will be overwritten. | |
Parameters | |
---------- | |
path: str | |
URL of file on this filesystems | |
value: str | |
Text to write. | |
encoding, errors, newline: same as `open`. | |
""" | |
with self.open( | |
path, | |
mode="w", | |
encoding=encoding, | |
errors=errors, | |
newline=newline, | |
**kwargs, | |
) as f: | |
return f.write(value) | |
def cat_file(self, path, start=None, end=None, **kwargs): | |
"""Get the content of a file | |
Parameters | |
---------- | |
path: URL of file on this filesystems | |
start, end: int | |
Bytes limits of the read. If negative, backwards from end, | |
like usual python slices. Either can be None for start or | |
end of file, respectively | |
kwargs: passed to ``open()``. | |
""" | |
# explicitly set buffering off? | |
with self.open(path, "rb", **kwargs) as f: | |
if start is not None: | |
if start >= 0: | |
f.seek(start) | |
else: | |
f.seek(max(0, f.size + start)) | |
if end is not None: | |
if end < 0: | |
end = f.size + end | |
return f.read(end - f.tell()) | |
return f.read() | |
def pipe_file(self, path, value, **kwargs): | |
"""Set the bytes of given file""" | |
with self.open(path, "wb", **kwargs) as f: | |
f.write(value) | |
def pipe(self, path, value=None, **kwargs): | |
"""Put value into path | |
(counterpart to ``cat``) | |
Parameters | |
---------- | |
path: string or dict(str, bytes) | |
If a string, a single remote location to put ``value`` bytes; if a dict, | |
a mapping of {path: bytesvalue}. | |
value: bytes, optional | |
If using a single path, these are the bytes to put there. Ignored if | |
``path`` is a dict | |
""" | |
if isinstance(path, str): | |
self.pipe_file(self._strip_protocol(path), value, **kwargs) | |
elif isinstance(path, dict): | |
for k, v in path.items(): | |
self.pipe_file(self._strip_protocol(k), v, **kwargs) | |
else: | |
raise ValueError("path must be str or dict") | |
def cat_ranges( | |
self, paths, starts, ends, max_gap=None, on_error="return", **kwargs | |
): | |
"""Get the contents of byte ranges from one or more files | |
Parameters | |
---------- | |
paths: list | |
A list of of filepaths on this filesystems | |
starts, ends: int or list | |
Bytes limits of the read. If using a single int, the same value will be | |
used to read all the specified files. | |
""" | |
if max_gap is not None: | |
raise NotImplementedError | |
if not isinstance(paths, list): | |
raise TypeError | |
if not isinstance(starts, list): | |
starts = [starts] * len(paths) | |
if not isinstance(ends, list): | |
ends = [ends] * len(paths) | |
if len(starts) != len(paths) or len(ends) != len(paths): | |
raise ValueError | |
out = [] | |
for p, s, e in zip(paths, starts, ends): | |
try: | |
out.append(self.cat_file(p, s, e)) | |
except Exception as e: | |
if on_error == "return": | |
out.append(e) | |
else: | |
raise | |
return out | |
def cat(self, path, recursive=False, on_error="raise", **kwargs): | |
"""Fetch (potentially multiple) paths' contents | |
Parameters | |
---------- | |
recursive: bool | |
If True, assume the path(s) are directories, and get all the | |
contained files | |
on_error : "raise", "omit", "return" | |
If raise, an underlying exception will be raised (converted to KeyError | |
if the type is in self.missing_exceptions); if omit, keys with exception | |
will simply not be included in the output; if "return", all keys are | |
included in the output, but the value will be bytes or an exception | |
instance. | |
kwargs: passed to cat_file | |
Returns | |
------- | |
dict of {path: contents} if there are multiple paths | |
or the path has been otherwise expanded | |
""" | |
paths = self.expand_path(path, recursive=recursive) | |
if ( | |
len(paths) > 1 | |
or isinstance(path, list) | |
or paths[0] != self._strip_protocol(path) | |
): | |
out = {} | |
for path in paths: | |
try: | |
out[path] = self.cat_file(path, **kwargs) | |
except Exception as e: | |
if on_error == "raise": | |
raise | |
if on_error == "return": | |
out[path] = e | |
return out | |
else: | |
return self.cat_file(paths[0], **kwargs) | |
def get_file(self, rpath, lpath, callback=DEFAULT_CALLBACK, outfile=None, **kwargs): | |
"""Copy single remote file to local""" | |
from .implementations.local import LocalFileSystem | |
if isfilelike(lpath): | |
outfile = lpath | |
elif self.isdir(rpath): | |
os.makedirs(lpath, exist_ok=True) | |
return None | |
fs = LocalFileSystem(auto_mkdir=True) | |
fs.makedirs(fs._parent(lpath), exist_ok=True) | |
with self.open(rpath, "rb", **kwargs) as f1: | |
if outfile is None: | |
outfile = open(lpath, "wb") | |
try: | |
callback.set_size(getattr(f1, "size", None)) | |
data = True | |
while data: | |
data = f1.read(self.blocksize) | |
segment_len = outfile.write(data) | |
if segment_len is None: | |
segment_len = len(data) | |
callback.relative_update(segment_len) | |
finally: | |
if not isfilelike(lpath): | |
outfile.close() | |
def get( | |
self, | |
rpath, | |
lpath, | |
recursive=False, | |
callback=DEFAULT_CALLBACK, | |
maxdepth=None, | |
**kwargs, | |
): | |
"""Copy file(s) to local. | |
Copies a specific file or tree of files (if recursive=True). If lpath | |
ends with a "/", it will be assumed to be a directory, and target files | |
will go within. Can submit a list of paths, which may be glob-patterns | |
and will be expanded. | |
Calls get_file for each source. | |
""" | |
if isinstance(lpath, list) and isinstance(rpath, list): | |
# No need to expand paths when both source and destination | |
# are provided as lists | |
rpaths = rpath | |
lpaths = lpath | |
else: | |
from .implementations.local import ( | |
LocalFileSystem, | |
make_path_posix, | |
trailing_sep, | |
) | |
source_is_str = isinstance(rpath, str) | |
rpaths = self.expand_path(rpath, recursive=recursive, maxdepth=maxdepth) | |
if source_is_str and (not recursive or maxdepth is not None): | |
# Non-recursive glob does not copy directories | |
rpaths = [p for p in rpaths if not (trailing_sep(p) or self.isdir(p))] | |
if not rpaths: | |
return | |
if isinstance(lpath, str): | |
lpath = make_path_posix(lpath) | |
source_is_file = len(rpaths) == 1 | |
dest_is_dir = isinstance(lpath, str) and ( | |
trailing_sep(lpath) or LocalFileSystem().isdir(lpath) | |
) | |
exists = source_is_str and ( | |
(has_magic(rpath) and source_is_file) | |
or (not has_magic(rpath) and dest_is_dir and not trailing_sep(rpath)) | |
) | |
lpaths = other_paths( | |
rpaths, | |
lpath, | |
exists=exists, | |
flatten=not source_is_str, | |
) | |
callback.set_size(len(lpaths)) | |
for lpath, rpath in callback.wrap(zip(lpaths, rpaths)): | |
with callback.branched(rpath, lpath) as child: | |
self.get_file(rpath, lpath, callback=child, **kwargs) | |
def put_file(self, lpath, rpath, callback=DEFAULT_CALLBACK, **kwargs): | |
"""Copy single file to remote""" | |
if os.path.isdir(lpath): | |
self.makedirs(rpath, exist_ok=True) | |
return None | |
with open(lpath, "rb") as f1: | |
size = f1.seek(0, 2) | |
callback.set_size(size) | |
f1.seek(0) | |
self.mkdirs(self._parent(os.fspath(rpath)), exist_ok=True) | |
with self.open(rpath, "wb", **kwargs) as f2: | |
while f1.tell() < size: | |
data = f1.read(self.blocksize) | |
segment_len = f2.write(data) | |
if segment_len is None: | |
segment_len = len(data) | |
callback.relative_update(segment_len) | |
def put( | |
self, | |
lpath, | |
rpath, | |
recursive=False, | |
callback=DEFAULT_CALLBACK, | |
maxdepth=None, | |
**kwargs, | |
): | |
"""Copy file(s) from local. | |
Copies a specific file or tree of files (if recursive=True). If rpath | |
ends with a "/", it will be assumed to be a directory, and target files | |
will go within. | |
Calls put_file for each source. | |
""" | |
if isinstance(lpath, list) and isinstance(rpath, list): | |
# No need to expand paths when both source and destination | |
# are provided as lists | |
rpaths = rpath | |
lpaths = lpath | |
else: | |
from .implementations.local import ( | |
LocalFileSystem, | |
make_path_posix, | |
trailing_sep, | |
) | |
source_is_str = isinstance(lpath, str) | |
if source_is_str: | |
lpath = make_path_posix(lpath) | |
fs = LocalFileSystem() | |
lpaths = fs.expand_path(lpath, recursive=recursive, maxdepth=maxdepth) | |
if source_is_str and (not recursive or maxdepth is not None): | |
# Non-recursive glob does not copy directories | |
lpaths = [p for p in lpaths if not (trailing_sep(p) or fs.isdir(p))] | |
if not lpaths: | |
return | |
source_is_file = len(lpaths) == 1 | |
dest_is_dir = isinstance(rpath, str) and ( | |
trailing_sep(rpath) or self.isdir(rpath) | |
) | |
rpath = ( | |
self._strip_protocol(rpath) | |
if isinstance(rpath, str) | |
else [self._strip_protocol(p) for p in rpath] | |
) | |
exists = source_is_str and ( | |
(has_magic(lpath) and source_is_file) | |
or (not has_magic(lpath) and dest_is_dir and not trailing_sep(lpath)) | |
) | |
rpaths = other_paths( | |
lpaths, | |
rpath, | |
exists=exists, | |
flatten=not source_is_str, | |
) | |
callback.set_size(len(rpaths)) | |
for lpath, rpath in callback.wrap(zip(lpaths, rpaths)): | |
with callback.branched(lpath, rpath) as child: | |
self.put_file(lpath, rpath, callback=child, **kwargs) | |
def head(self, path, size=1024): | |
"""Get the first ``size`` bytes from file""" | |
with self.open(path, "rb") as f: | |
return f.read(size) | |
def tail(self, path, size=1024): | |
"""Get the last ``size`` bytes from file""" | |
with self.open(path, "rb") as f: | |
f.seek(max(-size, -f.size), 2) | |
return f.read() | |
def cp_file(self, path1, path2, **kwargs): | |
raise NotImplementedError | |
def copy( | |
self, path1, path2, recursive=False, maxdepth=None, on_error=None, **kwargs | |
): | |
"""Copy within two locations in the filesystem | |
on_error : "raise", "ignore" | |
If raise, any not-found exceptions will be raised; if ignore any | |
not-found exceptions will cause the path to be skipped; defaults to | |
raise unless recursive is true, where the default is ignore | |
""" | |
if on_error is None and recursive: | |
on_error = "ignore" | |
elif on_error is None: | |
on_error = "raise" | |
if isinstance(path1, list) and isinstance(path2, list): | |
# No need to expand paths when both source and destination | |
# are provided as lists | |
paths1 = path1 | |
paths2 = path2 | |
else: | |
from .implementations.local import trailing_sep | |
source_is_str = isinstance(path1, str) | |
paths1 = self.expand_path(path1, recursive=recursive, maxdepth=maxdepth) | |
if source_is_str and (not recursive or maxdepth is not None): | |
# Non-recursive glob does not copy directories | |
paths1 = [p for p in paths1 if not (trailing_sep(p) or self.isdir(p))] | |
if not paths1: | |
return | |
source_is_file = len(paths1) == 1 | |
dest_is_dir = isinstance(path2, str) and ( | |
trailing_sep(path2) or self.isdir(path2) | |
) | |
exists = source_is_str and ( | |
(has_magic(path1) and source_is_file) | |
or (not has_magic(path1) and dest_is_dir and not trailing_sep(path1)) | |
) | |
paths2 = other_paths( | |
paths1, | |
path2, | |
exists=exists, | |
flatten=not source_is_str, | |
) | |
for p1, p2 in zip(paths1, paths2): | |
try: | |
self.cp_file(p1, p2, **kwargs) | |
except FileNotFoundError: | |
if on_error == "raise": | |
raise | |
def expand_path(self, path, recursive=False, maxdepth=None, **kwargs): | |
"""Turn one or more globs or directories into a list of all matching paths | |
to files or directories. | |
kwargs are passed to ``glob`` or ``find``, which may in turn call ``ls`` | |
""" | |
if maxdepth is not None and maxdepth < 1: | |
raise ValueError("maxdepth must be at least 1") | |
if isinstance(path, (str, os.PathLike)): | |
out = self.expand_path([path], recursive, maxdepth) | |
else: | |
out = set() | |
path = [self._strip_protocol(p) for p in path] | |
for p in path: | |
if has_magic(p): | |
bit = set(self.glob(p, maxdepth=maxdepth, **kwargs)) | |
out |= bit | |
if recursive: | |
# glob call above expanded one depth so if maxdepth is defined | |
# then decrement it in expand_path call below. If it is zero | |
# after decrementing then avoid expand_path call. | |
if maxdepth is not None and maxdepth <= 1: | |
continue | |
out |= set( | |
self.expand_path( | |
list(bit), | |
recursive=recursive, | |
maxdepth=maxdepth - 1 if maxdepth is not None else None, | |
**kwargs, | |
) | |
) | |
continue | |
elif recursive: | |
rec = set( | |
self.find( | |
p, maxdepth=maxdepth, withdirs=True, detail=False, **kwargs | |
) | |
) | |
out |= rec | |
if p not in out and (recursive is False or self.exists(p)): | |
# should only check once, for the root | |
out.add(p) | |
if not out: | |
raise FileNotFoundError(path) | |
return sorted(out) | |
def mv(self, path1, path2, recursive=False, maxdepth=None, **kwargs): | |
"""Move file(s) from one location to another""" | |
if path1 == path2: | |
logger.debug("%s mv: The paths are the same, so no files were moved.", self) | |
else: | |
# explicitly raise exception to prevent data corruption | |
self.copy( | |
path1, path2, recursive=recursive, maxdepth=maxdepth, onerror="raise" | |
) | |
self.rm(path1, recursive=recursive) | |
def rm_file(self, path): | |
"""Delete a file""" | |
self._rm(path) | |
def _rm(self, path): | |
"""Delete one file""" | |
# this is the old name for the method, prefer rm_file | |
raise NotImplementedError | |
def rm(self, path, recursive=False, maxdepth=None): | |
"""Delete files. | |
Parameters | |
---------- | |
path: str or list of str | |
File(s) to delete. | |
recursive: bool | |
If file(s) are directories, recursively delete contents and then | |
also remove the directory | |
maxdepth: int or None | |
Depth to pass to walk for finding files to delete, if recursive. | |
If None, there will be no limit and infinite recursion may be | |
possible. | |
""" | |
path = self.expand_path(path, recursive=recursive, maxdepth=maxdepth) | |
for p in reversed(path): | |
self.rm_file(p) | |
def _parent(cls, path): | |
path = cls._strip_protocol(path) | |
if "/" in path: | |
parent = path.rsplit("/", 1)[0].lstrip(cls.root_marker) | |
return cls.root_marker + parent | |
else: | |
return cls.root_marker | |
def _open( | |
self, | |
path, | |
mode="rb", | |
block_size=None, | |
autocommit=True, | |
cache_options=None, | |
**kwargs, | |
): | |
"""Return raw bytes-mode file-like from the file-system""" | |
return AbstractBufferedFile( | |
self, | |
path, | |
mode, | |
block_size, | |
autocommit, | |
cache_options=cache_options, | |
**kwargs, | |
) | |
def open( | |
self, | |
path, | |
mode="rb", | |
block_size=None, | |
cache_options=None, | |
compression=None, | |
**kwargs, | |
): | |
""" | |
Return a file-like object from the filesystem | |
The resultant instance must function correctly in a context ``with`` | |
block. | |
Parameters | |
---------- | |
path: str | |
Target file | |
mode: str like 'rb', 'w' | |
See builtin ``open()`` | |
block_size: int | |
Some indication of buffering - this is a value in bytes | |
cache_options : dict, optional | |
Extra arguments to pass through to the cache. | |
compression: string or None | |
If given, open file using compression codec. Can either be a compression | |
name (a key in ``fsspec.compression.compr``) or "infer" to guess the | |
compression from the filename suffix. | |
encoding, errors, newline: passed on to TextIOWrapper for text mode | |
""" | |
import io | |
path = self._strip_protocol(path) | |
if "b" not in mode: | |
mode = mode.replace("t", "") + "b" | |
text_kwargs = { | |
k: kwargs.pop(k) | |
for k in ["encoding", "errors", "newline"] | |
if k in kwargs | |
} | |
return io.TextIOWrapper( | |
self.open( | |
path, | |
mode, | |
block_size=block_size, | |
cache_options=cache_options, | |
compression=compression, | |
**kwargs, | |
), | |
**text_kwargs, | |
) | |
else: | |
ac = kwargs.pop("autocommit", not self._intrans) | |
f = self._open( | |
path, | |
mode=mode, | |
block_size=block_size, | |
autocommit=ac, | |
cache_options=cache_options, | |
**kwargs, | |
) | |
if compression is not None: | |
from fsspec.compression import compr | |
from fsspec.core import get_compression | |
compression = get_compression(path, compression) | |
compress = compr[compression] | |
f = compress(f, mode=mode[0]) | |
if not ac and "r" not in mode: | |
self.transaction.files.append(f) | |
return f | |
def touch(self, path, truncate=True, **kwargs): | |
"""Create empty file, or update timestamp | |
Parameters | |
---------- | |
path: str | |
file location | |
truncate: bool | |
If True, always set file size to 0; if False, update timestamp and | |
leave file unchanged, if backend allows this | |
""" | |
if truncate or not self.exists(path): | |
with self.open(path, "wb", **kwargs): | |
pass | |
else: | |
raise NotImplementedError # update timestamp, if possible | |
def ukey(self, path): | |
"""Hash of file properties, to tell if it has changed""" | |
return sha256(str(self.info(path)).encode()).hexdigest() | |
def read_block(self, fn, offset, length, delimiter=None): | |
"""Read a block of bytes from | |
Starting at ``offset`` of the file, read ``length`` bytes. If | |
``delimiter`` is set then we ensure that the read starts and stops at | |
delimiter boundaries that follow the locations ``offset`` and ``offset | |
+ length``. If ``offset`` is zero then we start at zero. The | |
bytestring returned WILL include the end delimiter string. | |
If offset+length is beyond the eof, reads to eof. | |
Parameters | |
---------- | |
fn: string | |
Path to filename | |
offset: int | |
Byte offset to start read | |
length: int | |
Number of bytes to read. If None, read to end. | |
delimiter: bytes (optional) | |
Ensure reading starts and stops at delimiter bytestring | |
Examples | |
-------- | |
>>> fs.read_block('data/file.csv', 0, 13) # doctest: +SKIP | |
b'Alice, 100\\nBo' | |
>>> fs.read_block('data/file.csv', 0, 13, delimiter=b'\\n') # doctest: +SKIP | |
b'Alice, 100\\nBob, 200\\n' | |
Use ``length=None`` to read to the end of the file. | |
>>> fs.read_block('data/file.csv', 0, None, delimiter=b'\\n') # doctest: +SKIP | |
b'Alice, 100\\nBob, 200\\nCharlie, 300' | |
See Also | |
-------- | |
:func:`fsspec.utils.read_block` | |
""" | |
with self.open(fn, "rb") as f: | |
size = f.size | |
if length is None: | |
length = size | |
if size is not None and offset + length > size: | |
length = size - offset | |
return read_block(f, offset, length, delimiter) | |
def to_json(self, *, include_password: bool = True) -> str: | |
""" | |
JSON representation of this filesystem instance. | |
Parameters | |
---------- | |
include_password: bool, default True | |
Whether to include the password (if any) in the output. | |
Returns | |
------- | |
JSON string with keys ``cls`` (the python location of this class), | |
protocol (text name of this class's protocol, first one in case of | |
multiple), ``args`` (positional args, usually empty), and all other | |
keyword arguments as their own keys. | |
Warnings | |
-------- | |
Serialized filesystems may contain sensitive information which have been | |
passed to the constructor, such as passwords and tokens. Make sure you | |
store and send them in a secure environment! | |
""" | |
from .json import FilesystemJSONEncoder | |
return json.dumps( | |
self, | |
cls=type( | |
"_FilesystemJSONEncoder", | |
(FilesystemJSONEncoder,), | |
{"include_password": include_password}, | |
), | |
) | |
def from_json(blob: str) -> AbstractFileSystem: | |
""" | |
Recreate a filesystem instance from JSON representation. | |
See ``.to_json()`` for the expected structure of the input. | |
Parameters | |
---------- | |
blob: str | |
Returns | |
------- | |
file system instance, not necessarily of this particular class. | |
Warnings | |
-------- | |
This can import arbitrary modules (as determined by the ``cls`` key). | |
Make sure you haven't installed any modules that may execute malicious code | |
at import time. | |
""" | |
from .json import FilesystemJSONDecoder | |
return json.loads(blob, cls=FilesystemJSONDecoder) | |
def to_dict(self, *, include_password: bool = True) -> Dict[str, Any]: | |
""" | |
JSON-serializable dictionary representation of this filesystem instance. | |
Parameters | |
---------- | |
include_password: bool, default True | |
Whether to include the password (if any) in the output. | |
Returns | |
------- | |
Dictionary with keys ``cls`` (the python location of this class), | |
protocol (text name of this class's protocol, first one in case of | |
multiple), ``args`` (positional args, usually empty), and all other | |
keyword arguments as their own keys. | |
Warnings | |
-------- | |
Serialized filesystems may contain sensitive information which have been | |
passed to the constructor, such as passwords and tokens. Make sure you | |
store and send them in a secure environment! | |
""" | |
from .json import FilesystemJSONEncoder | |
json_encoder = FilesystemJSONEncoder() | |
cls = type(self) | |
proto = self.protocol | |
storage_options = dict(self.storage_options) | |
if not include_password: | |
storage_options.pop("password", None) | |
return dict( | |
cls=f"{cls.__module__}:{cls.__name__}", | |
protocol=proto[0] if isinstance(proto, (tuple, list)) else proto, | |
args=json_encoder.make_serializable(self.storage_args), | |
**json_encoder.make_serializable(storage_options), | |
) | |
def from_dict(dct: Dict[str, Any]) -> AbstractFileSystem: | |
""" | |
Recreate a filesystem instance from dictionary representation. | |
See ``.to_dict()`` for the expected structure of the input. | |
Parameters | |
---------- | |
dct: Dict[str, Any] | |
Returns | |
------- | |
file system instance, not necessarily of this particular class. | |
Warnings | |
-------- | |
This can import arbitrary modules (as determined by the ``cls`` key). | |
Make sure you haven't installed any modules that may execute malicious code | |
at import time. | |
""" | |
from .json import FilesystemJSONDecoder | |
json_decoder = FilesystemJSONDecoder() | |
dct = dict(dct) # Defensive copy | |
cls = FilesystemJSONDecoder.try_resolve_fs_cls(dct) | |
if cls is None: | |
raise ValueError("Not a serialized AbstractFileSystem") | |
dct.pop("cls", None) | |
dct.pop("protocol", None) | |
return cls( | |
*json_decoder.unmake_serializable(dct.pop("args", ())), | |
**json_decoder.unmake_serializable(dct), | |
) | |
def _get_pyarrow_filesystem(self): | |
""" | |
Make a version of the FS instance which will be acceptable to pyarrow | |
""" | |
# all instances already also derive from pyarrow | |
return self | |
def get_mapper(self, root="", check=False, create=False, missing_exceptions=None): | |
"""Create key/value store based on this file-system | |
Makes a MutableMapping interface to the FS at the given root path. | |
See ``fsspec.mapping.FSMap`` for further details. | |
""" | |
from .mapping import FSMap | |
return FSMap( | |
root, | |
self, | |
check=check, | |
create=create, | |
missing_exceptions=missing_exceptions, | |
) | |
def clear_instance_cache(cls): | |
""" | |
Clear the cache of filesystem instances. | |
Notes | |
----- | |
Unless overridden by setting the ``cachable`` class attribute to False, | |
the filesystem class stores a reference to newly created instances. This | |
prevents Python's normal rules around garbage collection from working, | |
since the instances refcount will not drop to zero until | |
``clear_instance_cache`` is called. | |
""" | |
cls._cache.clear() | |
def created(self, path): | |
"""Return the created timestamp of a file as a datetime.datetime""" | |
raise NotImplementedError | |
def modified(self, path): | |
"""Return the modified timestamp of a file as a datetime.datetime""" | |
raise NotImplementedError | |
# ------------------------------------------------------------------------ | |
# Aliases | |
def read_bytes(self, path, start=None, end=None, **kwargs): | |
"""Alias of `AbstractFileSystem.cat_file`.""" | |
return self.cat_file(path, start=start, end=end, **kwargs) | |
def write_bytes(self, path, value, **kwargs): | |
"""Alias of `AbstractFileSystem.pipe_file`.""" | |
self.pipe_file(path, value, **kwargs) | |
def makedir(self, path, create_parents=True, **kwargs): | |
"""Alias of `AbstractFileSystem.mkdir`.""" | |
return self.mkdir(path, create_parents=create_parents, **kwargs) | |
def mkdirs(self, path, exist_ok=False): | |
"""Alias of `AbstractFileSystem.makedirs`.""" | |
return self.makedirs(path, exist_ok=exist_ok) | |
def listdir(self, path, detail=True, **kwargs): | |
"""Alias of `AbstractFileSystem.ls`.""" | |
return self.ls(path, detail=detail, **kwargs) | |
def cp(self, path1, path2, **kwargs): | |
"""Alias of `AbstractFileSystem.copy`.""" | |
return self.copy(path1, path2, **kwargs) | |
def move(self, path1, path2, **kwargs): | |
"""Alias of `AbstractFileSystem.mv`.""" | |
return self.mv(path1, path2, **kwargs) | |
def stat(self, path, **kwargs): | |
"""Alias of `AbstractFileSystem.info`.""" | |
return self.info(path, **kwargs) | |
def disk_usage(self, path, total=True, maxdepth=None, **kwargs): | |
"""Alias of `AbstractFileSystem.du`.""" | |
return self.du(path, total=total, maxdepth=maxdepth, **kwargs) | |
def rename(self, path1, path2, **kwargs): | |
"""Alias of `AbstractFileSystem.mv`.""" | |
return self.mv(path1, path2, **kwargs) | |
def delete(self, path, recursive=False, maxdepth=None): | |
"""Alias of `AbstractFileSystem.rm`.""" | |
return self.rm(path, recursive=recursive, maxdepth=maxdepth) | |
def upload(self, lpath, rpath, recursive=False, **kwargs): | |
"""Alias of `AbstractFileSystem.put`.""" | |
return self.put(lpath, rpath, recursive=recursive, **kwargs) | |
def download(self, rpath, lpath, recursive=False, **kwargs): | |
"""Alias of `AbstractFileSystem.get`.""" | |
return self.get(rpath, lpath, recursive=recursive, **kwargs) | |
def sign(self, path, expiration=100, **kwargs): | |
"""Create a signed URL representing the given path | |
Some implementations allow temporary URLs to be generated, as a | |
way of delegating credentials. | |
Parameters | |
---------- | |
path : str | |
The path on the filesystem | |
expiration : int | |
Number of seconds to enable the URL for (if supported) | |
Returns | |
------- | |
URL : str | |
The signed URL | |
Raises | |
------ | |
NotImplementedError : if method is not implemented for a filesystem | |
""" | |
raise NotImplementedError("Sign is not implemented for this filesystem") | |
def _isfilestore(self): | |
# Originally inherited from pyarrow DaskFileSystem. Keeping this | |
# here for backwards compatibility as long as pyarrow uses its | |
# legacy fsspec-compatible filesystems and thus accepts fsspec | |
# filesystems as well | |
return False | |
class AbstractBufferedFile(io.IOBase): | |
"""Convenient class to derive from to provide buffering | |
In the case that the backend does not provide a pythonic file-like object | |
already, this class contains much of the logic to build one. The only | |
methods that need to be overridden are ``_upload_chunk``, | |
``_initiate_upload`` and ``_fetch_range``. | |
""" | |
DEFAULT_BLOCK_SIZE = 5 * 2**20 | |
_details = None | |
def __init__( | |
self, | |
fs, | |
path, | |
mode="rb", | |
block_size="default", | |
autocommit=True, | |
cache_type="readahead", | |
cache_options=None, | |
size=None, | |
**kwargs, | |
): | |
""" | |
Template for files with buffered reading and writing | |
Parameters | |
---------- | |
fs: instance of FileSystem | |
path: str | |
location in file-system | |
mode: str | |
Normal file modes. Currently only 'wb', 'ab' or 'rb'. Some file | |
systems may be read-only, and some may not support append. | |
block_size: int | |
Buffer size for reading or writing, 'default' for class default | |
autocommit: bool | |
Whether to write to final destination; may only impact what | |
happens when file is being closed. | |
cache_type: {"readahead", "none", "mmap", "bytes"}, default "readahead" | |
Caching policy in read mode. See the definitions in ``core``. | |
cache_options : dict | |
Additional options passed to the constructor for the cache specified | |
by `cache_type`. | |
size: int | |
If given and in read mode, suppressed having to look up the file size | |
kwargs: | |
Gets stored as self.kwargs | |
""" | |
from .core import caches | |
self.path = path | |
self.fs = fs | |
self.mode = mode | |
self.blocksize = ( | |
self.DEFAULT_BLOCK_SIZE if block_size in ["default", None] else block_size | |
) | |
self.loc = 0 | |
self.autocommit = autocommit | |
self.end = None | |
self.start = None | |
self.closed = False | |
if cache_options is None: | |
cache_options = {} | |
if "trim" in kwargs: | |
warnings.warn( | |
"Passing 'trim' to control the cache behavior has been deprecated. " | |
"Specify it within the 'cache_options' argument instead.", | |
FutureWarning, | |
) | |
cache_options["trim"] = kwargs.pop("trim") | |
self.kwargs = kwargs | |
if mode not in {"ab", "rb", "wb"}: | |
raise NotImplementedError("File mode not supported") | |
if mode == "rb": | |
if size is not None: | |
self.size = size | |
else: | |
self.size = self.details["size"] | |
self.cache = caches[cache_type]( | |
self.blocksize, self._fetch_range, self.size, **cache_options | |
) | |
else: | |
self.buffer = io.BytesIO() | |
self.offset = None | |
self.forced = False | |
self.location = None | |
def details(self): | |
if self._details is None: | |
self._details = self.fs.info(self.path) | |
return self._details | |
def details(self, value): | |
self._details = value | |
self.size = value["size"] | |
def full_name(self): | |
return _unstrip_protocol(self.path, self.fs) | |
def closed(self): | |
# get around this attr being read-only in IOBase | |
# use getattr here, since this can be called during del | |
return getattr(self, "_closed", True) | |
def closed(self, c): | |
self._closed = c | |
def __hash__(self): | |
if "w" in self.mode: | |
return id(self) | |
else: | |
return int(tokenize(self.details), 16) | |
def __eq__(self, other): | |
"""Files are equal if they have the same checksum, only in read mode""" | |
if self is other: | |
return True | |
return ( | |
isinstance(other, type(self)) | |
and self.mode == "rb" | |
and other.mode == "rb" | |
and hash(self) == hash(other) | |
) | |
def commit(self): | |
"""Move from temp to final destination""" | |
def discard(self): | |
"""Throw away temporary file""" | |
def info(self): | |
"""File information about this path""" | |
if "r" in self.mode: | |
return self.details | |
else: | |
raise ValueError("Info not available while writing") | |
def tell(self): | |
"""Current file location""" | |
return self.loc | |
def seek(self, loc, whence=0): | |
"""Set current file location | |
Parameters | |
---------- | |
loc: int | |
byte location | |
whence: {0, 1, 2} | |
from start of file, current location or end of file, resp. | |
""" | |
loc = int(loc) | |
if not self.mode == "rb": | |
raise OSError(ESPIPE, "Seek only available in read mode") | |
if whence == 0: | |
nloc = loc | |
elif whence == 1: | |
nloc = self.loc + loc | |
elif whence == 2: | |
nloc = self.size + loc | |
else: | |
raise ValueError(f"invalid whence ({whence}, should be 0, 1 or 2)") | |
if nloc < 0: | |
raise ValueError("Seek before start of file") | |
self.loc = nloc | |
return self.loc | |
def write(self, data): | |
""" | |
Write data to buffer. | |
Buffer only sent on flush() or if buffer is greater than | |
or equal to blocksize. | |
Parameters | |
---------- | |
data: bytes | |
Set of bytes to be written. | |
""" | |
if self.mode not in {"wb", "ab"}: | |
raise ValueError("File not in write mode") | |
if self.closed: | |
raise ValueError("I/O operation on closed file.") | |
if self.forced: | |
raise ValueError("This file has been force-flushed, can only close") | |
out = self.buffer.write(data) | |
self.loc += out | |
if self.buffer.tell() >= self.blocksize: | |
self.flush() | |
return out | |
def flush(self, force=False): | |
""" | |
Write buffered data to backend store. | |
Writes the current buffer, if it is larger than the block-size, or if | |
the file is being closed. | |
Parameters | |
---------- | |
force: bool | |
When closing, write the last block even if it is smaller than | |
blocks are allowed to be. Disallows further writing to this file. | |
""" | |
if self.closed: | |
raise ValueError("Flush on closed file") | |
if force and self.forced: | |
raise ValueError("Force flush cannot be called more than once") | |
if force: | |
self.forced = True | |
if self.mode not in {"wb", "ab"}: | |
# no-op to flush on read-mode | |
return | |
if not force and self.buffer.tell() < self.blocksize: | |
# Defer write on small block | |
return | |
if self.offset is None: | |
# Initialize a multipart upload | |
self.offset = 0 | |
try: | |
self._initiate_upload() | |
except: # noqa: E722 | |
self.closed = True | |
raise | |
if self._upload_chunk(final=force) is not False: | |
self.offset += self.buffer.seek(0, 2) | |
self.buffer = io.BytesIO() | |
def _upload_chunk(self, final=False): | |
"""Write one part of a multi-block file upload | |
Parameters | |
========== | |
final: bool | |
This is the last block, so should complete file, if | |
self.autocommit is True. | |
""" | |
# may not yet have been initialized, may need to call _initialize_upload | |
def _initiate_upload(self): | |
"""Create remote file/upload""" | |
pass | |
def _fetch_range(self, start, end): | |
"""Get the specified set of bytes from remote""" | |
raise NotImplementedError | |
def read(self, length=-1): | |
""" | |
Return data from cache, or fetch pieces as necessary | |
Parameters | |
---------- | |
length: int (-1) | |
Number of bytes to read; if <0, all remaining bytes. | |
""" | |
length = -1 if length is None else int(length) | |
if self.mode != "rb": | |
raise ValueError("File not in read mode") | |
if length < 0: | |
length = self.size - self.loc | |
if self.closed: | |
raise ValueError("I/O operation on closed file.") | |
if length == 0: | |
# don't even bother calling fetch | |
return b"" | |
out = self.cache._fetch(self.loc, self.loc + length) | |
logger.debug( | |
"%s read: %i - %i %s", | |
self, | |
self.loc, | |
self.loc + length, | |
self.cache._log_stats(), | |
) | |
self.loc += len(out) | |
return out | |
def readinto(self, b): | |
"""mirrors builtin file's readinto method | |
https://docs.python.org/3/library/io.html#io.RawIOBase.readinto | |
""" | |
out = memoryview(b).cast("B") | |
data = self.read(out.nbytes) | |
out[: len(data)] = data | |
return len(data) | |
def readuntil(self, char=b"\n", blocks=None): | |
"""Return data between current position and first occurrence of char | |
char is included in the output, except if the end of the tile is | |
encountered first. | |
Parameters | |
---------- | |
char: bytes | |
Thing to find | |
blocks: None or int | |
How much to read in each go. Defaults to file blocksize - which may | |
mean a new read on every call. | |
""" | |
out = [] | |
while True: | |
start = self.tell() | |
part = self.read(blocks or self.blocksize) | |
if len(part) == 0: | |
break | |
found = part.find(char) | |
if found > -1: | |
out.append(part[: found + len(char)]) | |
self.seek(start + found + len(char)) | |
break | |
out.append(part) | |
return b"".join(out) | |
def readline(self): | |
"""Read until first occurrence of newline character | |
Note that, because of character encoding, this is not necessarily a | |
true line ending. | |
""" | |
return self.readuntil(b"\n") | |
def __next__(self): | |
out = self.readline() | |
if out: | |
return out | |
raise StopIteration | |
def __iter__(self): | |
return self | |
def readlines(self): | |
"""Return all data, split by the newline character""" | |
data = self.read() | |
lines = data.split(b"\n") | |
out = [l + b"\n" for l in lines[:-1]] | |
if data.endswith(b"\n"): | |
return out | |
else: | |
return out + [lines[-1]] | |
# return list(self) ??? | |
def readinto1(self, b): | |
return self.readinto(b) | |
def close(self): | |
"""Close file | |
Finalizes writes, discards cache | |
""" | |
if getattr(self, "_unclosable", False): | |
return | |
if self.closed: | |
return | |
if self.mode == "rb": | |
self.cache = None | |
else: | |
if not self.forced: | |
self.flush(force=True) | |
if self.fs is not None: | |
self.fs.invalidate_cache(self.path) | |
self.fs.invalidate_cache(self.fs._parent(self.path)) | |
self.closed = True | |
def readable(self): | |
"""Whether opened for reading""" | |
return self.mode == "rb" and not self.closed | |
def seekable(self): | |
"""Whether is seekable (only in read mode)""" | |
return self.readable() | |
def writable(self): | |
"""Whether opened for writing""" | |
return self.mode in {"wb", "ab"} and not self.closed | |
def __del__(self): | |
if not self.closed: | |
self.close() | |
def __str__(self): | |
return f"<File-like object {type(self.fs).__name__}, {self.path}>" | |
__repr__ = __str__ | |
def __enter__(self): | |
return self | |
def __exit__(self, *args): | |
self.close() | |