PTWZ's picture
Upload folder using huggingface_hub
f5f3483 verified
# Copyright 2024 The etils Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""`os.path` API backend."""
from __future__ import annotations
import abc
import contextlib
import functools
import glob as glob_lib
import os
import pathlib
import shutil
import stat as stat_lib
import typing
from typing import Callable, Iterator, NoReturn, Optional, Union
from etils.epath import stat_utils
from etils.epath.typing import PathLike # pylint: disable=g-importing-member
if typing.TYPE_CHECKING:
import fsspec
class Backend(abc.ABC):
"""Abstract backend class."""
@abc.abstractmethod
def open(
self,
path: PathLike,
mode: str,
) -> typing.IO[Union[str, bytes]]:
"""`open`. Encoding should be utf-8."""
raise NotImplementedError
@abc.abstractmethod
def exists(self, path: PathLike) -> bool:
raise NotImplementedError
@abc.abstractmethod
def isdir(self, path: PathLike) -> bool:
raise NotImplementedError
@abc.abstractmethod
def listdir(self, path: PathLike) -> list[str]:
raise NotImplementedError
@abc.abstractmethod
def glob(self, path: PathLike) -> list[str]:
raise NotImplementedError
@abc.abstractmethod
def walk(
self,
top: PathLike,
*,
top_down: bool = True,
on_error: Callable[[OSError], object] | None = None,
) -> Iterator[tuple[PathLike, list[str], list[str]]]:
raise NotImplementedError
@abc.abstractmethod
def makedirs(
self,
path: PathLike,
*,
exist_ok: bool = False,
mode: Optional[int] = None,
) -> None:
raise NotImplementedError
@abc.abstractmethod
def mkdir(
self,
path: PathLike,
*,
exist_ok: bool = False,
mode: Optional[int] = None,
) -> None:
raise NotImplementedError
@abc.abstractmethod
def rmtree(self, path: PathLike) -> None:
raise NotImplementedError
@abc.abstractmethod
def remove(self, path: PathLike) -> None:
raise NotImplementedError
@abc.abstractmethod
def rename(self, path: PathLike, dst: PathLike) -> None:
raise NotImplementedError
@abc.abstractmethod
def replace(self, path: PathLike, dst: PathLike) -> None:
raise NotImplementedError
@abc.abstractmethod
def copy(self, path: PathLike, dst: PathLike, overwrite: bool) -> None:
raise NotImplementedError
@abc.abstractmethod
def stat(self, path: PathLike) -> stat_utils.StatResult:
raise NotImplementedError
class _OsPathBackend(Backend):
"""`os.path` backend."""
def open(
self,
path: PathLike,
mode: str,
) -> typing.IO[Union[str, bytes]]:
if 'b' in mode:
encoding = None
else:
encoding = 'utf-8'
return open(path, mode, encoding=encoding)
def exists(self, path: PathLike) -> bool:
return os.path.exists(path)
def isdir(self, path: PathLike) -> bool:
return os.path.isdir(path)
def listdir(self, path: PathLike) -> list[str]:
# GFile filter backup files per default.
return [p for p in os.listdir(path) if not p.endswith('~')]
def glob(self, path: PathLike) -> list[str]:
return glob_lib.glob(path)
def walk(
self,
top: PathLike,
*,
top_down: bool = True,
on_error: Callable[[OSError], object] | None = None,
) -> Iterator[tuple[PathLike, list[str], list[str]]]:
if hasattr(pathlib.Path, 'walk'): # Python 3.12
yield from pathlib.Path(top).walk(top_down=top_down, on_error=on_error)
else: # Backward compatibility
# Note that `os.walk` is inconsistent for `symlinks` (always marked as
# filenames), but should be fine.
yield from os.walk(top, topdown=top_down, onerror=on_error)
def makedirs(
self,
path: PathLike,
*,
exist_ok: bool = False,
mode: Optional[int] = None,
) -> None:
mode = 0o777 if mode is None else mode
os.makedirs(path, exist_ok=exist_ok, mode=mode)
def mkdir(
self,
path: PathLike,
*,
exist_ok: bool = False,
mode: Optional[int] = None,
) -> None:
mode = 0o777 if mode is None else mode
try:
os.mkdir(path, mode=mode)
except FileExistsError:
if self.isdir(path): # No-op if directory already exists
if exist_ok:
pass
else: # Overwriting file raise an error
raise
else:
raise
def rmtree(self, path: PathLike) -> None:
try:
shutil.rmtree(path)
except NotADirectoryError:
self.remove(path)
def remove(self, path: PathLike) -> None:
try:
os.remove(path)
except IsADirectoryError:
os.rmdir(path)
except PermissionError:
# On Mac, `PermissionError` is raised instead of `IsADirectoryError`
if self.isdir(path):
os.rmdir(path)
else:
raise
def rename(self, path: PathLike, dst: PathLike) -> None:
if self.exists(dst):
raise FileExistsError(
f'Cannot rename {path}. Destination {dst} already exists.'
)
os.rename(path, dst)
def replace(self, path: PathLike, dst: PathLike) -> None:
if self.isdir(dst):
raise IsADirectoryError(f'Cannot overwrite: {dst} is a directory')
os.replace(path, dst)
def copy(self, path: PathLike, dst: PathLike, overwrite: bool) -> None:
if not overwrite and self.exists(dst):
raise FileExistsError(f'{dst} already exists. Cannot copy {path}.')
shutil.copyfile(path, dst)
def stat(self, path: PathLike) -> stat_utils.StatResult:
st = os.stat(path)
if os.name == 'nt':
owner = None
group = None
else:
import grp # pylint: disable=g-import-not-at-top
import pwd # pylint: disable=g-import-not-at-top
owner = pwd.getpwuid(st.st_uid).pw_name
group = grp.getgrgid(st.st_gid).gr_name
return stat_utils.StatResult(
is_directory=stat_lib.S_ISDIR(st.st_mode),
length=st.st_size,
mtime=int(st.st_mtime),
owner=owner,
group=group,
mode=st.st_mode,
)
class _TfBackend(Backend):
"""TensorFlow backend."""
@property
def tf(self):
try:
import tensorflow # pylint: disable=g-import-not-at-top # pytype: disable=import-error
except ImportError as e:
raise ImportError(
f'{e}. To use epath.Path with gs://, TensorFlow should be installed.'
) from None
return tensorflow
@property
def gfile(self):
return self.tf.io.gfile
@contextlib.contextmanager
def open(
self,
path: PathLike,
mode: str,
) -> Iterator[typing.IO[Union[str, bytes]]]:
with self.gfile.GFile(path, mode) as f: # pytype: disable=bad-return-type
try:
yield f
except self.tf.errors.NotFoundError as e:
raise FileNotFoundError(e) from None
def exists(self, path: PathLike) -> bool:
return self.gfile.exists(path)
def isdir(self, path: PathLike) -> bool:
return self.gfile.isdir(path)
def listdir(self, path: PathLike) -> list[str]:
return self.gfile.listdir(path)
def glob(self, path: PathLike) -> list[str]:
return self.gfile.glob(path)
def walk(
self,
top: PathLike,
*,
top_down: bool = True,
on_error: Callable[[OSError], object] | None = None,
) -> Iterator[tuple[PathLike, list[str], list[str]]]:
yield from self.gfile.walk(top, topdown=top_down, onerror=on_error)
def makedirs(
self,
path: PathLike,
*,
exist_ok: bool = False,
mode: Optional[int] = None,
) -> None:
mode = 0o777 if mode is None else mode
if mode != 0o777:
# tf.io.gfile do not support setting `mode=`
raise NotImplementedError(
'makedirs with custom `mode=` not supported for tf.io.gfile backend.'
' Please open an issue.'
)
# TF do not have a `exist_ok=` kwargs, so have to first check existence.
# This has performance impact but can be disabled with `exist_ok=True`.
if not exist_ok and self.exists(path):
raise FileExistsError(f'{path} already exists.')
try:
self.gfile.makedirs(path)
except self.tf.errors.FailedPreconditionError as e:
if 'not a directory' in str(e):
raise FileExistsError(str(e)) from None
else:
raise OSError(str(e)) from None
def mkdir(
self,
path: PathLike,
*,
exist_ok: bool = False,
mode: Optional[int] = None,
) -> None:
mode = 0o777 if mode is None else mode
if mode != 0o777:
# tf.io.gfile do not support setting `mode=`
raise NotImplementedError(
'mkdir with custom `mode=` not supported for tf.io.gfile backend.'
' Please open an issue.'
)
if not exist_ok and self.exists(path):
raise FileExistsError(f'{path} already exists.')
try:
self.gfile.mkdir(path)
except self.tf.errors.NotFoundError as e:
raise FileNotFoundError(str(e)) from None
else:
if not self.isdir(path): # TF do not raises error for files
raise FileExistsError(f'Cannot create dir. {path} is not a directory')
def rmtree(self, path: PathLike) -> None:
try:
self.gfile.rmtree(path)
except self.tf.errors.NotFoundError as e:
raise FileNotFoundError(str(e)) from None
def remove(self, path: PathLike) -> None:
try:
self.gfile.remove(path)
except self.tf.errors.FailedPreconditionError as e: # Dir not empty
raise OSError(str(e)) from None
except self.tf.errors.NotFoundError as e:
raise FileNotFoundError(str(e)) from None
def rename(self, path: PathLike, dst: PathLike) -> None:
try:
self.gfile.rename(path, dst)
except self.tf.errors.OpError as e:
self._reraise_error(e)
def replace(self, path: PathLike, dst: PathLike) -> None:
try:
self.gfile.rename(path, dst, overwrite=True)
except self.tf.errors.OpError as e:
self._reraise_error(e)
def copy(self, path: PathLike, dst: PathLike, overwrite: bool) -> None:
if overwrite and self.isdir(dst): # For consistency with rename, replace
raise IsADirectoryError(
f'Cannot copy {path}. Destination {dst} is a directory'
) from None
try:
self.gfile.copy(path, dst, overwrite=overwrite)
except self.tf.errors.OpError as e:
self._reraise_error(e)
def _reraise_error(self, e) -> NoReturn:
"""Reraise the TF error."""
e_msg = str(e)
if isinstance(e, self.tf.errors.FailedPreconditionError):
if 'not a directory' in e_msg.lower():
raise NotADirectoryError(e_msg) from None
if 'is a directory' in e_msg.lower():
raise IsADirectoryError(e_msg) from None
else:
raise OSError(e_msg) from None
if isinstance(e, self.tf.errors.AlreadyExistsError):
e_msg = str(e)
if 'is a directory' in e_msg.lower():
raise IsADirectoryError(e_msg) from None
else:
raise FileExistsError(e_msg) from None
if isinstance(e, self.tf.errors.NotFoundError):
raise FileNotFoundError(e_msg) from None
else:
raise # pylint: disable=misplaced-bare-raise
def stat(self, path: PathLike) -> stat_utils.StatResult:
st = self.gfile.stat(path)
return stat_utils.StatResult(
is_directory=st.is_directory,
length=st.length,
mtime=st.mtime_nsec // 1_000_000_000,
owner=None, # Not available.
group=None, # Not available.
mode=None,
)
class _FileSystemSpecBackend(Backend):
"""FileSystemSpec backend entirely relying on fsspec."""
@functools.lru_cache()
def _get_filesystem(self, name: str) -> fsspec.AbstractFileSystem:
"""Caches the filesystem."""
try:
import fsspec # pylint: disable=g-import-not-at-top
except ImportError as e:
raise ImportError(
"To use epath.Path with gs://, fsspec should be installed.'"
) from e
return fsspec.filesystem(name)
def fs(self, path: PathLike) -> fsspec.AbstractFileSystem:
"""Returns the proper fsspec filsystem: GCS, S3 or file."""
path = os.fspath(path)
if path.startswith('gs://'):
return self._get_filesystem('gcs')
elif path.startswith('s3://'):
return self._get_filesystem('s3')
elif path.startswith('az://'):
return self._get_filesystem('az')
else:
return self._get_filesystem('file')
def open(self, path: PathLike, mode: str) -> typing.IO[Union[str, bytes]]:
return self.fs(path).open(path, mode=mode)
def exists(self, path: PathLike) -> bool:
return self.fs(path).exists(path)
def isdir(self, path: PathLike) -> bool:
return self.fs(path).isdir(path)
def listdir(self, path: PathLike) -> list[str]:
paths = self.fs(path).listdir(path, detail=False)
return [os.path.basename(p) for p in paths if not p.endswith('~')]
def glob(self, path: PathLike) -> list[str]:
protocol = _get_protocol(path)
return [protocol + p for p in self.fs(path).glob(path)]
def walk(
self,
top: PathLike,
*,
top_down: bool = True,
on_error: Callable[[OSError], object] | None = None,
) -> Iterator[tuple[PathLike, list[str], list[str]]]:
if on_error is None:
on_error = 'omit' # default behavior for pathlib.Path.walk
yield from self.fs(top).walk( # pytype: disable=bad-return-type
top,
topdown=top_down,
on_error=on_error,
max_depth=None,
)
def makedirs(
self,
path: PathLike,
*,
exist_ok: bool = False,
mode: Optional[int] = None,
) -> None:
mode = 0o777 if mode is None else mode
if mode != 0o777:
# FileSystemSpec backend do not support setting `mode=`
raise NotImplementedError(
'makedirs with custom `mode=` not supported for FileSystemSpec'
' backend. Please open an issue.'
)
return self.fs(path).makedirs(path, exist_ok=exist_ok)
def mkdir(
self,
path: PathLike,
*,
exist_ok: bool = False,
mode: Optional[int] = None,
) -> None:
mode = 0o777 if mode is None else mode
if mode != 0o777:
# FileSystemSpec backend do not support setting `mode=`
raise NotImplementedError(
'mkdir with custom `mode=` not supported for FileSystemSpec backend.'
' Please open an issue.'
)
try:
return self.fs(path).mkdir(path, create_parents=False)
except FileExistsError:
if exist_ok and self.isdir(path):
return
raise FileExistsError(
f'The operation failed because the specified {path=} already exists.'
) from None
def rmtree(self, path: PathLike) -> None:
return self.fs(path).rm(path, recursive=True)
def remove(self, path: PathLike) -> None:
try:
return self.fs(path).rm(path, recursive=False)
except (IsADirectoryError, ValueError):
return self.fs(path).rmdir(path)
def rename(self, path: PathLike, dst: PathLike) -> None:
if self.exists(dst):
raise FileExistsError(f'{dst} already exists. Cannot rename {path}.')
if self.isdir(path) and not self.isdir(dst):
if self.exists(dst):
raise FileExistsError(
f'Cannot rename a file ({path}) to a directory ({dst})'
)
# Check that `dst` is in an existing folder
dst_folder = os.path.dirname(dst)
if not self.exists(dst_folder):
raise FileNotFoundError(f'folder {dst_folder} does not exist')
# Stringify paths, because PosixPaths do not implement len:
path, dst = os.fspath(path), os.fspath(dst)
return self.fs(path).rename(path, dst, recursive=True)
def replace(self, path: PathLike, dst: PathLike) -> None:
if self.isdir(dst):
raise IsADirectoryError(f'Cannot overwrite: {dst} is a directory')
if not self.exists(path):
raise FileNotFoundError(f'Cannot replace: path {path} does not exist')
if self.isdir(path) and self.exists(dst):
raise NotADirectoryError(
f'Cannot replace a directory {path} by a file {dst}'
)
# Check that `dst` is in an existing folder
dst_folder = os.path.dirname(dst)
if not self.exists(dst_folder):
raise FileNotFoundError(f'folder {dst_folder} does not exist')
# Stringify paths, because PosixPaths do not implement len:
path, dst = os.fspath(path), os.fspath(dst)
return self.fs(path).rename(path, dst, recursive=True)
def copy(self, path: PathLike, dst: PathLike, overwrite: bool) -> None:
if not overwrite and self.exists(dst):
raise FileExistsError(f'{dst} already exists. Cannot copy {path}.')
is_dir_dst = self.isdir(dst)
if self.isdir(path) and not is_dir_dst:
raise IsADirectoryError(
f'Cannot copy to {dst}. Path {path} is a directory'
)
if overwrite and is_dir_dst:
raise IsADirectoryError(
f'Cannot overwrite {path}. Destination {dst} is a directory'
)
# Stringify paths, because PosixPaths do not implement len:
path, dst = os.fspath(path), os.fspath(dst)
chunksize = 1024 * 1024 # 1 MB
with self.open(path, mode='rb') as src:
with self.open(dst, mode='wb') as dst:
while True:
data = src.read(chunksize)
if not data:
break
dst.write(data)
def stat(self, path: PathLike) -> stat_utils.StatResult:
info = self.fs(path).info(path)
mtime = int(info.get('mtime', 0.0))
return stat_utils.StatResult(
is_directory=info.get('type') == 'directory',
length=info.get('size'),
mtime=mtime,
owner=info.get('owner'),
group=info.get('group'),
mode=info.get('mode'),
)
def _get_protocol(path: PathLike) -> str:
"""Extract the protocol."""
path = os.fspath(path)
if '://' in path:
return path.split('://', 1)[0] + '://'
else:
return ''
tf_backend = _TfBackend()
os_backend = _OsPathBackend()
fsspec_backend = _FileSystemSpecBackend()