import logging import tarfile import fsspec from fsspec.archive import AbstractArchiveFileSystem from fsspec.compression import compr from fsspec.utils import infer_compression typemap = {b"0": "file", b"5": "directory"} logger = logging.getLogger("tar") class TarFileSystem(AbstractArchiveFileSystem): """Compressed Tar archives as a file-system (read-only) Supports the following formats: tar.gz, tar.bz2, tar.xz """ root_marker = "" protocol = "tar" cachable = False def __init__( self, fo="", index_store=None, target_options=None, target_protocol=None, compression=None, **kwargs, ): super().__init__(**kwargs) target_options = target_options or {} if isinstance(fo, str): self.of = fsspec.open(fo, protocol=target_protocol, **target_options) fo = self.of.open() # keep the reference # Try to infer compression. if compression is None: name = None # Try different ways to get hold of the filename. `fo` might either # be a `fsspec.LocalFileOpener`, an `io.BufferedReader` or an # `fsspec.AbstractFileSystem` instance. try: # Amended io.BufferedReader or similar. # This uses a "protocol extension" where original filenames are # propagated to archive-like filesystems in order to let them # infer the right compression appropriately. if hasattr(fo, "original"): name = fo.original # fsspec.LocalFileOpener elif hasattr(fo, "path"): name = fo.path # io.BufferedReader elif hasattr(fo, "name"): name = fo.name # fsspec.AbstractFileSystem elif hasattr(fo, "info"): name = fo.info()["name"] except Exception as ex: logger.warning( f"Unable to determine file name, not inferring compression: {ex}" ) if name is not None: compression = infer_compression(name) logger.info(f"Inferred compression {compression} from file name {name}") if compression is not None: # TODO: tarfile already implements compression with modes like "'r:gz'", # but then would seek to offset in the file work? fo = compr[compression](fo) self._fo_ref = fo self.fo = fo # the whole instance is a context self.tar = tarfile.TarFile(fileobj=self.fo) self.dir_cache = None self.index_store = index_store self.index = None self._index() def _index(self): # TODO: load and set saved index, if exists out = {} for ti in self.tar: info = ti.get_info() info["type"] = typemap.get(info["type"], "file") name = ti.get_info()["name"].rstrip("/") out[name] = (info, ti.offset_data) self.index = out # TODO: save index to self.index_store here, if set def _get_dirs(self): if self.dir_cache is not None: return # This enables ls to get directories as children as well as files self.dir_cache = { dirname: {"name": dirname, "size": 0, "type": "directory"} for dirname in self._all_dirnames(self.tar.getnames()) } for member in self.tar.getmembers(): info = member.get_info() info["name"] = info["name"].rstrip("/") info["type"] = typemap.get(info["type"], "file") self.dir_cache[info["name"]] = info def _open(self, path, mode="rb", **kwargs): if mode != "rb": raise ValueError("Read-only filesystem implementation") details, offset = self.index[path] if details["type"] != "file": raise ValueError("Can only handle regular files") return self.tar.extractfile(path)