Spaces:
Paused
Paused
import io | |
import json | |
import warnings | |
from .core import url_to_fs | |
from .utils import merge_offset_ranges | |
# Parquet-Specific Utilities for fsspec | |
# | |
# Most of the functions defined in this module are NOT | |
# intended for public consumption. The only exception | |
# to this is `open_parquet_file`, which should be used | |
# place of `fs.open()` to open parquet-formatted files | |
# on remote file systems. | |
def open_parquet_file( | |
path, | |
mode="rb", | |
fs=None, | |
metadata=None, | |
columns=None, | |
row_groups=None, | |
storage_options=None, | |
strict=False, | |
engine="auto", | |
max_gap=64_000, | |
max_block=256_000_000, | |
footer_sample_size=1_000_000, | |
**kwargs, | |
): | |
""" | |
Return a file-like object for a single Parquet file. | |
The specified parquet `engine` will be used to parse the | |
footer metadata, and determine the required byte ranges | |
from the file. The target path will then be opened with | |
the "parts" (`KnownPartsOfAFile`) caching strategy. | |
Note that this method is intended for usage with remote | |
file systems, and is unlikely to improve parquet-read | |
performance on local file systems. | |
Parameters | |
---------- | |
path: str | |
Target file path. | |
mode: str, optional | |
Mode option to be passed through to `fs.open`. Default is "rb". | |
metadata: Any, optional | |
Parquet metadata object. Object type must be supported | |
by the backend parquet engine. For now, only the "fastparquet" | |
engine supports an explicit `ParquetFile` metadata object. | |
If a metadata object is supplied, the remote footer metadata | |
will not need to be transferred into local memory. | |
fs: AbstractFileSystem, optional | |
Filesystem object to use for opening the file. If nothing is | |
specified, an `AbstractFileSystem` object will be inferred. | |
engine : str, default "auto" | |
Parquet engine to use for metadata parsing. Allowed options | |
include "fastparquet", "pyarrow", and "auto". The specified | |
engine must be installed in the current environment. If | |
"auto" is specified, and both engines are installed, | |
"fastparquet" will take precedence over "pyarrow". | |
columns: list, optional | |
List of all column names that may be read from the file. | |
row_groups : list, optional | |
List of all row-groups that may be read from the file. This | |
may be a list of row-group indices (integers), or it may be | |
a list of `RowGroup` metadata objects (if the "fastparquet" | |
engine is used). | |
storage_options : dict, optional | |
Used to generate an `AbstractFileSystem` object if `fs` was | |
not specified. | |
strict : bool, optional | |
Whether the resulting `KnownPartsOfAFile` cache should | |
fetch reads that go beyond a known byte-range boundary. | |
If `False` (the default), any read that ends outside a | |
known part will be zero padded. Note that using | |
`strict=True` may be useful for debugging. | |
max_gap : int, optional | |
Neighboring byte ranges will only be merged when their | |
inter-range gap is <= `max_gap`. Default is 64KB. | |
max_block : int, optional | |
Neighboring byte ranges will only be merged when the size of | |
the aggregated range is <= `max_block`. Default is 256MB. | |
footer_sample_size : int, optional | |
Number of bytes to read from the end of the path to look | |
for the footer metadata. If the sampled bytes do not contain | |
the footer, a second read request will be required, and | |
performance will suffer. Default is 1MB. | |
**kwargs : | |
Optional key-word arguments to pass to `fs.open` | |
""" | |
# Make sure we have an `AbstractFileSystem` object | |
# to work with | |
if fs is None: | |
fs = url_to_fs(path, **(storage_options or {}))[0] | |
# For now, `columns == []` not supported. Just use | |
# default `open` command with `path` input | |
if columns is not None and len(columns) == 0: | |
return fs.open(path, mode=mode) | |
# Set the engine | |
engine = _set_engine(engine) | |
# Fetch the known byte ranges needed to read | |
# `columns` and/or `row_groups` | |
data = _get_parquet_byte_ranges( | |
[path], | |
fs, | |
metadata=metadata, | |
columns=columns, | |
row_groups=row_groups, | |
engine=engine, | |
max_gap=max_gap, | |
max_block=max_block, | |
footer_sample_size=footer_sample_size, | |
) | |
# Extract file name from `data` | |
fn = next(iter(data)) if data else path | |
# Call self.open with "parts" caching | |
options = kwargs.pop("cache_options", {}).copy() | |
return fs.open( | |
fn, | |
mode=mode, | |
cache_type="parts", | |
cache_options={ | |
**options, | |
"data": data.get(fn, {}), | |
"strict": strict, | |
}, | |
**kwargs, | |
) | |
def _get_parquet_byte_ranges( | |
paths, | |
fs, | |
metadata=None, | |
columns=None, | |
row_groups=None, | |
max_gap=64_000, | |
max_block=256_000_000, | |
footer_sample_size=1_000_000, | |
engine="auto", | |
): | |
"""Get a dictionary of the known byte ranges needed | |
to read a specific column/row-group selection from a | |
Parquet dataset. Each value in the output dictionary | |
is intended for use as the `data` argument for the | |
`KnownPartsOfAFile` caching strategy of a single path. | |
""" | |
# Set engine if necessary | |
if isinstance(engine, str): | |
engine = _set_engine(engine) | |
# Pass to specialized function if metadata is defined | |
if metadata is not None: | |
# Use the provided parquet metadata object | |
# to avoid transferring/parsing footer metadata | |
return _get_parquet_byte_ranges_from_metadata( | |
metadata, | |
fs, | |
engine, | |
columns=columns, | |
row_groups=row_groups, | |
max_gap=max_gap, | |
max_block=max_block, | |
) | |
# Get file sizes asynchronously | |
file_sizes = fs.sizes(paths) | |
# Populate global paths, starts, & ends | |
result = {} | |
data_paths = [] | |
data_starts = [] | |
data_ends = [] | |
add_header_magic = True | |
if columns is None and row_groups is None: | |
# We are NOT selecting specific columns or row-groups. | |
# | |
# We can avoid sampling the footers, and just transfer | |
# all file data with cat_ranges | |
for i, path in enumerate(paths): | |
result[path] = {} | |
for b in range(0, file_sizes[i], max_block): | |
data_paths.append(path) | |
data_starts.append(b) | |
data_ends.append(min(b + max_block, file_sizes[i])) | |
add_header_magic = False # "Magic" should already be included | |
else: | |
# We ARE selecting specific columns or row-groups. | |
# | |
# Gather file footers. | |
# We just take the last `footer_sample_size` bytes of each | |
# file (or the entire file if it is smaller than that) | |
footer_starts = [] | |
footer_ends = [] | |
for i, path in enumerate(paths): | |
footer_ends.append(file_sizes[i]) | |
sample_size = max(0, file_sizes[i] - footer_sample_size) | |
footer_starts.append(sample_size) | |
footer_samples = fs.cat_ranges(paths, footer_starts, footer_ends) | |
# Check our footer samples and re-sample if necessary. | |
missing_footer_starts = footer_starts.copy() | |
large_footer = 0 | |
for i, path in enumerate(paths): | |
footer_size = int.from_bytes(footer_samples[i][-8:-4], "little") | |
real_footer_start = file_sizes[i] - (footer_size + 8) | |
if real_footer_start < footer_starts[i]: | |
missing_footer_starts[i] = real_footer_start | |
large_footer = max(large_footer, (footer_size + 8)) | |
if large_footer: | |
warnings.warn( | |
f"Not enough data was used to sample the parquet footer. " | |
f"Try setting footer_sample_size >= {large_footer}." | |
) | |
for i, block in enumerate( | |
fs.cat_ranges( | |
paths, | |
missing_footer_starts, | |
footer_starts, | |
) | |
): | |
footer_samples[i] = block + footer_samples[i] | |
footer_starts[i] = missing_footer_starts[i] | |
# Calculate required byte ranges for each path | |
for i, path in enumerate(paths): | |
# Deal with small-file case. | |
# Just include all remaining bytes of the file | |
# in a single range. | |
if file_sizes[i] < max_block: | |
if footer_starts[i] > 0: | |
# Only need to transfer the data if the | |
# footer sample isn't already the whole file | |
data_paths.append(path) | |
data_starts.append(0) | |
data_ends.append(footer_starts[i]) | |
continue | |
# Use "engine" to collect data byte ranges | |
path_data_starts, path_data_ends = engine._parquet_byte_ranges( | |
columns, | |
row_groups=row_groups, | |
footer=footer_samples[i], | |
footer_start=footer_starts[i], | |
) | |
data_paths += [path] * len(path_data_starts) | |
data_starts += path_data_starts | |
data_ends += path_data_ends | |
# Merge adjacent offset ranges | |
data_paths, data_starts, data_ends = merge_offset_ranges( | |
data_paths, | |
data_starts, | |
data_ends, | |
max_gap=max_gap, | |
max_block=max_block, | |
sort=False, # Should already be sorted | |
) | |
# Start by populating `result` with footer samples | |
for i, path in enumerate(paths): | |
result[path] = {(footer_starts[i], footer_ends[i]): footer_samples[i]} | |
# Transfer the data byte-ranges into local memory | |
_transfer_ranges(fs, result, data_paths, data_starts, data_ends) | |
# Add b"PAR1" to header if necessary | |
if add_header_magic: | |
_add_header_magic(result) | |
return result | |
def _get_parquet_byte_ranges_from_metadata( | |
metadata, | |
fs, | |
engine, | |
columns=None, | |
row_groups=None, | |
max_gap=64_000, | |
max_block=256_000_000, | |
): | |
"""Simplified version of `_get_parquet_byte_ranges` for | |
the case that an engine-specific `metadata` object is | |
provided, and the remote footer metadata does not need to | |
be transferred before calculating the required byte ranges. | |
""" | |
# Use "engine" to collect data byte ranges | |
data_paths, data_starts, data_ends = engine._parquet_byte_ranges( | |
columns, | |
row_groups=row_groups, | |
metadata=metadata, | |
) | |
# Merge adjacent offset ranges | |
data_paths, data_starts, data_ends = merge_offset_ranges( | |
data_paths, | |
data_starts, | |
data_ends, | |
max_gap=max_gap, | |
max_block=max_block, | |
sort=False, # Should be sorted | |
) | |
# Transfer the data byte-ranges into local memory | |
result = {fn: {} for fn in list(set(data_paths))} | |
_transfer_ranges(fs, result, data_paths, data_starts, data_ends) | |
# Add b"PAR1" to header | |
_add_header_magic(result) | |
return result | |
def _transfer_ranges(fs, blocks, paths, starts, ends): | |
# Use cat_ranges to gather the data byte_ranges | |
ranges = (paths, starts, ends) | |
for path, start, stop, data in zip(*ranges, fs.cat_ranges(*ranges)): | |
blocks[path][(start, stop)] = data | |
def _add_header_magic(data): | |
# Add b"PAR1" to file headers | |
for path in list(data.keys()): | |
add_magic = True | |
for k in data[path].keys(): | |
if k[0] == 0 and k[1] >= 4: | |
add_magic = False | |
break | |
if add_magic: | |
data[path][(0, 4)] = b"PAR1" | |
def _set_engine(engine_str): | |
# Define a list of parquet engines to try | |
if engine_str == "auto": | |
try_engines = ("fastparquet", "pyarrow") | |
elif not isinstance(engine_str, str): | |
raise ValueError( | |
"Failed to set parquet engine! " | |
"Please pass 'fastparquet', 'pyarrow', or 'auto'" | |
) | |
elif engine_str not in ("fastparquet", "pyarrow"): | |
raise ValueError(f"{engine_str} engine not supported by `fsspec.parquet`") | |
else: | |
try_engines = [engine_str] | |
# Try importing the engines in `try_engines`, | |
# and choose the first one that succeeds | |
for engine in try_engines: | |
try: | |
if engine == "fastparquet": | |
return FastparquetEngine() | |
elif engine == "pyarrow": | |
return PyarrowEngine() | |
except ImportError: | |
pass | |
# Raise an error if a supported parquet engine | |
# was not found | |
raise ImportError( | |
f"The following parquet engines are not installed " | |
f"in your python environment: {try_engines}." | |
f"Please install 'fastparquert' or 'pyarrow' to " | |
f"utilize the `fsspec.parquet` module." | |
) | |
class FastparquetEngine: | |
# The purpose of the FastparquetEngine class is | |
# to check if fastparquet can be imported (on initialization) | |
# and to define a `_parquet_byte_ranges` method. In the | |
# future, this class may also be used to define other | |
# methods/logic that are specific to fastparquet. | |
def __init__(self): | |
import fastparquet as fp | |
self.fp = fp | |
def _row_group_filename(self, row_group, pf): | |
return pf.row_group_filename(row_group) | |
def _parquet_byte_ranges( | |
self, | |
columns, | |
row_groups=None, | |
metadata=None, | |
footer=None, | |
footer_start=None, | |
): | |
# Initialize offset ranges and define ParqetFile metadata | |
pf = metadata | |
data_paths, data_starts, data_ends = [], [], [] | |
if pf is None: | |
pf = self.fp.ParquetFile(io.BytesIO(footer)) | |
# Convert columns to a set and add any index columns | |
# specified in the pandas metadata (just in case) | |
column_set = None if columns is None else set(columns) | |
if column_set is not None and hasattr(pf, "pandas_metadata"): | |
md_index = [ | |
ind | |
for ind in pf.pandas_metadata.get("index_columns", []) | |
# Ignore RangeIndex information | |
if not isinstance(ind, dict) | |
] | |
column_set |= set(md_index) | |
# Check if row_groups is a list of integers | |
# or a list of row-group metadata | |
if row_groups and not isinstance(row_groups[0], int): | |
# Input row_groups contains row-group metadata | |
row_group_indices = None | |
else: | |
# Input row_groups contains row-group indices | |
row_group_indices = row_groups | |
row_groups = pf.row_groups | |
# Loop through column chunks to add required byte ranges | |
for r, row_group in enumerate(row_groups): | |
# Skip this row-group if we are targeting | |
# specific row-groups | |
if row_group_indices is None or r in row_group_indices: | |
# Find the target parquet-file path for `row_group` | |
fn = self._row_group_filename(row_group, pf) | |
for column in row_group.columns: | |
name = column.meta_data.path_in_schema[0] | |
# Skip this column if we are targeting a | |
# specific columns | |
if column_set is None or name in column_set: | |
file_offset0 = column.meta_data.dictionary_page_offset | |
if file_offset0 is None: | |
file_offset0 = column.meta_data.data_page_offset | |
num_bytes = column.meta_data.total_compressed_size | |
if footer_start is None or file_offset0 < footer_start: | |
data_paths.append(fn) | |
data_starts.append(file_offset0) | |
data_ends.append( | |
min( | |
file_offset0 + num_bytes, | |
footer_start or (file_offset0 + num_bytes), | |
) | |
) | |
if metadata: | |
# The metadata in this call may map to multiple | |
# file paths. Need to include `data_paths` | |
return data_paths, data_starts, data_ends | |
return data_starts, data_ends | |
class PyarrowEngine: | |
# The purpose of the PyarrowEngine class is | |
# to check if pyarrow can be imported (on initialization) | |
# and to define a `_parquet_byte_ranges` method. In the | |
# future, this class may also be used to define other | |
# methods/logic that are specific to pyarrow. | |
def __init__(self): | |
import pyarrow.parquet as pq | |
self.pq = pq | |
def _row_group_filename(self, row_group, metadata): | |
raise NotImplementedError | |
def _parquet_byte_ranges( | |
self, | |
columns, | |
row_groups=None, | |
metadata=None, | |
footer=None, | |
footer_start=None, | |
): | |
if metadata is not None: | |
raise ValueError("metadata input not supported for PyarrowEngine") | |
data_starts, data_ends = [], [] | |
md = self.pq.ParquetFile(io.BytesIO(footer)).metadata | |
# Convert columns to a set and add any index columns | |
# specified in the pandas metadata (just in case) | |
column_set = None if columns is None else set(columns) | |
if column_set is not None: | |
schema = md.schema.to_arrow_schema() | |
has_pandas_metadata = ( | |
schema.metadata is not None and b"pandas" in schema.metadata | |
) | |
if has_pandas_metadata: | |
md_index = [ | |
ind | |
for ind in json.loads( | |
schema.metadata[b"pandas"].decode("utf8") | |
).get("index_columns", []) | |
# Ignore RangeIndex information | |
if not isinstance(ind, dict) | |
] | |
column_set |= set(md_index) | |
# Loop through column chunks to add required byte ranges | |
for r in range(md.num_row_groups): | |
# Skip this row-group if we are targeting | |
# specific row-groups | |
if row_groups is None or r in row_groups: | |
row_group = md.row_group(r) | |
for c in range(row_group.num_columns): | |
column = row_group.column(c) | |
name = column.path_in_schema | |
# Skip this column if we are targeting a | |
# specific columns | |
split_name = name.split(".")[0] | |
if ( | |
column_set is None | |
or name in column_set | |
or split_name in column_set | |
): | |
file_offset0 = column.dictionary_page_offset | |
if file_offset0 is None: | |
file_offset0 = column.data_page_offset | |
num_bytes = column.total_compressed_size | |
if file_offset0 < footer_start: | |
data_starts.append(file_offset0) | |
data_ends.append( | |
min(file_offset0 + num_bytes, footer_start) | |
) | |
return data_starts, data_ends | |