|
from __future__ import annotations |
|
|
|
import os |
|
|
|
import pytest |
|
|
|
from pandas.compat._optional import VERSIONS |
|
|
|
from pandas import ( |
|
read_csv, |
|
read_table, |
|
) |
|
import pandas._testing as tm |
|
|
|
|
|
class BaseParser: |
|
engine: str | None = None |
|
low_memory = True |
|
float_precision_choices: list[str | None] = [] |
|
|
|
def update_kwargs(self, kwargs): |
|
kwargs = kwargs.copy() |
|
kwargs.update({"engine": self.engine, "low_memory": self.low_memory}) |
|
|
|
return kwargs |
|
|
|
def read_csv(self, *args, **kwargs): |
|
kwargs = self.update_kwargs(kwargs) |
|
return read_csv(*args, **kwargs) |
|
|
|
def read_csv_check_warnings( |
|
self, |
|
warn_type: type[Warning], |
|
warn_msg: str, |
|
*args, |
|
raise_on_extra_warnings=True, |
|
check_stacklevel: bool = True, |
|
**kwargs, |
|
): |
|
|
|
|
|
|
|
kwargs = self.update_kwargs(kwargs) |
|
with tm.assert_produces_warning( |
|
warn_type, |
|
match=warn_msg, |
|
raise_on_extra_warnings=raise_on_extra_warnings, |
|
check_stacklevel=check_stacklevel, |
|
): |
|
return read_csv(*args, **kwargs) |
|
|
|
def read_table(self, *args, **kwargs): |
|
kwargs = self.update_kwargs(kwargs) |
|
return read_table(*args, **kwargs) |
|
|
|
def read_table_check_warnings( |
|
self, |
|
warn_type: type[Warning], |
|
warn_msg: str, |
|
*args, |
|
raise_on_extra_warnings=True, |
|
**kwargs, |
|
): |
|
|
|
|
|
|
|
kwargs = self.update_kwargs(kwargs) |
|
with tm.assert_produces_warning( |
|
warn_type, match=warn_msg, raise_on_extra_warnings=raise_on_extra_warnings |
|
): |
|
return read_table(*args, **kwargs) |
|
|
|
|
|
class CParser(BaseParser): |
|
engine = "c" |
|
float_precision_choices = [None, "high", "round_trip"] |
|
|
|
|
|
class CParserHighMemory(CParser): |
|
low_memory = False |
|
|
|
|
|
class CParserLowMemory(CParser): |
|
low_memory = True |
|
|
|
|
|
class PythonParser(BaseParser): |
|
engine = "python" |
|
float_precision_choices = [None] |
|
|
|
|
|
class PyArrowParser(BaseParser): |
|
engine = "pyarrow" |
|
float_precision_choices = [None] |
|
|
|
|
|
@pytest.fixture |
|
def csv_dir_path(datapath): |
|
""" |
|
The directory path to the data files needed for parser tests. |
|
""" |
|
return datapath("io", "parser", "data") |
|
|
|
|
|
@pytest.fixture |
|
def csv1(datapath): |
|
""" |
|
The path to the data file "test1.csv" needed for parser tests. |
|
""" |
|
return os.path.join(datapath("io", "data", "csv"), "test1.csv") |
|
|
|
|
|
_cParserHighMemory = CParserHighMemory |
|
_cParserLowMemory = CParserLowMemory |
|
_pythonParser = PythonParser |
|
_pyarrowParser = PyArrowParser |
|
|
|
_py_parsers_only = [_pythonParser] |
|
_c_parsers_only = [_cParserHighMemory, _cParserLowMemory] |
|
_pyarrow_parsers_only = [pytest.param(_pyarrowParser, marks=pytest.mark.single_cpu)] |
|
|
|
_all_parsers = [*_c_parsers_only, *_py_parsers_only, *_pyarrow_parsers_only] |
|
|
|
_py_parser_ids = ["python"] |
|
_c_parser_ids = ["c_high", "c_low"] |
|
_pyarrow_parsers_ids = ["pyarrow"] |
|
|
|
_all_parser_ids = [*_c_parser_ids, *_py_parser_ids, *_pyarrow_parsers_ids] |
|
|
|
|
|
@pytest.fixture(params=_all_parsers, ids=_all_parser_ids) |
|
def all_parsers(request): |
|
""" |
|
Fixture all of the CSV parsers. |
|
""" |
|
parser = request.param() |
|
if parser.engine == "pyarrow": |
|
pytest.importorskip("pyarrow", VERSIONS["pyarrow"]) |
|
|
|
|
|
import pyarrow |
|
|
|
pyarrow.set_cpu_count(1) |
|
return parser |
|
|
|
|
|
@pytest.fixture(params=_c_parsers_only, ids=_c_parser_ids) |
|
def c_parser_only(request): |
|
""" |
|
Fixture all of the CSV parsers using the C engine. |
|
""" |
|
return request.param() |
|
|
|
|
|
@pytest.fixture(params=_py_parsers_only, ids=_py_parser_ids) |
|
def python_parser_only(request): |
|
""" |
|
Fixture all of the CSV parsers using the Python engine. |
|
""" |
|
return request.param() |
|
|
|
|
|
@pytest.fixture(params=_pyarrow_parsers_only, ids=_pyarrow_parsers_ids) |
|
def pyarrow_parser_only(request): |
|
""" |
|
Fixture all of the CSV parsers using the Pyarrow engine. |
|
""" |
|
return request.param() |
|
|
|
|
|
def _get_all_parser_float_precision_combinations(): |
|
""" |
|
Return all allowable parser and float precision |
|
combinations and corresponding ids. |
|
""" |
|
params = [] |
|
ids = [] |
|
for parser, parser_id in zip(_all_parsers, _all_parser_ids): |
|
if hasattr(parser, "values"): |
|
|
|
parser = parser.values[0] |
|
for precision in parser.float_precision_choices: |
|
|
|
mark = pytest.mark.single_cpu if parser.engine == "pyarrow" else () |
|
param = pytest.param((parser(), precision), marks=mark) |
|
params.append(param) |
|
ids.append(f"{parser_id}-{precision}") |
|
|
|
return {"params": params, "ids": ids} |
|
|
|
|
|
@pytest.fixture( |
|
params=_get_all_parser_float_precision_combinations()["params"], |
|
ids=_get_all_parser_float_precision_combinations()["ids"], |
|
) |
|
def all_parsers_all_precisions(request): |
|
""" |
|
Fixture for all allowable combinations of parser |
|
and float precision |
|
""" |
|
return request.param |
|
|
|
|
|
_utf_values = [8, 16, 32] |
|
|
|
_encoding_seps = ["", "-", "_"] |
|
_encoding_prefixes = ["utf", "UTF"] |
|
|
|
_encoding_fmts = [ |
|
f"{prefix}{sep}{{0}}" for sep in _encoding_seps for prefix in _encoding_prefixes |
|
] |
|
|
|
|
|
@pytest.fixture(params=_utf_values) |
|
def utf_value(request): |
|
""" |
|
Fixture for all possible integer values for a UTF encoding. |
|
""" |
|
return request.param |
|
|
|
|
|
@pytest.fixture(params=_encoding_fmts) |
|
def encoding_fmt(request): |
|
""" |
|
Fixture for all possible string formats of a UTF encoding. |
|
""" |
|
return request.param |
|
|
|
|
|
@pytest.fixture( |
|
params=[ |
|
("-1,0", -1.0), |
|
("-1,2e0", -1.2), |
|
("-1e0", -1.0), |
|
("+1e0", 1.0), |
|
("+1e+0", 1.0), |
|
("+1e-1", 0.1), |
|
("+,1e1", 1.0), |
|
("+1,e0", 1.0), |
|
("-,1e1", -1.0), |
|
("-1,e0", -1.0), |
|
("0,1", 0.1), |
|
("1,", 1.0), |
|
(",1", 0.1), |
|
("-,1", -0.1), |
|
("1_,", 1.0), |
|
("1_234,56", 1234.56), |
|
("1_234,56e0", 1234.56), |
|
|
|
("_", "_"), |
|
("-_", "-_"), |
|
("-_1", "-_1"), |
|
("-_1e0", "-_1e0"), |
|
("_1", "_1"), |
|
("_1,", "_1,"), |
|
("_1,_", "_1,_"), |
|
("_1e0", "_1e0"), |
|
("1,2e_1", "1,2e_1"), |
|
("1,2e1_0", "1,2e1_0"), |
|
("1,_2", "1,_2"), |
|
(",1__2", ",1__2"), |
|
(",1e", ",1e"), |
|
("-,1e", "-,1e"), |
|
("1_000,000_000", "1_000,000_000"), |
|
("1,e1_2", "1,e1_2"), |
|
("e11,2", "e11,2"), |
|
("1e11,2", "1e11,2"), |
|
("1,2,2", "1,2,2"), |
|
("1,2_1", "1,2_1"), |
|
("1,2e-10e1", "1,2e-10e1"), |
|
("--1,2", "--1,2"), |
|
("1a_2,1", "1a_2,1"), |
|
("1,2E-1", 0.12), |
|
("1,2E1", 12.0), |
|
] |
|
) |
|
def numeric_decimal(request): |
|
""" |
|
Fixture for all numeric formats which should get recognized. The first entry |
|
represents the value to read while the second represents the expected result. |
|
""" |
|
return request.param |
|
|
|
|
|
@pytest.fixture |
|
def pyarrow_xfail(request): |
|
""" |
|
Fixture that xfails a test if the engine is pyarrow. |
|
|
|
Use if failure is do to unsupported keywords or inconsistent results. |
|
""" |
|
if "all_parsers" in request.fixturenames: |
|
parser = request.getfixturevalue("all_parsers") |
|
elif "all_parsers_all_precisions" in request.fixturenames: |
|
|
|
parser = request.getfixturevalue("all_parsers_all_precisions")[0] |
|
else: |
|
return |
|
if parser.engine == "pyarrow": |
|
mark = pytest.mark.xfail(reason="pyarrow doesn't support this.") |
|
request.applymarker(mark) |
|
|
|
|
|
@pytest.fixture |
|
def pyarrow_skip(request): |
|
""" |
|
Fixture that skips a test if the engine is pyarrow. |
|
|
|
Use if failure is do a parsing failure from pyarrow.csv.read_csv |
|
""" |
|
if "all_parsers" in request.fixturenames: |
|
parser = request.getfixturevalue("all_parsers") |
|
elif "all_parsers_all_precisions" in request.fixturenames: |
|
|
|
parser = request.getfixturevalue("all_parsers_all_precisions")[0] |
|
else: |
|
return |
|
if parser.engine == "pyarrow": |
|
pytest.skip(reason="https://github.com/apache/arrow/issues/38676") |
|
|