Spaces:

arxify
/

RVC-beta-v2-0618

Running

App Files Files Community

RVC-beta-v2-0618 / runtime /Lib /site-packages /altair /utils /tests /test_utils.py

arxify

Upload folder using huggingface_hub

ba2f5d6 about 2 years ago

raw

history blame contribute delete

6.55 kB

	import pytest
	import warnings
	import json

	import numpy as np
	import pandas as pd

	from .. import infer_vegalite_type, sanitize_dataframe


	def test_infer_vegalite_type():
	def _check(arr, typ):
	assert infer_vegalite_type(arr) == typ

	_check(np.arange(5, dtype=float), "quantitative")
	_check(np.arange(5, dtype=int), "quantitative")
	_check(np.zeros(5, dtype=bool), "nominal")
	_check(pd.date_range("2012", "2013"), "temporal")
	_check(pd.timedelta_range(365, periods=12), "temporal")

	nulled = pd.Series(np.random.randint(10, size=10))
	nulled[0] = None
	_check(nulled, "quantitative")
	_check(["a", "b", "c"], "nominal")

	if hasattr(pytest, "warns"): # added in pytest 2.8
	with pytest.warns(UserWarning):
	_check([], "nominal")
	else:
	with warnings.catch_warnings():
	warnings.filterwarnings("ignore")
	_check([], "nominal")


	def test_sanitize_dataframe():
	# create a dataframe with various types
	df = pd.DataFrame(
	{
	"s": list("abcde"),
	"f": np.arange(5, dtype=float),
	"i": np.arange(5, dtype=int),
	"b": np.array([True, False, True, True, False]),
	"d": pd.date_range("2012-01-01", periods=5, freq="H"),
	"c": pd.Series(list("ababc"), dtype="category"),
	"c2": pd.Series([1, "A", 2.5, "B", None], dtype="category"),
	"o": pd.Series([np.array(i) for i in range(5)]),
	"p": pd.date_range("2012-01-01", periods=5, freq="H").tz_localize("UTC"),
	}
	)

	# add some nulls
	df.iloc[0, df.columns.get_loc("s")] = None
	df.iloc[0, df.columns.get_loc("f")] = np.nan
	df.iloc[0, df.columns.get_loc("d")] = pd.NaT
	df.iloc[0, df.columns.get_loc("o")] = np.array(np.nan)

	# JSON serialize. This will fail on non-sanitized dataframes
	print(df[["s", "c2"]])
	df_clean = sanitize_dataframe(df)
	print(df_clean[["s", "c2"]])
	print(df_clean[["s", "c2"]].to_dict())
	s = json.dumps(df_clean.to_dict(orient="records"))
	print(s)

	# Re-construct pandas dataframe
	df2 = pd.read_json(s)

	# Re-order the columns to match df
	df2 = df2[df.columns]

	# Re-apply original types
	for col in df:
	if str(df[col].dtype).startswith("datetime"):
	# astype(datetime) introduces time-zone issues:
	# to_datetime() does not.
	utc = isinstance(df[col].dtype, pd.core.dtypes.dtypes.DatetimeTZDtype)
	df2[col] = pd.to_datetime(df2[col], utc=utc)
	else:
	df2[col] = df2[col].astype(df[col].dtype)

	# pandas doesn't properly recognize np.array(np.nan), so change it here
	df.iloc[0, df.columns.get_loc("o")] = np.nan
	assert df.equals(df2)


	def test_sanitize_dataframe_colnames():
	df = pd.DataFrame(np.arange(12).reshape(4, 3))

	# Test that RangeIndex is converted to strings
	df = sanitize_dataframe(df)
	assert [isinstance(col, str) for col in df.columns]

	# Test that non-string columns result in an error
	df.columns = [4, "foo", "bar"]
	with pytest.raises(ValueError) as err:
	sanitize_dataframe(df)
	assert str(err.value).startswith("Dataframe contains invalid column name: 4.")


	def test_sanitize_dataframe_timedelta():
	df = pd.DataFrame({"r": pd.timedelta_range(start="1 day", periods=4)})
	with pytest.raises(ValueError) as err:
	sanitize_dataframe(df)
	assert str(err.value).startswith('Field "r" has type "timedelta')


	def test_sanitize_dataframe_infs():
	df = pd.DataFrame({"x": [0, 1, 2, np.inf, -np.inf, np.nan]})
	df_clean = sanitize_dataframe(df)
	assert list(df_clean.dtypes) == [object]
	assert list(df_clean["x"]) == [0, 1, 2, None, None, None]


	@pytest.mark.skipif(
	not hasattr(pd, "Int64Dtype"),
	reason="Nullable integers not supported in pandas v{}".format(pd.__version__),
	)
	def test_sanitize_nullable_integers():

	df = pd.DataFrame(
	{
	"int_np": [1, 2, 3, 4, 5],
	"int64": pd.Series([1, 2, 3, None, 5], dtype="UInt8"),
	"int64_nan": pd.Series([1, 2, 3, float("nan"), 5], dtype="Int64"),
	"float": [1.0, 2.0, 3.0, 4, 5.0],
	"float_null": [1, 2, None, 4, 5],
	"float_inf": [1, 2, None, 4, (float("inf"))],
	}
	)

	df_clean = sanitize_dataframe(df)
	assert {col.dtype.name for _, col in df_clean.iteritems()} == {"object"}

	result_python = {col_name: list(col) for col_name, col in df_clean.iteritems()}
	assert result_python == {
	"int_np": [1, 2, 3, 4, 5],
	"int64": [1, 2, 3, None, 5],
	"int64_nan": [1, 2, 3, None, 5],
	"float": [1.0, 2.0, 3.0, 4.0, 5.0],
	"float_null": [1.0, 2.0, None, 4.0, 5.0],
	"float_inf": [1.0, 2.0, None, 4.0, None],
	}


	@pytest.mark.skipif(
	not hasattr(pd, "StringDtype"),
	reason="dedicated String dtype not supported in pandas v{}".format(pd.__version__),
	)
	def test_sanitize_string_dtype():
	df = pd.DataFrame(
	{
	"string_object": ["a", "b", "c", "d"],
	"string_string": pd.array(["a", "b", "c", "d"], dtype="string"),
	"string_object_null": ["a", "b", None, "d"],
	"string_string_null": pd.array(["a", "b", None, "d"], dtype="string"),
	}
	)

	df_clean = sanitize_dataframe(df)
	assert {col.dtype.name for _, col in df_clean.iteritems()} == {"object"}

	result_python = {col_name: list(col) for col_name, col in df_clean.iteritems()}
	assert result_python == {
	"string_object": ["a", "b", "c", "d"],
	"string_string": ["a", "b", "c", "d"],
	"string_object_null": ["a", "b", None, "d"],
	"string_string_null": ["a", "b", None, "d"],
	}


	@pytest.mark.skipif(
	not hasattr(pd, "BooleanDtype"),
	reason="Nullable boolean dtype not supported in pandas v{}".format(pd.__version__),
	)
	def test_sanitize_boolean_dtype():
	df = pd.DataFrame(
	{
	"bool_none": pd.array([True, False, None], dtype="boolean"),
	"none": pd.array([None, None, None], dtype="boolean"),
	"bool": pd.array([True, False, True], dtype="boolean"),
	}
	)

	df_clean = sanitize_dataframe(df)
	assert {col.dtype.name for _, col in df_clean.iteritems()} == {"object"}

	result_python = {col_name: list(col) for col_name, col in df_clean.iteritems()}
	assert result_python == {
	"bool_none": [True, False, None],
	"none": [None, None, None],
	"bool": [True, False, True],
	}