arxify's picture
Upload folder using huggingface_hub
ba2f5d6
import os
import pytest
import pandas as pd
from toolz import pipe
from ..data import limit_rows, MaxRowsError, sample, to_values, to_json, to_csv
def _create_dataframe(N):
data = pd.DataFrame({"x": range(N), "y": range(N)})
return data
def _create_data_with_values(N):
data = {"values": [{"x": i, "y": i + 1} for i in range(N)]}
return data
def test_limit_rows():
"""Test the limit_rows data transformer."""
data = _create_dataframe(10)
result = limit_rows(data, max_rows=20)
assert data is result
with pytest.raises(MaxRowsError):
pipe(data, limit_rows(max_rows=5))
data = _create_data_with_values(10)
result = pipe(data, limit_rows(max_rows=20))
assert data is result
with pytest.raises(MaxRowsError):
limit_rows(data, max_rows=5)
def test_sample():
"""Test the sample data transformer."""
data = _create_dataframe(20)
result = pipe(data, sample(n=10))
assert len(result) == 10
assert isinstance(result, pd.DataFrame)
data = _create_data_with_values(20)
result = sample(data, n=10)
assert isinstance(result, dict)
assert "values" in result
assert len(result["values"]) == 10
data = _create_dataframe(20)
result = pipe(data, sample(frac=0.5))
assert len(result) == 10
assert isinstance(result, pd.DataFrame)
data = _create_data_with_values(20)
result = sample(data, frac=0.5)
assert isinstance(result, dict)
assert "values" in result
assert len(result["values"]) == 10
def test_to_values():
"""Test the to_values data transformer."""
data = _create_dataframe(10)
result = pipe(data, to_values)
assert result == {"values": data.to_dict(orient="records")}
def test_type_error():
"""Ensure that TypeError is raised for types other than dict/DataFrame."""
for f in (sample, limit_rows, to_values):
with pytest.raises(TypeError):
pipe(0, f)
def test_dataframe_to_json():
"""Test to_json
- make certain the filename is deterministic
- make certain the file contents match the data
"""
data = _create_dataframe(10)
try:
result1 = pipe(data, to_json)
result2 = pipe(data, to_json)
filename = result1["url"]
output = pd.read_json(filename)
finally:
os.remove(filename)
assert result1 == result2
assert output.equals(data)
def test_dict_to_json():
"""Test to_json
- make certain the filename is deterministic
- make certain the file contents match the data
"""
data = _create_data_with_values(10)
try:
result1 = pipe(data, to_json)
result2 = pipe(data, to_json)
filename = result1["url"]
output = pd.read_json(filename).to_dict(orient="records")
finally:
os.remove(filename)
assert result1 == result2
assert data == {"values": output}
def test_dataframe_to_csv():
"""Test to_csv with dataframe input
- make certain the filename is deterministic
- make certain the file contents match the data
"""
data = _create_dataframe(10)
try:
result1 = pipe(data, to_csv)
result2 = pipe(data, to_csv)
filename = result1["url"]
output = pd.read_csv(filename)
finally:
os.remove(filename)
assert result1 == result2
assert output.equals(data)
def test_dict_to_csv():
"""Test to_csv with dict input
- make certain the filename is deterministic
- make certain the file contents match the data
"""
data = _create_data_with_values(10)
try:
result1 = pipe(data, to_csv)
result2 = pipe(data, to_csv)
filename = result1["url"]
output = pd.read_csv(filename).to_dict(orient="records")
finally:
os.remove(filename)
assert result1 == result2
assert data == {"values": output}