Spaces:

arxify
/

RVC-beta-v2-0618

Running

App Files Files Community

RVC-beta-v2-0618 / runtime /Lib /site-packages /altair /utils /data.py

arxify

Upload folder using huggingface_hub

ba2f5d6 almost 2 years ago

raw

history blame contribute delete

8.05 kB

	import json
	import os
	import random
	import hashlib
	import warnings

	import pandas as pd
	from toolz import curried
	from typing import Callable

	from .core import sanitize_dataframe
	from .core import sanitize_geo_interface
	from .deprecation import AltairDeprecationWarning
	from .plugin_registry import PluginRegistry


	# ==============================================================================
	# Data transformer registry
	# ==============================================================================
	DataTransformerType = Callable


	class DataTransformerRegistry(PluginRegistry[DataTransformerType]):
	_global_settings = {"consolidate_datasets": True}

	@property
	def consolidate_datasets(self):
	return self._global_settings["consolidate_datasets"]

	@consolidate_datasets.setter
	def consolidate_datasets(self, value):
	self._global_settings["consolidate_datasets"] = value


	# ==============================================================================
	# Data model transformers
	#
	# A data model transformer is a pure function that takes a dict or DataFrame
	# and returns a transformed version of a dict or DataFrame. The dict objects
	# will be the Data portion of the VegaLite schema. The idea is that user can
	# pipe a sequence of these data transformers together to prepare the data before
	# it hits the renderer.
	#
	# In this version of Altair, renderers only deal with the dict form of a
	# VegaLite spec, after the Data model has been put into a schema compliant
	# form.
	#
	# A data model transformer has the following type signature:
	# DataModelType = Union[dict, pd.DataFrame]
	# DataModelTransformerType = Callable[[DataModelType, KwArgs], DataModelType]
	# ==============================================================================


	class MaxRowsError(Exception):
	"""Raised when a data model has too many rows."""

	pass


	@curried.curry
	def limit_rows(data, max_rows=5000):
	"""Raise MaxRowsError if the data model has more than max_rows.

	If max_rows is None, then do not perform any check.
	"""
	check_data_type(data)
	if hasattr(data, "__geo_interface__"):
	if data.__geo_interface__["type"] == "FeatureCollection":
	values = data.__geo_interface__["features"]
	else:
	values = data.__geo_interface__
	elif isinstance(data, pd.DataFrame):
	values = data
	elif isinstance(data, dict):
	if "values" in data:
	values = data["values"]
	else:
	return data
	if max_rows is not None and len(values) > max_rows:
	raise MaxRowsError(
	"The number of rows in your dataset is greater "
	"than the maximum allowed ({}). "
	"For information on how to plot larger datasets "
	"in Altair, see the documentation".format(max_rows)
	)
	return data


	@curried.curry
	def sample(data, n=None, frac=None):
	"""Reduce the size of the data model by sampling without replacement."""
	check_data_type(data)
	if isinstance(data, pd.DataFrame):
	return data.sample(n=n, frac=frac)
	elif isinstance(data, dict):
	if "values" in data:
	values = data["values"]
	n = n if n else int(frac * len(values))
	values = random.sample(values, n)
	return {"values": values}


	@curried.curry
	def to_json(
	data,
	prefix="altair-data",
	extension="json",
	filename="{prefix}-{hash}.{extension}",
	urlpath="",
	):
	"""
	Write the data model to a .json file and return a url based data model.
	"""
	data_json = _data_to_json_string(data)
	data_hash = _compute_data_hash(data_json)
	filename = filename.format(prefix=prefix, hash=data_hash, extension=extension)
	with open(filename, "w") as f:
	f.write(data_json)
	return {"url": os.path.join(urlpath, filename), "format": {"type": "json"}}


	@curried.curry
	def to_csv(
	data,
	prefix="altair-data",
	extension="csv",
	filename="{prefix}-{hash}.{extension}",
	urlpath="",
	):
	"""Write the data model to a .csv file and return a url based data model."""
	data_csv = _data_to_csv_string(data)
	data_hash = _compute_data_hash(data_csv)
	filename = filename.format(prefix=prefix, hash=data_hash, extension=extension)
	with open(filename, "w") as f:
	f.write(data_csv)
	return {"url": os.path.join(urlpath, filename), "format": {"type": "csv"}}


	@curried.curry
	def to_values(data):
	"""Replace a DataFrame by a data model with values."""
	check_data_type(data)
	if hasattr(data, "__geo_interface__"):
	if isinstance(data, pd.DataFrame):
	data = sanitize_dataframe(data)
	data = sanitize_geo_interface(data.__geo_interface__)
	return {"values": data}
	elif isinstance(data, pd.DataFrame):
	data = sanitize_dataframe(data)
	return {"values": data.to_dict(orient="records")}
	elif isinstance(data, dict):
	if "values" not in data:
	raise KeyError("values expected in data dict, but not present.")
	return data


	def check_data_type(data):
	"""Raise if the data is not a dict or DataFrame."""
	if not isinstance(data, (dict, pd.DataFrame)) and not hasattr(
	data, "__geo_interface__"
	):
	raise TypeError(
	"Expected dict, DataFrame or a __geo_interface__ attribute, got: {}".format(
	type(data)
	)
	)


	# ==============================================================================
	# Private utilities
	# ==============================================================================


	def _compute_data_hash(data_str):
	return hashlib.md5(data_str.encode()).hexdigest()


	def _data_to_json_string(data):
	"""Return a JSON string representation of the input data"""
	check_data_type(data)
	if hasattr(data, "__geo_interface__"):
	if isinstance(data, pd.DataFrame):
	data = sanitize_dataframe(data)
	data = sanitize_geo_interface(data.__geo_interface__)
	return json.dumps(data)
	elif isinstance(data, pd.DataFrame):
	data = sanitize_dataframe(data)
	return data.to_json(orient="records", double_precision=15)
	elif isinstance(data, dict):
	if "values" not in data:
	raise KeyError("values expected in data dict, but not present.")
	return json.dumps(data["values"], sort_keys=True)
	else:
	raise NotImplementedError(
	"to_json only works with data expressed as " "a DataFrame or as a dict"
	)


	def _data_to_csv_string(data):
	"""return a CSV string representation of the input data"""
	check_data_type(data)
	if hasattr(data, "__geo_interface__"):
	raise NotImplementedError(
	"to_csv does not work with data that "
	"contains the __geo_interface__ attribute"
	)
	elif isinstance(data, pd.DataFrame):
	data = sanitize_dataframe(data)
	return data.to_csv(index=False)
	elif isinstance(data, dict):
	if "values" not in data:
	raise KeyError("values expected in data dict, but not present")
	return pd.DataFrame.from_dict(data["values"]).to_csv(index=False)
	else:
	raise NotImplementedError(
	"to_csv only works with data expressed as " "a DataFrame or as a dict"
	)


	def pipe(data, *funcs):
	"""
	Pipe a value through a sequence of functions

	Deprecated: use toolz.curried.pipe() instead.
	"""
	warnings.warn(
	"alt.pipe() is deprecated, and will be removed in a future release. "
	"Use toolz.curried.pipe() instead.",
	AltairDeprecationWarning,
	)
	return curried.pipe(data, *funcs)


	def curry(args, *kwargs):
	"""Curry a callable function

	Deprecated: use toolz.curried.curry() instead.
	"""
	warnings.warn(
	"alt.curry() is deprecated, and will be removed in a future release. "
	"Use toolz.curried.curry() instead.",
	AltairDeprecationWarning,
	)
	return curried.curry(args, *kwargs)