|
from __future__ import annotations |
|
|
|
from typing import ( |
|
TYPE_CHECKING, |
|
Any, |
|
Literal, |
|
final, |
|
) |
|
|
|
import numpy as np |
|
|
|
from pandas.core.dtypes.common import ( |
|
is_integer, |
|
is_list_like, |
|
) |
|
from pandas.core.dtypes.generic import ( |
|
ABCDataFrame, |
|
ABCIndex, |
|
) |
|
from pandas.core.dtypes.missing import ( |
|
isna, |
|
remove_na_arraylike, |
|
) |
|
|
|
from pandas.io.formats.printing import pprint_thing |
|
from pandas.plotting._matplotlib.core import ( |
|
LinePlot, |
|
MPLPlot, |
|
) |
|
from pandas.plotting._matplotlib.groupby import ( |
|
create_iter_data_given_by, |
|
reformat_hist_y_given_by, |
|
) |
|
from pandas.plotting._matplotlib.misc import unpack_single_str_list |
|
from pandas.plotting._matplotlib.tools import ( |
|
create_subplots, |
|
flatten_axes, |
|
maybe_adjust_figure, |
|
set_ticks_props, |
|
) |
|
|
|
if TYPE_CHECKING: |
|
from matplotlib.axes import Axes |
|
from matplotlib.figure import Figure |
|
|
|
from pandas._typing import PlottingOrientation |
|
|
|
from pandas import ( |
|
DataFrame, |
|
Series, |
|
) |
|
|
|
|
|
class HistPlot(LinePlot): |
|
@property |
|
def _kind(self) -> Literal["hist", "kde"]: |
|
return "hist" |
|
|
|
def __init__( |
|
self, |
|
data, |
|
bins: int | np.ndarray | list[np.ndarray] = 10, |
|
bottom: int | np.ndarray = 0, |
|
*, |
|
range=None, |
|
weights=None, |
|
**kwargs, |
|
) -> None: |
|
if is_list_like(bottom): |
|
bottom = np.array(bottom) |
|
self.bottom = bottom |
|
|
|
self._bin_range = range |
|
self.weights = weights |
|
|
|
self.xlabel = kwargs.get("xlabel") |
|
self.ylabel = kwargs.get("ylabel") |
|
|
|
MPLPlot.__init__(self, data, **kwargs) |
|
|
|
self.bins = self._adjust_bins(bins) |
|
|
|
def _adjust_bins(self, bins: int | np.ndarray | list[np.ndarray]): |
|
if is_integer(bins): |
|
if self.by is not None: |
|
by_modified = unpack_single_str_list(self.by) |
|
grouped = self.data.groupby(by_modified)[self.columns] |
|
bins = [self._calculate_bins(group, bins) for key, group in grouped] |
|
else: |
|
bins = self._calculate_bins(self.data, bins) |
|
return bins |
|
|
|
def _calculate_bins(self, data: Series | DataFrame, bins) -> np.ndarray: |
|
"""Calculate bins given data""" |
|
nd_values = data.infer_objects(copy=False)._get_numeric_data() |
|
values = np.ravel(nd_values) |
|
values = values[~isna(values)] |
|
|
|
hist, bins = np.histogram(values, bins=bins, range=self._bin_range) |
|
return bins |
|
|
|
|
|
@classmethod |
|
def _plot( |
|
cls, |
|
ax: Axes, |
|
y: np.ndarray, |
|
style=None, |
|
bottom: int | np.ndarray = 0, |
|
column_num: int = 0, |
|
stacking_id=None, |
|
*, |
|
bins, |
|
**kwds, |
|
): |
|
if column_num == 0: |
|
cls._initialize_stacker(ax, stacking_id, len(bins) - 1) |
|
|
|
base = np.zeros(len(bins) - 1) |
|
bottom = bottom + cls._get_stacked_values(ax, stacking_id, base, kwds["label"]) |
|
|
|
n, bins, patches = ax.hist(y, bins=bins, bottom=bottom, **kwds) |
|
cls._update_stacker(ax, stacking_id, n) |
|
return patches |
|
|
|
def _make_plot(self, fig: Figure) -> None: |
|
colors = self._get_colors() |
|
stacking_id = self._get_stacking_id() |
|
|
|
|
|
data = ( |
|
create_iter_data_given_by(self.data, self._kind) |
|
if self.by is not None |
|
else self.data |
|
) |
|
|
|
|
|
|
|
for i, (label, y) in enumerate(self._iter_data(data=data)): |
|
ax = self._get_ax(i) |
|
|
|
kwds = self.kwds.copy() |
|
if self.color is not None: |
|
kwds["color"] = self.color |
|
|
|
label = pprint_thing(label) |
|
label = self._mark_right_label(label, index=i) |
|
kwds["label"] = label |
|
|
|
style, kwds = self._apply_style_colors(colors, kwds, i, label) |
|
if style is not None: |
|
kwds["style"] = style |
|
|
|
self._make_plot_keywords(kwds, y) |
|
|
|
|
|
|
|
if self.by is not None: |
|
kwds["bins"] = kwds["bins"][i] |
|
kwds["label"] = self.columns |
|
kwds.pop("color") |
|
|
|
if self.weights is not None: |
|
kwds["weights"] = type(self)._get_column_weights(self.weights, i, y) |
|
|
|
y = reformat_hist_y_given_by(y, self.by) |
|
|
|
artists = self._plot(ax, y, column_num=i, stacking_id=stacking_id, **kwds) |
|
|
|
|
|
if self.by is not None: |
|
ax.set_title(pprint_thing(label)) |
|
|
|
self._append_legend_handles_labels(artists[0], label) |
|
|
|
def _make_plot_keywords(self, kwds: dict[str, Any], y: np.ndarray) -> None: |
|
"""merge BoxPlot/KdePlot properties to passed kwds""" |
|
|
|
kwds["bottom"] = self.bottom |
|
kwds["bins"] = self.bins |
|
|
|
@final |
|
@staticmethod |
|
def _get_column_weights(weights, i: int, y): |
|
|
|
|
|
|
|
if weights is not None: |
|
if np.ndim(weights) != 1 and np.shape(weights)[-1] != 1: |
|
try: |
|
weights = weights[:, i] |
|
except IndexError as err: |
|
raise ValueError( |
|
"weights must have the same shape as data, " |
|
"or be a single column" |
|
) from err |
|
weights = weights[~isna(y)] |
|
return weights |
|
|
|
def _post_plot_logic(self, ax: Axes, data) -> None: |
|
if self.orientation == "horizontal": |
|
|
|
|
|
ax.set_xlabel( |
|
"Frequency" |
|
if self.xlabel is None |
|
else self.xlabel |
|
) |
|
ax.set_ylabel(self.ylabel) |
|
else: |
|
ax.set_xlabel(self.xlabel) |
|
ax.set_ylabel( |
|
"Frequency" |
|
if self.ylabel is None |
|
else self.ylabel |
|
) |
|
|
|
@property |
|
def orientation(self) -> PlottingOrientation: |
|
if self.kwds.get("orientation", None) == "horizontal": |
|
return "horizontal" |
|
else: |
|
return "vertical" |
|
|
|
|
|
class KdePlot(HistPlot): |
|
@property |
|
def _kind(self) -> Literal["kde"]: |
|
return "kde" |
|
|
|
@property |
|
def orientation(self) -> Literal["vertical"]: |
|
return "vertical" |
|
|
|
def __init__( |
|
self, data, bw_method=None, ind=None, *, weights=None, **kwargs |
|
) -> None: |
|
|
|
MPLPlot.__init__(self, data, **kwargs) |
|
self.bw_method = bw_method |
|
self.ind = ind |
|
self.weights = weights |
|
|
|
@staticmethod |
|
def _get_ind(y: np.ndarray, ind): |
|
if ind is None: |
|
|
|
sample_range = np.nanmax(y) - np.nanmin(y) |
|
ind = np.linspace( |
|
np.nanmin(y) - 0.5 * sample_range, |
|
np.nanmax(y) + 0.5 * sample_range, |
|
1000, |
|
) |
|
elif is_integer(ind): |
|
sample_range = np.nanmax(y) - np.nanmin(y) |
|
ind = np.linspace( |
|
np.nanmin(y) - 0.5 * sample_range, |
|
np.nanmax(y) + 0.5 * sample_range, |
|
ind, |
|
) |
|
return ind |
|
|
|
@classmethod |
|
|
|
def _plot( |
|
cls, |
|
ax: Axes, |
|
y: np.ndarray, |
|
style=None, |
|
bw_method=None, |
|
ind=None, |
|
column_num=None, |
|
stacking_id: int | None = None, |
|
**kwds, |
|
): |
|
from scipy.stats import gaussian_kde |
|
|
|
y = remove_na_arraylike(y) |
|
gkde = gaussian_kde(y, bw_method=bw_method) |
|
|
|
y = gkde.evaluate(ind) |
|
lines = MPLPlot._plot(ax, ind, y, style=style, **kwds) |
|
return lines |
|
|
|
def _make_plot_keywords(self, kwds: dict[str, Any], y: np.ndarray) -> None: |
|
kwds["bw_method"] = self.bw_method |
|
kwds["ind"] = type(self)._get_ind(y, ind=self.ind) |
|
|
|
def _post_plot_logic(self, ax: Axes, data) -> None: |
|
ax.set_ylabel("Density") |
|
|
|
|
|
def _grouped_plot( |
|
plotf, |
|
data: Series | DataFrame, |
|
column=None, |
|
by=None, |
|
numeric_only: bool = True, |
|
figsize: tuple[float, float] | None = None, |
|
sharex: bool = True, |
|
sharey: bool = True, |
|
layout=None, |
|
rot: float = 0, |
|
ax=None, |
|
**kwargs, |
|
): |
|
|
|
|
|
if figsize == "default": |
|
|
|
raise ValueError( |
|
"figsize='default' is no longer supported. " |
|
"Specify figure size by tuple instead" |
|
) |
|
|
|
grouped = data.groupby(by) |
|
if column is not None: |
|
grouped = grouped[column] |
|
|
|
naxes = len(grouped) |
|
fig, axes = create_subplots( |
|
naxes=naxes, figsize=figsize, sharex=sharex, sharey=sharey, ax=ax, layout=layout |
|
) |
|
|
|
_axes = flatten_axes(axes) |
|
|
|
for i, (key, group) in enumerate(grouped): |
|
ax = _axes[i] |
|
if numeric_only and isinstance(group, ABCDataFrame): |
|
group = group._get_numeric_data() |
|
plotf(group, ax, **kwargs) |
|
ax.set_title(pprint_thing(key)) |
|
|
|
return fig, axes |
|
|
|
|
|
def _grouped_hist( |
|
data: Series | DataFrame, |
|
column=None, |
|
by=None, |
|
ax=None, |
|
bins: int = 50, |
|
figsize: tuple[float, float] | None = None, |
|
layout=None, |
|
sharex: bool = False, |
|
sharey: bool = False, |
|
rot: float = 90, |
|
grid: bool = True, |
|
xlabelsize: int | None = None, |
|
xrot=None, |
|
ylabelsize: int | None = None, |
|
yrot=None, |
|
legend: bool = False, |
|
**kwargs, |
|
): |
|
""" |
|
Grouped histogram |
|
|
|
Parameters |
|
---------- |
|
data : Series/DataFrame |
|
column : object, optional |
|
by : object, optional |
|
ax : axes, optional |
|
bins : int, default 50 |
|
figsize : tuple, optional |
|
layout : optional |
|
sharex : bool, default False |
|
sharey : bool, default False |
|
rot : float, default 90 |
|
grid : bool, default True |
|
legend: : bool, default False |
|
kwargs : dict, keyword arguments passed to matplotlib.Axes.hist |
|
|
|
Returns |
|
------- |
|
collection of Matplotlib Axes |
|
""" |
|
if legend: |
|
assert "label" not in kwargs |
|
if data.ndim == 1: |
|
kwargs["label"] = data.name |
|
elif column is None: |
|
kwargs["label"] = data.columns |
|
else: |
|
kwargs["label"] = column |
|
|
|
def plot_group(group, ax) -> None: |
|
ax.hist(group.dropna().values, bins=bins, **kwargs) |
|
if legend: |
|
ax.legend() |
|
|
|
if xrot is None: |
|
xrot = rot |
|
|
|
fig, axes = _grouped_plot( |
|
plot_group, |
|
data, |
|
column=column, |
|
by=by, |
|
sharex=sharex, |
|
sharey=sharey, |
|
ax=ax, |
|
figsize=figsize, |
|
layout=layout, |
|
rot=rot, |
|
) |
|
|
|
set_ticks_props( |
|
axes, xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot |
|
) |
|
|
|
maybe_adjust_figure( |
|
fig, bottom=0.15, top=0.9, left=0.1, right=0.9, hspace=0.5, wspace=0.3 |
|
) |
|
return axes |
|
|
|
|
|
def hist_series( |
|
self: Series, |
|
by=None, |
|
ax=None, |
|
grid: bool = True, |
|
xlabelsize: int | None = None, |
|
xrot=None, |
|
ylabelsize: int | None = None, |
|
yrot=None, |
|
figsize: tuple[float, float] | None = None, |
|
bins: int = 10, |
|
legend: bool = False, |
|
**kwds, |
|
): |
|
import matplotlib.pyplot as plt |
|
|
|
if legend and "label" in kwds: |
|
raise ValueError("Cannot use both legend and label") |
|
|
|
if by is None: |
|
if kwds.get("layout", None) is not None: |
|
raise ValueError("The 'layout' keyword is not supported when 'by' is None") |
|
|
|
fig = kwds.pop( |
|
"figure", plt.gcf() if plt.get_fignums() else plt.figure(figsize=figsize) |
|
) |
|
if figsize is not None and tuple(figsize) != tuple(fig.get_size_inches()): |
|
fig.set_size_inches(*figsize, forward=True) |
|
if ax is None: |
|
ax = fig.gca() |
|
elif ax.get_figure() != fig: |
|
raise AssertionError("passed axis not bound to passed figure") |
|
values = self.dropna().values |
|
if legend: |
|
kwds["label"] = self.name |
|
ax.hist(values, bins=bins, **kwds) |
|
if legend: |
|
ax.legend() |
|
ax.grid(grid) |
|
axes = np.array([ax]) |
|
|
|
|
|
|
|
set_ticks_props( |
|
axes, |
|
xlabelsize=xlabelsize, |
|
xrot=xrot, |
|
ylabelsize=ylabelsize, |
|
yrot=yrot, |
|
) |
|
|
|
else: |
|
if "figure" in kwds: |
|
raise ValueError( |
|
"Cannot pass 'figure' when using the " |
|
"'by' argument, since a new 'Figure' instance will be created" |
|
) |
|
axes = _grouped_hist( |
|
self, |
|
by=by, |
|
ax=ax, |
|
grid=grid, |
|
figsize=figsize, |
|
bins=bins, |
|
xlabelsize=xlabelsize, |
|
xrot=xrot, |
|
ylabelsize=ylabelsize, |
|
yrot=yrot, |
|
legend=legend, |
|
**kwds, |
|
) |
|
|
|
if hasattr(axes, "ndim"): |
|
if axes.ndim == 1 and len(axes) == 1: |
|
return axes[0] |
|
return axes |
|
|
|
|
|
def hist_frame( |
|
data: DataFrame, |
|
column=None, |
|
by=None, |
|
grid: bool = True, |
|
xlabelsize: int | None = None, |
|
xrot=None, |
|
ylabelsize: int | None = None, |
|
yrot=None, |
|
ax=None, |
|
sharex: bool = False, |
|
sharey: bool = False, |
|
figsize: tuple[float, float] | None = None, |
|
layout=None, |
|
bins: int = 10, |
|
legend: bool = False, |
|
**kwds, |
|
): |
|
if legend and "label" in kwds: |
|
raise ValueError("Cannot use both legend and label") |
|
if by is not None: |
|
axes = _grouped_hist( |
|
data, |
|
column=column, |
|
by=by, |
|
ax=ax, |
|
grid=grid, |
|
figsize=figsize, |
|
sharex=sharex, |
|
sharey=sharey, |
|
layout=layout, |
|
bins=bins, |
|
xlabelsize=xlabelsize, |
|
xrot=xrot, |
|
ylabelsize=ylabelsize, |
|
yrot=yrot, |
|
legend=legend, |
|
**kwds, |
|
) |
|
return axes |
|
|
|
if column is not None: |
|
if not isinstance(column, (list, np.ndarray, ABCIndex)): |
|
column = [column] |
|
data = data[column] |
|
|
|
data = data.select_dtypes( |
|
include=(np.number, "datetime64", "datetimetz"), exclude="timedelta" |
|
) |
|
naxes = len(data.columns) |
|
|
|
if naxes == 0: |
|
raise ValueError( |
|
"hist method requires numerical or datetime columns, nothing to plot." |
|
) |
|
|
|
fig, axes = create_subplots( |
|
naxes=naxes, |
|
ax=ax, |
|
squeeze=False, |
|
sharex=sharex, |
|
sharey=sharey, |
|
figsize=figsize, |
|
layout=layout, |
|
) |
|
_axes = flatten_axes(axes) |
|
|
|
can_set_label = "label" not in kwds |
|
|
|
for i, col in enumerate(data.columns): |
|
ax = _axes[i] |
|
if legend and can_set_label: |
|
kwds["label"] = col |
|
ax.hist(data[col].dropna().values, bins=bins, **kwds) |
|
ax.set_title(col) |
|
ax.grid(grid) |
|
if legend: |
|
ax.legend() |
|
|
|
set_ticks_props( |
|
axes, xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot |
|
) |
|
maybe_adjust_figure(fig, wspace=0.3, hspace=0.3) |
|
|
|
return axes |
|
|