import pandas as pd
import numpy as np
from distfit import distfit
from tqdm.auto import tqdm
from matplotlib import pyplot as plt
from seaborn import countplot, histplot, boxplot, violinplot, kdeplot, move_legend
from typing import Optional, Union, Tuple, List
import numpy.typing as npt
from .._utils import SEQUENCE_LIKE, itr_plot, PlotMixin
from .._logr import _z_log
###################################################################
[docs]
class Dist(PlotMixin):
"""
Visualizing and finding the best fit distribution for parametric, non-parametric
and discrete distributions.
Visualizations includes the following plots `Seaborn Module`:
- Count plot
- Histograms
- Box plot
- Violin plot
- Kernel density estimation(kde) plot
Notes
-----
- Categorical features are considered irrespective of their dtype, either string
or numbers, users of this class are advised to explicitly specify whether the
columns are categorical using ``cat_cols`` parameter as this will affect both
grouping and plots.
- If not explicitly specified then it is assumed that if features ``cols`` are
categorical then ``target`` must be numeric and vise versa. This is only to
dictate the direction of categorical grouping of numeric data.
- Categories are preprocessed to highlight `rare` levels; only frequent levels are
displayed as is while others are grouped into a single level called `rare`. This
behavior can be controlled by ``top_n`` and ``rare_thresh`` parameters. For
severely imbalanced datasets, ensure that ``rare_thresh`` parameter account for
minority class when plotting conditional distributions.
- Missing values `NaN` in categorical features are not removed, rather,
considered as a separate level called `missing`. If missing values happens to be
also rare, then they will be labeled as `rare` instead of `missing`.
- Keep in mind the ``frac`` parameter when using the preprocessed DataFrame out of
this class.
Parameters
----------
df: pandas dataframe
data source
cols: sequence (lists, tuples, NumPy arrays or Pandas Base Index),
column names of features to plot. Better to use homogeneous subsets, example:
either all categorical or all numeric features; categorical subset can have
multiple dtypes (object or numeric) depending on the nature of the feature
target: str or None
target column name
cat_cols: Bool or None
indicate whether ``cols`` are categorical in nature or not and in-turn the
direction of categorical grouping of numeric data, for Example: for
`binary target`, numeric ``cols`` are to be grouped by the categorical
``target`` and vise versa. If `None`, inferred automatically.
rare_thresh, top_n: float, int
max cardinality beyond which lvls are grouped and analysed as a single level;
If ``rare_thresh`` = 0 or ``top_n`` < 2 then all levels are analysed as is,
otherwise these levels are grouped as a single `rare` level.
Notes
-----
- both are independent, for example considering categorical level to be
`rare` (1% - 5%) can still result in a high cardinality categorical
feature (> N levels)
- missing values will be displayed as `rare` rather than `missing` if the
new `missing` category is below ``rare_thresh``
frac: float or None
fraction of dataframe to use as a sample
for analysis:
- 0 < ``frac`` < 1 returns a random sample with size ``frac``.
- ``frac`` = 1 returns shuffled dataframe.
- ``frac`` > 1 up-sample the dataframe, sampling of the same row more
than once.
random_state: int
for reproducibility, controls the random number generator for ``frac``
parameter and `best_fit` method.
figsize: tuple or None
dimensions of matplotlib figure (width, height)
n_rows: int
number of rows in matplotlib subplot figure
n_cols: int
number of columns in matplotlib subplot figure
silent: Bool
solicit user input for continuation during iterative plotting. If `True`,
plotting proceeds without user interaction.
hide_p_bar: Bool
triggers hiding progress bar (tqdm module); Default 'False'
Attributes
----------
z_df: pandas dataframe
preprocessed dataframe that was used internally
z_freq_lvls_map: dict
where keys are column(s) name(s) and values are frequent levels. Only
applicable when ``cols`` are categorical
xludd_feats: numpy array
categorical column names excluded from the analysis being dominated by
rare levels. Only applicable when ``cols`` are categorical
"""
def __init__(self,
df: pd.DataFrame,
cols: SEQUENCE_LIKE,
target: Optional[str] = None,
cat_cols: Optional[bool] = None,
rare_thresh: float = 0.05,
top_n: int = 25,
frac: Optional[float] = None,
random_state: int = 45,
figsize: Optional[Tuple[int, int]] = None,
n_rows: Optional[int] = None,
n_cols: Optional[int] = None,
silent: bool = False,
hide_p_bar: bool = False):
# input checks
if not isinstance(cols, (list, tuple, np.ndarray, pd.Index)):
raise TypeError("'cols' parameter accepts a sequence e.g: Lists, Tuples, NumPy Arrays or Pandas Base Index. "
f"However, '{type(cols)}' was received!")
if target and not isinstance(target, str):
raise TypeError("please pass 'target' column name as string")
if np.any(~np.isin(list(set(np.r_[np.array(cols), [target] if target else []])), df.columns)):
raise KeyError("Missing columns! Please ensure that all columns to analyze are included in the DataFrame")
if np.any([df[col].dtype.kind in 'M' for col in cols]) and cat_cols:
raise TypeError("Datetime columns are casted as categorical, Please ensure 'cat_cols' parameter is set to 'False'")
# prepare data for preprocessing
if frac:
replace = True if frac > 1 else False
self.z_df_ = df.sample(frac = frac, replace = replace, random_state = random_state).copy()
else:
self.z_df_ = df.copy()
self._ori_df = df.copy() # to be used in `best_fit` and dtype checks. TODO: Alternative for this excess overhead
self._cols = cols
self._target = target
self._cat_cols = cat_cols
self._rare_thresh = rare_thresh
self._top_n = top_n
self._random_state = random_state
self._figsize = figsize
self._n_rows = n_rows
self._n_cols = n_cols
self._silent = silent
self._hide_p_bar = hide_p_bar
# preprocess categorical columns
self._preprocess()
[docs]
@itr_plot
def cp(self,
stat: str = 'count',
native_scale: bool = False,
legend: Union[str, bool] = 'auto',
hue_agg: List[str] = ['mean'],
log_scale: Optional[Union[int, bool, Tuple[int, bool]]] = False,
color: str = 'lightblue',
palette: str = 'Paired',
nbins: Union[int, str] = 'auto',
axis: str = 'x',
tight: Optional[bool] = None,
x_ax_rotation: Optional[int] = None,
theme: str = 'darkorange'):
"""
Visualize categorical feature distribution using Seaborn's Count Plot
Note
----
Categorical features plagued with rare levels (< rare_thresh) will be skipped;
Only those having at least 2 frequent levels are plotted
Parameters
----------
stat: str
One of 'count', 'percent', 'proportion' or 'probability'. Statistic to compute;
when not 'count', bar heights will be normalized so that they sum to 100
(for 'percent') or 1 (otherwise) across the plot.
native_scale: bool
When True, numeric or datetime values on the categorical axis will maintain
their original scaling rather than being converted to fixed indices.
legend: "auto", "brief", "full", or False
How to draw the legend. If "brief", numeric `hue` and `size`
variables will be represented with a sample of evenly spaced values.
If "full", every group will get an entry in the legend. If "auto",
choose between brief or full representation based on number of levels.
If `False`, no legend data is added and no legend is drawn.
hue_agg: list
list of functions and/or function names, e.g.: [np.sum, 'mean']
to use for aggregating the data. Functions, must either work
when passed a Series/DataFrame or when passed to Series/DataFrame.apply.
Only applicable for conditional distributions between numeric and categorical
variable, otherwise data is sorted ascendingly following the frequency(count)
of categorical levels. e.g.:
pandas.DataFrame.groupby(category)[value].agg(hue_agg).sort_values(hue_agg)
log_scale: bool or number, or pair of bools or numbers
Set axis scale(s) to log. A single value sets the data axis for any numeric
axes in the plot. A pair of values sets each axis independently.
Numeric values are interpreted as the desired base (default 10).
When `None` or `False`, seaborn defers to the existing Axes scale.
color: str
adjust color of seaborn plots as desired
palette: str
adjust color of 'hue' as desired; See seaborn.color_palette('palette_name')
nbins: int or 'auto'
For plot decoration, maximum number of axis intervals; one less than max
number of ticks. If the string 'auto', the number of bins will be
automatically determined based on the length of the axis.
axis: str
For plot decoration, one of ['both', 'x', 'y'], axis on which to apply
'nbins'.
tight: bool or None
For plot decoration, controls expansion of axis limits, if 'True' axis limits
are only expanded using the margins; This does *not* set the margins to zero.
If 'False', further expand the axis limits using the axis major locator.
x_ax_rotation: int or None
For plot decoration, set degree of x_ticks rotation.
theme: str
adjust axis and title colors as desired
"""
# prepare data
mask, feat, hue = self._grouping(self._col, self._target)
# sort
if hue:
if mask[feat].dtype.kind != 'O':
order = mask.groupby(hue)[feat].agg(hue_agg).sort_values(hue_agg).index
else:
order = mask.groupby(hue)[feat].agg(['count']).sort_values(['count']).index
elif self._cat_cols:
order = mask[feat].value_counts().sort_values().index
else:
order = None
# plot
ax = self._fig.add_subplot(self._n_rows, self._n_cols, self._n)
countplot(mask, x = feat, hue = hue, order = order if not hue else None,
hue_order = order if hue else None, stat = stat, native_scale = native_scale,
legend = legend, color = color, palette = palette if hue else None,
log_scale = log_scale, ax = ax)
# decorate
if hue and hue != feat:
move_legend(ax, 'center', bbox_to_anchor = (.5, 1.3), columnspacing = .4, ncol = 4,
labelspacing = 0.0, handletextpad = 0.0, handlelength = 1, fancybox = True,
shadow = True)
col_dtype = mask[feat].dtype.kind
self._decorate_plot(ax, col_dtype, log_scale, nbins, axis, tight, x_ax_rotation, theme);
# update iteration control
self._n += 1
[docs]
@itr_plot
def hs(self,
bins: Union[str, int, List[int], npt.NDArray[np.int_]] = 'doane',
stat: str = 'count',
multiple: str = 'layer',
element: str = 'bars',
fill: bool = True,
discrete: bool = False,
hue_agg: List[str] = ['mean'],
log_scale: Optional[Union[int, bool, Tuple[int, bool]]] = False,
color: str = 'lightblue',
palette: str = 'Paired',
nbins: Union[int, str] = 'auto',
axis: str = 'x',
tight: Optional[bool] = None,
x_ax_rotation: Optional[int] = None,
theme: str = 'darkorange'):
"""
Visualize numeric features distribution using Seaborn's Hist Plot
Parameters
----------
bins: str, number, vector, or a pair of such values
histogram bins, Generic bin
parameter that can be:
- the name of a reference rule,
- the number of bins,
- the breaks of the bins.
Passed to :func:`numpy.histogram_bin_edges`.
Notes
-----
`str` can be one of [‘auto’, ‘fd’, ‘doane’, ‘scott’,
‘stone’, ‘rice’, ‘sturges’ or ‘sqrt’]
stat: str
Aggregate statistic to compute in each histogram bin.
- **count**: show the number of observations in each bin
- **frequency**: show the number of observations divided by the bin width
- **probability** or **proportion**: normalize such that bar heights sum to 1
- **percent**: normalize such that bar heights sum to 100
- **density**: normalize such that the total area of the histogram equals 1
multiple: {"layer", "dodge", "stack", "fill"}
Approach to resolving multiple elements when semantic mapping creates subsets.
Only relevant with univariate data.
element: {"bars", "step", "poly"}
Visual representation of the histogram statistic.
Only relevant with univariate data.
fill: bool
If True, fill in the space under the histogram.
discrete: bool
If True, default to ``binwidth=1`` and draw the bars so that they are
centered on their corresponding data points. This avoids "gaps" that may
otherwise appear when using discrete (integer) data.
hue_agg: list
list of functions and/or function names, e.g.: [np.sum, 'mean']
to use for aggregating the data. Only applicable for conditional
distributions between numeric and categorical variable, otherwise
data is sorted ascendingly following the frequency(count) of categorical
levels. e.g.:
pandas.DataFrame.groupby(category)[value].agg(hue_agg).sort_values(hue_agg)
log_scale: bool or number, or pair of bools or numbers
Set axis scale(s) to log. A single value sets the data axis for any numeric
axes in the plot. A pair of values sets each axis independently.
Numeric values are interpreted as the desired base (default 10).
When `None` or `False`, seaborn defers to the existing Axes scale.
color: str
adjust color of seaborn plots as desired
palette: str
adjust color of 'hue' as desired; See seaborn.color_palette('palette_name')
nbins: int or 'auto'
For plot decoration, maximum number of axis intervals; one less than max
number of ticks. If the string 'auto', the number of bins will be
automatically determined based on the length of the axis.
axis: str
For plot decoration, one of ['both', 'x', 'y'], axis on which to apply
'nbins'.
tight : bool or None
For plot decoration, controls expansion of axis limits, if 'True' axis limits
are only expanded using the margins; This does *not* set the margins to zero.
If 'False', further expand the axis limits using the axis major locator.
x_ax_rotation: int or None
For plot decoration, set degree of x_ticks rotation.
theme: str
adjust axis and title colors as desired
Note
----
The choice of bins for computing and plotting a histogram can exert
substantial influence on the insights that one is able to draw from the
visualization. If the bins are too large, they may erase important features.
On the other hand, bins that are too small may be dominated by random
variability, obscuring the shape of the true underlying distribution. The
default bin size is determined using a reference rule that depends on the
sample size and variance. This works well in many cases, (i.e., with
"well-behaved" data) but it fails in others. It is always a good to try
different bin sizes to be sure that you are not missing something important.
This function allows you to specify bins in several different ways, such as
by setting the total number of bins to use, the width of each bin, or the
specific locations where the bins should break.
"""
# prepare data
mask, feat, hue = self._grouping(self._col, self._target)
# sort
if hue and mask[feat].dtype.kind != 'O':
order = mask.groupby(hue)[feat].agg(hue_agg).sort_values(hue_agg).index
else:
order = None
# plot
ax = self._fig.add_subplot(self._n_rows, self._n_cols, self._n)
histplot(mask, x = feat, hue = hue, hue_order = order, bins = bins, stat = stat,
multiple = multiple, element = element, fill = fill, discrete = discrete,
color = color, palette = palette if hue else None, log_scale = log_scale, ax = ax)
# decorate
if hue:
move_legend(ax, 'center', bbox_to_anchor = (.5, 1.3), columnspacing = .4, ncol = 4,
labelspacing = 0.0, handletextpad = 0.0, handlelength = 1, fancybox = True,
shadow = True)
col_dtype = mask[feat].dtype.kind
self._decorate_plot(ax, col_dtype, log_scale, nbins, axis, tight, x_ax_rotation, theme);
# update iteration control
self._n += 1
[docs]
@itr_plot
def bo(self,
hue: Optional[str] = None,
fill: bool = True,
showmeans: bool = True,
meanprops: Optional[dict] = None,
medianprops: Optional[dict] = None,
whis: Union[float, Tuple[float, float]] = 1.5,
fliersize: Optional[float] = None,
hue_agg: List[str] = ['mean'],
log_scale: Optional[Union[int, bool, Tuple[int, bool]]] = False,
color: str = 'lightblue',
palette: str = 'Paired',
nbins: Union[int, str] = 'auto',
axis: str = 'x',
tight: Optional[bool] = None,
x_ax_rotation: Optional[int] = None,
theme: str = 'darkorange'):
"""
Visualize numeric features distribution using Seaborn's Box Plot
Parameters
----------
hue: str
column name for additional layer of categorization.
fill: bool
If True, use a solid patch. Otherwise, draw as line art.
showmeans: bool
Show the arithmetic means.
meanprops: dict
Specifies the style of the mean.
medianprops: dict
Specifies the style of the median.
whis: float or pair of floats
Paramater that controls whisker length. If scalar, whiskers are drawn
to the farthest datapoint within *whis * IQR* from the nearest hinge.
If a tuple, it is interpreted as percentiles that whiskers represent.
fliersize: float
Size of the markers used to indicate outlier observations.
hue_agg: list
list of functions and/or function names, e.g.: [np.sum, 'mean']
to use for aggregating the data. Only applicable for conditional
distributions between numeric and categorical variable, otherwise
data is sorted ascendingly following the frequency(count) of categorical
levels. e.g.:
pandas.DataFrame.groupby(category)[value].agg(hue_agg).sort_values(hue_agg)
log_scale: bool or number, or pair of bools or numbers
Set axis scale(s) to log. A single value sets the data axis for any numeric
axes in the plot. A pair of values sets each axis independently.
Numeric values are interpreted as the desired base (default 10).
When `None` or `False`, seaborn defers to the existing Axes scale.
color: str
adjust color of seaborn plots as desired
palette: str
adjust color of 'hue' as desired; See seaborn.color_palette('palette_name')
nbins: int or 'auto'
For plot decoration, maximum number of axis intervals; one less than max
number of ticks. If the string 'auto', the number of bins will be
automatically determined based on the length of the axis.
axis: str
For plot decoration, one of ['both', 'x', 'y'], axis on which to apply
'nbins'.
tight : bool or None
For plot decoration, controls expansion of axis limits, if 'True' axis limits
are only expanded using the margins; This does *not* set the margins to zero.
If 'False', further expand the axis limits using the axis major locator.
x_ax_rotation: int or None
For plot decoration, set degree of x_ticks rotation.
theme: str
adjust axis and title colors as desired
"""
# prepare data
mask, feat, hue_ = self._grouping(self._col, self._target)
# sort
if hue_ and mask[feat].dtype.kind != 'O':
order = mask.groupby(hue_)[feat].agg(hue_agg).sort_values(hue_agg).index
else:
order = None
# plot
ax = self._fig.add_subplot(self._n_rows, self._n_cols, self._n)
mean_props = dict(linewidth = 2, markeredgecolor = 'black', markerfacecolor = 'firebrick') if not meanprops else meanprops
median_props = dict(linewidth = 1.5, color = 'black') if not medianprops else medianprops
boxplot(mask, x = feat, y = hue_, order = order, hue = hue if hue else None, fill = fill, showmeans = showmeans,
meanline = showmeans, meanprops = mean_props, medianprops = median_props, whis = whis, fliersize = fliersize,
color = color, palette = palette if hue else None, log_scale = log_scale, ax = ax)
# decorate
if hue:
move_legend(ax, 'center', bbox_to_anchor = (.5, 1.3), columnspacing = .4, ncol = 4,
labelspacing = 0.0, handletextpad = 0.0, handlelength = 1, fancybox = True,
shadow = True)
# decorate
col_dtype = mask[feat].dtype.kind
self._decorate_plot(ax, col_dtype, log_scale, nbins, axis, tight, x_ax_rotation, theme);
# update iteration control
self._n += 1
[docs]
@itr_plot
def vi(self,
hue: Optional[str] = None,
fill: bool = False,
inner: Optional[str] = 'quart',
split: bool = False,
cut: float = 2,
bw_method: Union[str, float] = 'scott',
bw_adjust: float = 1,
density_norm: str = 'area',
hue_agg: List[str] = ['mean'],
log_scale: Optional[Union[int, bool, Tuple[int, bool]]] = False,
color: str = 'lightblue',
palette: str = 'Paired',
nbins: Union[int, str] = 'auto',
axis: str = 'x',
tight: Optional[bool] = None,
x_ax_rotation: Optional[int] = None,
theme: str = 'darkorange'):
"""
Visualize numeric features distribution using Seaborn's Violin Plot
Parameters
----------
hue: str
column name for additional layer of categorization.
fill: bool
If True, use a solid patch. Otherwise, draw as line art.
inner: {"box", "quart", "point", "stick", None}
Representation of the data in the violin interior.
One of the following:
- **box**: draw a miniature box-and-whisker plot
- **quart**: show the quartiles of the data
- **point** or **stick**: show each observation
split: bool
Show an un-mirrored distribution, alternating sides when using `hue`.
cut: float
Distance, in units of bandwidth, to extend the density past extreme
data points. Set to 0 to limit the violin within the data range.
bw_method: {"scott", "silverman", float}
Either the name of a reference rule or the scale factor to use when
computing the kernel bandwidth. The actual kernel size will be
determined by multiplying the scale factor by the standard deviation of
the data within each group.
bw_adjust: float
Factor that scales the bandwidth to use more or less smoothing.
density_norm: {"area", "count", "width"}
Method that normalizes each density to determine the violin's width.
If `area`, each violin will have the same area. If `count`, the width
will be proportional to the number of observations. If `width`, each
violin will have the same width.
hue_agg: list
list of functions and/or function names, e.g.: [np.sum, 'mean']
to use for aggregating the data. Only applicable for conditional
distributions between numeric and categorical variable, otherwise
data is sorted ascendingly following the frequency(count) of categorical
levels. e.g.:
pandas.DataFrame.groupby(category)[value].agg(hue_agg).sort_values(hue_agg)
log_scale: bool or number, or pair of bools or numbers
Set axis scale(s) to log. A single value sets the data axis for any numeric
axes in the plot. A pair of values sets each axis independently.
Numeric values are interpreted as the desired base (default 10).
When `None` or `False`, seaborn defers to the existing Axes scale.
color: str
adjust color of seaborn plots as desired
palette: str
adjust color of 'hue' as desired; See seaborn.color_palette('palette_name')
nbins: int or 'auto'
For plot decoration, maximum number of axis intervals; one less than max
number of ticks. If the string 'auto', the number of bins will be
automatically determined based on the length of the axis.
axis: str
For plot decoration, one of ['both', 'x', 'y'], axis on which to apply
'nbins'.
tight: bool or None
For plot decoration, controls expansion of axis limits, if 'True' axis limits
are only expanded using the margins; This does *not* set the margins to zero.
If 'False', further expand the axis limits using the axis major locator.
x_ax_rotation: int or None
For plot decoration, set degree of x_ticks rotation.
theme: str
adjust axis and title colors as desired
"""
# prepare data
mask, feat, hue_ = self._grouping(self._col, self._target)
# sort
if hue_ and mask[feat].dtype.kind != 'O':
order = mask.groupby(hue_)[feat].agg(hue_agg).sort_values(hue_agg).index
else:
order = None
# plot
ax = self._fig.add_subplot(self._n_rows, self._n_cols, self._n)
violinplot(mask, x = feat, y = hue_, order = order, hue = hue if hue else None, inner = inner,
fill = fill, split = split, cut = cut, bw_method = bw_method,
bw_adjust = bw_adjust, density_norm = density_norm, color = color,
palette = palette if hue else None, log_scale = log_scale, ax = ax)
# decorate
if hue:
move_legend(ax, 'center', bbox_to_anchor = (.5, 1.3), columnspacing = .4, ncol = 4,
labelspacing = 0.0, handletextpad = 0.0, handlelength = 1, fancybox = True,
shadow = True)
col_dtype = mask[feat].dtype.kind
self._decorate_plot(ax, col_dtype, log_scale, nbins, axis, tight, x_ax_rotation, theme);
# update iteration control
self._n += 1
[docs]
@itr_plot
def kd(self,
cut: float = 0,
bw_method: Union[str, float] = 'scott',
bw_adjust: float = 1,
warn_singular: bool = False,
hue_agg: List[str] = ['mean'],
log_scale: Optional[Union[int, bool, Tuple[int, bool]]] = False,
color: str = 'lightblue',
palette: str = 'Paired',
nbins: Union[int, str] = 'auto',
axis: str = 'x',
tight: Optional[bool] = None,
x_ax_rotation: Optional[int] = None,
theme: str = 'darkorange'):
"""
Visualize numeric features distribution using Seaborn's KDE Plot
Parameters
----------
cut: float
Factor, for KDE plot, that determines how far to reach past
extreme data points. Set to 0, truncate the plot at the data limits.
bw_method: string, scalar, or callable
The method used to calculate the estimator bandwidth. This can be
'scott', 'silverman', a scalar constant or a callable. If a scalar,
this will be used directly as `kde.factor`. If a callable, it should
take a `gaussian_kde` instance as only parameter and return a scalar.
See :class:`scipy.stats.gaussian_kde` for more details.
bw_adjust: float
Factor that multiplicatively scales the value chosen using
``bw_method``. Increasing will make the curve smoother. See Notes.
warn_singular: bool
If True, issue a warning when trying to estimate the density of data
with zero variance
hue_agg: list
list of functions and/or function names, e.g.: [np.sum, 'mean']
to use for aggregating the data. Only applicable for conditional
distributions between numeric and categorical variable, otherwise
data is sorted ascendingly following the frequency(count) of categorical
levels. Example:
pandas.DataFrame.groupby(category)[value].agg(hue_agg).sort_values(hue_agg)
log_scale: bool or number, or pair of bools or numbers
Set axis scale(s) to log. A single value sets the data axis for any numeric
axes in the plot. A pair of values sets each axis independently.
Numeric values are interpreted as the desired base (default 10).
When `None` or `False`, seaborn defers to the existing Axes scale.
color: str
adjust color of seaborn plots as desired
palette: str
adjust color of 'hue' as desired; See seaborn.color_palette('palette_name')
nbins: int or 'auto'
For plot decoration, maximum number of axis intervals; one less than max
number of ticks. If the string 'auto', the number of bins will be
automatically determined based on the length of the axis.
axis: str
For plot decoration, one of ['both', 'x', 'y'], axis on which to apply
'nbins'.
tight: bool or None
For plot decoration, controls expansion of axis limits, if 'True' axis limits
are only expanded using the margins; This does *not* set the margins to zero.
If 'False', further expand the axis limits using the axis major locator.
x_ax_rotation: int or None
For plot decoration, set degree of x_ticks rotation.
theme: str
adjust axis and title colors as desired
Notes
-----
The *bandwidth*, or standard deviation of the smoothing kernel, is an
important parameter. Misspecification of the bandwidth can produce a
distorted representation of the data. Much like the choice of bin width in a
histogram, an over-smoothed curve can erase true features of a
distribution, while an under-smoothed curve can create false features out of
random variability. The rule-of-thumb that sets the default bandwidth works
best when the true distribution is smooth, unimodal, and roughly bell-shaped.
It is always a good idea to check the default behavior by using ``bw_adjust``
to increase or decrease the amount of smoothing.
Because the smoothing algorithm uses a Gaussian kernel, the estimated density
curve can extend to values that do not make sense for a particular dataset.
For example, the curve may be drawn over negative values when smoothing data
that are naturally positive. The ``cut`` and ``clip`` parameters can be used
to control the extent of the curve, but datasets that have many observations
close to a natural boundary may be better served by a different visualization
method.
Similar considerations apply when a dataset is naturally discrete or "spiky"
(containing many repeated observations of the same value). Kernel density
estimation will always produce a smooth curve, which would be misleading
in these situations.
The units on the density axis are a common source of confusion. While kernel
density estimation produces a probability distribution, the height of the curve
at each point gives a density, not a probability. A probability can be obtained
only by integrating the density across a range. The curve is normalized so
that the integral over all possible values is 1, meaning that the scale of
the density axis depends on the data values.
"""
# prepare data
mask, feat, hue = self._grouping(self._col, self._target)
# sort
if hue and mask[feat].dtype.kind != 'O':
order = mask.groupby(hue)[feat].agg(hue_agg).sort_values(hue_agg).index
else:
order = None
# plot
ax = self._fig.add_subplot(self._n_rows, self._n_cols, self._n)
kdeplot(mask, x = feat, hue = hue, hue_order = order, cut = cut, bw_method = bw_method,
bw_adjust = bw_adjust, warn_singular = warn_singular, color = color, palette = palette if hue else None,
log_scale = log_scale, ax = ax)
# decorate
if hue:
move_legend(ax, 'center', bbox_to_anchor = (.5, 1.3), columnspacing = .4, ncol = 4,
labelspacing = 0.0, handletextpad = 0.0, handlelength = 1, fancybox = True,
shadow = True)
col_dtype = mask[feat].dtype.kind
self._decorate_plot(ax, col_dtype, log_scale, nbins, axis, tight, x_ax_rotation, theme);
# update iteration control
self._n += 1
[docs]
def best_fit(self,
cols: Optional[SEQUENCE_LIKE] = None,
method: str = 'parametric',
distr: Union[str, List[str]] = 'popular',
stats: str = 'RSS',
alpha: float = 0.05,
verbose: Union[str, int] = 30,
**kwargs) -> dict:
"""
Find the best fit distribution for parametric, non-parametric, and discrete
distributions using DistFit module
Parameters
----------
cols: sequence (lists, tuples, NumPy arrays or Pandas Base Index) or None
column names of features to analyze. If `None`, all columns at class
initialization will be used.
method: str
specify how the best fit distribution is determined,
One of ['parametric', 'quantile', 'percentile', 'discrete'].
- For the parametric approach, the distfit library can determine the best
fit across 89 theoretical distributions.
- For the non-parametric approach (assume that the data does not follow a
specific probability distribution), either the quantile or percentile
method is used; where confidence intervals of distribution boundaries are
computed based on either quantiles or percentiles. e.g.:
ci_upper = np.quantile(X, 1 - alpha), ci_lower = np.quantile(X, alpha)
where 'X' is feature values and 'alpha' is Significance alpha (i.e: 0.05)
- In case the dataset contains discrete values, the best fit is then derived
using the binomial distribution.
distr: str or list of str
the (set) of distribution to test. str can be "popular",
name of distribution or list of specific theoretical
distribution names, for example:
- 'popular':[norm, expon, pareto, dweibull, t, genextreme,
gamma, lognorm, beta, uniform, loggamma]
- 'full'
- 'norm', 't', 'k': Test for one specific distribution.
- ['norm', 't', 'k', ...]: Test for a list of distributions.
If ``method`` = 'discrete', then binomial distribution is used.
stats: str
specify the scoring statistics for the goodness of fit test,
One of ['RSS', 'wasserstein', 'ks', 'energy'].
alpha: float
Significance alpha
verbose: str or int
set the verbose messages using string or integer:
- 0, 60, None, 'silent', 'off', 'no']: No message.
- 10, 'debug': Messages from debug level and higher.
- 20, 'info': Messages from info level and higher.
- 30, 'warning': Messages from warning level and higher.
- 50, 'critical': Messages from critical level and higher.
Attributes
----------
z_best_fit_results: dict,
where keys are column(s) name(s) and values are fitted model(s)
z_feats_out_: numpy array,
excluded columns having null values, if any.
Note
----
- Columns having null values will not be analyzed
- For full list of parameters see `<https://erdogant.github.io/distfit>`__.
"""
# Input check
if cols is not None:
if not isinstance(cols, (list, tuple, np.ndarray, pd.Index)):
raise TypeError("'cols' parameter accepts a sequence, e.g: Lists, Tuples, NumPy Arrays or Pandas Base Index. "
f"However, '{type(cols)}' was received!")
# assign columns to analyze
self._bf_cols = np.array(cols)
else:
self._bf_cols = np.array(self._cols)
if np.any([self._ori_df[col].dtype.kind not in 'ifc' for col in self._bf_cols]):
raise TypeError("please only use columns having numeric data type")
if stats not in ['RSS', 'wasserstein', 'ks', 'energy']:
raise ValueError("'stats' parameter must be one of the following arguments: 'RSS', 'wasserstein', 'ks', or 'energy', "
f"however, '{stats}' was received!")
# exclude columns with null values
null_cols = self._bf_cols[self._ori_df[self._bf_cols].isnull().sum() != 0]
n_ori, n_out = len(self._bf_cols), len(null_cols)
self._bf_cols = self._bf_cols[~np.isin(self._bf_cols, null_cols)]
if n_out == n_ori:
raise ValueError("All columns have null(missing) values, can't fit!")
else:
# highlight exclusion
if n_out:
self.z_feats_out_ = null_cols # attributes
_z_log.info(f"{n_out} out of {n_ori} columns having null values will not be analyzed")
# fit dists
fit_results = {} # best fit model(s) container
for col in tqdm(self._bf_cols, desc = f'Finding Best Fits....', disable = self._hide_p_bar):
dfit = distfit(method = method, distr = distr, stats = stats, alpha = alpha, todf = True,
random_state = self._random_state, verbose = verbose, **kwargs)
results = dfit.fit_transform(self._ori_df[col]) # using original dataframe ignoring `frac`
fit_results[col] = dfit # update best fit container
self.z_best_fit_results_ = fit_results # attributes
[docs]
@itr_plot
def best_vis(self):
"""
Visualize best fit model results `distfit module`.
Calls `best_fit` method if models were not already fit.
"""
ax = self._fig.add_subplot(self._n_rows, self._n_cols, self._n)
self.z_best_fit_results_[self._col].plot(emp_properties = {'color': 'red', 'linewidth' : 1},
pdf_properties = {'color': 'green', 'linewidth' : 1},
cii_properties = {'color' : 'darkorange', 'linewidth': 1, 'size': 10},
fontsize = 13, grid = False, figsize = self._figsize, title = f'{self._col}',
ax = ax)
# decorate
col_dtype = self.z_df_[self._col].dtype.kind
self._decorate_plot(ax, dtype = col_dtype)
plt.legend(ncol = 3, loc = 8, bbox_to_anchor = [0.5, 1.2], columnspacing = 1.0, labelspacing = 0.0,
handletextpad = 0.0, handlelength = 1.5, fancybox = True, shadow = True);
self._n += 1 # update iteration control