Source code for zaps.eda._dist

import pandas as pd

import numpy as np

from distfit import distfit

from tqdm.auto import tqdm

from matplotlib import pyplot as plt

from seaborn import countplot, histplot, boxplot, violinplot, kdeplot, move_legend

from typing import Optional, Union, Tuple, List

import numpy.typing as npt

from .._utils import SEQUENCE_LIKE, itr_plot, PlotMixin

from .._logr import _z_log

###################################################################


[docs]
class Dist(PlotMixin):
    """
    Visualizing and finding the best fit distribution for parametric, non-parametric 
    and discrete distributions.
    
    Visualizations includes the following plots `Seaborn Module`:

    - Count plot
    - Histograms
    - Box plot
    - Violin plot
    - Kernel density estimation(kde) plot
        
    Notes
    -----
    - Categorical features are considered irrespective of their dtype, either string 
      or numbers, users of this class are advised to explicitly specify whether the
      columns are categorical using ``cat_cols`` parameter as this will affect both
      grouping and plots.
    - If not explicitly specified then it is assumed that if features ``cols`` are 
      categorical then ``target`` must be numeric and vise versa. This is only to 
      dictate the direction of categorical grouping of numeric data.
    - Categories are preprocessed to highlight `rare` levels; only frequent levels are 
      displayed as is while others are grouped into a single level called `rare`. This 
      behavior can be controlled by ``top_n`` and ``rare_thresh`` parameters. For 
      severely imbalanced datasets, ensure that ``rare_thresh`` parameter account for 
      minority class when plotting conditional distributions.
    - Missing values `NaN` in categorical features are not removed, rather, 
      considered as a separate level called `missing`. If missing values happens to be 
      also rare, then they will be labeled as `rare` instead of `missing`.
    - Keep in mind the ``frac`` parameter when using the preprocessed DataFrame out of 
      this class.
      
    Parameters
    ----------
    df: pandas dataframe
        data source
    cols: sequence (lists, tuples, NumPy arrays or Pandas Base Index),
        column names of features to plot. Better to use homogeneous subsets, example:
        either all categorical or all numeric features; categorical subset can have
        multiple dtypes (object or numeric) depending on the nature of the feature
    target: str or None
        target column name
    cat_cols: Bool or None
        indicate whether ``cols`` are categorical in nature or not and in-turn the 
        direction of categorical grouping of numeric data, for Example: for 
        `binary target`, numeric ``cols`` are to be grouped by the categorical 
        ``target`` and vise versa. If `None`, inferred automatically.
    rare_thresh, top_n: float, int
        max cardinality beyond which lvls are grouped and analysed as a single level;
        If ``rare_thresh`` = 0 or ``top_n`` < 2 then all levels are analysed as is,
        otherwise these levels are grouped as a single `rare` level.

        Notes
        -----
        - both are independent, for example considering categorical level to be 
          `rare` (1% - 5%) can still result in a high cardinality categorical
          feature (> N levels)
        - missing values will be displayed as `rare` rather than `missing` if the 
          new `missing` category is below ``rare_thresh``
    frac: float or None
        fraction of dataframe to use as a sample 
        for analysis:

            - 0 < ``frac`` < 1 returns a random sample with size ``frac``. 
            - ``frac`` = 1 returns shuffled dataframe.
            - ``frac`` > 1 up-sample the dataframe, sampling of the same row more 
              than once.
    random_state: int
        for reproducibility, controls the random number generator for ``frac``
        parameter and `best_fit` method.
    figsize: tuple or None 
        dimensions of matplotlib figure (width, height)
    n_rows: int
        number of rows in matplotlib subplot figure
    n_cols: int
        number of columns in matplotlib subplot figure
    silent: Bool
        solicit user input for continuation during iterative plotting. If `True`,
        plotting proceeds without user interaction.
    hide_p_bar: Bool
        triggers hiding progress bar (tqdm module); Default 'False'
        
    Attributes
    ----------
    z_df: pandas dataframe
        preprocessed dataframe that was used internally
    z_freq_lvls_map: dict
        where keys are column(s) name(s) and values are frequent levels. Only 
        applicable when ``cols`` are categorical
    xludd_feats: numpy array
        categorical column names excluded from the analysis being dominated by 
        rare levels. Only applicable when ``cols`` are categorical

    """

    def __init__(self, 
                 df: pd.DataFrame,
                 cols: SEQUENCE_LIKE,
                 target: Optional[str] = None,
                 cat_cols: Optional[bool] = None,
                 rare_thresh: float = 0.05,
                 top_n: int = 25,
                 frac: Optional[float] = None,
                 random_state: int = 45,
                 figsize: Optional[Tuple[int, int]] = None,
                 n_rows: Optional[int] = None,
                 n_cols: Optional[int] = None,
                 silent: bool = False,
                 hide_p_bar: bool = False):
        
        # input checks  
        if not isinstance(cols, (list, tuple, np.ndarray, pd.Index)):
            raise TypeError("'cols' parameter accepts a sequence e.g: Lists, Tuples, NumPy Arrays or Pandas Base Index. "
                            f"However, '{type(cols)}' was received!")
        if target and not isinstance(target, str):
            raise TypeError("please pass 'target' column name as string")
        if np.any(~np.isin(list(set(np.r_[np.array(cols), [target] if target else []])), df.columns)):
            raise KeyError("Missing columns! Please ensure that all columns to analyze are included in the DataFrame")
        if np.any([df[col].dtype.kind in 'M' for col in cols]) and cat_cols: 
            raise TypeError("Datetime columns are casted as categorical, Please ensure 'cat_cols' parameter is set to 'False'")
        
        # prepare data for preprocessing
        if frac:
            replace = True if frac > 1 else False
            self.z_df_ = df.sample(frac = frac, replace = replace, random_state = random_state).copy()
        else:
            self.z_df_ = df.copy()

        self._ori_df = df.copy() # to be used in `best_fit` and dtype checks. TODO: Alternative for this excess overhead
        self._cols = cols
        self._target = target
        self._cat_cols = cat_cols
        self._rare_thresh = rare_thresh
        self._top_n = top_n
        self._random_state = random_state
        self._figsize = figsize
        self._n_rows = n_rows
        self._n_cols = n_cols
        self._silent = silent
        self._hide_p_bar = hide_p_bar
        
        # preprocess categorical columns
        self._preprocess()



[docs]
    @itr_plot
    def cp(self, 
           stat: str = 'count',
           native_scale: bool = False,
           legend: Union[str, bool] = 'auto',
           hue_agg: List[str] = ['mean'],
           log_scale: Optional[Union[int, bool, Tuple[int, bool]]] = False,
           color: str = 'lightblue',
           palette: str = 'Paired',
           nbins: Union[int, str] = 'auto',
           axis: str = 'x',
           tight: Optional[bool] = None,
           x_ax_rotation: Optional[int] = None,
           theme: str = 'darkorange'):
        """
        Visualize categorical feature distribution using Seaborn's Count Plot
        
        Note
        ----
        Categorical features plagued with rare levels (< rare_thresh) will be skipped;
        Only those having at least 2 frequent levels are plotted
        
        Parameters
        ----------
        stat: str
            One of 'count', 'percent', 'proportion' or 'probability'. Statistic to compute; 
            when not 'count', bar heights will be normalized so that they sum to 100 
            (for 'percent') or 1 (otherwise) across the plot.
        native_scale: bool
            When True, numeric or datetime values on the categorical axis will maintain
            their original scaling rather than being converted to fixed indices.
        legend: "auto", "brief", "full", or False
            How to draw the legend. If "brief", numeric `hue` and `size`
            variables will be represented with a sample of evenly spaced values.
            If "full", every group will get an entry in the legend. If "auto",
            choose between brief or full representation based on number of levels.
            If `False`, no legend data is added and no legend is drawn.
        hue_agg: list
            list of functions and/or function names, e.g.: [np.sum, 'mean']
            to use for aggregating the data. Functions, must either work 
            when passed a Series/DataFrame or when passed to Series/DataFrame.apply.
            Only applicable for conditional distributions between numeric and categorical 
            variable, otherwise data is sorted ascendingly following the frequency(count) 
            of categorical levels. e.g.:
            pandas.DataFrame.groupby(category)[value].agg(hue_agg).sort_values(hue_agg)
        log_scale: bool or number, or pair of bools or numbers
            Set axis scale(s) to log. A single value sets the data axis for any numeric
            axes in the plot. A pair of values sets each axis independently.
            Numeric values are interpreted as the desired base (default 10).
            When `None` or `False`, seaborn defers to the existing Axes scale.
        color: str
            adjust color of seaborn plots as desired
        palette: str
            adjust color of 'hue' as desired; See seaborn.color_palette('palette_name')
        nbins: int or 'auto'
            For plot decoration, maximum number of axis intervals; one less than max 
            number of ticks. If the string 'auto', the number of bins will be 
            automatically determined based on the length of the axis.
        axis: str
            For plot decoration, one of ['both', 'x', 'y'], axis on which to apply 
            'nbins'.
        tight: bool or None
            For plot decoration, controls expansion of axis limits, if 'True' axis limits 
            are only expanded using the margins; This does *not* set the margins to zero. 
            If 'False', further expand the axis limits using the axis major locator.
        x_ax_rotation: int or None
            For plot decoration, set degree of x_ticks rotation.
        theme: str
            adjust axis and title colors as desired

        """

        # prepare data        
        mask, feat, hue = self._grouping(self._col, self._target)
        
        # sort
        if hue:
            if mask[feat].dtype.kind != 'O':
                order = mask.groupby(hue)[feat].agg(hue_agg).sort_values(hue_agg).index
            else:
                order = mask.groupby(hue)[feat].agg(['count']).sort_values(['count']).index
        elif self._cat_cols:
            order = mask[feat].value_counts().sort_values().index
        else:
            order = None
        
        # plot
        ax = self._fig.add_subplot(self._n_rows, self._n_cols, self._n)
        countplot(mask, x = feat, hue = hue, order = order if not hue else None, 
                  hue_order = order if hue else None, stat = stat, native_scale = native_scale,  
                  legend = legend, color = color, palette = palette if hue else None, 
                  log_scale = log_scale, ax = ax)
        
        # decorate
        if hue and hue != feat:
            move_legend(ax, 'center', bbox_to_anchor = (.5, 1.3), columnspacing = .4, ncol = 4,
                        labelspacing = 0.0, handletextpad = 0.0, handlelength = 1, fancybox = True, 
                        shadow = True)
        
        col_dtype = mask[feat].dtype.kind
        self._decorate_plot(ax, col_dtype, log_scale, nbins, axis, tight, x_ax_rotation, theme);
        
        # update iteration control
        self._n += 1

          


[docs]
    @itr_plot
    def hs(self,
           bins: Union[str, int, List[int], npt.NDArray[np.int_]] = 'doane',
           stat: str = 'count',
           multiple: str = 'layer',
           element: str = 'bars',
           fill: bool = True,
           discrete: bool = False,
           hue_agg: List[str] = ['mean'],
           log_scale: Optional[Union[int, bool, Tuple[int, bool]]] = False,
           color: str = 'lightblue',
           palette: str = 'Paired',
           nbins: Union[int, str] = 'auto',
           axis: str = 'x',
           tight: Optional[bool] = None,
           x_ax_rotation: Optional[int] = None,
           theme: str = 'darkorange'):
        """
        Visualize numeric features distribution using Seaborn's Hist Plot
        
        Parameters
        ----------
        bins: str, number, vector, or a pair of such values
            histogram bins, Generic bin 
            parameter that can be:
            
                - the name of a reference rule, 
                - the number of bins, 
                - the breaks of the bins.

            Passed to :func:`numpy.histogram_bin_edges`.

            Notes
            -----
            `str` can be one of [‘auto’, ‘fd’, ‘doane’, ‘scott’,
            ‘stone’, ‘rice’, ‘sturges’ or ‘sqrt’]
        stat: str
            Aggregate statistic to compute in each histogram bin.

            - **count**: show the number of observations in each bin
            - **frequency**: show the number of observations divided by the bin width
            - **probability** or **proportion**: normalize such that bar heights sum to 1
            - **percent**: normalize such that bar heights sum to 100
            - **density**: normalize such that the total area of the histogram equals 1
        multiple: {"layer", "dodge", "stack", "fill"}
            Approach to resolving multiple elements when semantic mapping creates subsets.
            Only relevant with univariate data.
        element: {"bars", "step", "poly"}
            Visual representation of the histogram statistic.
            Only relevant with univariate data.
        fill: bool
            If True, fill in the space under the histogram.
        discrete: bool
            If True, default to ``binwidth=1`` and draw the bars so that they are
            centered on their corresponding data points. This avoids "gaps" that may
            otherwise appear when using discrete (integer) data.
        hue_agg: list
            list of functions and/or function names, e.g.: [np.sum, 'mean']
            to use for aggregating the data. Only applicable for conditional
            distributions between numeric and categorical variable, otherwise 
            data is sorted ascendingly following the frequency(count) of categorical 
            levels. e.g.:
            pandas.DataFrame.groupby(category)[value].agg(hue_agg).sort_values(hue_agg)
        log_scale: bool or number, or pair of bools or numbers
            Set axis scale(s) to log. A single value sets the data axis for any numeric
            axes in the plot. A pair of values sets each axis independently.
            Numeric values are interpreted as the desired base (default 10).
            When `None` or `False`, seaborn defers to the existing Axes scale.
        color: str
            adjust color of seaborn plots as desired
        palette: str
            adjust color of 'hue' as desired; See seaborn.color_palette('palette_name')
        nbins: int or 'auto'
            For plot decoration, maximum number of axis intervals; one less than max 
            number of ticks. If the string 'auto', the number of bins will be 
            automatically determined based on the length of the axis.
        axis: str
            For plot decoration, one of ['both', 'x', 'y'], axis on which to apply 
            'nbins'.
        tight : bool or None
            For plot decoration, controls expansion of axis limits, if 'True' axis limits 
            are only expanded using the margins; This does *not* set the margins to zero. 
            If 'False', further expand the axis limits using the axis major locator.
        x_ax_rotation: int or None
            For plot decoration, set degree of x_ticks rotation.
        theme: str
            adjust axis and title colors as desired
            
        Note
        ----
        The choice of bins for computing and plotting a histogram can exert
        substantial influence on the insights that one is able to draw from the
        visualization. If the bins are too large, they may erase important features.
        On the other hand, bins that are too small may be dominated by random
        variability, obscuring the shape of the true underlying distribution. The
        default bin size is determined using a reference rule that depends on the
        sample size and variance. This works well in many cases, (i.e., with
        "well-behaved" data) but it fails in others. It is always a good to try
        different bin sizes to be sure that you are not missing something important.
        This function allows you to specify bins in several different ways, such as
        by setting the total number of bins to use, the width of each bin, or the
        specific locations where the bins should break.

        """

        # prepare data        
        mask, feat, hue = self._grouping(self._col, self._target)
        
        # sort
        if hue and mask[feat].dtype.kind != 'O':        
            order = mask.groupby(hue)[feat].agg(hue_agg).sort_values(hue_agg).index
        else:
            order = None
            
        # plot
        ax = self._fig.add_subplot(self._n_rows, self._n_cols, self._n)
        histplot(mask, x = feat, hue = hue, hue_order = order, bins = bins, stat = stat,
                 multiple = multiple, element = element, fill = fill, discrete = discrete,
                 color = color, palette = palette if hue else None, log_scale = log_scale, ax = ax)
        
        # decorate
        if hue:
            move_legend(ax, 'center', bbox_to_anchor = (.5, 1.3), columnspacing = .4, ncol = 4,
                        labelspacing = 0.0, handletextpad = 0.0, handlelength = 1, fancybox = True, 
                        shadow = True)

        col_dtype = mask[feat].dtype.kind
        self._decorate_plot(ax, col_dtype, log_scale, nbins, axis, tight, x_ax_rotation, theme);
        
        # update iteration control
        self._n += 1 

        


[docs]
    @itr_plot
    def bo(self, 
           hue: Optional[str] = None, 
           fill: bool = True,
           showmeans: bool = True,
           meanprops: Optional[dict] = None,
           medianprops: Optional[dict] = None,
           whis: Union[float, Tuple[float, float]] = 1.5, 
           fliersize: Optional[float] = None,
           hue_agg: List[str] = ['mean'],
           log_scale: Optional[Union[int, bool, Tuple[int, bool]]] = False,
           color: str = 'lightblue',
           palette: str = 'Paired',
           nbins: Union[int, str] = 'auto',
           axis: str = 'x',
           tight: Optional[bool] = None,
           x_ax_rotation: Optional[int] = None,
           theme: str = 'darkorange'):
        """
        Visualize numeric features distribution using Seaborn's Box Plot
        
        Parameters
        ----------
        hue: str
            column name for additional layer of categorization.
        fill: bool
            If True, use a solid patch. Otherwise, draw as line art.
        showmeans: bool
            Show the arithmetic means.
        meanprops: dict
            Specifies the style of the mean.
        medianprops: dict
            Specifies the style of the median.
        whis: float or pair of floats
            Paramater that controls whisker length. If scalar, whiskers are drawn
            to the farthest datapoint within *whis * IQR* from the nearest hinge.
            If a tuple, it is interpreted as percentiles that whiskers represent.
        fliersize: float
            Size of the markers used to indicate outlier observations.
        hue_agg: list
            list of functions and/or function names, e.g.: [np.sum, 'mean']
            to use for aggregating the data. Only applicable for conditional
            distributions between numeric and categorical variable, otherwise 
            data is sorted ascendingly following the frequency(count) of categorical 
            levels. e.g.:
            pandas.DataFrame.groupby(category)[value].agg(hue_agg).sort_values(hue_agg)
        log_scale: bool or number, or pair of bools or numbers
            Set axis scale(s) to log. A single value sets the data axis for any numeric
            axes in the plot. A pair of values sets each axis independently.
            Numeric values are interpreted as the desired base (default 10).
            When `None` or `False`, seaborn defers to the existing Axes scale.
        color: str
            adjust color of seaborn plots as desired
        palette: str
            adjust color of 'hue' as desired; See seaborn.color_palette('palette_name')
        nbins: int or 'auto'
            For plot decoration, maximum number of axis intervals; one less than max 
            number of ticks. If the string 'auto', the number of bins will be 
            automatically determined based on the length of the axis.
        axis: str
            For plot decoration, one of ['both', 'x', 'y'], axis on which to apply 
            'nbins'.
        tight : bool or None
            For plot decoration, controls expansion of axis limits, if 'True' axis limits 
            are only expanded using the margins; This does *not* set the margins to zero. 
            If 'False', further expand the axis limits using the axis major locator.
        x_ax_rotation: int or None
            For plot decoration, set degree of x_ticks rotation.
        theme: str
            adjust axis and title colors as desired

        """
        
        # prepare data        
        mask, feat, hue_ = self._grouping(self._col, self._target)
            
        # sort
        if hue_ and mask[feat].dtype.kind != 'O':        
            order = mask.groupby(hue_)[feat].agg(hue_agg).sort_values(hue_agg).index
        else:
            order = None
        
        # plot
        ax = self._fig.add_subplot(self._n_rows, self._n_cols, self._n)
        
        mean_props = dict(linewidth = 2, markeredgecolor = 'black', markerfacecolor = 'firebrick') if not meanprops else meanprops
        median_props = dict(linewidth = 1.5, color = 'black') if not medianprops else medianprops
        
        boxplot(mask, x = feat, y = hue_, order = order, hue = hue if hue else None, fill = fill, showmeans = showmeans,
                meanline = showmeans, meanprops = mean_props, medianprops = median_props, whis = whis, fliersize = fliersize, 
                color = color, palette = palette if hue else None, log_scale = log_scale, ax = ax)

        # decorate
        if hue:
            move_legend(ax, 'center', bbox_to_anchor = (.5, 1.3), columnspacing = .4, ncol = 4,
                        labelspacing = 0.0, handletextpad = 0.0, handlelength = 1, fancybox = True, 
                        shadow = True)
        
        # decorate
        col_dtype = mask[feat].dtype.kind
        self._decorate_plot(ax, col_dtype, log_scale, nbins, axis, tight, x_ax_rotation, theme);
        
        # update iteration control
        self._n += 1 




[docs]
    @itr_plot
    def vi(self, 
           hue: Optional[str] = None, 
           fill: bool = False,
           inner: Optional[str] = 'quart', 
           split: bool = False, 
           cut: float = 2,
           bw_method: Union[str, float] = 'scott',
           bw_adjust: float = 1,
           density_norm: str = 'area',
           hue_agg: List[str] = ['mean'],
           log_scale: Optional[Union[int, bool, Tuple[int, bool]]] = False,
           color: str = 'lightblue',
           palette: str = 'Paired',
           nbins: Union[int, str] = 'auto',
           axis: str = 'x',
           tight: Optional[bool] = None,
           x_ax_rotation: Optional[int] = None,
           theme: str = 'darkorange'):
        """
        Visualize numeric features distribution using Seaborn's Violin Plot
        
        Parameters
        ----------
        hue: str
            column name for additional layer of categorization.
        fill: bool
            If True, use a solid patch. Otherwise, draw as line art. 
        inner: {"box", "quart", "point", "stick", None}
            Representation of the data in the violin interior. 
            One of the following:

                - **box**: draw a miniature box-and-whisker plot
                - **quart**: show the quartiles of the data
                - **point** or **stick**: show each observation
        split: bool
            Show an un-mirrored distribution, alternating sides when using `hue`.
        cut: float
            Distance, in units of bandwidth, to extend the density past extreme
            data points. Set to 0 to limit the violin within the data range.
        bw_method: {"scott", "silverman", float}
            Either the name of a reference rule or the scale factor to use when
            computing the kernel bandwidth. The actual kernel size will be
            determined by multiplying the scale factor by the standard deviation of
            the data within each group.
        bw_adjust: float
            Factor that scales the bandwidth to use more or less smoothing.
        density_norm: {"area", "count", "width"}
            Method that normalizes each density to determine the violin's width.
            If `area`, each violin will have the same area. If `count`, the width
            will be proportional to the number of observations. If `width`, each
            violin will have the same width.       
        hue_agg: list
            list of functions and/or function names, e.g.: [np.sum, 'mean']
            to use for aggregating the data. Only applicable for conditional
            distributions between numeric and categorical variable, otherwise 
            data is sorted ascendingly following the frequency(count) of categorical 
            levels. e.g.:
            pandas.DataFrame.groupby(category)[value].agg(hue_agg).sort_values(hue_agg)
        log_scale: bool or number, or pair of bools or numbers
            Set axis scale(s) to log. A single value sets the data axis for any numeric
            axes in the plot. A pair of values sets each axis independently.
            Numeric values are interpreted as the desired base (default 10).
            When `None` or `False`, seaborn defers to the existing Axes scale.
        color: str
            adjust color of seaborn plots as desired  
        palette: str
            adjust color of 'hue' as desired; See seaborn.color_palette('palette_name')
        nbins: int or 'auto'
            For plot decoration, maximum number of axis intervals; one less than max 
            number of ticks. If the string 'auto', the number of bins will be 
            automatically determined based on the length of the axis.
        axis: str
            For plot decoration, one of ['both', 'x', 'y'], axis on which to apply 
            'nbins'.
        tight: bool or None
            For plot decoration, controls expansion of axis limits, if 'True' axis limits 
            are only expanded using the margins; This does *not* set the margins to zero. 
            If 'False', further expand the axis limits using the axis major locator.
        x_ax_rotation: int or None
            For plot decoration, set degree of x_ticks rotation.
        theme: str
            adjust axis and title colors as desired

        """

        # prepare data
        mask, feat, hue_ = self._grouping(self._col, self._target)
            
        # sort
        if hue_ and mask[feat].dtype.kind != 'O':        
            order = mask.groupby(hue_)[feat].agg(hue_agg).sort_values(hue_agg).index
        else:
            order = None
        
        # plot
        ax = self._fig.add_subplot(self._n_rows, self._n_cols, self._n)
        violinplot(mask, x = feat, y = hue_, order = order, hue = hue if hue else None, inner = inner, 
                   fill = fill, split = split, cut = cut, bw_method = bw_method,
                   bw_adjust = bw_adjust, density_norm = density_norm, color = color, 
                   palette = palette if hue else None, log_scale = log_scale, ax = ax)
        
        # decorate
        if hue:
            move_legend(ax, 'center', bbox_to_anchor = (.5, 1.3), columnspacing = .4, ncol = 4,
                        labelspacing = 0.0, handletextpad = 0.0, handlelength = 1, fancybox = True, 
                        shadow = True)
        
        col_dtype = mask[feat].dtype.kind
        self._decorate_plot(ax, col_dtype, log_scale, nbins, axis, tight, x_ax_rotation, theme);
        
        # update iteration control
        self._n += 1 




[docs]
    @itr_plot
    def kd(self,
           cut: float = 0,
           bw_method: Union[str, float] = 'scott',
           bw_adjust: float = 1,
           warn_singular: bool = False,
           hue_agg: List[str] = ['mean'],
           log_scale: Optional[Union[int, bool, Tuple[int, bool]]] = False,
           color: str = 'lightblue',
           palette: str = 'Paired',
           nbins: Union[int, str] = 'auto',
           axis: str = 'x',
           tight: Optional[bool] = None,
           x_ax_rotation: Optional[int] = None,
           theme: str = 'darkorange'):
        """
        Visualize numeric features distribution using Seaborn's KDE Plot
        
        Parameters
        ----------
        cut: float
            Factor, for KDE plot, that determines how far to reach past 
            extreme data points. Set to 0, truncate the plot at the data limits.
        bw_method: string, scalar, or callable
            The method used to calculate the estimator bandwidth. This can be
            'scott', 'silverman', a scalar constant or a callable. If a scalar,
            this will be used directly as `kde.factor`. If a callable, it should
            take a `gaussian_kde` instance as only parameter and return a scalar.
            See :class:`scipy.stats.gaussian_kde` for more details.
        bw_adjust: float
            Factor that multiplicatively scales the value chosen using
            ``bw_method``. Increasing will make the curve smoother. See Notes.
        warn_singular: bool
            If True, issue a warning when trying to estimate the density of data
            with zero variance
        hue_agg: list
            list of functions and/or function names, e.g.: [np.sum, 'mean']
            to use for aggregating the data. Only applicable for conditional
            distributions between numeric and categorical variable, otherwise 
            data is sorted ascendingly following the frequency(count) of categorical 
            levels. Example:
            pandas.DataFrame.groupby(category)[value].agg(hue_agg).sort_values(hue_agg)
        log_scale: bool or number, or pair of bools or numbers
            Set axis scale(s) to log. A single value sets the data axis for any numeric
            axes in the plot. A pair of values sets each axis independently.
            Numeric values are interpreted as the desired base (default 10).
            When `None` or `False`, seaborn defers to the existing Axes scale.
        color: str
            adjust color of seaborn plots as desired
        palette: str
            adjust color of 'hue' as desired; See seaborn.color_palette('palette_name')
        nbins: int or 'auto'
            For plot decoration, maximum number of axis intervals; one less than max 
            number of ticks. If the string 'auto', the number of bins will be 
            automatically determined based on the length of the axis.
        axis: str
            For plot decoration, one of ['both', 'x', 'y'], axis on which to apply 
            'nbins'.
        tight: bool or None
            For plot decoration, controls expansion of axis limits, if 'True' axis limits 
            are only expanded using the margins; This does *not* set the margins to zero. 
            If 'False', further expand the axis limits using the axis major locator.
        x_ax_rotation: int or None
            For plot decoration, set degree of x_ticks rotation.
        theme: str
            adjust axis and title colors as desired
            
        Notes
        -----
        The *bandwidth*, or standard deviation of the smoothing kernel, is an
        important parameter. Misspecification of the bandwidth can produce a
        distorted representation of the data. Much like the choice of bin width in a
        histogram, an over-smoothed curve can erase true features of a
        distribution, while an under-smoothed curve can create false features out of
        random variability. The rule-of-thumb that sets the default bandwidth works
        best when the true distribution is smooth, unimodal, and roughly bell-shaped.
        It is always a good idea to check the default behavior by using ``bw_adjust``
        to increase or decrease the amount of smoothing.

        Because the smoothing algorithm uses a Gaussian kernel, the estimated density
        curve can extend to values that do not make sense for a particular dataset.
        For example, the curve may be drawn over negative values when smoothing data
        that are naturally positive. The ``cut`` and ``clip`` parameters can be used
        to control the extent of the curve, but datasets that have many observations
        close to a natural boundary may be better served by a different visualization
        method.

        Similar considerations apply when a dataset is naturally discrete or "spiky"
        (containing many repeated observations of the same value). Kernel density
        estimation will always produce a smooth curve, which would be misleading
        in these situations.

        The units on the density axis are a common source of confusion. While kernel
        density estimation produces a probability distribution, the height of the curve
        at each point gives a density, not a probability. A probability can be obtained
        only by integrating the density across a range. The curve is normalized so
        that the integral over all possible values is 1, meaning that the scale of
        the density axis depends on the data values.

        """
        
        # prepare data
        mask, feat, hue = self._grouping(self._col, self._target)
        
        # sort
        if hue and mask[feat].dtype.kind != 'O':        
            order = mask.groupby(hue)[feat].agg(hue_agg).sort_values(hue_agg).index
        else:
            order = None

        # plot
        ax = self._fig.add_subplot(self._n_rows, self._n_cols, self._n)
        kdeplot(mask, x = feat, hue = hue, hue_order = order, cut = cut, bw_method = bw_method, 
                bw_adjust = bw_adjust, warn_singular = warn_singular, color = color, palette = palette if hue else None, 
                log_scale = log_scale, ax = ax)
        
        # decorate
        if hue:
            move_legend(ax, 'center', bbox_to_anchor = (.5, 1.3), columnspacing = .4, ncol = 4,
                        labelspacing = 0.0, handletextpad = 0.0, handlelength = 1, fancybox = True, 
                        shadow = True)

        col_dtype = mask[feat].dtype.kind
        self._decorate_plot(ax, col_dtype, log_scale, nbins, axis, tight, x_ax_rotation, theme);
        
        # update iteration control
        self._n += 1

        


[docs]
    def best_fit(self, 
                 cols: Optional[SEQUENCE_LIKE] = None,
                 method: str = 'parametric',
                 distr: Union[str, List[str]] = 'popular',
                 stats: str = 'RSS',
                 alpha: float = 0.05,
                 verbose: Union[str, int] = 30,
                 **kwargs) -> dict:
        """
        Find the best fit distribution for parametric, non-parametric, and discrete 
        distributions using DistFit module
        
        Parameters
        ---------- 
        cols: sequence (lists, tuples, NumPy arrays or Pandas Base Index) or None
            column names of features to analyze. If `None`, all columns at class
            initialization will be used.
        method: str
            specify how the best fit distribution is determined, 
            One of ['parametric', 'quantile', 'percentile', 'discrete'].

            - For the parametric approach, the distfit library can determine the best 
              fit across 89 theoretical distributions. 

            - For the non-parametric approach (assume that the data does not follow a 
              specific probability distribution), either the quantile or percentile 
              method is used; where confidence intervals of distribution boundaries are 
              computed based on either quantiles or percentiles. e.g.: 
              ci_upper = np.quantile(X, 1 - alpha), ci_lower = np.quantile(X, alpha) 
              where 'X' is feature values and 'alpha' is Significance alpha (i.e: 0.05)

            - In case the dataset contains discrete values, the best fit is then derived 
              using the binomial distribution.
        distr: str or list of str
            the (set) of distribution to test. str can be "popular", 
            name of distribution or list of specific theoretical 
            distribution names, for example:

                - 'popular':[norm, expon, pareto, dweibull, t, genextreme, 
                  gamma, lognorm, beta, uniform, loggamma]
                - 'full'
                - 'norm', 't', 'k': Test for one specific distribution.
                - ['norm', 't', 'k', ...]: Test for a list of distributions.

            If ``method`` = 'discrete', then binomial distribution is used.
        stats: str
            specify the scoring statistics for the goodness of fit test, 
            One of ['RSS', 'wasserstein', 'ks', 'energy'].
        alpha: float
            Significance alpha
        verbose: str or int
            set the verbose messages using string or integer:

            - 0, 60, None, 'silent', 'off', 'no']: No message.
            - 10, 'debug': Messages from debug level and higher.
            - 20, 'info': Messages from info level and higher.
            - 30, 'warning': Messages from warning level and higher.
            - 50, 'critical': Messages from critical level and higher.
        
        Attributes
        ----------
        z_best_fit_results: dict,
            where keys are column(s) name(s) and values are fitted model(s)
        z_feats_out_: numpy array,
            excluded columns having null values, if any.

        Note
        ----
        - Columns having null values will not be analyzed
        - For full list of parameters see `<https://erdogant.github.io/distfit>`__.

        """
        
        # Input check
        if cols is not None:
            if not isinstance(cols, (list, tuple, np.ndarray, pd.Index)):
                raise TypeError("'cols' parameter accepts a sequence, e.g: Lists, Tuples, NumPy Arrays or Pandas Base Index. "
                                f"However, '{type(cols)}' was received!")
            # assign columns to analyze
            self._bf_cols = np.array(cols)
        else:
            self._bf_cols = np.array(self._cols)

        if np.any([self._ori_df[col].dtype.kind not in 'ifc' for col in self._bf_cols]):
            raise TypeError("please only use columns having numeric data type")
        if stats not in ['RSS', 'wasserstein', 'ks', 'energy']:
            raise ValueError("'stats' parameter must be one of the following arguments: 'RSS', 'wasserstein', 'ks', or 'energy', "
                             f"however, '{stats}' was received!")

        # exclude columns with null values
        null_cols = self._bf_cols[self._ori_df[self._bf_cols].isnull().sum() != 0]
        n_ori, n_out = len(self._bf_cols), len(null_cols)
        self._bf_cols = self._bf_cols[~np.isin(self._bf_cols, null_cols)]
        
        if n_out == n_ori:
            raise ValueError("All columns have null(missing) values, can't fit!")
        else:
            # highlight exclusion
            if n_out:
                self.z_feats_out_ = null_cols # attributes
                _z_log.info(f"{n_out} out of {n_ori} columns having null values will not be analyzed")
        
            # fit dists
            fit_results = {} # best fit model(s) container

            for col in tqdm(self._bf_cols, desc = f'Finding Best Fits....', disable = self._hide_p_bar):

                dfit = distfit(method = method, distr = distr, stats = stats, alpha = alpha, todf = True, 
                               random_state = self._random_state, verbose = verbose, **kwargs)

                results = dfit.fit_transform(self._ori_df[col]) # using original dataframe ignoring `frac`

                fit_results[col] = dfit # update best fit container

            self.z_best_fit_results_ = fit_results # attributes

            
            

[docs]
    @itr_plot
    def best_vis(self):
        """
        Visualize best fit model results `distfit module`. 
        Calls `best_fit` method if models were not already fit.
        """
        
        ax = self._fig.add_subplot(self._n_rows, self._n_cols, self._n)  

        self.z_best_fit_results_[self._col].plot(emp_properties = {'color': 'red', 'linewidth' : 1}, 
                                                 pdf_properties = {'color': 'green', 'linewidth' : 1}, 
                                                 cii_properties = {'color' : 'darkorange', 'linewidth': 1, 'size': 10}, 
                                                 fontsize = 13, grid = False, figsize = self._figsize, title = f'{self._col}', 
                                                 ax = ax)
        
        # decorate
        col_dtype = self.z_df_[self._col].dtype.kind
        self._decorate_plot(ax, dtype = col_dtype)
        plt.legend(ncol = 3, loc = 8, bbox_to_anchor = [0.5, 1.2], columnspacing = 1.0, labelspacing = 0.0, 
                   handletextpad = 0.0, handlelength = 1.5, fancybox = True, shadow = True);
                
        self._n += 1 # update iteration control