Source code for zaps.eda._uni_analysis

import pandas as pd

import numpy as np

from packaging.version import parse

from scipy import stats

from tqdm.auto import tqdm

from matplotlib import pyplot as plt

from plotly.graph_objs import Bar, Figure

from typing import Optional, Union, Tuple

from IPython.display import display

from .._utils import SEQUENCE_LIKE, itr_plot, PlotMixin

from .._logr import _z_log

###################################################################


[docs]
class UniStat(PlotMixin):
    '''
    Calculate and visualize Univariate statistics for all features, identifying 
    feature problems (e.g.: missing values, skew, rare categories,...)
    
    Parameters
    ----------
    df: pandas dataframe
        data source
    col_drop: sequence (lists, tuples, NumPy arrays or Pandas Base Index) or None
        column(s) name(s) to exclude when analysing duplicates
    card_thresh: int
        threshold for considering categorical feature to be of high cardinality
    rare_thresh: float
        threshold below which categorical levels are considered to be rare 
        (e.g.: 1% or 5%). If 0 then all levels are considered during the analysis.
    skw_thresh: int
        threshold for highlighting and plotting skewed numeric distributions, assuming 
        normal theoretical distribution. Features are considered to be skewed if outside
        bounds of ``skw_thresh``: abs(skew score) > ``skw_thresh``
    figsize: tuple or None
        dimensions of matplotlib figure (width, height)
    n_rows: int
        number of rows in matplotlib subplot figure
    n_cols: int
        number of columns in matplotlib subplot figure
    silent: Bool
        solicit user input for continuation during iterative plotting. If `True`,
        plotting proceeds without user interaction.
    hide_p_bar: Bool
        triggers hiding progress bar (tqdm module); Default `False`
    theme: str
        adjust axis and title colors as desired
    color: str
        adjust color of Plotly Bar as desired
    
    Note
    ----
    Data types will be inferred automatically, however, to get optimal separation of categorical
    and numeric ``cols`` its better to ensure that correct data types are applied before using 
    this class.

    '''
    
    def __init__(self,
                 df: pd.DataFrame,
                 col_drop: Optional[SEQUENCE_LIKE] = None,
                 card_thresh: int = 10,
                 rare_thresh: float = 0.05,
                 skw_thresh: int = 1,
                 figsize: Optional[Tuple[int, int]] = None,
                 n_rows: Optional[int] = None,
                 n_cols: Optional[int] = None,
                 silent: bool = False,
                 hide_p_bar: bool = False,
                 theme: str = 'darkorange',
                 color: str = 'lightblue'):

        # input check
        if col_drop is not None:
            if not isinstance(col_drop, (list, tuple, np.ndarray, pd.Index)):
                raise TypeError("'col_drop' parameter accepts a sequence e.g: Lists, Tuples, NumPy Arrays or Pandas Base Index. "
                                f"However, '{type(col_drop)}' was received!")
            if np.any(~np.isin(col_drop, df.columns)):
                raise KeyError("Missing columns! Please ensure that column name passed to 'col_drop' parameter "
                               "is included in the DataFrame")
        if (card_thresh or rare_thresh) < 0:
            raise ValueError("cardinality and/or rare threshold(s) can't be less than 0")

        _z_log.info("Missing values in categorical columns will be considered as an additional 'missing' level "
                    "when calculating cardinality and rare levels percent")
        
        self._df = df.copy()
        self._col_drop = col_drop
        self._card_thresh = card_thresh
        self.rare_thresh = rare_thresh
        self.skw_thresh = skw_thresh
        self._figsize = figsize
        self._n_rows = n_rows
        self._n_cols = n_cols
        self._silent = silent
        self._hide_p_bar = hide_p_bar
        self._theme = theme 
        self._color = color
    

    @property # could've been exposed, just an example of property usage
    def card_thresh(self):
        """get cardinality threshold"""
        return self._card_thresh


    @card_thresh.setter
    def card_thresh(self, value):
        """set cardinality threshold"""
        if value >= 0:
            self._card_thresh = value
        else:
            print(f"cardinality threshold can't be negative, default value of {self._card_thresh} was not changed")
        

    # delete
    @card_thresh.deleter
    def card_thresh(self):
        """delete cardinality threshold"""
        del self._card_thresh
        


[docs]
    def peek(self, disp_res: bool = True) -> Tuple[pd.core.indexes.base.Index, pd.DataFrame]:
        '''
        Calculate univariate statistics for Numeric and Categorical features while 
        identifying the following:
        
        - Proportion of missing data
        - Highly skewed features, assuming normal distribution
        - High cardinality categorical features
        - Proportion of rare categorical levels
        
        Parameters
        ----------
        disp_res: Bool
            triggers displaying summary results
        
        Attributes
        ----------
        z_summary: Pandas DataFrame
            info about the dataframe
        z_miss_data: Pandas Series
            Proportion of missing data
        z_hc_data: Pandas Series
            Count of categories/levels of high cardinality categorical columns
        z_rare_cat: Pandas DataFrame
            Count and proportion of rare categories/levels    
        z_univ_stat_df: Pandas DataFrame
            univariate statistics
        
        Returns
        -------
        num_cols: Pandas Index
            Numeric column names
        cat_cols: Pandas Index
            Categorical column names
        dup_df: Pandas DataFrame
            duplicate rows

        '''
        
        # capture all missing values before modification
        null_vals = self._df.isnull().mean().round(4)

        # dynamic progress bar for non iterable execution
        result_list = ['Categorical', 'Numeric', 'Univariate Statistics', 'Missing Data', 'High Cardinality Features', 
                       'Rare Categorical Features', 'Duplicates'] # progress bar text
        
        p_bar = tqdm(range(len(result_list)), desc = '', disable = self._hide_p_bar) # progress bar
        
        for i in p_bar:
            # Categorical Features
            if result_list[i] == 'Categorical':
                p_bar.set_description(f'Capturing {result_list[i]}')
                cats = self._df.select_dtypes(['object', 'bool', 'category']).columns
                # Checking for `ifc` instead of `i`
                # for features that better be
                # treated as categorical
                discats = {col: dtype for col in self._df.columns for dtype in [self._df[col].dtype] \
                           if dtype.kind in 'ifc' and self._df[col].nunique() <= 20} # dict to assign original dtypes latter
                discats_ = list(discats)
                cat_cols = np.r_[discats_, cats]

                # capturing original values and dtypes then
                # unifying display of missing values under `missing`
                # label for both discats and cats as missing values
                # in cats are ignored by pandas.describe
                cat_df = self._df[cat_cols] # preserve original values and dtypes
                cat_nulls = cat_cols[null_vals[cat_cols] > 0]
                for col in cat_nulls:
                    self._df[col] = np.where(self._df[col].isna(), 'missing', self._df[col])

                # for summary stats to be treated
                # as categorical if not having nulls
                self._df[discats_] = self._df[discats_].astype(str)
                            
            # Numeric Features
            elif result_list[i] == 'Numeric':
                p_bar.set_description(f'Capturing {result_list[i]}')
                num_cols = self._df.columns[~self._df.columns.isin(cat_cols)]
                
                # remove `inf` columns, if any
                inf_mask = np.isinf(self._df[num_cols]).any()
                if inf_mask.any():
                    self.z_inf_out_ = num_cols[inf_mask] # attributes
                    _z_log.info("Some columns contain 'inf' values and will be excluded.")
                    num_cols = num_cols[~np.isin(num_cols, self.z_inf_out_)]

                    n_inf = len(self.z_inf_out_)
                else:
                    n_inf = 0

            # Summary Statistics
            elif result_list[i] == 'Univariate Statistics':
                p_bar.set_description(f'Capturing {result_list[i]}')

                # datetime correct dtype
                kwargs = ({} if parse(pd.__version__) > parse('1.5') else {'datetime_is_numeric': True}) 

                # stats
                univ_stat_df = self._df[np.r_[num_cols, cat_cols]].describe(include = 'all', **kwargs)

                # back to original values and dtypes so if peek() called again
                # missing values re-appear after being labeled "missing"
                self._df[cat_cols] = cat_df

                # add dtypes to summary stats, reflecting changes to discats
                univ_stat_df.loc['d_types'] = self._df.dtypes

                # calculate skew, kurt and normality for numeric features only
                # skew: how much a distribution is pushed left or right
                # Rule of Thump between -1 and 1
                univ_stat_df.loc['skw'] = round(self._df[num_cols].skew(skipna = True), 2)

                # kurtosis: how much of the distribution is in the tail
                # kurtosis of normal == 0
                univ_stat_df.loc['krts'] = round(self._df[num_cols].kurt(skipna = True), 2)
                
                # spotting rare levels in categorical features
                rare_count = [(self._df[col].value_counts(dropna = False, normalize = True) < self.rare_thresh).sum() for col in cat_cols]

                univ_stat_df.loc['n_rare_lvls', cat_cols] = rare_count
                univ_stat_df.loc['pct_rare_lvls', cat_cols] = \
                univ_stat_df[cat_cols].loc['n_rare_lvls'] / univ_stat_df[cat_cols].loc['unique'] if any(cat_cols) else np.nan

                # re-arrange index - cosmetics
                re_idx = np.r_[univ_stat_df.index[:2], univ_stat_df.index[-2:], univ_stat_df.index[2:-2]]
                univ_stat_df = univ_stat_df.reindex(re_idx)
            
            # missing data    
            elif result_list[i] == 'Missing Data':
                p_bar.set_description(f'Capturing {result_list[i]}')
                univ_stat_df.loc['pct_missing'] = null_vals
                miss_data = univ_stat_df.T[univ_stat_df.T.pct_missing > 0]['pct_missing'].sort_values()
            
            # Overall Categorical Feature Cardinality
            elif result_list[i] == 'High Cardinality Features':
                p_bar.set_description(f'Capturing {result_list[i]}')
                cat_card = self._df[cat_cols].nunique(dropna = False).sort_values()
                hc_data = cat_card[cat_card > self._card_thresh]
            
            # Rare Categorical Levels
            elif result_list[i] == 'Rare Categorical Features':
                p_bar.set_description(f'Capturing {result_list[i]}')
                rare_cat = univ_stat_df.T[univ_stat_df.T.n_rare_lvls > 0][['unique', 'n_rare_lvls', 'pct_rare_lvls']]\
                .sort_values('pct_rare_lvls') if any(cat_cols) else hc_data # no cat_cols means empty pd.Series as is hc_data
            
            # duplicate samples
            else:
                p_bar.set_description(f'Capturing {result_list[i]}')
                if self._col_drop is not None: # e.g: drop target and check identical features with different target values
                    mask = self._df.drop(np.array(self._col_drop), axis = 1)
                    dup_df = self._df[mask.duplicated(keep = False)].sort_values(list(mask.columns))
                else: # in all columns
                    dup_df = self._df[self._df.duplicated(keep = False)].sort_values(list(self._df.columns))

        # attributes  
        self.z_summary_ = pd.DataFrame([{'rows': self._df.shape[0], 'columns': self._df.shape[1], 'num_feats': len(num_cols), 
                                         'cat_feats': len(cat_cols), 'high_card_cats': len(hc_data), 'rare_lvl_cats': len(rare_cat), 
                                         'n_feats_missing_data': len(miss_data), 'n_inf_feats': n_inf, 'duplicates': len(dup_df)}])

        # display results
        if disp_res:
            display('** Data Summary **', self.z_summary_.T.style.hide(axis = 1).format(thousands = ','),
                    '** Univariate stats - Numeric Features **', univ_stat_df[num_cols]\
                    .sort_values('skw', ascending = False, axis = 1).dropna().style.apply(
                        lambda x: ['background: olive' if abs(x.loc['skw']) > self.skw_thresh else '' for i in x],
                       subset = pd.IndexSlice['skw':'skw']).format(precision = 3, thousands = ',') if any(num_cols) else 'N/A', 
                    '** Univariate stats - Categorical Features **', univ_stat_df[cat_cols]\
                    .sort_values('unique', ascending = False, axis = 1).dropna().style.apply(
                        lambda x: ['background: grey' if x.loc['pct_rare_lvls'] >= .5 else '' for i in x], 
                        subset = pd.IndexSlice['pct_rare_lvls':'pct_rare_lvls']).format(precision = 3, thousands = ',')\
                        if any(cat_cols) else 'N/A')

        # attributes
        self.z_miss_data_ = miss_data
        self.z_hc_data_ = hc_data
        self.z_rare_cat_ = rare_cat
        self.z_univ_stat_df_ = univ_stat_df
         
        return num_cols, cat_cols, dup_df

    


[docs]
    def stats_plot(self, width: Optional[int] = None, height: Optional[int] = None):
        '''
        Interactive plots visualizing:

        - Proportion of missing data
        - High cardinality categorical features
        - Proportion of rare categorical levels

        Parameters
        ----------
        width: int, default `None`
            The figure width in pixels
        height: int, default `None`
            The figure height in pixels

        '''

        try:
            self.z_miss_data_; self.z_hc_data_; self.z_rare_cat_ # check if plot data is ready
        except:
            self.peek(disp_res = False)
            
        miss_d, hc_d, rare_d = self.z_miss_data_, self.z_hc_data_, self.z_rare_cat_
        
        if any(miss_d): # plot missing data stats 
            # pLot
            fig = Figure(data=[Bar(x = miss_d.index, y = miss_d.values, marker = dict(color = self._color), 
                                   customdata = self._df[miss_d.index].dtypes.values.astype(str), 
                                   hoverlabel = dict(namelength=0), 
                                   hovertemplate = '<br>'.join(['Feat: %{x}', 'Missing Value: %{y:.1%}', 
                                                                'Data Type: %{customdata}']))])
            # Decorate
            fig.update_layout(dict(paper_bgcolor = 'rgba(0,0,0,0)', plot_bgcolor = 'rgba(0,0,0,0)', 
                                   hoverlabel = dict(bgcolor = 'darkred'), 
                                   title = dict(text = 'Quantifying Missing data', x = 0.5, xref = 'paper', 
                                                font = dict(family = 'Arial', size = 20, color = self._theme)),
                                   width = width, height = height))
            fig.update_xaxes(title = 'Features', color = self._theme)
            fig.update_yaxes(showgrid = False, title = 'Percentage of Missing Values', color = self._theme)
            fig.show();

        else:
            _z_log.info('No Missing Values')

        if any(hc_d): # plot high cardinality stats
            # pLot
            fig = Figure(data=[Bar(x = hc_d.index, y = hc_d.values, marker = dict(color = self._color), 
                                   # customdata = customdata, 
                                   hoverlabel = dict(namelength=0),
                                   hovertemplate = '<br>'.join(['Feat: %{x}', 'n_lvls: %{y:.0f}']))])
            # Decorate
            fig.update_layout(dict(paper_bgcolor = 'rgba(0,0,0,0)', plot_bgcolor = 'rgba(0,0,0,0)', 
                                   hoverlabel = dict(bgcolor = 'darkred'), 
                                   title = dict(text = f'High Cardinality Categorical Features (>{self._card_thresh} levels)', 
                                                x = 0.5, xref = 'paper', 
                                                font = dict(family = 'Arial', size = 20, color = self._theme)),
                                   width = width, height = height))
            fig.update_xaxes(title = 'Features', color = self._theme)
            fig.update_yaxes(type = 'log', showgrid = False, title = '(log) Size of Unique Categories', color = self._theme)
            fig.show();

        else:
            _z_log.info(f'No High Cardinality Categorical Features (>{self._card_thresh})')

        if len(rare_d) > 0: # plot rare categorical levels stats
            # plot
            customdata = np.stack(rare_d.values, axis = 0) # hover data

            fig = Figure(data=[Bar(x = rare_d.index, y = rare_d.pct_rare_lvls, 
                                   marker = dict(color = self._color), 
                                   customdata = customdata, hoverlabel = dict(namelength=0),
                                   hovertemplate = '<br>'.join(['Feat: %{x}', 'n_lvls: %{customdata[0]:.0f}',
                                                                'n_rare_lvls: %{customdata[1]:.0f}', 
                                                                'pct_rare_Lvls: %{y:.1%}']))])

            # decorate
            fig.update_layout(dict(paper_bgcolor = 'rgba(0,0,0,0)', plot_bgcolor = 'rgba(0,0,0,0)', 
                                   hoverlabel = dict(bgcolor = 'darkred'),
                                   title = dict(
                                       text = f'Proportion of Rare Categorical Levels (<{self.rare_thresh})',
                                       x = 0.5, xref = 'paper', font = dict(family = 'Arial', size = 20, color = self._theme)),
                                   width = width, height = height))
            fig.update_xaxes(title = 'Features', color = self._theme)
            fig.update_yaxes(showgrid = False, title = '% of Rare Levels', color = self._theme)
            fig.show();
            
        else:
            _z_log.info(f'No Rare Categorical Levels (<{self.rare_thresh})')




[docs]
    @itr_plot(n_cols = 6, figsize = (24, 11))
    def skew_plot(self, dist: str = 'norm', cols: Optional[SEQUENCE_LIKE] = None):
        '''
        Generate Probability Plots for highly skewed features given a specific 
        distribution (default Normal).
        
        `Probability Plots`: Compare unscaled ordered feature values `Y-axis` vs 
        Scaled theoretical `Expected` Quantiles of Normal Distribution `X-axis`
        representing Z-scores of standard normal distribution `dist.ppf(p)`; 
        where `p` is Order statistics of the uniform distribution and `ppf` is 
        inverse `cdf`, so we basically generating `x` values with uniform `p` and 
        norm `mu` and `sigma`.

        If true, that generated(x) and actual(y) values both comes from same 
        distribution, all values (blue dots) should form a straight line in the plot 
        and lie on the red line. 
        
        Note that the red line is a function of Ordered Values(y) ~ theoretical `x`: 
        `OLS(dist.ppf(p), sort(x))`, this `OLS` best-fit line provide insight as to 
        whether or not the feature can be characterized by the distribution; if the 
        two distributions are linearly related, but not similar, the blue dots will 
        approximately lie on a line, but not necessarily on the line. 
        
        So the degree of similarity between both distributions can be assessed this way, 
        guiding the methods for further data preprocessing.
        
        Parameters
        ----------
        dist: str or stats.distributions instance
            Distribution or distribution function name. The default is `norm` for a
            normal probability plot. Objects that look enough like a `stats.distributions`
            instance (i.e. they have a `ppf` method) are also accepted.
        cols: sequence (lists, tuples, NumPy arrays or Pandas Base Index) or None
            column(s) name(s) to fit distribution, if `None`, then `peek` method is invoked 
            and ``cols`` are those having abs(skew score) > ``skew threshold``.

        '''
        ax = self._fig.add_subplot(self._n_rows, self._n_cols, self._n)
        plot = stats.probplot(self._df[self._col], dist = dist, plot = plt, rvalue = False)
        r_2 = plot[1][2]**2
        text = f'{self._col} \n $r^2$ = {r_2:.2f}' if not np.isnan(r_2) else f'{self._col}'
        plt.title(text, color = self._theme)
        self._decorate_plot(ax, theme = self._theme);
        self._n += 1