import pandas as pd
import numpy as np
from packaging.version import parse
from scipy import stats
from tqdm.auto import tqdm
from matplotlib import pyplot as plt
from plotly.graph_objs import Bar, Figure
from typing import Optional, Union, Tuple
from IPython.display import display
from .._utils import SEQUENCE_LIKE, itr_plot, PlotMixin
from .._logr import _z_log
###################################################################
[docs]
class UniStat(PlotMixin):
'''
Calculate and visualize Univariate statistics for all features, identifying
feature problems (e.g.: missing values, skew, rare categories,...)
Parameters
----------
df: pandas dataframe
data source
col_drop: sequence (lists, tuples, NumPy arrays or Pandas Base Index) or None
column(s) name(s) to exclude when analysing duplicates
card_thresh: int
threshold for considering categorical feature to be of high cardinality
rare_thresh: float
threshold below which categorical levels are considered to be rare
(e.g.: 1% or 5%). If 0 then all levels are considered during the analysis.
skw_thresh: int
threshold for highlighting and plotting skewed numeric distributions, assuming
normal theoretical distribution. Features are considered to be skewed if outside
bounds of ``skw_thresh``: abs(skew score) > ``skw_thresh``
figsize: tuple or None
dimensions of matplotlib figure (width, height)
n_rows: int
number of rows in matplotlib subplot figure
n_cols: int
number of columns in matplotlib subplot figure
silent: Bool
solicit user input for continuation during iterative plotting. If `True`,
plotting proceeds without user interaction.
hide_p_bar: Bool
triggers hiding progress bar (tqdm module); Default `False`
theme: str
adjust axis and title colors as desired
color: str
adjust color of Plotly Bar as desired
Note
----
Data types will be inferred automatically, however, to get optimal separation of categorical
and numeric ``cols`` its better to ensure that correct data types are applied before using
this class.
'''
def __init__(self,
df: pd.DataFrame,
col_drop: Optional[SEQUENCE_LIKE] = None,
card_thresh: int = 10,
rare_thresh: float = 0.05,
skw_thresh: int = 1,
figsize: Optional[Tuple[int, int]] = None,
n_rows: Optional[int] = None,
n_cols: Optional[int] = None,
silent: bool = False,
hide_p_bar: bool = False,
theme: str = 'darkorange',
color: str = 'lightblue'):
# input check
if col_drop is not None:
if not isinstance(col_drop, (list, tuple, np.ndarray, pd.Index)):
raise TypeError("'col_drop' parameter accepts a sequence e.g: Lists, Tuples, NumPy Arrays or Pandas Base Index. "
f"However, '{type(col_drop)}' was received!")
if np.any(~np.isin(col_drop, df.columns)):
raise KeyError("Missing columns! Please ensure that column name passed to 'col_drop' parameter "
"is included in the DataFrame")
if (card_thresh or rare_thresh) < 0:
raise ValueError("cardinality and/or rare threshold(s) can't be less than 0")
_z_log.info("Missing values in categorical columns will be considered as an additional 'missing' level "
"when calculating cardinality and rare levels percent")
self._df = df.copy()
self._col_drop = col_drop
self._card_thresh = card_thresh
self.rare_thresh = rare_thresh
self.skw_thresh = skw_thresh
self._figsize = figsize
self._n_rows = n_rows
self._n_cols = n_cols
self._silent = silent
self._hide_p_bar = hide_p_bar
self._theme = theme
self._color = color
@property # could've been exposed, just an example of property usage
def card_thresh(self):
"""get cardinality threshold"""
return self._card_thresh
@card_thresh.setter
def card_thresh(self, value):
"""set cardinality threshold"""
if value >= 0:
self._card_thresh = value
else:
print(f"cardinality threshold can't be negative, default value of {self._card_thresh} was not changed")
# delete
@card_thresh.deleter
def card_thresh(self):
"""delete cardinality threshold"""
del self._card_thresh
[docs]
def peek(self, disp_res: bool = True) -> Tuple[pd.core.indexes.base.Index, pd.DataFrame]:
'''
Calculate univariate statistics for Numeric and Categorical features while
identifying the following:
- Proportion of missing data
- Highly skewed features, assuming normal distribution
- High cardinality categorical features
- Proportion of rare categorical levels
Parameters
----------
disp_res: Bool
triggers displaying summary results
Attributes
----------
z_summary: Pandas DataFrame
info about the dataframe
z_miss_data: Pandas Series
Proportion of missing data
z_hc_data: Pandas Series
Count of categories/levels of high cardinality categorical columns
z_rare_cat: Pandas DataFrame
Count and proportion of rare categories/levels
z_univ_stat_df: Pandas DataFrame
univariate statistics
Returns
-------
num_cols: Pandas Index
Numeric column names
cat_cols: Pandas Index
Categorical column names
dup_df: Pandas DataFrame
duplicate rows
'''
# capture all missing values before modification
null_vals = self._df.isnull().mean().round(4)
# dynamic progress bar for non iterable execution
result_list = ['Categorical', 'Numeric', 'Univariate Statistics', 'Missing Data', 'High Cardinality Features',
'Rare Categorical Features', 'Duplicates'] # progress bar text
p_bar = tqdm(range(len(result_list)), desc = '', disable = self._hide_p_bar) # progress bar
for i in p_bar:
# Categorical Features
if result_list[i] == 'Categorical':
p_bar.set_description(f'Capturing {result_list[i]}')
cats = self._df.select_dtypes(['object', 'bool', 'category']).columns
# Checking for `ifc` instead of `i`
# for features that better be
# treated as categorical
discats = {col: dtype for col in self._df.columns for dtype in [self._df[col].dtype] \
if dtype.kind in 'ifc' and self._df[col].nunique() <= 20} # dict to assign original dtypes latter
discats_ = list(discats)
cat_cols = np.r_[discats_, cats]
# capturing original values and dtypes then
# unifying display of missing values under `missing`
# label for both discats and cats as missing values
# in cats are ignored by pandas.describe
cat_df = self._df[cat_cols] # preserve original values and dtypes
cat_nulls = cat_cols[null_vals[cat_cols] > 0]
for col in cat_nulls:
self._df[col] = np.where(self._df[col].isna(), 'missing', self._df[col])
# for summary stats to be treated
# as categorical if not having nulls
self._df[discats_] = self._df[discats_].astype(str)
# Numeric Features
elif result_list[i] == 'Numeric':
p_bar.set_description(f'Capturing {result_list[i]}')
num_cols = self._df.columns[~self._df.columns.isin(cat_cols)]
# remove `inf` columns, if any
inf_mask = np.isinf(self._df[num_cols]).any()
if inf_mask.any():
self.z_inf_out_ = num_cols[inf_mask] # attributes
_z_log.info("Some columns contain 'inf' values and will be excluded.")
num_cols = num_cols[~np.isin(num_cols, self.z_inf_out_)]
n_inf = len(self.z_inf_out_)
else:
n_inf = 0
# Summary Statistics
elif result_list[i] == 'Univariate Statistics':
p_bar.set_description(f'Capturing {result_list[i]}')
# datetime correct dtype
kwargs = ({} if parse(pd.__version__) > parse('1.5') else {'datetime_is_numeric': True})
# stats
univ_stat_df = self._df[np.r_[num_cols, cat_cols]].describe(include = 'all', **kwargs)
# back to original values and dtypes so if peek() called again
# missing values re-appear after being labeled "missing"
self._df[cat_cols] = cat_df
# add dtypes to summary stats, reflecting changes to discats
univ_stat_df.loc['d_types'] = self._df.dtypes
# calculate skew, kurt and normality for numeric features only
# skew: how much a distribution is pushed left or right
# Rule of Thump between -1 and 1
univ_stat_df.loc['skw'] = round(self._df[num_cols].skew(skipna = True), 2)
# kurtosis: how much of the distribution is in the tail
# kurtosis of normal == 0
univ_stat_df.loc['krts'] = round(self._df[num_cols].kurt(skipna = True), 2)
# spotting rare levels in categorical features
rare_count = [(self._df[col].value_counts(dropna = False, normalize = True) < self.rare_thresh).sum() for col in cat_cols]
univ_stat_df.loc['n_rare_lvls', cat_cols] = rare_count
univ_stat_df.loc['pct_rare_lvls', cat_cols] = \
univ_stat_df[cat_cols].loc['n_rare_lvls'] / univ_stat_df[cat_cols].loc['unique'] if any(cat_cols) else np.nan
# re-arrange index - cosmetics
re_idx = np.r_[univ_stat_df.index[:2], univ_stat_df.index[-2:], univ_stat_df.index[2:-2]]
univ_stat_df = univ_stat_df.reindex(re_idx)
# missing data
elif result_list[i] == 'Missing Data':
p_bar.set_description(f'Capturing {result_list[i]}')
univ_stat_df.loc['pct_missing'] = null_vals
miss_data = univ_stat_df.T[univ_stat_df.T.pct_missing > 0]['pct_missing'].sort_values()
# Overall Categorical Feature Cardinality
elif result_list[i] == 'High Cardinality Features':
p_bar.set_description(f'Capturing {result_list[i]}')
cat_card = self._df[cat_cols].nunique(dropna = False).sort_values()
hc_data = cat_card[cat_card > self._card_thresh]
# Rare Categorical Levels
elif result_list[i] == 'Rare Categorical Features':
p_bar.set_description(f'Capturing {result_list[i]}')
rare_cat = univ_stat_df.T[univ_stat_df.T.n_rare_lvls > 0][['unique', 'n_rare_lvls', 'pct_rare_lvls']]\
.sort_values('pct_rare_lvls') if any(cat_cols) else hc_data # no cat_cols means empty pd.Series as is hc_data
# duplicate samples
else:
p_bar.set_description(f'Capturing {result_list[i]}')
if self._col_drop is not None: # e.g: drop target and check identical features with different target values
mask = self._df.drop(np.array(self._col_drop), axis = 1)
dup_df = self._df[mask.duplicated(keep = False)].sort_values(list(mask.columns))
else: # in all columns
dup_df = self._df[self._df.duplicated(keep = False)].sort_values(list(self._df.columns))
# attributes
self.z_summary_ = pd.DataFrame([{'rows': self._df.shape[0], 'columns': self._df.shape[1], 'num_feats': len(num_cols),
'cat_feats': len(cat_cols), 'high_card_cats': len(hc_data), 'rare_lvl_cats': len(rare_cat),
'n_feats_missing_data': len(miss_data), 'n_inf_feats': n_inf, 'duplicates': len(dup_df)}])
# display results
if disp_res:
display('** Data Summary **', self.z_summary_.T.style.hide(axis = 1).format(thousands = ','),
'** Univariate stats - Numeric Features **', univ_stat_df[num_cols]\
.sort_values('skw', ascending = False, axis = 1).dropna().style.apply(
lambda x: ['background: olive' if abs(x.loc['skw']) > self.skw_thresh else '' for i in x],
subset = pd.IndexSlice['skw':'skw']).format(precision = 3, thousands = ',') if any(num_cols) else 'N/A',
'** Univariate stats - Categorical Features **', univ_stat_df[cat_cols]\
.sort_values('unique', ascending = False, axis = 1).dropna().style.apply(
lambda x: ['background: grey' if x.loc['pct_rare_lvls'] >= .5 else '' for i in x],
subset = pd.IndexSlice['pct_rare_lvls':'pct_rare_lvls']).format(precision = 3, thousands = ',')\
if any(cat_cols) else 'N/A')
# attributes
self.z_miss_data_ = miss_data
self.z_hc_data_ = hc_data
self.z_rare_cat_ = rare_cat
self.z_univ_stat_df_ = univ_stat_df
return num_cols, cat_cols, dup_df
[docs]
def stats_plot(self, width: Optional[int] = None, height: Optional[int] = None):
'''
Interactive plots visualizing:
- Proportion of missing data
- High cardinality categorical features
- Proportion of rare categorical levels
Parameters
----------
width: int, default `None`
The figure width in pixels
height: int, default `None`
The figure height in pixels
'''
try:
self.z_miss_data_; self.z_hc_data_; self.z_rare_cat_ # check if plot data is ready
except:
self.peek(disp_res = False)
miss_d, hc_d, rare_d = self.z_miss_data_, self.z_hc_data_, self.z_rare_cat_
if any(miss_d): # plot missing data stats
# pLot
fig = Figure(data=[Bar(x = miss_d.index, y = miss_d.values, marker = dict(color = self._color),
customdata = self._df[miss_d.index].dtypes.values.astype(str),
hoverlabel = dict(namelength=0),
hovertemplate = '<br>'.join(['Feat: %{x}', 'Missing Value: %{y:.1%}',
'Data Type: %{customdata}']))])
# Decorate
fig.update_layout(dict(paper_bgcolor = 'rgba(0,0,0,0)', plot_bgcolor = 'rgba(0,0,0,0)',
hoverlabel = dict(bgcolor = 'darkred'),
title = dict(text = 'Quantifying Missing data', x = 0.5, xref = 'paper',
font = dict(family = 'Arial', size = 20, color = self._theme)),
width = width, height = height))
fig.update_xaxes(title = 'Features', color = self._theme)
fig.update_yaxes(showgrid = False, title = 'Percentage of Missing Values', color = self._theme)
fig.show();
else:
_z_log.info('No Missing Values')
if any(hc_d): # plot high cardinality stats
# pLot
fig = Figure(data=[Bar(x = hc_d.index, y = hc_d.values, marker = dict(color = self._color),
# customdata = customdata,
hoverlabel = dict(namelength=0),
hovertemplate = '<br>'.join(['Feat: %{x}', 'n_lvls: %{y:.0f}']))])
# Decorate
fig.update_layout(dict(paper_bgcolor = 'rgba(0,0,0,0)', plot_bgcolor = 'rgba(0,0,0,0)',
hoverlabel = dict(bgcolor = 'darkred'),
title = dict(text = f'High Cardinality Categorical Features (>{self._card_thresh} levels)',
x = 0.5, xref = 'paper',
font = dict(family = 'Arial', size = 20, color = self._theme)),
width = width, height = height))
fig.update_xaxes(title = 'Features', color = self._theme)
fig.update_yaxes(type = 'log', showgrid = False, title = '(log) Size of Unique Categories', color = self._theme)
fig.show();
else:
_z_log.info(f'No High Cardinality Categorical Features (>{self._card_thresh})')
if len(rare_d) > 0: # plot rare categorical levels stats
# plot
customdata = np.stack(rare_d.values, axis = 0) # hover data
fig = Figure(data=[Bar(x = rare_d.index, y = rare_d.pct_rare_lvls,
marker = dict(color = self._color),
customdata = customdata, hoverlabel = dict(namelength=0),
hovertemplate = '<br>'.join(['Feat: %{x}', 'n_lvls: %{customdata[0]:.0f}',
'n_rare_lvls: %{customdata[1]:.0f}',
'pct_rare_Lvls: %{y:.1%}']))])
# decorate
fig.update_layout(dict(paper_bgcolor = 'rgba(0,0,0,0)', plot_bgcolor = 'rgba(0,0,0,0)',
hoverlabel = dict(bgcolor = 'darkred'),
title = dict(
text = f'Proportion of Rare Categorical Levels (<{self.rare_thresh})',
x = 0.5, xref = 'paper', font = dict(family = 'Arial', size = 20, color = self._theme)),
width = width, height = height))
fig.update_xaxes(title = 'Features', color = self._theme)
fig.update_yaxes(showgrid = False, title = '% of Rare Levels', color = self._theme)
fig.show();
else:
_z_log.info(f'No Rare Categorical Levels (<{self.rare_thresh})')
[docs]
@itr_plot(n_cols = 6, figsize = (24, 11))
def skew_plot(self, dist: str = 'norm', cols: Optional[SEQUENCE_LIKE] = None):
'''
Generate Probability Plots for highly skewed features given a specific
distribution (default Normal).
`Probability Plots`: Compare unscaled ordered feature values `Y-axis` vs
Scaled theoretical `Expected` Quantiles of Normal Distribution `X-axis`
representing Z-scores of standard normal distribution `dist.ppf(p)`;
where `p` is Order statistics of the uniform distribution and `ppf` is
inverse `cdf`, so we basically generating `x` values with uniform `p` and
norm `mu` and `sigma`.
If true, that generated(x) and actual(y) values both comes from same
distribution, all values (blue dots) should form a straight line in the plot
and lie on the red line.
Note that the red line is a function of Ordered Values(y) ~ theoretical `x`:
`OLS(dist.ppf(p), sort(x))`, this `OLS` best-fit line provide insight as to
whether or not the feature can be characterized by the distribution; if the
two distributions are linearly related, but not similar, the blue dots will
approximately lie on a line, but not necessarily on the line.
So the degree of similarity between both distributions can be assessed this way,
guiding the methods for further data preprocessing.
Parameters
----------
dist: str or stats.distributions instance
Distribution or distribution function name. The default is `norm` for a
normal probability plot. Objects that look enough like a `stats.distributions`
instance (i.e. they have a `ppf` method) are also accepted.
cols: sequence (lists, tuples, NumPy arrays or Pandas Base Index) or None
column(s) name(s) to fit distribution, if `None`, then `peek` method is invoked
and ``cols`` are those having abs(skew score) > ``skew threshold``.
'''
ax = self._fig.add_subplot(self._n_rows, self._n_cols, self._n)
plot = stats.probplot(self._df[self._col], dist = dist, plot = plt, rvalue = False)
r_2 = plot[1][2]**2
text = f'{self._col} \n $r^2$ = {r_2:.2f}' if not np.isnan(r_2) else f'{self._col}'
plt.title(text, color = self._theme)
self._decorate_plot(ax, theme = self._theme);
self._n += 1