import pandas as pd
import numpy as np
import time
from statsmodels.formula.api import ols, mnlogit
from statsmodels.nonparametric.smoothers_lowess import lowess
from scipy import stats
from tqdm.auto import tqdm
from matplotlib import pyplot as plt
from seaborn import regplot, scatterplot, histplot
from plotly.graph_objs import Heatmap, Figure, layout
from plotly.express import scatter, scatter_3d, colors, get_trendline_results
from typing import Optional, Union, Tuple, List
from itertools import product
from textwrap import wrap
from .._utils import SEQUENCE_LIKE, itr_plot, PlotMixin
from .._logr import _z_log
###################################################################
[docs]
class NumAna(PlotMixin):
"""
Collection of Numeric features analysis that includes:
- Regression
- Correlation
- visualizations
Parameters
----------
df: pandas dataframe
data source
cols: sequence (lists, tuples, NumPy arrays or Pandas Base Index)
column names of numeric features
target: str
target column name, categorical target will be encode as integer.
degree: int
If ``degree`` is greater than 1, ``fit`` is ignored and polynomial
regression is applied to the nth ``degree``.
fit: str or None
type of regression to fit. One of `ols`, `logit` or `lws`. If `ols`
then Ordinary Least Squares regression is applied. If `logit` then
logistic regression will be fitted, if `lws` then Locally Weighted
Scatterplot Smoothing non-parametric regression is applied. If `None`
it's either Ordinary Least Squares or logistic regression based on
type of ``target``.
method: str
Only applicable when ``fit`` = `logit`. The following solvers from
`scipy.optimize` are accepted:
- **newton** for Newton-Raphson, ‘nm’ for Nelder-Mead
- **bfgs** for Broyden-Fletcher-Goldfarb-Shanno (BFGS)
- **lbfgs** for limited-memory BFGS with optional box constraints
- **powell** for modified Powell’s method
- **cg** for conjugate gradient
- **ncg** for Newton-conjugate gradient
- **basinhopping** for global basin-hopping solver
- **minimize** for generic wrapper of scipy minimize
(BFGS by default)
Note
----
Each solver has several optional unique arguments. See ``**kwargs``
parameter below (or scipy.optimize) for the available arguments that
each solver supports.
lowess_frac: float
Between 0 and 1. The fraction of the data used when estimating each
y-value for lowess fit.
it: int
The number of residual-based reweightings to perform for lowess fit.
delta: float
Distance within which to use linear-interpolation instead of weighted
regression for lowess fit. `'delta'` can be used to save computations.
For each `x_i`, regressions are skipped for points closer than ``delta``.
The next regression is fit for the farthest point within delta of `x_i` and
all points in between are estimated by linearly interpolating between the
two regression fits.
Judicious choice of delta can cut computation time considerably
for large data (N > 5000). A good choice is ``delta`` = 0.01 * range(x).
nans_d: dict or None
dictionary where keys are column names and values are missing(nan)
replacements. To perform multiple imputation for several numeric columns.
frac: float or None
fraction of dataframe to use as a sample
for analysis:
- 0 < ``frac`` < 1 returns a random sample with size ``frac``.
- ``frac`` = 1 returns shuffled dataframe.
- ``frac`` > 1 up-sample the dataframe, sampling of the same row more
than once.
random_state: int
for reproducibility, controls the random number generator for ``frac``
parameter.
figsize: tuple or None
dimensions of matplotlib figure (width, height)
n_rows: int
number of rows in matplotlib subplot figure
n_cols: int
number of columns in matplotlib subplot figure
silent: Bool
solicit user input for continuation during iterative plotting. If `True`,
plotting proceeds without user interaction.
hide_p_bar: Bool
triggers hiding progress bar (tqdm module); Default 'False'
theme: str
adjust axis and title colors as desired
Keyword Args
------------
warn_convergence: bool, optional
If True, checks the model for the converged flag. If the
converged flag is False, a ConvergenceWarning is issued.
All other kwargs are passed to the chosen solver.
newton
tol: float
Relative error in params acceptable for convergence.
nm -- Nelder Mead
xtol: float
Relative error in params acceptable for convergence
ftol: float
Relative error in loglike(params) acceptable for
convergence
maxfun: int
Maximum number of function evaluations to make.
bfgs
gtol: float
Stop when norm of gradient is less than gtol.
norm: float
Order of norm (np.inf is max, -np.inf is min)
epsilon
If fprime is approximated, use this value for the step
size. Only relevant if LikelihoodModel.score is None.
lbfgs
m: int
This many terms are used for the Hessian approximation.
factr: float
A stop condition that is a variant of relative error.
pgtol: float
A stop condition that uses the projected gradient.
epsilon
If fprime is approximated, use this value for the step
size. Only relevant if LikelihoodModel.score is None.
maxfun: int
Maximum number of function evaluations to make.
bounds: sequence
(min, max) pairs for each element in x,
defining the bounds on that parameter.
Use None for one of min or max when there is no bound
in that direction.
cg
gtol: float
Stop when norm of gradient is less than gtol.
norm: float
Order of norm (np.inf is max, -np.inf is min)
epsilon: float
If fprime is approximated, use this value for the step
size. Can be scalar or vector. Only relevant if
Likelihoodmodel.score is None.
ncg
fhess_p: callable f'(x,*args)
Function which computes the Hessian of f times an arbitrary
vector, p. Should only be supplied if
LikelihoodModel.hessian is None.
avextol: float
Stop when the average relative error in the minimizer
falls below this amount.
epsilon: float or ndarray
If fhess is approximated, use this value for the step size.
Only relevant if Likelihoodmodel.hessian is None.
powell
xtol: float
Line-search error tolerance
ftol: float
Relative error in loglike(params) for acceptable for
convergence.
maxfun: int
Maximum number of function evaluations to make.
start_direc: ndarray
Initial direction set.
basinhopping
niter: int
The number of basin hopping iterations.
niter_success: int
Stop the run if the global minimum candidate remains the
same for this number of iterations.
T: float
The "temperature" parameter for the accept or reject
criterion. Higher "temperatures" mean that larger jumps
in function value will be accepted. For best results
`T` should be comparable to the separation (in function
value) between local minima.
stepsize: float
Initial step size for use in the random displacement.
interval: int
The interval for how often to update the `stepsize`.
minimizer: dict
Extra keyword arguments to be passed to the minimizer
`scipy.optimize.minimize()`, for example 'method' - the
minimization method (e.g. 'L-BFGS-B'), or 'tol' - the
tolerance for termination. Other arguments are mapped
from explicit argument of `fit`:
- `args` <- `fargs`
- `jac` <- `score`
- `hess` <- `hess`
minimize
min_method: str, optional
Name of minimization method to use.
Any method specific arguments can be passed directly.
For a list of methods and their arguments, see
documentation of `scipy.optimize.minimize`.
If no method is specified, then BFGS is used.
Attributes
----------
z_inf_out : numpy array
excluded columns having `inf` values, if any.
z_nans: numpy array
numeric column names where imputation of `nan` values took place.
z_df: pandas dataframe
preprocessed dataframe that was used internally
"""
def __init__(self,
df: pd.DataFrame,
cols: SEQUENCE_LIKE,
target: str,
degree: int = 1,
fit: Optional[str] = None,
method: str = 'newton',
lowess_frac: float = 2/3,
it: int = 3,
delta: float = 0.0,
nans_d: Optional[dict] = None,
frac: Optional[float] = None,
random_state: int = 45,
figsize: Optional[Tuple[int, int]] = None,
n_rows: Optional[int] = None,
n_cols: Optional[int] = None,
silent: bool = False,
hide_p_bar: bool = False,
theme: str = 'darkorange',
**kwargs):
# input checks
if not isinstance(cols, (list, tuple, np.ndarray, pd.Index)):
raise TypeError("'cols' parameter accepts a sequence e.g: Lists, Tuples, NumPy Arrays or Pandas Base Index. "
f"However, '{type(cols)}' was received!")
if not isinstance(target, str):
raise TypeError("Please pass target column name as string")
if np.any(~np.isin(np.r_[cols, [target]], df.columns)):
raise KeyError("Missing columns! Please ensure that all columns to analyze are included in the DataFrame")
if np.any([df[col].dtype.kind not in 'ifc' for col in cols]):
raise ValueError("All 'cols' must be numeric please refer to parameter description")
if fit and fit not in ['ols', 'logit', 'lws']:
raise ValueError("'fit' parameter accepts the following arguments: 'ols', 'logit' or 'lws', "
f"however, '{fit}' was received!")
if nans_d:
if not isinstance(nans_d, dict):
raise TypeError("Please pass 'nans_d' parameter as a dictionary. Example: {'column name': imputation value}")
if np.any(~np.isin(np.array(list(nans_d.keys())), df.columns)):
raise KeyError("Missing columns! Please ensure that all columns to impute are included in the DataFrame")
if frac:
replace = True if frac > 1 else False
self.z_df_ = df.sample(frac = frac, replace = replace, random_state = random_state).copy()
else:
self.z_df_ = df.copy() # avoid editing original dataframe
# handling infs and nans
self._cols = np.array(cols)
self._slash_n_impute(nans_d) # attributes
# check and assign correct fits
str_target = self.z_df_[target].dtype.kind in 'bO'
cat_target = ((self.z_df_[target].dtype.kind in 'i' and self.z_df_[target].nunique() <= 20) or str_target)
if degree > 1:
_z_log.info("Polynomial Least Squares fit will be applied")
self._fit = None
elif not fit: # assign generic fits
if cat_target:
self._fit = 'logit'
_z_log.info("logistic fit will be applied")
else:
self._fit = 'ols'
_z_log.info("Ordinary Least Squares fit will be applied")
elif fit == 'logit' and not cat_target: # switch to regression
self._fit = 'ols'
_z_log.info(f"Logistic fit will not be applied rather OLS instead! `{target}` is of high cardinality.")
else:
self._fit = fit
if fit == 'ols' and cat_target:
_z_log.warning(f"OLS fit will be applied on `{target}` that appears to be categorical.")
# encoding categorical target as integer
if str_target: # checking for string only because OLS is allowed on categorical target
nums, labels = self.z_df_[target].factorize()
# assign new values
self.z_df_[target] = nums
# used to capture last class label when added to plot text
# discrete values are assigned following order of appearance not alphabetically
# so last label = highest discrete value
# left unsorted to match same order when capturing fitted model parameters
# as these are sorted ascendingly
# TODO: prompt user for sorting?
self._lbl = labels
if self._fit == 'logit':
if not hasattr(self, '_lbl'): # discrete target
# unlike earlier this needs sorting to match fitted model parameters
self._lbl = np.sort(self.z_df_[target].unique())
self._binary_t = len(self._lbl) == 2
self._target = target
self._degree = degree
self._method = method
self._lowess_frac = lowess_frac
self._it = it
self._delta = delta
self._hide_p_bar = hide_p_bar
self._figsize = figsize
self._n_rows = n_rows
self._n_cols = n_cols
self._silent = silent
self._theme = theme
self._kwargs = kwargs
[docs]
def corr(self,
disp_corr: str = 'pearson',
quant: float = .75,
thresh: Optional[float] = None,
alpha: Optional[float] = None,
plot: bool = False,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""
Calculate Pearson (linear) and Spearman (monotonic) correlation
and generate heat-map visualizations;
Visualizations `Plotly Module`:
- v1: Feature ``cols`` vs Target correlation either overall or
for significant results only (P-value <= 0.05).
- v2: Feature correlation for a selection of highly correlated
features with target.
Note
----
to check correlation for categorical features, encode categories as
integers first (e.g.: LabelEncoder, OrdinalEncoder, ...).
Parameters
----------
disp_corr: str
One of ['Pearson', 'Spearman'], correlation method to
be used in:
- calculating correlation between features
- sorting v1
quant: float
proportion of features to be used in calculating feature
correlation and display in v2; default is top 25% (> q3)
of features that are highly correlated with ``target``
thresh: float or None
minimum correlation strength between features to display
in v2; if not `None`, only display correlation >= ``thresh``
alpha: float or None
Significance alpha for rejecting null hypothesis (e.g.:0.05).
if not `None`, V1 display features with significant results
only (corr coef p-value <= ``alpha``)
plot: Bool
whether to run visualizations or not
Returns
-------
corr_df: pandas dataframe
correlation coefficient and p-value for each feature vs ``target``
feat_corr_df: pandas dataframe
correlation coefficient of highly correlated features, only
``quant`` features are included
"""
# input checks
self._input_validation(disp_corr = disp_corr)
corr = [] # result container
for col in tqdm(self._cols, desc = f'Calculating Corr....', disable = self._hide_p_bar):
r_pea, p_pea = stats.pearsonr(self.z_df_[self._target], self.z_df_[col])
r_spr, p_spr = stats.spearmanr(self.z_df_[self._target], self.z_df_[col])
corr.append((col, r_pea, p_pea, r_spr, p_spr))
# Feature vs Target Correlation Dataframe
corr_df = pd.DataFrame([c[1:] for c in corr], columns = ['pearson', 'p_val_pear', 'spearman', 'p_val_spr'],
index = np.array(corr)[:,:1].ravel()).sort_values(disp_corr, ascending = False)
# highly correlated features (>= q?)
abs_mask = corr_df[disp_corr].abs()
feat_corr_df = self.z_df_[corr_df.index[abs_mask >= abs_mask.quantile(quant)]].corr(disp_corr)
if plot:
if alpha:
# plotting correlations' statistically significant results
# null hypothesis: two sets of data are uncorrelated/have no ordinal correlation
# reject Null if P Val <= significance threshold(alpha)
plot_data = corr_df[(corr_df.p_val_pear <= alpha) |
(corr_df.p_val_spr <= alpha)].sort_values(disp_corr, ascending = False)
else:
plot_data = corr_df.sort_values(disp_corr, ascending = False)
corr_n_cols = len(plot_data)
# plotting feature correlation
if thresh:
corr_plot = feat_corr_df[abs(feat_corr_df) >= thresh]
# if True then all are nans except diagonals
# because this is a square matrix
all_nans = len(corr_plot)**2 - corr_plot.isna().sum().sum() == len(corr_plot)
if all_nans: # skip plotting as none > thresh
feat_corr_n_cols = 1
else:
feat_hm_title = f'{disp_corr} correlation heatmap (>={thresh:.0%}) of top {1 - quant:.0%} features'
feat_corr_n_cols = len(corr_plot)
else:
corr_plot = feat_corr_df
feat_hm_title = f'{disp_corr} correlation heatmap of top {1 - quant:.0%} features'
feat_corr_n_cols = len(corr_plot)
if corr_n_cols:
self._n = 0 # control iterations and slicing indices
lim = 30 # limit plotting to 30 features per map
title = f'Correlation Heatmap - {self._target}'
for _ in tqdm(range(int(np.ceil(corr_n_cols / lim))),
desc = f'Plotting Target Correlation Heat Maps....',
disable = self._hide_p_bar):
self._heat_plot(plot_data, lim, title)
if corr_n_cols > self._n:
if not self._silent:
_z_log.info(f"{corr_n_cols - self._n} out of {corr_n_cols} features remaining, to continue press "
"'Enter' or input any value to exit.")
time.sleep(2)
if input().strip().lower().replace(' ',''):
break
else:
_z_log.info(f"{corr_n_cols - self._n} out of {corr_n_cols} features remaining.")
else:
_z_log.info("No Significant Correlation Was Noted!")
if feat_corr_n_cols > 1:
self._n = 0
lim = 15
for _ in tqdm(range(int(np.ceil(feat_corr_n_cols / lim))), desc = f'Plotting Feature Correlation Heat Maps....',
disable = self._hide_p_bar):
self._heat_plot(corr_plot, lim, feat_hm_title, feat_corr = True)
if feat_corr_n_cols > self._n:
if not self._silent:
_z_log.info(f"{feat_corr_n_cols - self._n} out of {feat_corr_n_cols} features remaining, "
"to continue plotting press 'Enter' or input any value to exit.")
time.sleep(2)
if input().strip().lower().replace(' ',''):
break
else:
_z_log.info(f"{feat_corr_n_cols - self._n} out of {feat_corr_n_cols} features remaining.")
else:
_z_log.info(f"No high correlation among top {1 - quant:.0%} features that are highly correlated with `target`")
return corr_df.T.copy(), feat_corr_df
[docs]
def fit_models(self):
"""
Univariate model fitting:
- Polynomial regression
- Ordinary Least Squares regression
- Locally Weighted Scatterplot Smoothing non-parametric regression
- Logistic regression
Attributes
----------
z_fit_results_: dict
where keys are ``cols`` and values are fitted regression model(s).
z_fit_out_: numpy array
excluded ``cols``, if any, causing regression fit errors.
"""
# input checks
self._input_validation()
# attributes
self.z_fit_results_ = {} # fitted models
fit_out = [] # features not fitted
for col in tqdm(self._cols, desc = f'Fitting Models....', disable = self._hide_p_bar):
try:
# fit models
if self._degree > 1:
# Least-squares fit of a polynomial of nth degree
model = np.polynomial.polynomial.polyfit(self.z_df_[col], self.z_df_[self._target], self._degree)
elif self._fit == 'ols':
# Ordinary Least Squares Regression
# Q("{}")' controls column names having numbers and spaces
# adds constant automatically
model = ols(formula = f'Q("{self._target}")~Q("{col}")', data = self.z_df_).fit()
elif self._fit == 'logit':
# logistic Regression
# binary or multiclass
model = mnlogit(formula = f'Q("{self._target}")~Q("{col}")', data = self.z_df_).fit(disp = 0,
method = self._method,
**self._kwargs)
else:
# Locally Weighted Scatterplot Smoothing non-parametric regression
model = lowess(self.z_df_[self._target], self.z_df_[col], frac = self._lowess_frac, it = self._it,
delta = self._delta).T
self.z_fit_results_[col] = model
except:
fit_out.append(col)
if fit_out:
_z_log.info(f"{len(fit_out)} out of {len(self._cols)} features were not fit! "
"Please ensure that data to fit matches model requirements.")
self.z_fit_out_ = np.asarray(fit_out) # attributes
return self
[docs]
@itr_plot(n_cols = 6, figsize = (24, 11))
def vis_fit(self,
olrs_idx: Optional[Tuple[pd.core.indexes.base.Index, list]] = None,
olrs_mapping: Optional[dict] = None,
x_jitter: Optional[float] = None,
y_jitter: Optional[float] = None,
scatter_kws: dict = {'alpha': 0.3},
tc_color: str = 'orange',
olrs_color: str = 'red',
nbins: Union[int, str] = 'auto',
axis: str = 'x',
tight: Optional[bool] = None,
x_ax_rotation: Optional[int] = None,
):
"""
Scatter plot visualization of univariate regression fits `Seaborn Module`
Parameters
----------
olrs_idx: pandas index, list or None,
Index of outlier data points
olrs_mapping: dict or None
column names as keys and outlier data points indices as values
(pandas index or list) to highlight during plotting. Outliers
from each column are plotted against their respective plot
{x, y}_jitter: float or None
adds random noise to the observations on {x, y}_axis.
applicable to main scatter plot of `x` and `y`.
scatter_kws : dict or None
Additional keyword arguments passed to `plt.scatter` and
`plt.plot`. Applide to main scatter plot of `x` and `y`.
tc_color: str
Color of OLS trendline or Sigmoid/lOWESS Curve
olrs_color: str
Color of outlier data points
nbins: int or 'auto'
For plot decoration, maximum number of axis intervals; 1 - max
number of ticks. If the string 'auto', the number of bins will be
automatically determined based on the length of the axis.
axis: str
For plot decoration, one of ['both', 'x', 'y'], axis on which to apply
``nbins``.
tight : bool or None
For plot decoration, controls expansion of axis limits, if `True` axis limits
are only expanded using the margins; This does *not* set the margins to zero.
If `False`, further expand the axis limits using the axis major locator.
x_ax_rotation: int or None
For plot decoration, set degree of x_ticks rotation.
"""
# prepare plots
ax = self._fig.add_subplot(self._n_rows, self._n_cols, self._n)
model = self.z_fit_results_[self._col]
x, grid = self._plot_data(self.z_df_, self._col)
if self._degree > 1:
# fit plot
y_pred = np.polynomial.polynomial.polyval(grid, model)[:,-1]
# plot text
text = "$" + self._func_text(model, poly = True, text_wrap = True) + "$"
plt.title(text, color = self._theme)
elif self._fit == 'ols':
# fit plot
paras = model.params.values
y_pred = grid.dot(paras) # model.fittedvalues
# plot text
text = "$" + self._func_text(paras) + "$" + '\n' + \
f'$r^2$ = {model.rsquared:.4f}'
plt.title(text, color = self._theme)
elif self._fit == 'logit':
# fit plot
y_pred = self._logistic_pred(grid, model)
# plot text
null_ll = model.llnull
full_ll = model.llf
# likelihood function is the probability that the data were generated by the model parameters
# the model's goal is to find values for the parameters (coefficients) that maximize value of
# the likelihood function. The pseudo-R-squared(McFadden’s) measures model's performance,
# higher values indicate a better fit, similar to R^2 available under least squares regression.
# It is computed based on the ratio of the maximized log-likelihood function as follows:
# 1 - (Log-Likelihood / LL-Null) where
# Log-Likelihood(full model): maximized log-likelihood function using all parameters
# LL-Null(null model): maximized log-likelihood function when only an intercept is included
pr = 1 - (full_ll / null_ll) # model.prsquared
# This is the p-value from a likelihood-ratio test of the full versus null model.
# significance (p-value <.05) indicates favoring full(including feature) versus null(intercept only) model.
# For example in a binary classification this means that the feature does have an effect on observing a positive
# class label this effect is measured by the size(value) of model coefficients(parameters),
# which refers to the change in the log-odds of observing positive class for each unit change in the feature value
ll_stat = -2 * (null_ll - full_ll) # likelihood ratio Chi-Squared test statistic
# calculate p-value of test statistic using n degrees of freedom
llrp = stats.chi2.sf(ll_stat, model.df_model) # model.llr_pvalue
if self._binary_t:
# model params
paras = model.params.values.ravel()
else:
# displaying results of last class label
# compared to the base(reference) class
# i.e.: the change in log-odds of last class
# as a result of unit change in that feature
# TODO: what about other class labels?
paras = model.params.values[:,-1]
text = "$" + self._func_text(paras) + "$" + '\n' + \
f'P$r^2$ = {pr:.4f}' + ' | ' + f'llrp = {llrp:.4f}'
plt.title(text, color = self._theme)
else: # lowes
# fit plot
x, y_pred = model
# Main scatter plot
regplot(data = self.z_df_, x = self._col, y = self._target, fit_reg = False, x_jitter = x_jitter,
y_jitter = y_jitter, scatter_kws = scatter_kws, ax = ax)
# fit plot
plt.plot(x, y_pred, c = tc_color)
# Overlay Outliers
if hasattr(self, '_lrs') and self._col in self._lrs: # single check for both mapping or idx
idx = np.array(self._lrs[self._col])[np.isin(self._lrs[self._col], self.z_df_.index)]
if len(idx): # edge case: using `frac` and full index
outliers_df = self.z_df_.loc[idx]
scatterplot(data = outliers_df, x = self._col, y = self._target, color = olrs_color, ax = ax)
# decorate
col_dtype = self.z_df_[self._col].dtype.kind
self._decorate_plot(ax, dtype = col_dtype, nbins = nbins, axis = axis, tight = tight,
x_ax_rotation = x_ax_rotation, theme = self._theme);
self._n += 1
[docs]
@itr_plot(n_cols = 6, figsize = (24, 11))
def vis_ols_fit(self):
"""
Histograms and Scatter plots for Assessing OLS residuals'
normality and homoscedasticity assumptions
"""
# fitted model
model = self.z_fit_results_[self._col]
# Residuals Normality assumption
ax = self._fig.add_subplot(self._n_rows, self._n_cols, self._n)
histplot(model.resid, bins = 'doane', ax = ax)
plt.title(f'{self._col}', color = self._theme)
plt.xlabel('Residuals')
self._decorate_plot(ax, theme = self._theme);
# Residuals homoscedasticity assumption
ax_ = self._fig.add_subplot(self._n_rows, self._n_cols, self._n + 1)
scatterplot(x = model.fittedvalues, y = model.resid, ax = ax_) # predictions vs residuals
plt.axhline(0, alpha = 0.5, color = 'r')
plt.title(f'Residual Plot - {self._col}', color = self._theme)
plt.xlabel('Predictions')
plt.ylabel('Residuals')
self._decorate_plot(ax_, theme = self._theme);
self._n += 2 # update iteration control
[docs]
def vis_multi(self,
col: str,
olrs_idx: Optional[Tuple[pd.core.indexes.base.Index, list]] = None,
color: Optional[Union[str, int, pd.Series]]= None,
size: Optional[Union[str, int, pd.Series]] = None,
size_max: int = 15,
symbol: Optional[Union[str, int, pd.Series]] = None,
symbol_sequence: Optional[List[str]] = None,
symbol_map: Optional[dict] = None,
hover_name: Optional[Union[str, int, pd.Series]] = None,
hover_data: Optional[Union[str, list[str, int], pd.Series, dict]] = None,
custom_data: Optional[Union[str, list[str, int], pd.Series]] = None,
text: Optional[Union[str, int, pd.Series]] = None,
facet_row: Optional[Union[str, int, pd.Series]] = None,
facet_col: Optional[Union[str, int, pd.Series]] = None,
facet_col_wrap: int = 0,
facet_row_spacing: Optional[float] = None,
facet_col_spacing: Optional[float] = None,
error_x: Optional[Union[str, int, pd.Series]] = None,
error_x_minus: Optional[Union[str, int, pd.Series]] = None,
error_y: Optional[Union[str, int, pd.Series]] = None,
error_y_minus: Optional[Union[str, int, pd.Series]] = None,
labels: Optional[dict] = None,
color_discrete_sequence: Optional[List[str]] = None,
color_continuous_scale: Optional[List[str]] = None,
opacity: Optional[float] = None,
marginal_x: Optional[str] = None,
marginal_y: Optional[str] = None,
category_orders: Optional[dict] = None,
trendline: Optional[str] = None,
trendline_options: Optional[dict] = None,
trendline_color_override: Optional[str] = None,
trendline_scope: str = 'trace',
log_x: bool = False,
log_y: bool = False,
range_x: Optional[List[float]] = None,
range_y: Optional[List[float]] = None,
title: Optional[str] = None,
template: Optional[Union[str, dict]] = None,
width: Optional[int] = None,
height: Optional[int] = None,
theme: str = 'darkorange'):
"""
Interactive multivariate scatter plot
visualization and trend analysis `Plotly Module`
Parameters
----------
col: str
Name of column that goes to `x` axis
olrs_idx: pandas index, list or None
Index of outlier data points
color: str or int or Series or array-like
Either a name of a column in `data_frame`, or a pandas Series or
array_like object. Values from this column or array_like are used to
assign color to marks.
size: str or int or Series or array-like
Either a name of a column in `data_frame`, or a pandas Series or
array_like object. Values from this column or array_like are used to
assign mark sizes.
size_max: int (default `20`)
Set the maximum mark size when using `size`.
symbol: str or int or Series or array-like
Either a name of a column in `data_frame`, or a pandas Series or
array_like object. Values from this column or array_like are used to
assign symbols to marks.
symbol_sequence: list of str
Strings should define valid plotly.js symbols. When `symbol` is set,
values in that column are assigned symbols by cycling through
`symbol_sequence` in the order described in `category_orders`, unless
the value of `symbol` is a key in `symbol_map`.
symbol_map: dict with str keys and str values (default `{}`)
String values should define plotly.js symbols Used to override
`symbol_sequence` to assign a specific symbols to marks corresponding
with specific values. Keys in `symbol_map` should be values in the
column denoted by `symbol`. Alternatively, if the values of `symbol`
are valid symbol names, the string `'identity'` may be passed to cause
them to be used directly.
hover_name: str or int or Series or array-like
Either a name of a column in `data_frame`, or a pandas Series or
array_like object. Values from this column or array_like appear in bold
in the hover tooltip.
hover_data: str, or list of str or int, or Series or array-like, or dict
Either a name or list of names of columns in `data_frame`, or pandas
Series, or array_like objects or a dict with column names as keys, with
values True (for default formatting) False (in order to remove this
column from hover information), or a formatting string, for example
`':.3f'` or `'|%a'` or list-like data to appear in the hover tooltip or
tuples with a bool or formatting string as first element, and list-like
data to appear in hover as second element Values from these columns
appear as extra data in the hover tooltip.
custom_data: str, or list of str or int, or Series or array-like
Either name or list of names of columns in `data_frame`, or pandas
Series, or array_like objects Values from these columns are extra data,
to be used in widgets or Dash callbacks for example. This data is not
user-visible but is included in events emitted by the figure (lasso
selection etc.)
text: str or int or Series or array-like
Either a name of a column in `data_frame`, or a pandas Series or
array_like object. Values from this column or array_like appear in the
figure as text labels.
facet_row: str or int or Series or array-like
Either a name of a column in `data_frame`, or a pandas Series or
array_like object. Values from this column or array_like are used to
assign marks to facetted subplots in the vertical direction.
facet_col: str or int or Series or array-like
Either a name of a column in `data_frame`, or a pandas Series or
array_like object. Values from this column or array_like are used to
assign marks to facetted subplots in the horizontal direction.
facet_col_wrap: int
Maximum number of facet columns. Wraps the column variable at this
width, so that the column facets span multiple rows. Ignored if 0, and
forced to 0 if `facet_row` or a `marginal` is set.
facet_row_spacing: float between 0 and 1
Spacing between facet rows, in paper units. Default is 0.03 or 0.0.7
when facet_col_wrap is used.
facet_col_spacing: float between 0 and 1
Spacing between facet columns, in paper units Default is 0.02.
error_x: str or int or Series or array-like
Either a name of a column in `data_frame`, or a pandas Series or
array_like object. Values from this column or array_like are used to
size x-axis error bars. If `error_x_minus` is `None`, error bars will
be symmetrical, otherwise `error_x` is used for the positive direction
only.
error_x_minus: str or int or Series or array-like
Either a name of a column in `data_frame`, or a pandas Series or
array_like object. Values from this column or array_like are used to
size x-axis error bars in the negative direction. Ignored if `error_x`
is `None`.
error_y: str or int or Series or array-like
Either a name of a column in `data_frame`, or a pandas Series or
array_like object. Values from this column or array_like are used to
size y-axis error bars. If `error_y_minus` is `None`, error bars will
be symmetrical, otherwise `error_y` is used for the positive direction
only.
error_y_minus: str or int or Series or array-like
Either a name of a column in `data_frame`, or a pandas Series or
array_like object. Values from this column or array_like are used to
size y-axis error bars in the negative direction. Ignored if `error_y`
is `None`.
labels: dict with str keys and str values (default `{}`)
By default, column names are used in the figure for axis titles, legend
entries and hovers. This parameter allows this to be overridden. The
keys of this dict should correspond to column names, and the values
should correspond to the desired label to be displayed.
color_discrete_sequence: list of str
Strings should define valid CSS-colors. When `color` is set and the
values in the corresponding column are not numeric, values in that
column are assigned colors by cycling through `color_discrete_sequence`
in the order described in `category_orders`, unless the value of
`color` is a key in `color_discrete_map`. Various useful color
sequences are available in the `plotly.express.colors` submodules,
specifically `plotly.express.colors.qualitative`.
color_continuous_scale: list of str
Strings should define valid CSS-colors This list is used to build a
continuous color scale when the column denoted by `color` contains
numeric data. Various useful color scales are available in the
`plotly.express.colors` submodules, specifically
`plotly.express.colors.sequential`, `plotly.express.colors.diverging`
and `plotly.express.colors.cyclical`.
opacity: float
Value between 0 and 1. Sets the opacity for markers.
marginal_x: str
One of `'rug'`, `'box'`, `'violin'`, or `'histogram'`. If set, a
horizontal subplot is drawn above the main plot, visualizing the
x-distribution.
marginal_y: str
One of `'rug'`, `'box'`, `'violin'`, or `'histogram'`. If set, a
vertical subplot is drawn to the right of the main plot, visualizing
the y-distribution.
category_orders: dict with str keys and list of str values (default `{}`)
By default, in Python 3.6+, the order of categorical values in axes,
legends and facets depends on the order in which these values are first
encountered in `data_frame` (and no order is guaranteed by default in
Python below 3.6). This parameter is used to force a specific ordering
of values per column. The keys of this dict should correspond to column
names, and the values should be lists of strings corresponding to the
specific display order desired.
trendline: str or None
One of `'ols'`, `'lowess'`, `'rolling'`, `'expanding'` or `'ewm'`. If
`'ols'`, an Ordinary Least Squares regression line will be drawn for
each discrete-color/symbol group. If `'lowess`', a Locally Weighted
Scatterplot Smoothing line will be drawn for each discrete-color/symbol
group. If `'rolling`', a Rolling (e.g. rolling average, rolling median)
line will be drawn for each discrete-color/symbol group. If
`'expanding`', an Expanding (e.g. expanding average, expanding sum)
line will be drawn for each discrete-color/symbol group. If `'ewm`', an
Exponentially Weighted Moment (e.g. exponentially-weighted moving
average) line will be drawn for each discrete-color/symbol group. See
the docstrings for the functions in
`plotly.express.trendline_functions` for more details on these
functions and how to configure them with the `trendline_options`
argument.
trendline_options: dict or None
Options passed as the first argument to the function from
`plotly.express.trendline_functions` named in the `trendline`
argument. Valid keys for the `trendline_options` dict are as
follows:
ols
add_constant: bool, default 'True'
if `False`, the trendline passes through the origin
but if `True` a y-intercept is fitted.
log_x and log_y: bool, default 'False'
if `True` the OLS is computed with respect to the base
10 logarithm of the input. Note that this means no zeros
can be present in the input.
lowess
frac: float, default '0.6666666'
Between 0 and 1. The fraction of the data used when
estimating each y-value.
rolling
function: function, str, list or dict, default 'mean'
Function to use for aggregating the data. If a function,
must either work when passed a Series/Dataframe or when
passed to Series/Dataframe.apply. Accepted combinations
are:
- function
- string function name
- list of functions and/or
function names, e.g. [np.sum, 'mean']
- dict of axis labels -> functions,
function names or list of such.
function_args: dict
function arguments. For examples please refer to 'win_type'
argument documentation below.
window: int, timedelta, str, offset, or BaseIndexer subclass
- Size of the moving window.
- If an integer, the fixed number of observations used for
each window.
- If a timedelta, str, or offset, the time period of each
window. Each window will be a variable sized based on the
observations included in the time-period. This is only valid
for datetimelike indexes.
- If a BaseIndexer subclass, the window boundaries based on the
defined ``get_window_bounds`` method. Additional rolling
keyword arguments, namely ``min_periods``, ``center``,
``closed`` and ``step`` will be passed to ``get_window_bounds``.
min_periods: int, default None
Minimum number of observations in window required to have a value,
otherwise, result is ``np.nan``.
center: bool, default False
- If False, set the window labels as the right edge of the window index.
- If True, set the window labels as the center of the window index.
win_type: str, default None
- If ``None``, all points are evenly weighted.
- If a string, it must be a valid `scipy.signal window function
<https://docs.scipy.org/doc/scipy/reference/signal.windows.html#module-scipy.signal.windows>`__.
- e.g.: [`barthann`, `bartlett`, `blackman`, `blackmanharris`,
`bohman`, `boxcar`, `chebwin`, `cosine`, `exponential`, `flattop`,
`gaussian`, `general_gaussian`, `hamming`, `hann`, `kaiser`,
`nuttall`, `parzen`, `triang`, `tukey`]
- Certain Scipy window types require additional parameters to be
passed in the aggregation function. The additional parameters
must match the keywords specified in the Scipy window type method
signature.
- `window` and `rolling` are pandas subclasses utilizing window
functions from `scipy` module.
- If `win_type` is not `None` a `window` subclass is returned,
otherwise a `rolling` subclass is returned. This affects the
way `function` argument behaves, see examples below.
on: str, optional
- For a DataFrame, a column label or Index level on which
to calculate the rolling window, rather than the DataFrame's index.
- Provided integer column is ignored and excluded from result since
an integer index is not used to calculate the rolling window.
closed: str, default None
- If ``'right'``, the first point in the window is excluded from
calculations.
- If ``'left'``, the last point in the window is excluded from
calculations.
- If ``'both'``, the no points in the window are excluded from
calculations.
- If ``'neither'``, the first and last points in the window are
excluded from calculations.
- Default ``None`` (``'right'``).
step: int, default None
Evaluate the window at every ``step`` result, equivalent to slicing as
``[::step]``. ``window`` must be an integer. Using a step argument
other than None or 1 will produce a result with a different shape than
the input.
expanding
function and function_args
same as in `rolling`
min_periods: int, default 1
Minimum number of observations in window required to have a value;
otherwise, result is ``np.nan``.
ewm
function and function_args
same as in `rolling`
com: float, optional
Specify decay in terms of center of mass
span: float, optional
Specify decay in terms of span
halflife: float, str, timedelta, optional
- Specify decay in terms of half-life
- If ``times`` is specified, a timedelta convertible unit over which an
observation decays to half its value. Only applicable to ``mean()``,
and halflife value will not apply to the other functions.
alpha: float, optional
Specify smoothing factor
min_periods: int, default 0
Minimum number of observations in window required to have a value;
otherwise, result is ``np.nan``.
adjust: bool, default True
Divide by decaying adjustment factor in beginning periods to account
for imbalance in relative weightings (viewing EWMA as a moving
average).
ignore_na: bool, default False
Ignore missing values when calculating weights.
times : np.ndarray, Series, default None
- Only applicable to ``mean()``.
- Times corresponding to the observations. Must be monotonically
increasing and ``datetime64[ns]`` dtype.
- If 1-D array like, a sequence with the same shape as the observations.
trendline_color_override: str or None
Valid CSS color. If provided, and if ``trendline`` is set, all trendlines
will be drawn in this color rather than in the same color as the traces
from which they draw their inputs.
trendline_scope: str (one of `'trace'` or `'overall'`, default `'trace'`)
If `'trace'`, then one trendline is drawn per trace (i.e. per color,
symbol, facet, animation frame etc) and if `'overall'` then one
trendline is computed for the entire dataset, and replicated across all
facets.
log_x: boolean (default `False`)
If `True`, the x-axis is log-scaled in cartesian coordinates.
log_y: boolean (default `False`)
If `True`, the y-axis is log-scaled in cartesian coordinates.
range_x: list of two numbers
If provided, overrides auto-scaling on the x-axis in cartesian
coordinates.
range_y: list of two numbers
If provided, overrides auto-scaling on the y-axis in cartesian
coordinates.
title: str
The figure title.
template: str or dict or plotly.graph_objects.layout.Template instance
The figure template name (must be a key in plotly.io.templates) or
definition.
width: int (default `None`)
The figure width in pixels.
height: int (default `None`)
The figure height in pixels.
theme: str,
adjust axis and title colors as desired
Attributes
----------
z_plotly_ols_fit: pandas dataframe
fitted Ordinary Least Squares model(s)
z_plotly_fit: pandas dataframe
fitted Logistic or Polynomial model(s)
z_plotly_fit_out: pandas dataframe
groups where fitting models fails, only applicable
for Logistic or Polynomial fits if ``facet`` is assigned
Rolling Examples
----------------
>>> # Custom Function
>>> # pandas
>>> series.rolling('win_type' = None).aggregate(**opts)
>>> # trendline_options - lambda is the euclidean distance
>>> tl_opts = dict(
>>> function = 'aggregate',
>>> function_args = dict(
>>> func = lambda x: np.sqrt(x.dot(x))
>>> ),
>>> win_type = None)
>>> # Rolling object
>>> # pandas
>>> series.rolling('win_type' = None).sum(**opts)
>>> # trendline_options
>>> tl_opts = dict(
>>> function = 'sum',
>>> function_args = None,
>>> win_type = None)
>>> # Window object
>>> # pandas
>>> series.rolling('win_type' = 'gaussian').sum(**opts)
>>> # trendline_options - 'std' is parameter required by
>>> # 'gaussian' window function, not the aggregation function 'sum'
>>> tl_opts = dict(
>>> function = 'sum',
>>> function_args = dict(std = 2),
>>> win_type = 'gaussian')
"""
# force order on facet grid if not already
self._facets = [f for f in [facet_row, facet_col] if f]
if any(self._facets):
# ensure complete mapping incase only
# single facet order was set by user
# otherwise, override default values
category_orders = {
f'{f}': self.z_df_[f].unique() for f in self._facets} | (category_orders if category_orders else {})
# Check active arguments and column type
# for proper formatting of hover templates
args = [col, self._target, color, symbol, size, *self._facets] # all columns arguments
active_args, numeric, categorical = self._plotly_args(args)
if not hover_data:
# Apply proper formatting, hide facets and display index
# for main scatter plot
hover_data = {k:':,.3f' for k in numeric} | \
{k: False for k in self._facets} | {'index': (':,.0f', self.z_df_.index)}
# set colors if not already
if not color_discrete_sequence:
color_discrete_sequence = colors.qualitative.Dark24
if not color_continuous_scale:
color_continuous_scale = colors.sequential.Viridis # ignored if `color` is binary feature
# Main scatter plot using copy of dataFrame, accounting for fraction - if any
fig = scatter(self.z_df_, x = col, y = self._target, color = color, size = size, size_max = size_max,
symbol = symbol, symbol_sequence = symbol_sequence, symbol_map = symbol_map,
hover_name = hover_name, hover_data = hover_data, custom_data = custom_data,
text = text, facet_row = facet_row, facet_col = facet_col, facet_col_wrap = facet_col_wrap,
facet_row_spacing = facet_row_spacing, facet_col_spacing = facet_col_spacing,
error_x = error_x, error_x_minus = error_x_minus, error_y = error_y,
error_y_minus = error_y_minus, labels = labels,
color_discrete_sequence = color_discrete_sequence, color_continuous_scale = color_continuous_scale,
opacity = opacity, marginal_x = marginal_x, marginal_y = marginal_y, category_orders = category_orders,
trendline = trendline, trendline_options = trendline_options,
trendline_color_override = trendline_color_override, trendline_scope = trendline_scope,
log_x = log_x, log_y = log_y, range_x = range_x, range_y = range_y, title = title,
template = template, width = width, height = height
)
if trendline == 'ols':
self.z_plotly_ols_fit_ = get_trendline_results(fig) # attributes
# mapping position on facet grid, if any
if facet_col_wrap and (any([marginal_x, marginal_y]) or facet_row or not facet_col):
facet_col_wrap = 0 # Ignore facet column wrapping to match plotly logic
row_map, col_map = self._facet_map(category_orders, facet_row, facet_col, facet_col_wrap)
# fit identifier for hovertemplate
if self._degree > 1:
fit = f'Polynomial(degree = {self._degree})'
fit_cols = [f'x^{i}' for i in range(self._degree + 1)] # columns of fit results dataframe: polynomial coefs
elif self._fit == 'logit':
fit = 'logistic'
fit_cols = ['fit'] # fitted models
else:
fit = None
# overlay fits
if fit:
# log transformation takes place only during fitting
# this is controlled by designated keys in
# `trendline_options` attribute.
# Plot axis are displayed in original input values
# unless `log_x` or log_y` attributes are activated
# which in-turn updates `fig` x/y-axis
log_x = log_y = False
col_ = col
target_ = self._target
if trendline_options:
log_x = trendline_options.get("log_x", False)
log_y = trendline_options.get("log_y", False)
if log_y and fit != 'logistic':
if np.any(self.z_df_[self._target] <= 0):
log_y = False
_z_log.info(f"Log_y transformation was not applied, {self._target} includes non-positive values")
else:
self.z_df_[f'{self._target}_'] = np.log10(self.z_df_[self._target])
target_ = f'{self._target}_'
if log_x:
if np.any(self.z_df_[col] <= 0):
log_x = False
_z_log.info(f"Log_x transformation was not applied, {col} includes non-positive values")
else:
self.z_df_[f'{col}_'] = np.log10(self.z_df_[col])
col_ = f'{col}_'
self._models = [] # fit dataframe container
self._fit_fail = [] # failed fits container
# fit data is grouped by: color(if categorical) + symbol + facets
# plotly grouping order is color -> symbol -> facet row -> facet col
groupers = [color, symbol] if color in categorical else [symbol]
full_gs = groupers + self._facets
self._g_map = {col: self.z_df_[col].unique() for col in full_gs if col}
if self._g_map and trendline_scope != 'overall':
# update columns of results dataframe
if self._facets:
fit_cols = list(self._g_map.keys()) + fit_cols
# nested loops for all combinations in grouping map
for comb in product(*[self._g_map[col] for col in self._g_map.keys()], repeat = 1):
mask = self.z_df_[(self.z_df_[self._g_map.keys()] == comb).all(1)]
if len(mask.dropna()) > 1 and mask[self._target].nunique() > 1:
# location on facet_grid
if facet_col_wrap:
row_idx = row_map[mask[facet_col].unique()[0]]
else:
row_idx = row_map[mask[facet_row].unique()[0]] if row_map else 1
col_idx = col_map[mask[facet_col].unique()[0]] if col_map else 1
# plot data
x, y_pred, hovertemplate, name = self._fit_models(col_, target_, mask, fit, self._degree, row_idx,
col_idx, comb, groupers, log_x, log_y)
# plot
if name: # None if fit fails
fig.add_scatter(x = x, y = y_pred, hovertemplate = hovertemplate, showlegend = False,
name = name, row = row_idx, col = col_idx)
else:
# check pre-fit
# only applicable for
# overall non-transformed fits
model = self.z_fit_results_.get(f'{col_}') if hasattr(self, 'z_fit_results_') else False
# plot data
x, y_pred, hovertemplate, name = self._fit_models(col_, target_, self.z_df_, fit, self._degree,
log_x = log_x, log_y = log_y, model = model)
# plot
if name:
fig.add_scatter(x = x, y = y_pred, hovertemplate = hovertemplate, showlegend = False,
name = name, row = 'all', col = 'all')
# attributes
self.z_plotly_fit_ = pd.DataFrame(self._models, columns = fit_cols)
if self._fit_fail:
self.z_plotly_fit_out_ = pd.DataFrame(self._fit_fail)
# overlay outliers
if olrs_idx is not None:
idx = np.array(olrs_idx)[np.isin(olrs_idx, self.z_df_.index)]
if len(idx):
# Editing Outliers Scatter hover template
# Meta define values(column names) to be accessed within the hovertemplate
meta = active_args[~np.isin(active_args, self._facets)] # only active cols ignoring facets
# outlier dataframe
outliers_df = self.z_df_.loc[idx].copy()
# location on facet_grid, if any
if facet_col_wrap:
outliers_df['row_idx'] = outliers_df[facet_col].map(row_map)
else:
outliers_df['row_idx'] = outliers_df[facet_row].map(row_map) if row_map else 1
outliers_df['col_idx'] = outliers_df[facet_col].map(col_map) if col_map else 1
# unique facet grid coordinates
cord_list = sorted(list(set(zip(outliers_df['row_idx'].values, outliers_df['col_idx'].values))))
# plot
for idx in cord_list:
temp_df = outliers_df[(outliers_df['row_idx'] == idx[0]) & (outliers_df['col_idx'] == idx[1])]
# fetching all active arguments from outlier dataframe
# for each column displayed in hovertemplate
customdata = np.stack([temp_df[_] for _ in meta], axis = -1)
# workaround as hovertemplate contradicts python format string (% & f)
hovertemplate = \
'<br>'.join(['%{meta[i]}: %{customdata[i]:,.3f}'.replace('i', f'{i}') if _ in numeric
else '%{meta[i]}: %{customdata[i]}'.replace('i', f'{i}')
for i, _ in enumerate(meta)]) + '<br>''Index: %{text}<extra></extra>'
# overlay outliers in each respective facet position
fig.add_scatter(x = temp_df[col], y = temp_df[self._target],
meta = meta, customdata = customdata, mode = 'markers',
marker = dict(size = 10, color = 'red', line = dict(color = 'orange', width = 2)),
hovertemplate = hovertemplate, text = temp_df.index, showlegend = False,
name = 'outliers', row = idx[0], col = idx[1],
)
if not title:
title = f"'{self._target}' interactive scatter plot"
# Decorate
fig.update_layout(dict(paper_bgcolor = 'rgba(0,0,0,0)', plot_bgcolor = 'rgba(0,0,0,0)',
hoverlabel = dict(bgcolor = 'brown', font = dict(family = 'Rockwell',
size = 16, color = 'moccasin')),
legend = dict(orientation = 'h', xanchor = 'right', x = 1,
title_font_family = 'Arial', font = dict(family = 'Rockwell',
size = 14, color = 'lemonchiffon'),
bgcolor = 'dimgrey', bordercolor = 'Black', borderwidth = 2),
title = dict(text = title, x = 0.5, y = .99, xref = 'paper',
font = dict(family = 'Arial', size = 20, color = theme))
))
fig.update_xaxes(showgrid = False, color = theme, title_standoff = 10)
fig.update_yaxes(showgrid = False, color = theme)
# check if traces exists other than trendlines
# if they are the last trace then no other
# traces exists as they get plotted first
if len(fig.data) > 1 and 'trendline' not in fig.data[-1]['hovertemplate']: # other traces exists
fig.update_traces(selector = -1, showlegend = True) # only show last trace in legend
fig.for_each_annotation(lambda x: x.update(text = x.text.split('=')[-1], bgcolor = 'dimgrey',
font = dict(family = 'Rockwell', size = 16, color = 'oldlace')))
# return fig
# fig.write_image('figure.png', scale=4)
fig.show();
[docs]
def vis_multi_d(self,
x: str,
y: str,
z: Optional[str] = None,
olrs_idx: Optional[Tuple[pd.core.indexes.base.Index, list]] = None,
color: Optional[Union[str, int, pd.Series]]= None,
symbol: Optional[Union[str, int, pd.Series]] = None,
symbol_sequence: Optional[List[str]] = None,
symbol_map: Optional[dict] = None,
size: Optional[Union[str, int, pd.Series]] = None,
size_max: int = 20,
text: Optional[Union[str, int, pd.Series]] = None,
hover_name: Optional[Union[str, int, pd.Series]] = None,
hover_data: Optional[Union[str, list[str, int], pd.Series, dict]] = None,
custom_data: Optional[Union[str, list[str, int], pd.Series]] = None,
error_x: Optional[Union[str, int, pd.Series]] = None,
error_x_minus: Optional[Union[str, int, pd.Series]] = None,
error_y: Optional[Union[str, int, pd.Series]] = None,
error_y_minus: Optional[Union[str, int, pd.Series]] = None,
error_z: Optional[Union[str, int, pd.Series]] = None,
error_z_minus: Optional[Union[str, int, pd.Series]] = None,
animation_frame: Optional[Union[str, int, pd.Series]] = None,
animation_group: Optional[Union[str, int, pd.Series]] = None,
category_orders: Optional[dict] = None,
labels: Optional[dict] = None,
color_discrete_sequence: Optional[List[str]] = None,
color_continuous_scale: Optional[List[str]] = None,
opacity: Optional[float] = None,
log_x: bool = False,
log_y: bool = False,
log_z: bool = False,
range_x: Optional[List[float]] = None,
range_y: Optional[List[float]] = None,
range_z: Optional[List[float]] = None,
title: Optional[str] = None,
template: Optional[Union[str, dict]] = None,
width: Optional[int] = None,
height: Optional[int] = None,
theme: str = 'darkorange'):
"""
Interactive 3D multivariate scatter plot visualization `Plotly Module`
Parameters
----------
x: str
Name of column that goes to `x` axis
y: str
Name of column that goes to `y` axis
z: str or None
Name of column that goes to `z` axis. If `None`,
z-axis is the ``target`` variable
olrs_idx: pandas index, list or None
Index of outlier data points
color: str or int or Series or array-like
Either a name of a column in `data_frame`, or a pandas Series or
array_like object. Values from this column or array_like are used to
assign color to marks.
symbol: str or int or Series or array-like
Either a name of a column in `data_frame`, or a pandas Series or
array_like object. Values from this column or array_like are used to
assign symbols to marks.
symbol_sequence: list of str
Strings should define valid plotly.js symbols. When ``symbol`` is set,
values in that column are assigned symbols by cycling through
``symbol_sequence`` in the order described in ``category_orders``, unless
the value of ``symbol`` is a key in ``symbol_map``.
symbol_map: dict with str keys and str values (default `{}`)
String values should define plotly.js symbols Used to override
``symbol_sequence`` to assign a specific symbols to marks corresponding
with specific values. Keys in ``symbol_map`` should be values in the
column denoted by ``symbol``. Alternatively, if the values of ``symbol``
are valid symbol names, the string `'identity'` may be passed to cause
them to be used directly.
size: str or int or Series or array-like
Either a name of a column in `data_frame`, or a pandas Series or
array_like object. Values from this column or array_like are used to
assign mark sizes.
size_max: int (default `20`)
Set the maximum mark size when using ``size``.
text: str or int or Series or array-like
Either a name of a column in `data_frame`, or a pandas Series or
array_like object. Values from this column or array_like appear in the
figure as text labels.
hover_name: str or int or Series or array-like
Either a name of a column in `data_frame`, or a pandas Series or
array_like object. Values from this column or array_like appear in bold
in the hover tooltip.
hover_data: str, or list of str or int, or Series or array-like, or dict
Either a name or list of names of columns in `data_frame`, or pandas
Series, or array_like objects or a dict with column names as keys, with
values `True` (for default formatting) `False` (in order to remove this
column from hover information), or a formatting string, for example
`':.3f'` or `'|%a'` or list-like data to appear in the hover tooltip or
tuples with a bool or formatting string as first element, and list-like
data to appear in hover as second element Values from these columns
appear as extra data in the hover tooltip.
custom_data: str, or list of str or int, or Series or array-like
Either name or list of names of columns in `data_frame`, or pandas
Series, or array_like objects Values from these columns are extra data,
to be used in widgets or Dash callbacks for example. This data is not
user-visible but is included in events emitted by the figure (lasso
selection etc.)
error_x: str or int or Series or array-like
Either a name of a column in `data_frame`, or a pandas Series or
array_like object. Values from this column or array_like are used to
size x-axis error bars. If ``error_x_minus`` is `None`, error bars will
be symmetrical, otherwise `error_x` is used for the positive direction
only.
error_x_minus: str or int or Series or array-like
Either a name of a column in `data_frame`, or a pandas Series or
array_like object. Values from this column or array_like are used to
size x-axis error bars in the negative direction. Ignored if ``error_x``
is `None`.
error_y: str or int or Series or array-like
Either a name of a column in `data_frame`, or a pandas Series or
array_like object. Values from this column or array_like are used to
size y-axis error bars. If ``error_y_minus`` is `None`, error bars will
be symmetrical, otherwise ``error_y`` is used for the positive direction
only.
error_y_minus: str or int or Series or array-like
Either a name of a column in `data_frame`, or a pandas Series or
array_like object. Values from this column or array_like are used to
size y-axis error bars in the negative direction. Ignored if ``error_y``
is `None`.
error_z: str or int or Series or array-like
Either a name of a column in `data_frame`, or a pandas Series or
array_like object. Values from this column or array_like are used to
size z-axis error bars. If ``error_z_minus`` is `None`, error bars will
be symmetrical, otherwise ``error_z`` is used for the positive direction
only.
error_z_minus: str or int or Series or array-like
Either a name of a column in `data_frame`, or a pandas Series or
array_like object. Values from this column or array_like are used to
size z-axis error bars in the negative direction. Ignored if ``error_z``
is `None`.
animation_frame: str or int or Series or array-like
Either a name of a column in `data_frame`, or a pandas Series or
array_like object. Values from this column or array_like are used to
assign marks to animation frames.
animation_group: str or int or Series or array-like
Either a name of a column in `data_frame`, or a pandas Series or
array_like object. Values from this column or array_like are used to
provide object-constancy across animation frames: rows with matching
``animation_group`` will be treated as if they describe the same object
in each frame.
category_orders: dict with str keys and list of str values (default `{}`)
By default, in Python 3.6+, the order of categorical values in axes,
legends and facets depends on the order in which these values are first
encountered in `data_frame` (and no order is guaranteed by default in
Python below 3.6). This parameter is used to force a specific ordering
of values per column. The keys of this dict should correspond to column
names, and the values should be lists of strings corresponding to the
specific display order desired.
labels: dict with str keys and str values (default `{}`)
By default, column names are used in the figure for axis titles, legend
entries and hovers. This parameter allows this to be overridden. The
keys of this dict should correspond to column names, and the values
should correspond to the desired label to be displayed.
color_discrete_sequence: list of str
Strings should define valid CSS-colors. When ``color`` is set and the
values in the corresponding column are not numeric, values in that
column are assigned colors by cycling through ``color_discrete_sequence``
in the order described in ``category_orders``, unless the value of
``color`` is a key in ``color_discrete_map``. Various useful color
sequences are available in the `plotly.express.colors` submodules,
specifically `plotly.express.colors.qualitative`.
color_continuous_scale: list of str
Strings should define valid CSS-colors This list is used to build a
continuous color scale when the column denoted by `color` contains
numeric data. Various useful color scales are available in the
`plotly.express.colors` submodules, specifically
`plotly.express.colors.sequential`, `plotly.express.colors.diverging`
and `plotly.express.colors.cyclical`.
opacity: float or None,
Value between 0 and 1. Sets the opacity for markers.
log_x: boolean (default `False`)
If `True`, the x-axis is log-scaled in cartesian coordinates.
log_y: boolean (default `False`)
If `True`, the y-axis is log-scaled in cartesian coordinates.
log_z: boolean (default `False`)
If `True`, the z-axis is log-scaled in cartesian coordinates.
range_x: list of two numbers
If provided, overrides auto-scaling on the x-axis in cartesian
coordinates.
range_y: list of two numbers
If provided, overrides auto-scaling on the y-axis in cartesian
coordinates.
range_z: list of two numbers
If provided, overrides auto-scaling on the z-axis in cartesian
coordinates.
title: str
The figure title.
template: str or dict or plotly.graph_objects.layout.Template instance
The figure template name (must be a key in plotly.io.templates) or
definition.
width: int (default `None`)
The figure width in pixels.
height: int (default `None`)
The figure height in pixels.
theme: str
adjust axis and title colors as desired
"""
# Setting target variable to z-axis
if not z and self._target not in [x, y, z]:
z = self._target
# Editing Main Scatter hover template
args = [x, y, z, color, symbol, size] # all column arguments ignoring facets
active_args, numeric, categorical = self._plotly_args(args)
if not hover_data:
# Apply proper formatting and display index
# for main scatter plot
hover_data = {k:':,.3f' for k in numeric} | {'index': (':,.0f', self.z_df_.index)}
# set colors if not already
c_disc = color_discrete_sequence if color_discrete_sequence else colors.qualitative.Bold_r
c_cont = color_continuous_scale if color_continuous_scale else colors.sequential.YlOrRd
# XD scatter plot
fig = scatter_3d(self.z_df_, x = x, y = y, z = z, color = color, symbol = symbol,
symbol_sequence = symbol_sequence, symbol_map = symbol_map,
size = size, size_max = size_max, text = text, hover_name = hover_name,
hover_data = hover_data, custom_data = custom_data, error_x = error_x,
error_x_minus = error_x_minus, error_y = error_y, error_y_minus = error_y_minus,
error_z = error_z, error_z_minus = error_z_minus, animation_frame = animation_frame,
animation_group = animation_group, category_orders = category_orders, labels = labels,
color_discrete_sequence = c_disc, color_continuous_scale = c_cont,
opacity = opacity, log_x = log_x, log_y = log_y, log_z = log_z, range_x = range_x,
range_y = range_y, range_z = range_z, title = title, template = template, width = width,
height = height)
# overlay outliers
if olrs_idx is not None:
idx = np.array(olrs_idx)[np.isin(olrs_idx, self.z_df_.index)]
if len(idx): # edge case: using `frac` and full index
# Editing Outliers Scatter hover template
meta = active_args
# fetching all active arguments from outlier dataframe
outliers_df = self.z_df_.loc[idx].copy()
customdata = np.stack([outliers_df[_] for _ in meta], axis = -1)
# workaround as hovertemplate contradicts python format string (% & f)
# <extra></extra> remove trace name
hovertemplate = \
'<br>'.join(['%{meta[i]}: %{customdata[i]:,.0f}'.replace('i', f'{i}') if _ in numeric
else '%{meta[i]}: %{customdata[i]}'.replace('i', f'{i}')
for i, _ in enumerate(meta)]) + '<br>''Index: %{text}<extra></extra>'
fig.add_scatter3d(x = outliers_df[x], y = outliers_df[y], z = outliers_df[z],
meta = meta, customdata = customdata, mode = 'markers',
marker = dict(size = 10, color = 'yellow', line = dict(color = 'red', width = 4)),
hovertemplate = hovertemplate, text = outliers_df.index, name = 'outliers'
)
if not title:
title = f'3D Visualization of {self._target}'
fig.update_layout(dict(paper_bgcolor = 'rgba(0,0,0,0)', plot_bgcolor = 'rgba(0,0,0,0)',
# 3d scene decoration
scene = dict(
aspectratio = dict(x = 1.5, y = 1.5, z = 1), # axes ratio
# axes decoration
xaxis = dict(backgroundcolor = 'darkred', showgrid = False,
color = 'darkred'),
yaxis = dict(backgroundcolor = 'darkslateblue', showgrid = False,
color = 'darkslateblue'),
zaxis = dict(backgroundcolor = 'darkolivegreen', showgrid = False,
color = 'darkolivegreen')),
hoverlabel = dict(bgcolor = 'brown',
font = dict(family = 'Rockwell', size = 16, color = 'moccasin')),
legend = dict(orientation = 'h', xanchor = 'right', x = 1,
title_font_family = 'Arial', font = dict(family = 'Rockwell',
size = 14, color = 'lemonchiffon'),
bgcolor = 'dimgrey', bordercolor = 'Black', borderwidth = 2),
title = dict(text = title, x = 0.5, y = .99, xref = 'paper',
font = dict(family = 'Arial', size = 20, color = theme))))
fig.update_traces(marker = dict(line = dict(width = 2))) # scatter markers decoration
fig.show();
def _input_validation(self, disp_corr: Optional[str] = None):
""" validate inputs before execution """
if not len(self._cols):
raise AttributeError("No feature to analyze! Please ensure that input columns are valid")
if disp_corr and disp_corr not in ['pearson', 'spearman']:
raise ValueError("'disp_corr' parameter must be one of the following arguments: 'pearson' or 'spearman', "
f"however, '{disp_corr}' was received!")
def _heat_plot(self, plot_data, lim, title, feat_corr = False):
""" Correlation heatmaps"""
# limit features per map
mask = plot_data[self._n: self._n + lim].T
data = mask.values.round(2)
x = mask.columns
# heat map data and annotations
if not feat_corr:
z = data[::2,:] # coef values
y = mask.index[::2] # corr text
customdata = data[1::2,:] # pvals
hovtemp = '<br>'.join(['txt vs %{x}'.replace('txt', f'{self._target}'),
'pval = %{customdata:.3f}<extra></extra>'])
else:
z = np.where(np.isnan(data), '', data) # hide values < thresh
y = mask.index
customdata = None
hovtemp = '<br>'.join(['%{y} vs %{x}<extra></extra>'])
annotations = [layout.Annotation(text = str(z[idx][col_idx]), x = x[col_idx], y = y[idx],
bgcolor = 'black', bordercolor = 'purple',
showarrow = False, font = dict(family = 'Arial', size = 11, color = 'oldlace'))
for idx, col in enumerate(z) for col_idx, v in enumerate(col)]
# Heatmaps
fig = Figure(
data=[Heatmap(z = z, x = x, y = y ,zmin = -1, zmax = 1, customdata = customdata,
hovertemplate = hovtemp, colorscale = 'icefire', showscale = False
)])
# Decorate
fig.update_layout(
dict(paper_bgcolor = 'rgba(0,0,0,0)', plot_bgcolor = 'rgba(0,0,0,0)',
hoverlabel = dict(bgcolor = 'darkred'), annotations = annotations,
title = dict(text = title, x = 0.5, xref = 'paper',
font = dict(family = 'Arial', size = 20, color = self._theme))
))
fig.update_xaxes(showgrid = False, color = self._theme)
fig.update_yaxes(showgrid = False, color = self._theme)
fig.show();
self._n += lim # update iteration control and slicing indices
def _plot_data(self, df, col) -> Tuple[np.ndarray, np.ndarray]:
""" Data used to plot generated predictions """
lim = np.linspace(df[col].min(), df[col].max(), 100)
grid = np.c_[np.ones(len(lim)), lim]
x = grid[:,1]
return x, grid
def _logistic_pred(self, grid, model) -> Tuple[np.ndarray]:
""" Generate predictions from fitted logistic regression model """
# Use a constant logit of zero for the first class in multiclass
# because the reference class is never predicted directly
logits = np.c_[np.zeros(len(grid)), grid.dot(model.params)]
# class probability
# using softmax function
y_pred = np.exp(logits) / (np.sum(np.exp(logits), axis = 1, keepdims = 1)) # softmax = sigmoid for binary class
y_pred = y_pred[:,-1] # positive class label for binary or last label for multiclass
return y_pred
def _func_text(self, params, poly = False, text_wrap = False)-> str:
""" format regression equation and model parameters text """
# generic mapping for proper display
replace_map = {"+ -":"- "}
if not poly:
coef_list = [f"{coef:,.3f}" for coef in params]
equation = "f(x) = " + " + ".join(coef_list) + "x"
else:
coef_list = [f"{coef:,.3f}x<sup>{i}</sup>" for i, coef in enumerate(params)]
if text_wrap: # seaborn plot title
text = "P(x) = " + " + ".join(coef_list)
equation = "$ \n $".join(wrap(text, 70))
replace_map = replace_map | {"<sup>":"^", "</sup>":"", "x^0":"",
"x^1":"x", "+$ \n $-": "-$ \n $"}
else: # plotly hovertemplate
equation = "P(x) = " + " + ".join(coef_list)
replace_map = replace_map | {"x<sup>0</sup>":"", "x<sup>1</sup>":"x"}
for k, v in replace_map.items():
equation = equation.replace(k, v)
return equation
def _plotly_args(self, args) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
""" Identify numeric and categorical features for proper formatting """
active_args = np.array(list({arg for arg in args if arg})) # active arguments, no need to preserve order
# column dtypes of active arguments
numeric = np.array([col for col in active_args if self.z_df_[col].dtype.kind in 'ifc'])
categorical = active_args[~np.isin(active_args, numeric)]
return active_args, numeric, categorical
def _facet_map(self, category_orders, facet_row, facet_col, facet_col_wrap = None) -> Optional[Tuple[dict, dict]]:
""" Map category orders to their corresponding row and column positions within plotly facet grid """
if facet_col and facet_row:
n_rows_ = range(1, len(category_orders[facet_row]) + 1)
n_cols_ = range(1, len(category_orders[facet_col]) + 1)
row_map = dict(zip(np.flip(category_orders[facet_row]), n_rows_))
col_map = dict(zip(category_orders[facet_col], n_cols_))
elif facet_col:
if facet_col_wrap: # Wraps the column variable at this width, Ignored if `facet_row` is set.
n_rows_ = int(np.ceil(len(category_orders[facet_col]) / facet_col_wrap))
row_map = {}
col_map = {}
c_idx = r_idx = 1
for _ in range(len(category_orders[facet_col])):
row_map[np.flip(category_orders[facet_col])[_]] = r_idx
col_map[category_orders[facet_col][_]] = c_idx
c_idx += 1
if len(category_orders[facet_col][1+_:]) == facet_col_wrap: # remaining categories goes to new row
r_idx += 1
if c_idx > facet_col_wrap: # max columns reached, new row needed ?
c_idx = 1
if r_idx < n_rows_:
r_idx += 1
else:
n_cols_ = range(1, len(category_orders[facet_col]) + 1)
row_map = None
col_map = dict(zip(category_orders[facet_col], n_cols_))
elif facet_row:
n_rows_ = range(1, len(category_orders[facet_row]) + 1)
row_map = dict(zip(np.flip(category_orders[facet_row]), n_rows_))
col_map = None
else:
row_map = col_map = None
return row_map, col_map
def _fit_models(self, col, target, fit_data, fit, degree, row_idx = None, col_idx = None,
comb = None, groupers = None, log_x = False, log_y = False, model = False) -> Optional[Tuple[np.ndarray, str]]:
""" Generate fit data for Logistic and Polynomial fits in plotly scatter plots """
# plot data
x, grid = self._plot_data(fit_data, col)
# generic return values
y_pred = hovertemplate = name = None
# indications of groups for each fit
gs = [f'{groupers[i]}: {comb[i]}' for i, _ in enumerate(groupers) if _] if groupers else []
try:
if fit == 'logistic':
if not model:
model = mnlogit(formula = f'Q("{target}")~Q("{col}")', data = fit_data).fit(disp = 0, method = self._method,
**self._kwargs)
y_pred = self._logistic_pred(grid, model)
pr = model.prsquared
llrp = model.llr_pvalue
if self._binary_t:
paras = model.params.values.ravel()
y_hover = 'P(y = 1): %{y:,.0%}<b>(probability)</b><extra></extra>' # <extra></extra> remove trace name
name = f'{target} Logistic Fit'
else:
paras = model.params.values[:,-1]
y_hover = 'P(y = c): %{y:,.0%}<b>(probability)</b><extra></extra>'.replace('c', f'{self._lbl[-1]}')
name = f'{target} Logistic Fit: {self._lbl[-1]} vs {self._lbl[0]}(base)'
# decorate
hover_text = [f'pR<sup>2</sup> = {pr:,.3f}',
f'llrp = {llrp:,.4f}</b><br>',
*gs,
'col: %{x:,.3f}'.replace('col', f'{col}'),
y_hover]
else:
if not np.any(model):
model = np.polynomial.polynomial.polyfit(fit_data[col], fit_data[target], degree)
paras = model
y_pred = np.polynomial.polynomial.polyval(grid, model)[:,-1]
name = 'Polynomial Fit'
hover_text = [*gs,
'col: %{x:,.3f}'.replace('col', f'{col}'),
'c: %{y:,.3f}<b>(trend)</b><extra></extra>'.replace('c', target)]
if log_x:
x = np.power(10, x)
if log_y:
y_pred = np.power(10, y_pred)
# decorate
text = self._func_text(paras, poly = True if degree > 1 else False)
hovertemplate = '<br>'.join([f'<b>Fit: {fit}</b>',
f'{text}</b><br>'.replace('<br>', '') if fit == 'logistic' else f'{text}</b><br>',
*hover_text
])
# update fit container
# unpack categories tuple and
# poly fit output of array dtype
if self._facets and comb:
self._models.append((*comb, *model) if np.ndim(model) else (*comb, model))
else:
self._models.append(model)
except:
if comb:
self._fit_fail.append(dict(zip(self._g_map.keys(), comb)) | {'row': row_idx, 'col':col_idx})
else:
_z_log.info(f"{fit} was not applied! Please ensure that data to fit matches model requirements")
return x, y_pred, hovertemplate, name