Source code for zaps.eda._num_analysis

import pandas as pd

import numpy as np

import time

from statsmodels.formula.api import ols, mnlogit

from statsmodels.nonparametric.smoothers_lowess import lowess

from scipy import stats

from tqdm.auto import tqdm

from matplotlib import pyplot as plt

from seaborn import regplot, scatterplot, histplot

from plotly.graph_objs import Heatmap, Figure, layout

from plotly.express import scatter, scatter_3d, colors, get_trendline_results

from typing import Optional, Union, Tuple, List

from itertools import product

from textwrap import wrap

from .._utils import SEQUENCE_LIKE, itr_plot, PlotMixin

from .._logr import _z_log

###################################################################

[docs] class NumAna(PlotMixin): """ Collection of Numeric features analysis that includes: - Regression - Correlation - visualizations Parameters ---------- df: pandas dataframe data source cols: sequence (lists, tuples, NumPy arrays or Pandas Base Index) column names of numeric features target: str target column name, categorical target will be encode as integer. degree: int If ``degree`` is greater than 1, ``fit`` is ignored and polynomial regression is applied to the nth ``degree``. fit: str or None type of regression to fit. One of `ols`, `logit` or `lws`. If `ols` then Ordinary Least Squares regression is applied. If `logit` then logistic regression will be fitted, if `lws` then Locally Weighted Scatterplot Smoothing non-parametric regression is applied. If `None` it's either Ordinary Least Squares or logistic regression based on type of ``target``. method: str Only applicable when ``fit`` = `logit`. The following solvers from `scipy.optimize` are accepted: - **newton** for Newton-Raphson, ‘nm’ for Nelder-Mead - **bfgs** for Broyden-Fletcher-Goldfarb-Shanno (BFGS) - **lbfgs** for limited-memory BFGS with optional box constraints - **powell** for modified Powell’s method - **cg** for conjugate gradient - **ncg** for Newton-conjugate gradient - **basinhopping** for global basin-hopping solver - **minimize** for generic wrapper of scipy minimize (BFGS by default) Note ---- Each solver has several optional unique arguments. See ``**kwargs`` parameter below (or scipy.optimize) for the available arguments that each solver supports. lowess_frac: float Between 0 and 1. The fraction of the data used when estimating each y-value for lowess fit. it: int The number of residual-based reweightings to perform for lowess fit. delta: float Distance within which to use linear-interpolation instead of weighted regression for lowess fit. `'delta'` can be used to save computations. For each `x_i`, regressions are skipped for points closer than ``delta``. The next regression is fit for the farthest point within delta of `x_i` and all points in between are estimated by linearly interpolating between the two regression fits. Judicious choice of delta can cut computation time considerably for large data (N > 5000). A good choice is ``delta`` = 0.01 * range(x). nans_d: dict or None dictionary where keys are column names and values are missing(nan) replacements. To perform multiple imputation for several numeric columns. frac: float or None fraction of dataframe to use as a sample for analysis: - 0 < ``frac`` < 1 returns a random sample with size ``frac``. - ``frac`` = 1 returns shuffled dataframe. - ``frac`` > 1 up-sample the dataframe, sampling of the same row more than once. random_state: int for reproducibility, controls the random number generator for ``frac`` parameter. figsize: tuple or None dimensions of matplotlib figure (width, height) n_rows: int number of rows in matplotlib subplot figure n_cols: int number of columns in matplotlib subplot figure silent: Bool solicit user input for continuation during iterative plotting. If `True`, plotting proceeds without user interaction. hide_p_bar: Bool triggers hiding progress bar (tqdm module); Default 'False' theme: str adjust axis and title colors as desired Keyword Args ------------ warn_convergence: bool, optional If True, checks the model for the converged flag. If the converged flag is False, a ConvergenceWarning is issued. All other kwargs are passed to the chosen solver. newton tol: float Relative error in params acceptable for convergence. nm -- Nelder Mead xtol: float Relative error in params acceptable for convergence ftol: float Relative error in loglike(params) acceptable for convergence maxfun: int Maximum number of function evaluations to make. bfgs gtol: float Stop when norm of gradient is less than gtol. norm: float Order of norm (np.inf is max, -np.inf is min) epsilon If fprime is approximated, use this value for the step size. Only relevant if LikelihoodModel.score is None. lbfgs m: int This many terms are used for the Hessian approximation. factr: float A stop condition that is a variant of relative error. pgtol: float A stop condition that uses the projected gradient. epsilon If fprime is approximated, use this value for the step size. Only relevant if LikelihoodModel.score is None. maxfun: int Maximum number of function evaluations to make. bounds: sequence (min, max) pairs for each element in x, defining the bounds on that parameter. Use None for one of min or max when there is no bound in that direction. cg gtol: float Stop when norm of gradient is less than gtol. norm: float Order of norm (np.inf is max, -np.inf is min) epsilon: float If fprime is approximated, use this value for the step size. Can be scalar or vector. Only relevant if Likelihoodmodel.score is None. ncg fhess_p: callable f'(x,*args) Function which computes the Hessian of f times an arbitrary vector, p. Should only be supplied if LikelihoodModel.hessian is None. avextol: float Stop when the average relative error in the minimizer falls below this amount. epsilon: float or ndarray If fhess is approximated, use this value for the step size. Only relevant if Likelihoodmodel.hessian is None. powell xtol: float Line-search error tolerance ftol: float Relative error in loglike(params) for acceptable for convergence. maxfun: int Maximum number of function evaluations to make. start_direc: ndarray Initial direction set. basinhopping niter: int The number of basin hopping iterations. niter_success: int Stop the run if the global minimum candidate remains the same for this number of iterations. T: float The "temperature" parameter for the accept or reject criterion. Higher "temperatures" mean that larger jumps in function value will be accepted. For best results `T` should be comparable to the separation (in function value) between local minima. stepsize: float Initial step size for use in the random displacement. interval: int The interval for how often to update the `stepsize`. minimizer: dict Extra keyword arguments to be passed to the minimizer `scipy.optimize.minimize()`, for example 'method' - the minimization method (e.g. 'L-BFGS-B'), or 'tol' - the tolerance for termination. Other arguments are mapped from explicit argument of `fit`: - `args` <- `fargs` - `jac` <- `score` - `hess` <- `hess` minimize min_method: str, optional Name of minimization method to use. Any method specific arguments can be passed directly. For a list of methods and their arguments, see documentation of `scipy.optimize.minimize`. If no method is specified, then BFGS is used. Attributes ---------- z_inf_out : numpy array excluded columns having `inf` values, if any. z_nans: numpy array numeric column names where imputation of `nan` values took place. z_df: pandas dataframe preprocessed dataframe that was used internally """ def __init__(self, df: pd.DataFrame, cols: SEQUENCE_LIKE, target: str, degree: int = 1, fit: Optional[str] = None, method: str = 'newton', lowess_frac: float = 2/3, it: int = 3, delta: float = 0.0, nans_d: Optional[dict] = None, frac: Optional[float] = None, random_state: int = 45, figsize: Optional[Tuple[int, int]] = None, n_rows: Optional[int] = None, n_cols: Optional[int] = None, silent: bool = False, hide_p_bar: bool = False, theme: str = 'darkorange', **kwargs): # input checks if not isinstance(cols, (list, tuple, np.ndarray, pd.Index)): raise TypeError("'cols' parameter accepts a sequence e.g: Lists, Tuples, NumPy Arrays or Pandas Base Index. " f"However, '{type(cols)}' was received!") if not isinstance(target, str): raise TypeError("Please pass target column name as string") if np.any(~np.isin(np.r_[cols, [target]], df.columns)): raise KeyError("Missing columns! Please ensure that all columns to analyze are included in the DataFrame") if np.any([df[col].dtype.kind not in 'ifc' for col in cols]): raise ValueError("All 'cols' must be numeric please refer to parameter description") if fit and fit not in ['ols', 'logit', 'lws']: raise ValueError("'fit' parameter accepts the following arguments: 'ols', 'logit' or 'lws', " f"however, '{fit}' was received!") if nans_d: if not isinstance(nans_d, dict): raise TypeError("Please pass 'nans_d' parameter as a dictionary. Example: {'column name': imputation value}") if np.any(~np.isin(np.array(list(nans_d.keys())), df.columns)): raise KeyError("Missing columns! Please ensure that all columns to impute are included in the DataFrame") if frac: replace = True if frac > 1 else False self.z_df_ = df.sample(frac = frac, replace = replace, random_state = random_state).copy() else: self.z_df_ = df.copy() # avoid editing original dataframe # handling infs and nans self._cols = np.array(cols) self._slash_n_impute(nans_d) # attributes # check and assign correct fits str_target = self.z_df_[target].dtype.kind in 'bO' cat_target = ((self.z_df_[target].dtype.kind in 'i' and self.z_df_[target].nunique() <= 20) or str_target) if degree > 1: _z_log.info("Polynomial Least Squares fit will be applied") self._fit = None elif not fit: # assign generic fits if cat_target: self._fit = 'logit' _z_log.info("logistic fit will be applied") else: self._fit = 'ols' _z_log.info("Ordinary Least Squares fit will be applied") elif fit == 'logit' and not cat_target: # switch to regression self._fit = 'ols' _z_log.info(f"Logistic fit will not be applied rather OLS instead! `{target}` is of high cardinality.") else: self._fit = fit if fit == 'ols' and cat_target: _z_log.warning(f"OLS fit will be applied on `{target}` that appears to be categorical.") # encoding categorical target as integer if str_target: # checking for string only because OLS is allowed on categorical target nums, labels = self.z_df_[target].factorize() # assign new values self.z_df_[target] = nums # used to capture last class label when added to plot text # discrete values are assigned following order of appearance not alphabetically # so last label = highest discrete value # left unsorted to match same order when capturing fitted model parameters # as these are sorted ascendingly # TODO: prompt user for sorting? self._lbl = labels if self._fit == 'logit': if not hasattr(self, '_lbl'): # discrete target # unlike earlier this needs sorting to match fitted model parameters self._lbl = np.sort(self.z_df_[target].unique()) self._binary_t = len(self._lbl) == 2 self._target = target self._degree = degree self._method = method self._lowess_frac = lowess_frac self._it = it self._delta = delta self._hide_p_bar = hide_p_bar self._figsize = figsize self._n_rows = n_rows self._n_cols = n_cols self._silent = silent self._theme = theme self._kwargs = kwargs
[docs] def corr(self, disp_corr: str = 'pearson', quant: float = .75, thresh: Optional[float] = None, alpha: Optional[float] = None, plot: bool = False, ) -> Tuple[pd.DataFrame, pd.DataFrame]: """ Calculate Pearson (linear) and Spearman (monotonic) correlation and generate heat-map visualizations; Visualizations `Plotly Module`: - v1: Feature ``cols`` vs Target correlation either overall or for significant results only (P-value <= 0.05). - v2: Feature correlation for a selection of highly correlated features with target. Note ---- to check correlation for categorical features, encode categories as integers first (e.g.: LabelEncoder, OrdinalEncoder, ...). Parameters ---------- disp_corr: str One of ['Pearson', 'Spearman'], correlation method to be used in: - calculating correlation between features - sorting v1 quant: float proportion of features to be used in calculating feature correlation and display in v2; default is top 25% (> q3) of features that are highly correlated with ``target`` thresh: float or None minimum correlation strength between features to display in v2; if not `None`, only display correlation >= ``thresh`` alpha: float or None Significance alpha for rejecting null hypothesis (e.g.:0.05). if not `None`, V1 display features with significant results only (corr coef p-value <= ``alpha``) plot: Bool whether to run visualizations or not Returns ------- corr_df: pandas dataframe correlation coefficient and p-value for each feature vs ``target`` feat_corr_df: pandas dataframe correlation coefficient of highly correlated features, only ``quant`` features are included """ # input checks self._input_validation(disp_corr = disp_corr) corr = [] # result container for col in tqdm(self._cols, desc = f'Calculating Corr....', disable = self._hide_p_bar): r_pea, p_pea = stats.pearsonr(self.z_df_[self._target], self.z_df_[col]) r_spr, p_spr = stats.spearmanr(self.z_df_[self._target], self.z_df_[col]) corr.append((col, r_pea, p_pea, r_spr, p_spr)) # Feature vs Target Correlation Dataframe corr_df = pd.DataFrame([c[1:] for c in corr], columns = ['pearson', 'p_val_pear', 'spearman', 'p_val_spr'], index = np.array(corr)[:,:1].ravel()).sort_values(disp_corr, ascending = False) # highly correlated features (>= q?) abs_mask = corr_df[disp_corr].abs() feat_corr_df = self.z_df_[corr_df.index[abs_mask >= abs_mask.quantile(quant)]].corr(disp_corr) if plot: if alpha: # plotting correlations' statistically significant results # null hypothesis: two sets of data are uncorrelated/have no ordinal correlation # reject Null if P Val <= significance threshold(alpha) plot_data = corr_df[(corr_df.p_val_pear <= alpha) | (corr_df.p_val_spr <= alpha)].sort_values(disp_corr, ascending = False) else: plot_data = corr_df.sort_values(disp_corr, ascending = False) corr_n_cols = len(plot_data) # plotting feature correlation if thresh: corr_plot = feat_corr_df[abs(feat_corr_df) >= thresh] # if True then all are nans except diagonals # because this is a square matrix all_nans = len(corr_plot)**2 - corr_plot.isna().sum().sum() == len(corr_plot) if all_nans: # skip plotting as none > thresh feat_corr_n_cols = 1 else: feat_hm_title = f'{disp_corr} correlation heatmap (>={thresh:.0%}) of top {1 - quant:.0%} features' feat_corr_n_cols = len(corr_plot) else: corr_plot = feat_corr_df feat_hm_title = f'{disp_corr} correlation heatmap of top {1 - quant:.0%} features' feat_corr_n_cols = len(corr_plot) if corr_n_cols: self._n = 0 # control iterations and slicing indices lim = 30 # limit plotting to 30 features per map title = f'Correlation Heatmap - {self._target}' for _ in tqdm(range(int(np.ceil(corr_n_cols / lim))), desc = f'Plotting Target Correlation Heat Maps....', disable = self._hide_p_bar): self._heat_plot(plot_data, lim, title) if corr_n_cols > self._n: if not self._silent: _z_log.info(f"{corr_n_cols - self._n} out of {corr_n_cols} features remaining, to continue press " "'Enter' or input any value to exit.") time.sleep(2) if input().strip().lower().replace(' ',''): break else: _z_log.info(f"{corr_n_cols - self._n} out of {corr_n_cols} features remaining.") else: _z_log.info("No Significant Correlation Was Noted!") if feat_corr_n_cols > 1: self._n = 0 lim = 15 for _ in tqdm(range(int(np.ceil(feat_corr_n_cols / lim))), desc = f'Plotting Feature Correlation Heat Maps....', disable = self._hide_p_bar): self._heat_plot(corr_plot, lim, feat_hm_title, feat_corr = True) if feat_corr_n_cols > self._n: if not self._silent: _z_log.info(f"{feat_corr_n_cols - self._n} out of {feat_corr_n_cols} features remaining, " "to continue plotting press 'Enter' or input any value to exit.") time.sleep(2) if input().strip().lower().replace(' ',''): break else: _z_log.info(f"{feat_corr_n_cols - self._n} out of {feat_corr_n_cols} features remaining.") else: _z_log.info(f"No high correlation among top {1 - quant:.0%} features that are highly correlated with `target`") return corr_df.T.copy(), feat_corr_df
[docs] def fit_models(self): """ Univariate model fitting: - Polynomial regression - Ordinary Least Squares regression - Locally Weighted Scatterplot Smoothing non-parametric regression - Logistic regression Attributes ---------- z_fit_results_: dict where keys are ``cols`` and values are fitted regression model(s). z_fit_out_: numpy array excluded ``cols``, if any, causing regression fit errors. """ # input checks self._input_validation() # attributes self.z_fit_results_ = {} # fitted models fit_out = [] # features not fitted for col in tqdm(self._cols, desc = f'Fitting Models....', disable = self._hide_p_bar): try: # fit models if self._degree > 1: # Least-squares fit of a polynomial of nth degree model = np.polynomial.polynomial.polyfit(self.z_df_[col], self.z_df_[self._target], self._degree) elif self._fit == 'ols': # Ordinary Least Squares Regression # Q("{}")' controls column names having numbers and spaces # adds constant automatically model = ols(formula = f'Q("{self._target}")~Q("{col}")', data = self.z_df_).fit() elif self._fit == 'logit': # logistic Regression # binary or multiclass model = mnlogit(formula = f'Q("{self._target}")~Q("{col}")', data = self.z_df_).fit(disp = 0, method = self._method, **self._kwargs) else: # Locally Weighted Scatterplot Smoothing non-parametric regression model = lowess(self.z_df_[self._target], self.z_df_[col], frac = self._lowess_frac, it = self._it, delta = self._delta).T self.z_fit_results_[col] = model except: fit_out.append(col) if fit_out: _z_log.info(f"{len(fit_out)} out of {len(self._cols)} features were not fit! " "Please ensure that data to fit matches model requirements.") self.z_fit_out_ = np.asarray(fit_out) # attributes return self
[docs] @itr_plot(n_cols = 6, figsize = (24, 11)) def vis_fit(self, olrs_idx: Optional[Tuple[pd.core.indexes.base.Index, list]] = None, olrs_mapping: Optional[dict] = None, x_jitter: Optional[float] = None, y_jitter: Optional[float] = None, scatter_kws: dict = {'alpha': 0.3}, tc_color: str = 'orange', olrs_color: str = 'red', nbins: Union[int, str] = 'auto', axis: str = 'x', tight: Optional[bool] = None, x_ax_rotation: Optional[int] = None, ): """ Scatter plot visualization of univariate regression fits `Seaborn Module` Parameters ---------- olrs_idx: pandas index, list or None, Index of outlier data points olrs_mapping: dict or None column names as keys and outlier data points indices as values (pandas index or list) to highlight during plotting. Outliers from each column are plotted against their respective plot {x, y}_jitter: float or None adds random noise to the observations on {x, y}_axis. applicable to main scatter plot of `x` and `y`. scatter_kws : dict or None Additional keyword arguments passed to `plt.scatter` and `plt.plot`. Applide to main scatter plot of `x` and `y`. tc_color: str Color of OLS trendline or Sigmoid/lOWESS Curve olrs_color: str Color of outlier data points nbins: int or 'auto' For plot decoration, maximum number of axis intervals; 1 - max number of ticks. If the string 'auto', the number of bins will be automatically determined based on the length of the axis. axis: str For plot decoration, one of ['both', 'x', 'y'], axis on which to apply ``nbins``. tight : bool or None For plot decoration, controls expansion of axis limits, if `True` axis limits are only expanded using the margins; This does *not* set the margins to zero. If `False`, further expand the axis limits using the axis major locator. x_ax_rotation: int or None For plot decoration, set degree of x_ticks rotation. """ # prepare plots ax = self._fig.add_subplot(self._n_rows, self._n_cols, self._n) model = self.z_fit_results_[self._col] x, grid = self._plot_data(self.z_df_, self._col) if self._degree > 1: # fit plot y_pred = np.polynomial.polynomial.polyval(grid, model)[:,-1] # plot text text = "$" + self._func_text(model, poly = True, text_wrap = True) + "$" plt.title(text, color = self._theme) elif self._fit == 'ols': # fit plot paras = model.params.values y_pred = grid.dot(paras) # model.fittedvalues # plot text text = "$" + self._func_text(paras) + "$" + '\n' + \ f'$r^2$ = {model.rsquared:.4f}' plt.title(text, color = self._theme) elif self._fit == 'logit': # fit plot y_pred = self._logistic_pred(grid, model) # plot text null_ll = model.llnull full_ll = model.llf # likelihood function is the probability that the data were generated by the model parameters # the model's goal is to find values for the parameters (coefficients) that maximize value of # the likelihood function. The pseudo-R-squared(McFadden’s) measures model's performance, # higher values indicate a better fit, similar to R^2 available under least squares regression. # It is computed based on the ratio of the maximized log-likelihood function as follows: # 1 - (Log-Likelihood / LL-Null) where # Log-Likelihood(full model): maximized log-likelihood function using all parameters # LL-Null(null model): maximized log-likelihood function when only an intercept is included pr = 1 - (full_ll / null_ll) # model.prsquared # This is the p-value from a likelihood-ratio test of the full versus null model. # significance (p-value <.05) indicates favoring full(including feature) versus null(intercept only) model. # For example in a binary classification this means that the feature does have an effect on observing a positive # class label this effect is measured by the size(value) of model coefficients(parameters), # which refers to the change in the log-odds of observing positive class for each unit change in the feature value ll_stat = -2 * (null_ll - full_ll) # likelihood ratio Chi-Squared test statistic # calculate p-value of test statistic using n degrees of freedom llrp = stats.chi2.sf(ll_stat, model.df_model) # model.llr_pvalue if self._binary_t: # model params paras = model.params.values.ravel() else: # displaying results of last class label # compared to the base(reference) class # i.e.: the change in log-odds of last class # as a result of unit change in that feature # TODO: what about other class labels? paras = model.params.values[:,-1] text = "$" + self._func_text(paras) + "$" + '\n' + \ f'P$r^2$ = {pr:.4f}' + ' | ' + f'llrp = {llrp:.4f}' plt.title(text, color = self._theme) else: # lowes # fit plot x, y_pred = model # Main scatter plot regplot(data = self.z_df_, x = self._col, y = self._target, fit_reg = False, x_jitter = x_jitter, y_jitter = y_jitter, scatter_kws = scatter_kws, ax = ax) # fit plot plt.plot(x, y_pred, c = tc_color) # Overlay Outliers if hasattr(self, '_lrs') and self._col in self._lrs: # single check for both mapping or idx idx = np.array(self._lrs[self._col])[np.isin(self._lrs[self._col], self.z_df_.index)] if len(idx): # edge case: using `frac` and full index outliers_df = self.z_df_.loc[idx] scatterplot(data = outliers_df, x = self._col, y = self._target, color = olrs_color, ax = ax) # decorate col_dtype = self.z_df_[self._col].dtype.kind self._decorate_plot(ax, dtype = col_dtype, nbins = nbins, axis = axis, tight = tight, x_ax_rotation = x_ax_rotation, theme = self._theme); self._n += 1
[docs] @itr_plot(n_cols = 6, figsize = (24, 11)) def vis_ols_fit(self): """ Histograms and Scatter plots for Assessing OLS residuals' normality and homoscedasticity assumptions """ # fitted model model = self.z_fit_results_[self._col] # Residuals Normality assumption ax = self._fig.add_subplot(self._n_rows, self._n_cols, self._n) histplot(model.resid, bins = 'doane', ax = ax) plt.title(f'{self._col}', color = self._theme) plt.xlabel('Residuals') self._decorate_plot(ax, theme = self._theme); # Residuals homoscedasticity assumption ax_ = self._fig.add_subplot(self._n_rows, self._n_cols, self._n + 1) scatterplot(x = model.fittedvalues, y = model.resid, ax = ax_) # predictions vs residuals plt.axhline(0, alpha = 0.5, color = 'r') plt.title(f'Residual Plot - {self._col}', color = self._theme) plt.xlabel('Predictions') plt.ylabel('Residuals') self._decorate_plot(ax_, theme = self._theme); self._n += 2 # update iteration control
[docs] def vis_multi(self, col: str, olrs_idx: Optional[Tuple[pd.core.indexes.base.Index, list]] = None, color: Optional[Union[str, int, pd.Series]]= None, size: Optional[Union[str, int, pd.Series]] = None, size_max: int = 15, symbol: Optional[Union[str, int, pd.Series]] = None, symbol_sequence: Optional[List[str]] = None, symbol_map: Optional[dict] = None, hover_name: Optional[Union[str, int, pd.Series]] = None, hover_data: Optional[Union[str, list[str, int], pd.Series, dict]] = None, custom_data: Optional[Union[str, list[str, int], pd.Series]] = None, text: Optional[Union[str, int, pd.Series]] = None, facet_row: Optional[Union[str, int, pd.Series]] = None, facet_col: Optional[Union[str, int, pd.Series]] = None, facet_col_wrap: int = 0, facet_row_spacing: Optional[float] = None, facet_col_spacing: Optional[float] = None, error_x: Optional[Union[str, int, pd.Series]] = None, error_x_minus: Optional[Union[str, int, pd.Series]] = None, error_y: Optional[Union[str, int, pd.Series]] = None, error_y_minus: Optional[Union[str, int, pd.Series]] = None, labels: Optional[dict] = None, color_discrete_sequence: Optional[List[str]] = None, color_continuous_scale: Optional[List[str]] = None, opacity: Optional[float] = None, marginal_x: Optional[str] = None, marginal_y: Optional[str] = None, category_orders: Optional[dict] = None, trendline: Optional[str] = None, trendline_options: Optional[dict] = None, trendline_color_override: Optional[str] = None, trendline_scope: str = 'trace', log_x: bool = False, log_y: bool = False, range_x: Optional[List[float]] = None, range_y: Optional[List[float]] = None, title: Optional[str] = None, template: Optional[Union[str, dict]] = None, width: Optional[int] = None, height: Optional[int] = None, theme: str = 'darkorange'): """ Interactive multivariate scatter plot visualization and trend analysis `Plotly Module` Parameters ---------- col: str Name of column that goes to `x` axis olrs_idx: pandas index, list or None Index of outlier data points color: str or int or Series or array-like Either a name of a column in `data_frame`, or a pandas Series or array_like object. Values from this column or array_like are used to assign color to marks. size: str or int or Series or array-like Either a name of a column in `data_frame`, or a pandas Series or array_like object. Values from this column or array_like are used to assign mark sizes. size_max: int (default `20`) Set the maximum mark size when using `size`. symbol: str or int or Series or array-like Either a name of a column in `data_frame`, or a pandas Series or array_like object. Values from this column or array_like are used to assign symbols to marks. symbol_sequence: list of str Strings should define valid plotly.js symbols. When `symbol` is set, values in that column are assigned symbols by cycling through `symbol_sequence` in the order described in `category_orders`, unless the value of `symbol` is a key in `symbol_map`. symbol_map: dict with str keys and str values (default `{}`) String values should define plotly.js symbols Used to override `symbol_sequence` to assign a specific symbols to marks corresponding with specific values. Keys in `symbol_map` should be values in the column denoted by `symbol`. Alternatively, if the values of `symbol` are valid symbol names, the string `'identity'` may be passed to cause them to be used directly. hover_name: str or int or Series or array-like Either a name of a column in `data_frame`, or a pandas Series or array_like object. Values from this column or array_like appear in bold in the hover tooltip. hover_data: str, or list of str or int, or Series or array-like, or dict Either a name or list of names of columns in `data_frame`, or pandas Series, or array_like objects or a dict with column names as keys, with values True (for default formatting) False (in order to remove this column from hover information), or a formatting string, for example `':.3f'` or `'|%a'` or list-like data to appear in the hover tooltip or tuples with a bool or formatting string as first element, and list-like data to appear in hover as second element Values from these columns appear as extra data in the hover tooltip. custom_data: str, or list of str or int, or Series or array-like Either name or list of names of columns in `data_frame`, or pandas Series, or array_like objects Values from these columns are extra data, to be used in widgets or Dash callbacks for example. This data is not user-visible but is included in events emitted by the figure (lasso selection etc.) text: str or int or Series or array-like Either a name of a column in `data_frame`, or a pandas Series or array_like object. Values from this column or array_like appear in the figure as text labels. facet_row: str or int or Series or array-like Either a name of a column in `data_frame`, or a pandas Series or array_like object. Values from this column or array_like are used to assign marks to facetted subplots in the vertical direction. facet_col: str or int or Series or array-like Either a name of a column in `data_frame`, or a pandas Series or array_like object. Values from this column or array_like are used to assign marks to facetted subplots in the horizontal direction. facet_col_wrap: int Maximum number of facet columns. Wraps the column variable at this width, so that the column facets span multiple rows. Ignored if 0, and forced to 0 if `facet_row` or a `marginal` is set. facet_row_spacing: float between 0 and 1 Spacing between facet rows, in paper units. Default is 0.03 or 0.0.7 when facet_col_wrap is used. facet_col_spacing: float between 0 and 1 Spacing between facet columns, in paper units Default is 0.02. error_x: str or int or Series or array-like Either a name of a column in `data_frame`, or a pandas Series or array_like object. Values from this column or array_like are used to size x-axis error bars. If `error_x_minus` is `None`, error bars will be symmetrical, otherwise `error_x` is used for the positive direction only. error_x_minus: str or int or Series or array-like Either a name of a column in `data_frame`, or a pandas Series or array_like object. Values from this column or array_like are used to size x-axis error bars in the negative direction. Ignored if `error_x` is `None`. error_y: str or int or Series or array-like Either a name of a column in `data_frame`, or a pandas Series or array_like object. Values from this column or array_like are used to size y-axis error bars. If `error_y_minus` is `None`, error bars will be symmetrical, otherwise `error_y` is used for the positive direction only. error_y_minus: str or int or Series or array-like Either a name of a column in `data_frame`, or a pandas Series or array_like object. Values from this column or array_like are used to size y-axis error bars in the negative direction. Ignored if `error_y` is `None`. labels: dict with str keys and str values (default `{}`) By default, column names are used in the figure for axis titles, legend entries and hovers. This parameter allows this to be overridden. The keys of this dict should correspond to column names, and the values should correspond to the desired label to be displayed. color_discrete_sequence: list of str Strings should define valid CSS-colors. When `color` is set and the values in the corresponding column are not numeric, values in that column are assigned colors by cycling through `color_discrete_sequence` in the order described in `category_orders`, unless the value of `color` is a key in `color_discrete_map`. Various useful color sequences are available in the `plotly.express.colors` submodules, specifically `plotly.express.colors.qualitative`. color_continuous_scale: list of str Strings should define valid CSS-colors This list is used to build a continuous color scale when the column denoted by `color` contains numeric data. Various useful color scales are available in the `plotly.express.colors` submodules, specifically `plotly.express.colors.sequential`, `plotly.express.colors.diverging` and `plotly.express.colors.cyclical`. opacity: float Value between 0 and 1. Sets the opacity for markers. marginal_x: str One of `'rug'`, `'box'`, `'violin'`, or `'histogram'`. If set, a horizontal subplot is drawn above the main plot, visualizing the x-distribution. marginal_y: str One of `'rug'`, `'box'`, `'violin'`, or `'histogram'`. If set, a vertical subplot is drawn to the right of the main plot, visualizing the y-distribution. category_orders: dict with str keys and list of str values (default `{}`) By default, in Python 3.6+, the order of categorical values in axes, legends and facets depends on the order in which these values are first encountered in `data_frame` (and no order is guaranteed by default in Python below 3.6). This parameter is used to force a specific ordering of values per column. The keys of this dict should correspond to column names, and the values should be lists of strings corresponding to the specific display order desired. trendline: str or None One of `'ols'`, `'lowess'`, `'rolling'`, `'expanding'` or `'ewm'`. If `'ols'`, an Ordinary Least Squares regression line will be drawn for each discrete-color/symbol group. If `'lowess`', a Locally Weighted Scatterplot Smoothing line will be drawn for each discrete-color/symbol group. If `'rolling`', a Rolling (e.g. rolling average, rolling median) line will be drawn for each discrete-color/symbol group. If `'expanding`', an Expanding (e.g. expanding average, expanding sum) line will be drawn for each discrete-color/symbol group. If `'ewm`', an Exponentially Weighted Moment (e.g. exponentially-weighted moving average) line will be drawn for each discrete-color/symbol group. See the docstrings for the functions in `plotly.express.trendline_functions` for more details on these functions and how to configure them with the `trendline_options` argument. trendline_options: dict or None Options passed as the first argument to the function from `plotly.express.trendline_functions` named in the `trendline` argument. Valid keys for the `trendline_options` dict are as follows: ols add_constant: bool, default 'True' if `False`, the trendline passes through the origin but if `True` a y-intercept is fitted. log_x and log_y: bool, default 'False' if `True` the OLS is computed with respect to the base 10 logarithm of the input. Note that this means no zeros can be present in the input. lowess frac: float, default '0.6666666' Between 0 and 1. The fraction of the data used when estimating each y-value. rolling function: function, str, list or dict, default 'mean' Function to use for aggregating the data. If a function, must either work when passed a Series/Dataframe or when passed to Series/Dataframe.apply. Accepted combinations are: - function - string function name - list of functions and/or function names, e.g. [np.sum, 'mean'] - dict of axis labels -> functions, function names or list of such. function_args: dict function arguments. For examples please refer to 'win_type' argument documentation below. window: int, timedelta, str, offset, or BaseIndexer subclass - Size of the moving window. - If an integer, the fixed number of observations used for each window. - If a timedelta, str, or offset, the time period of each window. Each window will be a variable sized based on the observations included in the time-period. This is only valid for datetimelike indexes. - If a BaseIndexer subclass, the window boundaries based on the defined ``get_window_bounds`` method. Additional rolling keyword arguments, namely ``min_periods``, ``center``, ``closed`` and ``step`` will be passed to ``get_window_bounds``. min_periods: int, default None Minimum number of observations in window required to have a value, otherwise, result is ``np.nan``. center: bool, default False - If False, set the window labels as the right edge of the window index. - If True, set the window labels as the center of the window index. win_type: str, default None - If ``None``, all points are evenly weighted. - If a string, it must be a valid `scipy.signal window function <https://docs.scipy.org/doc/scipy/reference/signal.windows.html#module-scipy.signal.windows>`__. - e.g.: [`barthann`, `bartlett`, `blackman`, `blackmanharris`, `bohman`, `boxcar`, `chebwin`, `cosine`, `exponential`, `flattop`, `gaussian`, `general_gaussian`, `hamming`, `hann`, `kaiser`, `nuttall`, `parzen`, `triang`, `tukey`] - Certain Scipy window types require additional parameters to be passed in the aggregation function. The additional parameters must match the keywords specified in the Scipy window type method signature. - `window` and `rolling` are pandas subclasses utilizing window functions from `scipy` module. - If `win_type` is not `None` a `window` subclass is returned, otherwise a `rolling` subclass is returned. This affects the way `function` argument behaves, see examples below. on: str, optional - For a DataFrame, a column label or Index level on which to calculate the rolling window, rather than the DataFrame's index. - Provided integer column is ignored and excluded from result since an integer index is not used to calculate the rolling window. closed: str, default None - If ``'right'``, the first point in the window is excluded from calculations. - If ``'left'``, the last point in the window is excluded from calculations. - If ``'both'``, the no points in the window are excluded from calculations. - If ``'neither'``, the first and last points in the window are excluded from calculations. - Default ``None`` (``'right'``). step: int, default None Evaluate the window at every ``step`` result, equivalent to slicing as ``[::step]``. ``window`` must be an integer. Using a step argument other than None or 1 will produce a result with a different shape than the input. expanding function and function_args same as in `rolling` min_periods: int, default 1 Minimum number of observations in window required to have a value; otherwise, result is ``np.nan``. ewm function and function_args same as in `rolling` com: float, optional Specify decay in terms of center of mass span: float, optional Specify decay in terms of span halflife: float, str, timedelta, optional - Specify decay in terms of half-life - If ``times`` is specified, a timedelta convertible unit over which an observation decays to half its value. Only applicable to ``mean()``, and halflife value will not apply to the other functions. alpha: float, optional Specify smoothing factor min_periods: int, default 0 Minimum number of observations in window required to have a value; otherwise, result is ``np.nan``. adjust: bool, default True Divide by decaying adjustment factor in beginning periods to account for imbalance in relative weightings (viewing EWMA as a moving average). ignore_na: bool, default False Ignore missing values when calculating weights. times : np.ndarray, Series, default None - Only applicable to ``mean()``. - Times corresponding to the observations. Must be monotonically increasing and ``datetime64[ns]`` dtype. - If 1-D array like, a sequence with the same shape as the observations. trendline_color_override: str or None Valid CSS color. If provided, and if ``trendline`` is set, all trendlines will be drawn in this color rather than in the same color as the traces from which they draw their inputs. trendline_scope: str (one of `'trace'` or `'overall'`, default `'trace'`) If `'trace'`, then one trendline is drawn per trace (i.e. per color, symbol, facet, animation frame etc) and if `'overall'` then one trendline is computed for the entire dataset, and replicated across all facets. log_x: boolean (default `False`) If `True`, the x-axis is log-scaled in cartesian coordinates. log_y: boolean (default `False`) If `True`, the y-axis is log-scaled in cartesian coordinates. range_x: list of two numbers If provided, overrides auto-scaling on the x-axis in cartesian coordinates. range_y: list of two numbers If provided, overrides auto-scaling on the y-axis in cartesian coordinates. title: str The figure title. template: str or dict or plotly.graph_objects.layout.Template instance The figure template name (must be a key in plotly.io.templates) or definition. width: int (default `None`) The figure width in pixels. height: int (default `None`) The figure height in pixels. theme: str, adjust axis and title colors as desired Attributes ---------- z_plotly_ols_fit: pandas dataframe fitted Ordinary Least Squares model(s) z_plotly_fit: pandas dataframe fitted Logistic or Polynomial model(s) z_plotly_fit_out: pandas dataframe groups where fitting models fails, only applicable for Logistic or Polynomial fits if ``facet`` is assigned Rolling Examples ---------------- >>> # Custom Function >>> # pandas >>> series.rolling('win_type' = None).aggregate(**opts) >>> # trendline_options - lambda is the euclidean distance >>> tl_opts = dict( >>> function = 'aggregate', >>> function_args = dict( >>> func = lambda x: np.sqrt(x.dot(x)) >>> ), >>> win_type = None) >>> # Rolling object >>> # pandas >>> series.rolling('win_type' = None).sum(**opts) >>> # trendline_options >>> tl_opts = dict( >>> function = 'sum', >>> function_args = None, >>> win_type = None) >>> # Window object >>> # pandas >>> series.rolling('win_type' = 'gaussian').sum(**opts) >>> # trendline_options - 'std' is parameter required by >>> # 'gaussian' window function, not the aggregation function 'sum' >>> tl_opts = dict( >>> function = 'sum', >>> function_args = dict(std = 2), >>> win_type = 'gaussian') """ # force order on facet grid if not already self._facets = [f for f in [facet_row, facet_col] if f] if any(self._facets): # ensure complete mapping incase only # single facet order was set by user # otherwise, override default values category_orders = { f'{f}': self.z_df_[f].unique() for f in self._facets} | (category_orders if category_orders else {}) # Check active arguments and column type # for proper formatting of hover templates args = [col, self._target, color, symbol, size, *self._facets] # all columns arguments active_args, numeric, categorical = self._plotly_args(args) if not hover_data: # Apply proper formatting, hide facets and display index # for main scatter plot hover_data = {k:':,.3f' for k in numeric} | \ {k: False for k in self._facets} | {'index': (':,.0f', self.z_df_.index)} # set colors if not already if not color_discrete_sequence: color_discrete_sequence = colors.qualitative.Dark24 if not color_continuous_scale: color_continuous_scale = colors.sequential.Viridis # ignored if `color` is binary feature # Main scatter plot using copy of dataFrame, accounting for fraction - if any fig = scatter(self.z_df_, x = col, y = self._target, color = color, size = size, size_max = size_max, symbol = symbol, symbol_sequence = symbol_sequence, symbol_map = symbol_map, hover_name = hover_name, hover_data = hover_data, custom_data = custom_data, text = text, facet_row = facet_row, facet_col = facet_col, facet_col_wrap = facet_col_wrap, facet_row_spacing = facet_row_spacing, facet_col_spacing = facet_col_spacing, error_x = error_x, error_x_minus = error_x_minus, error_y = error_y, error_y_minus = error_y_minus, labels = labels, color_discrete_sequence = color_discrete_sequence, color_continuous_scale = color_continuous_scale, opacity = opacity, marginal_x = marginal_x, marginal_y = marginal_y, category_orders = category_orders, trendline = trendline, trendline_options = trendline_options, trendline_color_override = trendline_color_override, trendline_scope = trendline_scope, log_x = log_x, log_y = log_y, range_x = range_x, range_y = range_y, title = title, template = template, width = width, height = height ) if trendline == 'ols': self.z_plotly_ols_fit_ = get_trendline_results(fig) # attributes # mapping position on facet grid, if any if facet_col_wrap and (any([marginal_x, marginal_y]) or facet_row or not facet_col): facet_col_wrap = 0 # Ignore facet column wrapping to match plotly logic row_map, col_map = self._facet_map(category_orders, facet_row, facet_col, facet_col_wrap) # fit identifier for hovertemplate if self._degree > 1: fit = f'Polynomial(degree = {self._degree})' fit_cols = [f'x^{i}' for i in range(self._degree + 1)] # columns of fit results dataframe: polynomial coefs elif self._fit == 'logit': fit = 'logistic' fit_cols = ['fit'] # fitted models else: fit = None # overlay fits if fit: # log transformation takes place only during fitting # this is controlled by designated keys in # `trendline_options` attribute. # Plot axis are displayed in original input values # unless `log_x` or log_y` attributes are activated # which in-turn updates `fig` x/y-axis log_x = log_y = False col_ = col target_ = self._target if trendline_options: log_x = trendline_options.get("log_x", False) log_y = trendline_options.get("log_y", False) if log_y and fit != 'logistic': if np.any(self.z_df_[self._target] <= 0): log_y = False _z_log.info(f"Log_y transformation was not applied, {self._target} includes non-positive values") else: self.z_df_[f'{self._target}_'] = np.log10(self.z_df_[self._target]) target_ = f'{self._target}_' if log_x: if np.any(self.z_df_[col] <= 0): log_x = False _z_log.info(f"Log_x transformation was not applied, {col} includes non-positive values") else: self.z_df_[f'{col}_'] = np.log10(self.z_df_[col]) col_ = f'{col}_' self._models = [] # fit dataframe container self._fit_fail = [] # failed fits container # fit data is grouped by: color(if categorical) + symbol + facets # plotly grouping order is color -> symbol -> facet row -> facet col groupers = [color, symbol] if color in categorical else [symbol] full_gs = groupers + self._facets self._g_map = {col: self.z_df_[col].unique() for col in full_gs if col} if self._g_map and trendline_scope != 'overall': # update columns of results dataframe if self._facets: fit_cols = list(self._g_map.keys()) + fit_cols # nested loops for all combinations in grouping map for comb in product(*[self._g_map[col] for col in self._g_map.keys()], repeat = 1): mask = self.z_df_[(self.z_df_[self._g_map.keys()] == comb).all(1)] if len(mask.dropna()) > 1 and mask[self._target].nunique() > 1: # location on facet_grid if facet_col_wrap: row_idx = row_map[mask[facet_col].unique()[0]] else: row_idx = row_map[mask[facet_row].unique()[0]] if row_map else 1 col_idx = col_map[mask[facet_col].unique()[0]] if col_map else 1 # plot data x, y_pred, hovertemplate, name = self._fit_models(col_, target_, mask, fit, self._degree, row_idx, col_idx, comb, groupers, log_x, log_y) # plot if name: # None if fit fails fig.add_scatter(x = x, y = y_pred, hovertemplate = hovertemplate, showlegend = False, name = name, row = row_idx, col = col_idx) else: # check pre-fit # only applicable for # overall non-transformed fits model = self.z_fit_results_.get(f'{col_}') if hasattr(self, 'z_fit_results_') else False # plot data x, y_pred, hovertemplate, name = self._fit_models(col_, target_, self.z_df_, fit, self._degree, log_x = log_x, log_y = log_y, model = model) # plot if name: fig.add_scatter(x = x, y = y_pred, hovertemplate = hovertemplate, showlegend = False, name = name, row = 'all', col = 'all') # attributes self.z_plotly_fit_ = pd.DataFrame(self._models, columns = fit_cols) if self._fit_fail: self.z_plotly_fit_out_ = pd.DataFrame(self._fit_fail) # overlay outliers if olrs_idx is not None: idx = np.array(olrs_idx)[np.isin(olrs_idx, self.z_df_.index)] if len(idx): # Editing Outliers Scatter hover template # Meta define values(column names) to be accessed within the hovertemplate meta = active_args[~np.isin(active_args, self._facets)] # only active cols ignoring facets # outlier dataframe outliers_df = self.z_df_.loc[idx].copy() # location on facet_grid, if any if facet_col_wrap: outliers_df['row_idx'] = outliers_df[facet_col].map(row_map) else: outliers_df['row_idx'] = outliers_df[facet_row].map(row_map) if row_map else 1 outliers_df['col_idx'] = outliers_df[facet_col].map(col_map) if col_map else 1 # unique facet grid coordinates cord_list = sorted(list(set(zip(outliers_df['row_idx'].values, outliers_df['col_idx'].values)))) # plot for idx in cord_list: temp_df = outliers_df[(outliers_df['row_idx'] == idx[0]) & (outliers_df['col_idx'] == idx[1])] # fetching all active arguments from outlier dataframe # for each column displayed in hovertemplate customdata = np.stack([temp_df[_] for _ in meta], axis = -1) # workaround as hovertemplate contradicts python format string (% & f) hovertemplate = \ '<br>'.join(['%{meta[i]}: %{customdata[i]:,.3f}'.replace('i', f'{i}') if _ in numeric else '%{meta[i]}: %{customdata[i]}'.replace('i', f'{i}') for i, _ in enumerate(meta)]) + '<br>''Index: %{text}<extra></extra>' # overlay outliers in each respective facet position fig.add_scatter(x = temp_df[col], y = temp_df[self._target], meta = meta, customdata = customdata, mode = 'markers', marker = dict(size = 10, color = 'red', line = dict(color = 'orange', width = 2)), hovertemplate = hovertemplate, text = temp_df.index, showlegend = False, name = 'outliers', row = idx[0], col = idx[1], ) if not title: title = f"'{self._target}' interactive scatter plot" # Decorate fig.update_layout(dict(paper_bgcolor = 'rgba(0,0,0,0)', plot_bgcolor = 'rgba(0,0,0,0)', hoverlabel = dict(bgcolor = 'brown', font = dict(family = 'Rockwell', size = 16, color = 'moccasin')), legend = dict(orientation = 'h', xanchor = 'right', x = 1, title_font_family = 'Arial', font = dict(family = 'Rockwell', size = 14, color = 'lemonchiffon'), bgcolor = 'dimgrey', bordercolor = 'Black', borderwidth = 2), title = dict(text = title, x = 0.5, y = .99, xref = 'paper', font = dict(family = 'Arial', size = 20, color = theme)) )) fig.update_xaxes(showgrid = False, color = theme, title_standoff = 10) fig.update_yaxes(showgrid = False, color = theme) # check if traces exists other than trendlines # if they are the last trace then no other # traces exists as they get plotted first if len(fig.data) > 1 and 'trendline' not in fig.data[-1]['hovertemplate']: # other traces exists fig.update_traces(selector = -1, showlegend = True) # only show last trace in legend fig.for_each_annotation(lambda x: x.update(text = x.text.split('=')[-1], bgcolor = 'dimgrey', font = dict(family = 'Rockwell', size = 16, color = 'oldlace'))) # return fig # fig.write_image('figure.png', scale=4) fig.show();
[docs] def vis_multi_d(self, x: str, y: str, z: Optional[str] = None, olrs_idx: Optional[Tuple[pd.core.indexes.base.Index, list]] = None, color: Optional[Union[str, int, pd.Series]]= None, symbol: Optional[Union[str, int, pd.Series]] = None, symbol_sequence: Optional[List[str]] = None, symbol_map: Optional[dict] = None, size: Optional[Union[str, int, pd.Series]] = None, size_max: int = 20, text: Optional[Union[str, int, pd.Series]] = None, hover_name: Optional[Union[str, int, pd.Series]] = None, hover_data: Optional[Union[str, list[str, int], pd.Series, dict]] = None, custom_data: Optional[Union[str, list[str, int], pd.Series]] = None, error_x: Optional[Union[str, int, pd.Series]] = None, error_x_minus: Optional[Union[str, int, pd.Series]] = None, error_y: Optional[Union[str, int, pd.Series]] = None, error_y_minus: Optional[Union[str, int, pd.Series]] = None, error_z: Optional[Union[str, int, pd.Series]] = None, error_z_minus: Optional[Union[str, int, pd.Series]] = None, animation_frame: Optional[Union[str, int, pd.Series]] = None, animation_group: Optional[Union[str, int, pd.Series]] = None, category_orders: Optional[dict] = None, labels: Optional[dict] = None, color_discrete_sequence: Optional[List[str]] = None, color_continuous_scale: Optional[List[str]] = None, opacity: Optional[float] = None, log_x: bool = False, log_y: bool = False, log_z: bool = False, range_x: Optional[List[float]] = None, range_y: Optional[List[float]] = None, range_z: Optional[List[float]] = None, title: Optional[str] = None, template: Optional[Union[str, dict]] = None, width: Optional[int] = None, height: Optional[int] = None, theme: str = 'darkorange'): """ Interactive 3D multivariate scatter plot visualization `Plotly Module` Parameters ---------- x: str Name of column that goes to `x` axis y: str Name of column that goes to `y` axis z: str or None Name of column that goes to `z` axis. If `None`, z-axis is the ``target`` variable olrs_idx: pandas index, list or None Index of outlier data points color: str or int or Series or array-like Either a name of a column in `data_frame`, or a pandas Series or array_like object. Values from this column or array_like are used to assign color to marks. symbol: str or int or Series or array-like Either a name of a column in `data_frame`, or a pandas Series or array_like object. Values from this column or array_like are used to assign symbols to marks. symbol_sequence: list of str Strings should define valid plotly.js symbols. When ``symbol`` is set, values in that column are assigned symbols by cycling through ``symbol_sequence`` in the order described in ``category_orders``, unless the value of ``symbol`` is a key in ``symbol_map``. symbol_map: dict with str keys and str values (default `{}`) String values should define plotly.js symbols Used to override ``symbol_sequence`` to assign a specific symbols to marks corresponding with specific values. Keys in ``symbol_map`` should be values in the column denoted by ``symbol``. Alternatively, if the values of ``symbol`` are valid symbol names, the string `'identity'` may be passed to cause them to be used directly. size: str or int or Series or array-like Either a name of a column in `data_frame`, or a pandas Series or array_like object. Values from this column or array_like are used to assign mark sizes. size_max: int (default `20`) Set the maximum mark size when using ``size``. text: str or int or Series or array-like Either a name of a column in `data_frame`, or a pandas Series or array_like object. Values from this column or array_like appear in the figure as text labels. hover_name: str or int or Series or array-like Either a name of a column in `data_frame`, or a pandas Series or array_like object. Values from this column or array_like appear in bold in the hover tooltip. hover_data: str, or list of str or int, or Series or array-like, or dict Either a name or list of names of columns in `data_frame`, or pandas Series, or array_like objects or a dict with column names as keys, with values `True` (for default formatting) `False` (in order to remove this column from hover information), or a formatting string, for example `':.3f'` or `'|%a'` or list-like data to appear in the hover tooltip or tuples with a bool or formatting string as first element, and list-like data to appear in hover as second element Values from these columns appear as extra data in the hover tooltip. custom_data: str, or list of str or int, or Series or array-like Either name or list of names of columns in `data_frame`, or pandas Series, or array_like objects Values from these columns are extra data, to be used in widgets or Dash callbacks for example. This data is not user-visible but is included in events emitted by the figure (lasso selection etc.) error_x: str or int or Series or array-like Either a name of a column in `data_frame`, or a pandas Series or array_like object. Values from this column or array_like are used to size x-axis error bars. If ``error_x_minus`` is `None`, error bars will be symmetrical, otherwise `error_x` is used for the positive direction only. error_x_minus: str or int or Series or array-like Either a name of a column in `data_frame`, or a pandas Series or array_like object. Values from this column or array_like are used to size x-axis error bars in the negative direction. Ignored if ``error_x`` is `None`. error_y: str or int or Series or array-like Either a name of a column in `data_frame`, or a pandas Series or array_like object. Values from this column or array_like are used to size y-axis error bars. If ``error_y_minus`` is `None`, error bars will be symmetrical, otherwise ``error_y`` is used for the positive direction only. error_y_minus: str or int or Series or array-like Either a name of a column in `data_frame`, or a pandas Series or array_like object. Values from this column or array_like are used to size y-axis error bars in the negative direction. Ignored if ``error_y`` is `None`. error_z: str or int or Series or array-like Either a name of a column in `data_frame`, or a pandas Series or array_like object. Values from this column or array_like are used to size z-axis error bars. If ``error_z_minus`` is `None`, error bars will be symmetrical, otherwise ``error_z`` is used for the positive direction only. error_z_minus: str or int or Series or array-like Either a name of a column in `data_frame`, or a pandas Series or array_like object. Values from this column or array_like are used to size z-axis error bars in the negative direction. Ignored if ``error_z`` is `None`. animation_frame: str or int or Series or array-like Either a name of a column in `data_frame`, or a pandas Series or array_like object. Values from this column or array_like are used to assign marks to animation frames. animation_group: str or int or Series or array-like Either a name of a column in `data_frame`, or a pandas Series or array_like object. Values from this column or array_like are used to provide object-constancy across animation frames: rows with matching ``animation_group`` will be treated as if they describe the same object in each frame. category_orders: dict with str keys and list of str values (default `{}`) By default, in Python 3.6+, the order of categorical values in axes, legends and facets depends on the order in which these values are first encountered in `data_frame` (and no order is guaranteed by default in Python below 3.6). This parameter is used to force a specific ordering of values per column. The keys of this dict should correspond to column names, and the values should be lists of strings corresponding to the specific display order desired. labels: dict with str keys and str values (default `{}`) By default, column names are used in the figure for axis titles, legend entries and hovers. This parameter allows this to be overridden. The keys of this dict should correspond to column names, and the values should correspond to the desired label to be displayed. color_discrete_sequence: list of str Strings should define valid CSS-colors. When ``color`` is set and the values in the corresponding column are not numeric, values in that column are assigned colors by cycling through ``color_discrete_sequence`` in the order described in ``category_orders``, unless the value of ``color`` is a key in ``color_discrete_map``. Various useful color sequences are available in the `plotly.express.colors` submodules, specifically `plotly.express.colors.qualitative`. color_continuous_scale: list of str Strings should define valid CSS-colors This list is used to build a continuous color scale when the column denoted by `color` contains numeric data. Various useful color scales are available in the `plotly.express.colors` submodules, specifically `plotly.express.colors.sequential`, `plotly.express.colors.diverging` and `plotly.express.colors.cyclical`. opacity: float or None, Value between 0 and 1. Sets the opacity for markers. log_x: boolean (default `False`) If `True`, the x-axis is log-scaled in cartesian coordinates. log_y: boolean (default `False`) If `True`, the y-axis is log-scaled in cartesian coordinates. log_z: boolean (default `False`) If `True`, the z-axis is log-scaled in cartesian coordinates. range_x: list of two numbers If provided, overrides auto-scaling on the x-axis in cartesian coordinates. range_y: list of two numbers If provided, overrides auto-scaling on the y-axis in cartesian coordinates. range_z: list of two numbers If provided, overrides auto-scaling on the z-axis in cartesian coordinates. title: str The figure title. template: str or dict or plotly.graph_objects.layout.Template instance The figure template name (must be a key in plotly.io.templates) or definition. width: int (default `None`) The figure width in pixels. height: int (default `None`) The figure height in pixels. theme: str adjust axis and title colors as desired """ # Setting target variable to z-axis if not z and self._target not in [x, y, z]: z = self._target # Editing Main Scatter hover template args = [x, y, z, color, symbol, size] # all column arguments ignoring facets active_args, numeric, categorical = self._plotly_args(args) if not hover_data: # Apply proper formatting and display index # for main scatter plot hover_data = {k:':,.3f' for k in numeric} | {'index': (':,.0f', self.z_df_.index)} # set colors if not already c_disc = color_discrete_sequence if color_discrete_sequence else colors.qualitative.Bold_r c_cont = color_continuous_scale if color_continuous_scale else colors.sequential.YlOrRd # XD scatter plot fig = scatter_3d(self.z_df_, x = x, y = y, z = z, color = color, symbol = symbol, symbol_sequence = symbol_sequence, symbol_map = symbol_map, size = size, size_max = size_max, text = text, hover_name = hover_name, hover_data = hover_data, custom_data = custom_data, error_x = error_x, error_x_minus = error_x_minus, error_y = error_y, error_y_minus = error_y_minus, error_z = error_z, error_z_minus = error_z_minus, animation_frame = animation_frame, animation_group = animation_group, category_orders = category_orders, labels = labels, color_discrete_sequence = c_disc, color_continuous_scale = c_cont, opacity = opacity, log_x = log_x, log_y = log_y, log_z = log_z, range_x = range_x, range_y = range_y, range_z = range_z, title = title, template = template, width = width, height = height) # overlay outliers if olrs_idx is not None: idx = np.array(olrs_idx)[np.isin(olrs_idx, self.z_df_.index)] if len(idx): # edge case: using `frac` and full index # Editing Outliers Scatter hover template meta = active_args # fetching all active arguments from outlier dataframe outliers_df = self.z_df_.loc[idx].copy() customdata = np.stack([outliers_df[_] for _ in meta], axis = -1) # workaround as hovertemplate contradicts python format string (% & f) # <extra></extra> remove trace name hovertemplate = \ '<br>'.join(['%{meta[i]}: %{customdata[i]:,.0f}'.replace('i', f'{i}') if _ in numeric else '%{meta[i]}: %{customdata[i]}'.replace('i', f'{i}') for i, _ in enumerate(meta)]) + '<br>''Index: %{text}<extra></extra>' fig.add_scatter3d(x = outliers_df[x], y = outliers_df[y], z = outliers_df[z], meta = meta, customdata = customdata, mode = 'markers', marker = dict(size = 10, color = 'yellow', line = dict(color = 'red', width = 4)), hovertemplate = hovertemplate, text = outliers_df.index, name = 'outliers' ) if not title: title = f'3D Visualization of {self._target}' fig.update_layout(dict(paper_bgcolor = 'rgba(0,0,0,0)', plot_bgcolor = 'rgba(0,0,0,0)', # 3d scene decoration scene = dict( aspectratio = dict(x = 1.5, y = 1.5, z = 1), # axes ratio # axes decoration xaxis = dict(backgroundcolor = 'darkred', showgrid = False, color = 'darkred'), yaxis = dict(backgroundcolor = 'darkslateblue', showgrid = False, color = 'darkslateblue'), zaxis = dict(backgroundcolor = 'darkolivegreen', showgrid = False, color = 'darkolivegreen')), hoverlabel = dict(bgcolor = 'brown', font = dict(family = 'Rockwell', size = 16, color = 'moccasin')), legend = dict(orientation = 'h', xanchor = 'right', x = 1, title_font_family = 'Arial', font = dict(family = 'Rockwell', size = 14, color = 'lemonchiffon'), bgcolor = 'dimgrey', bordercolor = 'Black', borderwidth = 2), title = dict(text = title, x = 0.5, y = .99, xref = 'paper', font = dict(family = 'Arial', size = 20, color = theme)))) fig.update_traces(marker = dict(line = dict(width = 2))) # scatter markers decoration fig.show();
def _input_validation(self, disp_corr: Optional[str] = None): """ validate inputs before execution """ if not len(self._cols): raise AttributeError("No feature to analyze! Please ensure that input columns are valid") if disp_corr and disp_corr not in ['pearson', 'spearman']: raise ValueError("'disp_corr' parameter must be one of the following arguments: 'pearson' or 'spearman', " f"however, '{disp_corr}' was received!") def _heat_plot(self, plot_data, lim, title, feat_corr = False): """ Correlation heatmaps""" # limit features per map mask = plot_data[self._n: self._n + lim].T data = mask.values.round(2) x = mask.columns # heat map data and annotations if not feat_corr: z = data[::2,:] # coef values y = mask.index[::2] # corr text customdata = data[1::2,:] # pvals hovtemp = '<br>'.join(['txt vs %{x}'.replace('txt', f'{self._target}'), 'pval = %{customdata:.3f}<extra></extra>']) else: z = np.where(np.isnan(data), '', data) # hide values < thresh y = mask.index customdata = None hovtemp = '<br>'.join(['%{y} vs %{x}<extra></extra>']) annotations = [layout.Annotation(text = str(z[idx][col_idx]), x = x[col_idx], y = y[idx], bgcolor = 'black', bordercolor = 'purple', showarrow = False, font = dict(family = 'Arial', size = 11, color = 'oldlace')) for idx, col in enumerate(z) for col_idx, v in enumerate(col)] # Heatmaps fig = Figure( data=[Heatmap(z = z, x = x, y = y ,zmin = -1, zmax = 1, customdata = customdata, hovertemplate = hovtemp, colorscale = 'icefire', showscale = False )]) # Decorate fig.update_layout( dict(paper_bgcolor = 'rgba(0,0,0,0)', plot_bgcolor = 'rgba(0,0,0,0)', hoverlabel = dict(bgcolor = 'darkred'), annotations = annotations, title = dict(text = title, x = 0.5, xref = 'paper', font = dict(family = 'Arial', size = 20, color = self._theme)) )) fig.update_xaxes(showgrid = False, color = self._theme) fig.update_yaxes(showgrid = False, color = self._theme) fig.show(); self._n += lim # update iteration control and slicing indices def _plot_data(self, df, col) -> Tuple[np.ndarray, np.ndarray]: """ Data used to plot generated predictions """ lim = np.linspace(df[col].min(), df[col].max(), 100) grid = np.c_[np.ones(len(lim)), lim] x = grid[:,1] return x, grid def _logistic_pred(self, grid, model) -> Tuple[np.ndarray]: """ Generate predictions from fitted logistic regression model """ # Use a constant logit of zero for the first class in multiclass # because the reference class is never predicted directly logits = np.c_[np.zeros(len(grid)), grid.dot(model.params)] # class probability # using softmax function y_pred = np.exp(logits) / (np.sum(np.exp(logits), axis = 1, keepdims = 1)) # softmax = sigmoid for binary class y_pred = y_pred[:,-1] # positive class label for binary or last label for multiclass return y_pred def _func_text(self, params, poly = False, text_wrap = False)-> str: """ format regression equation and model parameters text """ # generic mapping for proper display replace_map = {"+ -":"- "} if not poly: coef_list = [f"{coef:,.3f}" for coef in params] equation = "f(x) = " + " + ".join(coef_list) + "x" else: coef_list = [f"{coef:,.3f}x<sup>{i}</sup>" for i, coef in enumerate(params)] if text_wrap: # seaborn plot title text = "P(x) = " + " + ".join(coef_list) equation = "$ \n $".join(wrap(text, 70)) replace_map = replace_map | {"<sup>":"^", "</sup>":"", "x^0":"", "x^1":"x", "+$ \n $-": "-$ \n $"} else: # plotly hovertemplate equation = "P(x) = " + " + ".join(coef_list) replace_map = replace_map | {"x<sup>0</sup>":"", "x<sup>1</sup>":"x"} for k, v in replace_map.items(): equation = equation.replace(k, v) return equation def _plotly_args(self, args) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: """ Identify numeric and categorical features for proper formatting """ active_args = np.array(list({arg for arg in args if arg})) # active arguments, no need to preserve order # column dtypes of active arguments numeric = np.array([col for col in active_args if self.z_df_[col].dtype.kind in 'ifc']) categorical = active_args[~np.isin(active_args, numeric)] return active_args, numeric, categorical def _facet_map(self, category_orders, facet_row, facet_col, facet_col_wrap = None) -> Optional[Tuple[dict, dict]]: """ Map category orders to their corresponding row and column positions within plotly facet grid """ if facet_col and facet_row: n_rows_ = range(1, len(category_orders[facet_row]) + 1) n_cols_ = range(1, len(category_orders[facet_col]) + 1) row_map = dict(zip(np.flip(category_orders[facet_row]), n_rows_)) col_map = dict(zip(category_orders[facet_col], n_cols_)) elif facet_col: if facet_col_wrap: # Wraps the column variable at this width, Ignored if `facet_row` is set. n_rows_ = int(np.ceil(len(category_orders[facet_col]) / facet_col_wrap)) row_map = {} col_map = {} c_idx = r_idx = 1 for _ in range(len(category_orders[facet_col])): row_map[np.flip(category_orders[facet_col])[_]] = r_idx col_map[category_orders[facet_col][_]] = c_idx c_idx += 1 if len(category_orders[facet_col][1+_:]) == facet_col_wrap: # remaining categories goes to new row r_idx += 1 if c_idx > facet_col_wrap: # max columns reached, new row needed ? c_idx = 1 if r_idx < n_rows_: r_idx += 1 else: n_cols_ = range(1, len(category_orders[facet_col]) + 1) row_map = None col_map = dict(zip(category_orders[facet_col], n_cols_)) elif facet_row: n_rows_ = range(1, len(category_orders[facet_row]) + 1) row_map = dict(zip(np.flip(category_orders[facet_row]), n_rows_)) col_map = None else: row_map = col_map = None return row_map, col_map def _fit_models(self, col, target, fit_data, fit, degree, row_idx = None, col_idx = None, comb = None, groupers = None, log_x = False, log_y = False, model = False) -> Optional[Tuple[np.ndarray, str]]: """ Generate fit data for Logistic and Polynomial fits in plotly scatter plots """ # plot data x, grid = self._plot_data(fit_data, col) # generic return values y_pred = hovertemplate = name = None # indications of groups for each fit gs = [f'{groupers[i]}: {comb[i]}' for i, _ in enumerate(groupers) if _] if groupers else [] try: if fit == 'logistic': if not model: model = mnlogit(formula = f'Q("{target}")~Q("{col}")', data = fit_data).fit(disp = 0, method = self._method, **self._kwargs) y_pred = self._logistic_pred(grid, model) pr = model.prsquared llrp = model.llr_pvalue if self._binary_t: paras = model.params.values.ravel() y_hover = 'P(y = 1): %{y:,.0%}<b>(probability)</b><extra></extra>' # <extra></extra> remove trace name name = f'{target} Logistic Fit' else: paras = model.params.values[:,-1] y_hover = 'P(y = c): %{y:,.0%}<b>(probability)</b><extra></extra>'.replace('c', f'{self._lbl[-1]}') name = f'{target} Logistic Fit: {self._lbl[-1]} vs {self._lbl[0]}(base)' # decorate hover_text = [f'pR<sup>2</sup> = {pr:,.3f}', f'llrp = {llrp:,.4f}</b><br>', *gs, 'col: %{x:,.3f}'.replace('col', f'{col}'), y_hover] else: if not np.any(model): model = np.polynomial.polynomial.polyfit(fit_data[col], fit_data[target], degree) paras = model y_pred = np.polynomial.polynomial.polyval(grid, model)[:,-1] name = 'Polynomial Fit' hover_text = [*gs, 'col: %{x:,.3f}'.replace('col', f'{col}'), 'c: %{y:,.3f}<b>(trend)</b><extra></extra>'.replace('c', target)] if log_x: x = np.power(10, x) if log_y: y_pred = np.power(10, y_pred) # decorate text = self._func_text(paras, poly = True if degree > 1 else False) hovertemplate = '<br>'.join([f'<b>Fit: {fit}</b>', f'{text}</b><br>'.replace('<br>', '') if fit == 'logistic' else f'{text}</b><br>', *hover_text ]) # update fit container # unpack categories tuple and # poly fit output of array dtype if self._facets and comb: self._models.append((*comb, *model) if np.ndim(model) else (*comb, model)) else: self._models.append(model) except: if comb: self._fit_fail.append(dict(zip(self._g_map.keys(), comb)) | {'row': row_idx, 'col':col_idx}) else: _z_log.info(f"{fit} was not applied! Please ensure that data to fit matches model requirements") return x, y_pred, hovertemplate, name