Source code for lib5c.plotters.bias_heatmaps

"""
Module for plotting bias heatmaps.
"""

import numpy as np
import scipy.stats as stats
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

from lib5c.plotters.asymmetric_colormap import shifted_colormap
from lib5c.plotters.colormaps import get_colormap
from lib5c.util.mathematics import gmean
from lib5c.util.stratification import conservative_qcut
from lib5c.util.plotting import plotter


[docs]@plotter def plot_bias_heatmap(obs_counts, exp_counts, primermap, factor, bins=None, n_bins=None, cmap=None, vmin=None, vmax=None, midpoint=None, log=True, region=None, agg=gmean, asymmetric=False, print_variance=False, shuffle=0, zero_inflated=False, unique=False, despine=False, style='dark', dpi=300, **kwargs): """ Plots a bias heatmap. Parameters ---------- obs_counts : Dict[str, np.ndarray] The dict of observed counts. exp_counts : Dict[str, np.ndarray] The dict of expected counts. primermap : Dict[str, List[Dict[str, Any]]] Primermap or pixelmap describing the loci in ``obs_counts`` and ``exp_counts``. factor : str The bias factor to draw the bias heatmap for. This string must match a metadata key in ``primermap``. That is to say, if ``factor`` is ``'length'`` then we expect ``primermap[region][i]['length']`` to be a number representing the length of the ``i`` th fragment in the region specified by ``region``. bins : Optional[Sequence[numeric]] The endpoints of the bins to use to stratify the bias factor values. Either ``bins`` or ``n_bins`` must be specified. n_bins : Optional[int] The number of even-number bins to use to stratify the bias factor values. Either ``bins`` or ``n_bins`` must be specified. cmap : Optional[matplotlib.colors.Colormap] Pass a colormap to use for the heatmap. If this kwarg is not passed, the default 'bias' colormap is used. vmin : Optional[float] The minimum value to use for the heatmap. If this kwarg is not passed, the min of the data will be used. vmax : Optional[float] The maximum value to use for the heatmap. If this kwarg is not passed, the max of the data will be used. midpoint : Optional[float] The midpoint value to use for the colormap. If this kwarg is not passed, the colormap will be symmetric about its midpoint. This kwarg can be used to force the midpoint of the colormap to lie at a desired value, such as 0. log : bool Whether or not to show log-scale fold-enrichments in the heatmap. region : Optional[str] Pass a region name as a string to consider only the contacts in one particular region. If this kwarg is not passed, contacts for all regions in the input counts dicts will be used to generate the bias heatmap. agg : Callable[[np.ndarray], float] The aggregation function to use when summarizing the strata. This function should take in an array of floats and return a single summary value. asymmetric : bool Pass True to construct heatmaps using only the upper-triangular elements of the counts matrices, which can lead to asymmetric heatmaps. By default, the algorithm iterates over all elements of the counts matrices, enforcing symmetry in the bias models but incurring some redundancy in the actual counts information. print_variance : bool If True, the variance of the bias across the stratification grid will be printed in the plot title. shuffle : int Specify a number of random permutation null hypothesis simulations to perform. zero_inflated : bool Pass True here to treat the bias factor as "zero inflated", which will cause all the zero values to land in a dedicated "zero stratum" and allocate the remaining bins evenly among the positive data. This kwarg is ignored if the bins are passed explicitly. unique : bool Pass True to override `bins` and `n_bins` and simply put each unique value of the bias factor into its own stratum. dpi : int DPI to save figure at if auto-saving to a raster format. kwargs : kwargs Typical plotter kwargs. Returns ------- Tuple[pyplot axis, float] The first element is the pyplot axis plotted on, which will always be present. The second element is the variance of the enrichment with respect to the bias factor grid. The third element is the percentile value for this variance obtained from simulations under the null hypothesis. The fourth element is the 95% RI for the null hypothesis. These last three will be nan if no simulations were performed. Notes ----- The simulations were a cool idea, but in reality the 95% null hypothesis RI is incredibly small since it is very unlikely to see any enrichment at all if the obs and exp counts are forcibly de-correlated from the bias factors. Therefore the recommendation is to not simulate unless you're really sure you want it. """ # resolve cmap if cmap is None: cmap = get_colormap('bias') # resolve regions regions = list(obs_counts.keys()) if region is not None: regions = [region] # make dataframe list_of_dict = [] for region in regions: for i in range(len(obs_counts[region])): if asymmetric: jrange = range(i + 1) else: jrange = range(len(obs_counts[region])) for j in jrange: if np.isfinite(obs_counts[region][i, j]): list_of_dict.append( {'upstream_factor' : primermap[region][j][factor], 'downstream_factor': primermap[region][i][factor], 'obs' : obs_counts[region][i, j], 'exp' : exp_counts[region][i, j]}) df = pd.DataFrame(list_of_dict) # cut/qcut if unique or bins is not None: if unique: bins = np.unique(df.upstream_factor) bins = np.concatenate([[bins[0] - 0.001], bins]) df['Upstream %s' % factor] = pd.cut(df.upstream_factor, bins) df['Downstream %s' % factor] = pd.cut(df.downstream_factor, bins) elif n_bins is not None: if zero_inflated: bins = conservative_qcut(df.upstream_factor, n_bins) df['Upstream %s' % factor] = pd.cut(df.upstream_factor, bins) df['Downstream %s' % factor] = pd.cut(df.downstream_factor, bins) else: df['Upstream %s' % factor] = pd.qcut(df.upstream_factor, n_bins) df['Downstream %s' % factor] = pd.qcut(df.downstream_factor, n_bins) else: raise ValueError('must pass bins or n_bins, or set unique=True') # stratify and aggregate plot_df = _group_and_agg(df, factor, agg, log) # compute variance variance = np.var( plot_df.values[np.tril(np.ones_like(plot_df.values, dtype=bool))], ddof=1) # simulate variance simulated_variances = [] for i in range(shuffle): # shuffle permutation = np.random.permutation(len(df)) df.ix[:, 'obs'] = df['obs'][permutation].reset_index(drop=True) df.ix[:, 'exp'] = df['exp'][permutation].reset_index(drop=True) # stratify and aggregate temp_df = _group_and_agg(df, factor, agg, log) # compute variance simulated_variances.append(np.var( temp_df.values[np.tril(np.ones_like(temp_df.values, dtype=bool))], ddof=1)) if simulated_variances: simulated_variance_percentile = stats.percentileofscore( simulated_variances, variance, kind='strict') variance_95_ri = np.percentile(simulated_variances, 95) else: simulated_variance_percentile = np.nan variance_95_ri = np.nan # prepare cmap if midpoint is not None: cmap = shifted_colormap(cmap, midpoint=midpoint) # plot with sns.plotting_context(context='paper'): plt.clf() sns.heatmap(plot_df, square=True, cmap=cmap, vmin=vmin, vmax=vmax, cbar_kws={'ticks': [vmin, vmax]}) plt.yticks(rotation=0) plt.xticks(rotation=45, ha='right') if print_variance: if np.isfinite(simulated_variance_percentile): plt.title('Variance: %g (%.1f%%, 95%% H_0 RI = %g)' % (float(variance), simulated_variance_percentile, variance_95_ri)) else: plt.title('Variance: %g' % float(variance)) return variance, simulated_variance_percentile, variance_95_ri
def _group_and_agg(df, factor, agg=gmean, log=True): # prepare plotting dataframe plot_df = (df.groupby(['Upstream %s' % factor, 'Downstream %s' % factor])['obs'] .agg(lambda x: agg(x)) / df.groupby(['Upstream %s' % factor, 'Downstream %s' % factor])['exp'] .agg(lambda x: agg(x))).unstack().iloc[::-1] if log: plot_df = np.log2(plot_df) return plot_df