Source code for lib5c.algorithms.thresholding

import numpy as np
from scipy.ndimage import label, generate_binary_structure
from sklearn.metrics import confusion_matrix

from lib5c.util.parallelization import parallelize_regions
from lib5c.structures.dataset import Dataset
from lib5c.util.metrics import cohens_kappa
from lib5c.util.statistics import adjust_pvalues, stouffer, convert_to_two_tail


[docs]def two_way_thresholding(pvalues_superdict, primermap, conditions=None, significance_threshold=1e-15, bh_fdr=False, two_tail=False, concordant=False, distance_threshold=24000, size_threshold=3, background_threshold=0.6, report_clusters=True): """ All-in-one heavy-lifting function for thresholding. Parameters ---------- pvalues_superdict : dict of dict of np.ndarray The p-values to threshold. primermap : primermap The primermap associated with the pvalues_superdict. conditions : list of str, optional The list of condition names. Pass None to skip condition comparisons. significance_threshold : float The p-value or q-value to threshold significance with. bh_fdr : bool Pass True to apply multiple testing correction (BH-FDR) before checking the ``significance_threshold``. two_tail : bool If ``bh_fdr=True``, pass True here to perform the BH-FDR on two-tailed p-values, but only report the significant right-tail events as loops. Note that two-tailed p-values are only accurate if p-values were called using a continuous distribution. concordant : bool Pass True to report only those interactions which are significant in all replicates in each condition. Pass False to combine evidence from all replicates within each condition instead. distance_threshold : int Interactions with interaction distance (in bp) shorter than this will not be called. size_threshold : int Interactions within connected components smaller than this will not be called. background_threshold : float, optional The p-value threshold to use to call a background loop class. Pass None to skip calling a background class. report_clusters : bool Pass True to perform a second pass of connected component counting at the very end, reporting the numbers of clusters in each color category to the returned Dataset. Returns ------- Dataset The results of the thresholding. """ # resolve conditions if conditions is None: conditions = ['dummy_condition'] # prepare dataset print('preparing dataset') d = Dataset.from_counts_superdict( pvalues_superdict, primermap, name='rep_pvalue', repinfo=conditions ) d.dropna(name='rep_pvalue') # distance thresholding print('distance thresholding') filter_near_diagonal(d.df, distance=distance_threshold) # combine replicate information per condition for cond in d.conditions: d.df[('pvalue', cond)] = stouffer( d.df[[('rep_pvalue', rep) for rep in d.cond_reps[cond]]], axis=1 ) # significance threhsolding print('significance thresholding') threshold_column = 'pvalue' if bh_fdr: if two_tail: d.apply_per_replicate( convert_to_two_tail, 'pvalue', 'two_tail_pvalue') d.apply_per_replicate( convert_to_two_tail, 'rep_pvalue', 'rep_two_tail_pvalue') d.apply_per_replicate( adjust_pvalues, 'two_tail_pvalue', 'two_tail_qvalue') d.apply_per_replicate( adjust_pvalues, 'rep_two_tail_pvalue', 'rep_two_tail_qvalue') d.apply_per_replicate( lambda x, y: np.where(y < 0.5, x, 1.0), ['two_tail_qvalue', 'pvalue'], 'qvalue') d.apply_per_replicate( lambda x, y: np.where(y < 0.5, x, 1.0), ['rep_two_tail_qvalue', 'rep_pvalue'], 'rep_qvalue') else: d.apply_per_replicate(adjust_pvalues, 'pvalue', 'qvalue') d.apply_per_replicate(adjust_pvalues, 'rep_pvalue', 'rep_qvalue') threshold_column = 'qvalue' d.apply_per_replicate( lambda x: x < significance_threshold, threshold_column, 'significant_unfiltered') d.apply_per_replicate( lambda x: x < significance_threshold, 'rep_%s' % threshold_column, 'rep_significant_unfiltered') # cluster size filtering if size_threshold > 1: print('size thresholding') if not concordant: d.add_columns_from_counts_superdict( {cond: size_filter( d.counts(name='significant_unfiltered', rep=cond), size_threshold) for cond in d.conditions}, 'significant', rep_order=d.conditions ) else: d.add_columns_from_counts_superdict( {rep: size_filter( d.counts(name='rep_significant_unfiltered', rep=rep), size_threshold) for rep in d.reps}, 'rep_significant', rep_order=d.reps ) else: if not concordant: d.apply_per_replicate(lambda x: x, 'significant_unfiltered', 'significant') else: d.apply_per_replicate(lambda x: x, 'rep_significant_unfiltered', 'rep_significant') # concordance within conditions if concordant: print('computing concordance within conditions') for cond in d.conditions: d.df['concordant', cond] = np.all( [d.df.loc[:, ('rep_significant', rep)].values for rep in d.cond_reps[cond]], axis=0) # intersections across conditions if len(d.conditions) == 2: print('intersecting across conditions') criterion = 'concordant' if concordant else 'significant' d.df['color'] = '' d.df.loc[d.df[criterion, d.conditions[0]] & d.df[criterion, d.conditions[1]], 'color'] \ = 'constitutive' d.df.loc[d.df[criterion, d.conditions[0]] & ~d.df[criterion, d.conditions[1]], 'color'] \ = '%s_only' % d.conditions[0] d.df.loc[~d.df[criterion, d.conditions[0]] & d.df[criterion, d.conditions[1]], 'color'] \ = '%s_only' % d.conditions[1] # add background if background_threshold is not None: print('adding background') d.df.loc[np.all(d.df['rep_pvalue'].values > background_threshold, axis=1), 'color'] = 'background' # report cluster info if report_clusters is True: print('counting final clusters') color_counts = d.counts('color') for color in set(d.df['color'].unique()) - {'background', ''}: d.add_column_from_counts( label_connected_components(color_counts, color), color) return d
[docs]def kappa(d): """ Compute the Cohen's kappa values between the replicates of each condition. Parameters ---------- d : Dataset Dataset processed by `two_way_thresholding()`. Returns ------- dict The keys are condition names, the values are the kappa values. """ key = 'rep_significant' if ('rep_significant', d.reps[0]) in d.df.columns \ else 'rep_significant_unfiltered' return {cond: cohens_kappa(*[d.df.loc[:, (key, rep)] for rep in d.cond_reps[cond]]) for cond in d.conditions}
[docs]def concordance_confusion(d): """ Extract the within-condition concordance confusion matrices. Parameters ---------- d : Dataset Dataset processed by `two_way_thresholding()`. Returns ------- dict The keys are condition names, the values are the 2x2 confusion matrices. """ key = 'rep_significant' if ('rep_significant', d.reps[0]) in d.df.columns \ else 'rep_significant_unfiltered' return {cond: confusion_matrix(*[d.df.loc[:, (key, rep)] for rep in d.cond_reps[cond]]) for cond in d.conditions}
[docs]def color_confusion(d): """ Extract the across-condition color confusion matrix. Parameters ---------- d : Dataset Dataset processed by `two_way_thresholding()`. Returns ------- np.ndarray The 2x2 confusion matrix. """ criterion = 'concordant' if 'concordant' in {c[0] for c in list(d.df)} \ else 'significant' return confusion_matrix(*[d.df.loc[:, (criterion, cond)] for cond in d.conditions])
[docs]def count_clusters(d): """ Extract the final cluster counts. Parameters ---------- d : Dataset Dataset processed by `two_way_thresholding()` called with `report_clusters=True`. Returns ------- dict The keys are the color names as strings, the values are integers representing the cluster counts. """ cluster_counts = {} for color in set(d.df['color'].unique()) - {'background', ''}: if not type(color) == str: continue row_idx = d.df[color] != 0 cluster_counts[color] = len( np.unique(d.df.loc[row_idx, color].map(str) + '_' + d.df.loc[row_idx, 'region'])) return cluster_counts
[docs]def filter_near_diagonal(df, distance=24000, drop=True): """ Drops rows from df where its 'distance' column is less than k. Dropping occurs in-place. Parameters ---------- df : pd.DataFrame Must have a 'distance' column. distance : int Threshold for distance (in bp). drop : bool Pass True to drop the filtered rows in-place. Pass False to return an index subset for the filtered rows instead. """ index_subset = (df[df['distance'] < distance]).index if drop: df.drop(index_subset, inplace=True) else: return index_subset
[docs]@parallelize_regions def size_filter(calls, threshold): """ Removes calls which are in connected components smaller than a threshold. Parameters ---------- calls : np.ndarray Boolean matrix of calls. threshold : int Connected components smaller than this will be removed. Returns ------- np.ndarray The filtered calls. Examples -------- >>> calls = np.array([[ True, True, False, True], ... [ True, True, False, False], ... [False, False, False, True], ... [ True, False, True, True]]) >>> size_filter(calls, 3) array([[ True, True, False, False], [ True, True, False, False], [False, False, False, False], [False, False, False, False]]) """ components, num_components = label( np.tril(calls), structure=generate_binary_structure(2, 2)) for i in range(1, num_components + 1): if np.sum(components == i) < threshold: components[components == i] = 0 result = components > 0 return result | result.T
[docs]@parallelize_regions def label_connected_components(colors, color): """ Labels the connected components of a specific loop color. Parameters ---------- colors : np.ndarray with string dtype The matrix of colors. color : str The color to label. Returns ------- np.ndarray Same size and shape as colors, entries are ints which are the labels Examples -------- >>> colors = np.array([['a', 'a', 'b', 'a'], ... ['a', 'a', 'b', 'b'], ... ['b', 'b', 'b', 'a'], ... ['a', 'b', 'a', 'a']]) >>> print(label_connected_components(colors, 'a')) [[1 1 0 2] [1 1 0 0] [0 0 0 3] [2 0 3 3]] """ calls = colors == color components, _ = label( np.tril(calls), structure=generate_binary_structure(2, 2)) return components | components.T