Source code for lib5c.util.counts_superdict

"""
Module for creating and performing operations on ``counts_superdict`` data
structures. ``counts_superdict`` data structures areconceptually just
collections of named counts dicts. Typically, the members of the collection
correspond to replicates or conditions in a 5C experiment. ``counts_superdict``
data structures are implemented as dicts of dicts of 2D numpy arrays as shown in
this example::

    counts_superdict[replicate_name][region_name] = 2D numpy array

Here the innermost values are the square symmetric 2D numpy arrays representing
the counts for the specified replicate and region. In other words, they are a
dict whose keys are replicate names as strings and whose values are standard
counts data structures. See ``lib5c.parsers.counts.load_counts()`` or
``lib5c.parsers.counts.load_primer_counts()``.
"""

from copy import deepcopy

import numpy as np
from scipy import stats

from lib5c.util.counts import calculate_pvalues, flatten_counts_to_list,\
    unflatten_counts_from_list


[docs]def make_atlas(counts_superdict, pvalues_superdict=None, distribution=stats.norm, percentile_threshold=None): """ Computes the maximum counts and minimum p-values for each region of a 5C dataset across multiple replicates or conditions. Parameters ---------- counts_superdict : dict of counts dicts The counts for all replicates or conditions to be used to generate the atlas. The keys are the replicate or condition names, the values are standard counts dicts containing the counts for that replicate. See ``lib5c.parsers.counts.load_counts()`` or ``lib5c.parsers.counts.load_primer_counts()``. pvalues_superdict : dict of counts dicts or None The counts for all replicates or conditions to be used to generate the atlas. The keys are the replicate or condition names, the values are standard counts dicts containing the p-values for that replcicate. If None is passed, the p-values will be automatically computed using ``lib5c.util.counts.calculate_regional_pvalues()``. distribution : subclass of scipy.stats.rv_continuous If pvalue_superdict is None, this kwarg specifies the distribution to use when modeling the counts. percentile_threshold : float between 0 and 100 or None If passed, all p-value modeling kwargs are ignored and all p-value modeling steps are skipped. Instead, the returned atlas peaks will contain a dummy p-value, which will be 0.0 whenever the peak passes the percentile threshold, and 1.0 otherwise. Returns ------- (counts dict, counts dict) tuple The first element in the tuple is the counts dict contatining the maximum counts observed across all replicates at each primer or bin combination in each region. The second element in the tuple is the parallel counts dict containing the corresponding minimum p-values. """ # determine replicates and regions replicates = list(counts_superdict.keys()) regions = list(counts_superdict[replicates[0]].keys()) # genereate pvalues_superdict if None was passed if not pvalues_superdict: pvalues_superdict = make_pvalues_superdict( counts_superdict, distribution=distribution, percentile_threshold=percentile_threshold) # compute atlas max_counts = deepcopy(counts_superdict[replicates[0]]) min_pvalues = deepcopy(pvalues_superdict[replicates[0]]) for rep in replicates: for region in regions: for i in range(len(max_counts[region])): for j in range(len(max_counts[region])): if counts_superdict[rep][region][i, j] > \ max_counts[region][i, j]: max_counts[region][i, j] = \ counts_superdict[rep][region][i, j] min_pvalues[region][i, j] = \ pvalues_superdict[rep][region][i, j] return max_counts, min_pvalues
[docs]def make_pvalues_superdict(counts_superdict, distribution=stats.norm, percentile_threshold=None): """ Makes a counts_superdict-like data structure containing automatically computed p-values for each replicate for each region. The counts within each region are modeled independently for each replicate using the distribution specified by the distribution kwarg. See ``lib5c.util.counts.calculate_regional_pvalues()``. Parameters ---------- counts_superdict : dict of counts dicts The counts for all replicates or conditions for which p-values should be computed. The keys are the replicate or condition names, the values are standard counts dicts containing the counts for that replicate. See ``lib5c.parsers.counts.load_counts()`` or ``lib5c.parsers.counts.load_primer_counts()``. distribution : subclass of scipy.stats.rv_continuous The distribution to use when modeling the counts. percentile_threshold : float between 0 and 100 or None If passed, the distribution kwarg is ignored and p-value modeling is skipped. Instead, the returned data struture will contain dummy p-values, which will be 0.0 whenever the peak passes the percentile threshold, and 1.0 otherwise. This percentile threshold is applied independently for each region and each replicate. Returns ------- dict of counts dicts A parallel data structure containing the corresponding p-values. """ return {rep: calculate_pvalues(counts_superdict[rep], distribution, percentile_threshold)[0] for rep in counts_superdict.keys()}
[docs]def make_atlas_peaks(counts_superdict, pvalues_superdict=None, max_counts=None, min_pvalues=None, distribution=stats.norm, percentile_threshold=None): """ Reshapes count and p-value information across multiple replicates into a peaks data structure compatible with the ``lib5c.algorithms.clustering`` subpackage. Parameters ---------- counts_superdict : dict of counts dicts The counts for all replicates or conditions to be used to generate the atlas. The keys are the replicate or condition names, the values are standard counts dicts containing the counts for that replicate. See ``lib5c.parsers.counts.load_counts()`` or ``lib5c.parsers.counts.load_primer_counts()``. pvalues_superdict : dict of counts dicts or None The p-values for all replicates or conditions to be used to generate the atlas. The keys are the replicate or condition names, the values are standard counts dicts containing the p-values for that replicate. See ``lib5c.parsers.counts.load_counts()`` or ``lib5c.parsers.counts.load_primer_counts()``. If None is passed, p-values will be automatically computed for each replicate for each region by modeling each region within each replicate independently with the distribution specified by the distribution kwarg. See ``lib5c.util.counts.calculate_regional_pvalues()``. max_counts : counts dict or None A standard counts dict containing the maximum count value observed for every interaction in each region across all replicates. If None is passed, this will be computed automatically based on the ``counts_superdict``. See ``lib5c.util.counts.make_atlas()``. min_pvalues : counts dict or None A standard counts dict containing the minimum p-value observed for every interaction in each region across all replicates. If None is passed, this will be computed automatically based on the ``pvalues_superdict``. See ``lib5c.util.counts.make_atlas()``. distribution : subclass of scipy.stats.rv_continuous If pvalue_superdict is None, this kwarg specifies the distribution to use when modeling the counts. percentile_threshold : float between 0 and 100 or None If passed, all p-value modeling kwargs are ignored and all p-value modeling steps are skipped. Instead, the returned atlas peaks will contain a dummy p-value, which will be 0.0 whenever the peak passes the percentile threshold, and 1.0 otherwise. Returns ------- complex dict structure The returned peaks data structure has the following structure:: atlas_peaks[region_name] = { 'x': int, 'y': int, 'value': float, 'pvalue': float, 'replicates' : { rep_name: { 'value': float, 'pvalue': float } } } Here ``region_name`` is a string referring to the region name. The 'x' and 'y' keys describe the x- and y-coordinate of the peak, respectively. The 'value' and 'pvalue' keys describe the max count and minimum p-value observed at those coordinates within the specified region across all replicates. The 'replicates' key's value is a dict containing more information about the values and p-values observed at the specified coordinates in each of the replicates in the original counts_superdict and pvalues_superdict. Notes ----- Only the lower-triangular and diagonal elements (those for which the x-coordinate is greater than or equal to the y-coordinate) of each region's counts are included in the returned data structure to prevent duplication of data. """ # deduce replicates and regions replicates = list(counts_superdict.keys()) regions = list(counts_superdict[replicates[0]].keys()) # create pvalues_superdict if not present if not pvalues_superdict: pvalues_superdict = make_pvalues_superdict( counts_superdict, distribution=distribution, percentile_threshold=percentile_threshold) # compute max_counts and min_pvalues if not present if (not max_counts) or (not min_pvalues): max_counts, min_pvalues = make_atlas(counts_superdict, pvalues_superdict) # make atlas peaks atlas_peaks = {} for region in regions: atlas_peaks[region] = [] for i in range(len(max_counts[region])): for j in range(i+1): peak_dict = {'x': i, 'y': j, 'value': max_counts[region][i, j], 'pvalue': min_pvalues[region][i, j], 'replicates': {}} for rep in replicates: peak_dict['replicates'][rep] = { 'value' : counts_superdict[rep][region][i, j], 'pvalue': pvalues_superdict[rep][region][i, j]} atlas_peaks[region].append(peak_dict) return atlas_peaks
[docs]def counts_superdict_to_matrix(counts_superdict, rep_order=None, region_order=None, discard_nan=False): """ Convert a counts_superdict structure to a matrix. Parameters ---------- counts_superdict : Dict[Dict[np.ndarray]] The keys of the outer dict are replicate names, the keys of the inner dict are region names, the values are square symmetric arrays of counts for the specified replicate and region. rep_order : Optional[List[str]] The order in which the replicates should be arranged. Pass None to use the order of ``counts_superdict.keys()``. region_order : Optional[List[str]] The order in which the regions should be concatenated. Pass None to use the order of the keys. discard_nan : bool If True, positions containing ``nan`` values will be removed. Returns ------- np.ndarray Each row is a replicate, each column is a position. The order of the columns is described in ``lib5c.util.counts.flatten_counts_to_list()``, honoring the passed region order. Examples -------- >>> import numpy as np >>> counts_superdict = {'Rep1': {'A': np.array([[1, 2], [2, 3]]), ... 'B': np.array([[4, 8], [8, 10]])}, ... 'Rep2': {'A': np.array([[3, 5], [5, 6]]), ... 'B': np.array([[7, 9], [9, np.nan]])}} >>> rep_order = ['Rep1', 'Rep2'] >>> region_order = ['A', 'B'] >>> counts_superdict_to_matrix(counts_superdict, rep_order, region_order) array([[ 1., 2., 3., 4., 8., 10.], [ 3., 5., 6., 7., 9., nan]]) >>> counts_superdict_to_matrix(counts_superdict, rep_order, region_order, ... discard_nan=True) array([[1., 2., 3., 4., 8.], [3., 5., 6., 7., 9.]]) """ # default rep_order if rep_order is None: rep_order = list(counts_superdict.keys()) # default region_order if region_order is None: region_order = list(counts_superdict[rep_order[0]].keys()) # create matrix matrix = np.array([flatten_counts_to_list(counts_superdict[rep], region_order=region_order, discard_nan=False) for rep in rep_order]) # discard nans if discard_nan: matrix = matrix[:, ~np.isnan(matrix).any(axis=0)] return matrix
[docs]def matrix_to_counts_superdict(matrix, rep_order, region_order, pixelmap): """ Converts a matrix back into a counts_superdict structure. Parameters ---------- matrix : np.ndarray The matrix to convert to a counts superdict. The rows are replicates, the columns are bin-bin pairs or FFLJs. rep_order : list of str The replicate names to use in the counts_superdict, in the order of the columns of ``matrix``. region_order : list of str The order in which the regions were concatenated down the rows of ``matrix``. pixelmap : pixelmap Needed to precompute the size of each region when preparing the counts_superdict. Returns ------- counts_superdict The counts_superdict representation of the input matrix. Examples -------- >>> import numpy as np >>> matrix = np.array([[1, 2, 3, 4, 8, 10], ... [3, 5, 6, 7, 9, np.nan]]) >>> rep_order = ['Rep1', 'Rep2'] >>> region_order = ['A', 'B'] >>> pixelmap = {'A': [{}, {}], 'B': [{}, {}]} >>> counts_superdict = matrix_to_counts_superdict( ... matrix, rep_order, region_order, pixelmap) >>> list(sorted(counts_superdict.keys())) ['Rep1', 'Rep2'] >>> list(sorted(counts_superdict['Rep1'].keys())) ['A', 'B'] >>> list(sorted(counts_superdict['Rep2'].keys())) ['A', 'B'] >>> counts_superdict['Rep1']['A'] array([[1., 2.], [2., 3.]]) >>> counts_superdict['Rep1']['B'] array([[ 4., 8.], [ 8., 10.]]) >>> counts_superdict['Rep2']['A'] array([[3., 5.], [5., 6.]]) >>> counts_superdict['Rep2']['B'] array([[ 7., 9.], [ 9., nan]]) """ counts_superdict = {} for i in range(len(rep_order)): counts_superdict[rep_order[i]] = unflatten_counts_from_list( matrix[i, :], region_order, pixelmap) return counts_superdict