Source code for lib5c.algorithms.expected

"""
Module for computing expected models for 5C interaction data.
"""

import numpy as np
import powerlaw as pl
from statsmodels.nonparametric.smoothers_lowess import lowess

from lib5c.util.parallelization import parallelize_regions
from lib5c.util.bed import get_mid_to_mid_distance
from lib5c.util.mathematics import gmean
from lib5c.util.counts import flatten_regional_counts, propagate_nans
from lib5c.algorithms.donut_filters import donut_filt, lower_left_filt


[docs]@parallelize_regions def make_poly_log_log_fragment_expected_matrix(obs_matrix, regional_primermap): """ Convenience function for quickly making an expected matrix for a fragment-level observed counts matrix based on a simple power law relationship. Parameters ---------- obs_matrix : np.ndarray The matrix of observed counts. regional_primermap : List[Dict[str, Any]] Primermap describing the loci in the region represented by ``obs_matrix``. Necessary to figure out distances between elements in the contact matrix. Returns ------- np.ndarray The expected matrix. """ distance_matrix = make_distance_matrix(regional_primermap) distance_expected = poly_log_log_fragment( obs_matrix, distance_matrix) return make_expected_matrix_from_dict(distance_expected, distance_matrix)
[docs]@parallelize_regions def make_poly_log_log_binned_expected_matrix(obs_matrix, exclude_near_diagonal=False): """ Convenience function for quickly making an expected matrix for a bin-level observed counts matrix based on a simple power law relationship. Parameters ---------- obs_matrix : np.ndarray The matrix of observed counts. exclude_near_diagonal : bool If regression or lowess_smooth are True, set this kwarg to True to ignore the first third of the distance scales when fitting the model. Returns ------- np.ndarray The expected matrix. """ distance_expected = poly_log_log_binned( obs_matrix, exclude_near_diagonal=exclude_near_diagonal) return make_expected_matrix_from_list(distance_expected)
[docs]@parallelize_regions def make_powerlaw_binned_expected_matrix(obs_matrix, exclude_near_diagonal=False): """ Convenience function for quickly making an expected matrix for a bin-level observed counts matrix based on a simple power law relationship. Parameters ---------- obs_matrix : np.ndarray The matrix of observed counts. exclude_near_diagonal : bool If regression or lowess_smooth are True, set this kwarg to True to ignore the first third of the distance scales when fitting the model. Returns ------- np.ndarray The expected matrix. """ distance_expected = powerlaw_binned( obs_matrix, exclude_near_diagonal=exclude_near_diagonal) return make_expected_matrix_from_list(distance_expected)
[docs]def get_distance_expected(obs_matrix, regional_primermap=None, level='bin', powerlaw=False, regression=False, degree=1, lowess_smooth=False, lowess_frac=0.8, log_transform='auto', exclude_near_diagonal=False): """ Convenience function for computing a regional one-dimensional expected model from a matrix of observed counts, with properties that can be customized by kwargs. Parameters ---------- obs_matrix : np.ndarray The matrix of observed counts to model. regional_primermap : Optional[List[Dict[str, Any]]] The primermap for this region. Required if ``obs_matrix`` is fragment-level. level : {'bin', 'fragment'} The level of ``obs_matrix``. powerlaw : bool Whether or not to fit a discrete power law distribution to the data. regression : bool Whether or not to use a polynomial regression model. degree : int The degree of the regression model to use. lowess_smooth : bool Whether or not to use lowess smoothing to compute the model. lowess_frac : float The lowess smoothing fraction parameter. log_transform : {'counts', 'both', 'none', 'auto'} What to transform into log space. * counts: log-transform only the counts but not the distances. This results in semi-log models, which don't work on fragment-level data yet. * both: log-transform both the counts and the distances, resulting in log-log models. * none: don't log anything. * auto: automatically pick a reasonably choice based on the other kwargs. exclude_near_diagonal : bool If regression or lowess_smooth are True, set this kwarg to True to ignore the first third of the distance scales when fitting the model. Returns ------- Union[List[float], Dict[int, float]] The one-dimensional expected model. For bin-level data, this is a list of floats, where the ``i`` th element of the list corresponds to the expected value for interactions between loci separated by ``i`` bins. For fragment-level data, this is a dict mapping interaction distances in units of base pairs to the appropriate expected values. """ # make 1-D expected if level == 'bin': # bin level, don't need a distance matrix if powerlaw: print('using power law 1-D distance model') distance_expected = powerlaw_binned( obs_matrix, exclude_near_diagonal=exclude_near_diagonal) elif regression: print('using polynomial log-log 1-D distance model') distance_expected = poly_log_log_binned( obs_matrix, degree=degree, exclude_near_diagonal=exclude_near_diagonal) elif lowess_smooth: if log_transform == 'counts': print('using lowess log-counts 1-D distance model') distance_expected = lowess_binned_log_counts( obs_matrix, frac=lowess_frac, exclude_near_diagonal=exclude_near_diagonal) elif log_transform == 'none': print('using lowess unlogged 1-D distance model') distance_expected = lowess_binned( obs_matrix, frac=lowess_frac, exclude_near_diagonal=exclude_near_diagonal) else: # catches 'both' and 'auto' print('using lowess log-log 1-D distance model') distance_expected = lowess_log_log_binned( obs_matrix, frac=lowess_frac, exclude_near_diagonal=exclude_near_diagonal) else: if log_transform == 'none': print('using unlogged empirical binned 1-D distance model') distance_expected = empirical_binned( obs_matrix, log_transform=False) else: # catches 'both' and 'auto'; 'counts' is illegal print('defaulting to logged empirical binned 1-D distance ' 'model') distance_expected = empirical_binned( obs_matrix, log_transform=True) else: # fragment level, need a distance matrix distance_matrix = make_distance_matrix(regional_primermap) assert np.issubdtype(distance_matrix[0, 1], int) if lowess_smooth: print('using lowess log-log 1-D distance model') distance_expected = lowess_log_log_fragment( obs_matrix, distance_matrix) else: print('defaulting to polynomial log-log 1-D distance model') distance_expected = poly_log_log_fragment( obs_matrix, distance_matrix, degree=degree) return distance_expected
[docs]def get_global_distance_expected(counts, primermap=None, level='bin', powerlaw=False, regression=False, degree=1, lowess_smooth=False, lowess_frac=0.8, log_transform='auto', exclude_near_diagonal=False): """ Convenience function for computing a global one-dimensional expected model from a dict of observed counts, with properties that can be customized by kwargs. Parameters ---------- counts : Dict[str, np.ndarray] The dict of observed counts to model. primermap : Optional[Dict[str, List[Dict[str, Any]]]] A primermap corresponding to ``counts``. level : {'bin', 'fragment'} The level of ``counts``. powerlaw : bool Whether or not to fit a discrete power law distribution to the data. regression : bool Whether or not to use a polynomial regression model. degree : int The degree of the regression model to use. lowess_smooth : bool Whether or not to use lowess smoothing to compute the model. lowess_frac : float The lowess smoothing fraction parameter. log_transform : {'counts', 'both', 'none', 'auto'} What to transform into log space. * counts: log-transform only the counts but not the distances. This results in semi-log models, which don't work on fragment-level data yet. * both: log-transform both the counts and the distances, resulting in log-log models. * none: don't log anything. * auto: automatically pick a reasonably choice based on the other kwargs. exclude_near_diagonal : bool If regression or lowess_smooth are True, set this kwarg to True to ignore the first third of the distance scales when fitting the model. Returns ------- Union[List[float], Dict[int, float]] The one-dimensional expected model. For bin-level data, this is a list of floats, where the ``i`` th element of the list corresponds to the expected value for interactions between loci separated by ``i`` bins. For fragment-level data, this is a dict mapping interaction distances in units of base pairs to the appropriate expected values. """ # make 1-D expected if level == 'bin': # bin level, don't need a distance matrix if powerlaw: print('using power law 1-D distance model') distance_expected = global_powerlaw_binned( counts, exclude_near_diagonal=exclude_near_diagonal) elif regression: print('using polynomial log-log 1-D distance model') distance_expected = global_poly_log_log_binned( counts, degree=degree, exclude_near_diagonal=exclude_near_diagonal) elif lowess_smooth: if log_transform == 'counts': print('using lowess log-counts 1-D distance model') distance_expected = global_lowess_binned_log_counts( counts, frac=lowess_frac, exclude_near_diagonal=exclude_near_diagonal) elif log_transform == 'none': print('using lowess unlogged 1-D distance model') distance_expected = global_lowess_binned( counts, frac=lowess_frac, exclude_near_diagonal=exclude_near_diagonal) else: print('using lowess log-log 1-D distance model') distance_expected = global_lowess_log_log_binned( counts, frac=lowess_frac, exclude_near_diagonal=exclude_near_diagonal) else: if log_transform == 'none': print('using unlogged empirical binned 1-D distance model') distance_expected = global_empirical_binned( counts, log_transform=False) else: print('defaulting to logged empirical binned 1-D distance ' 'model') distance_expected = global_empirical_binned( counts, log_transform=True) else: # fragment level, need a distance matrix distance_matrix = make_distance_matrix(primermap) if lowess_smooth: print('using lowess log-log 1-D distance model') distance_expected = global_lowess_log_log_fragment( counts, distance_matrix) else: print('defaulting to polynomial log-log 1-D distance model') distance_expected = global_poly_log_log_fragment( counts, distance_matrix, degree=degree) return distance_expected
[docs]@parallelize_regions def make_expected_matrix(obs_matrix, regional_primermap=None, level='bin', powerlaw=False, regression=False, degree=1, lowess_smooth=False, lowess_frac=0.8, log_transform='auto', monotonic=False, donut=False, w=15, p=5, donut_frac=0.2, min_exp=0.1, log_donut=False, max_donut_ll=False, distance_expected=None, exclude_near_diagonal=False): """ Convenience function for computing a complete expected matrix given a matrix of observed counts that can be customized with a variety of kwargs. Parameters ---------- obs_matrix : np.ndarray The matrix of observed counts to make an expected matrix for. regional_primermap : Optional[List[Dict[str, Any]]] The primermap for this region. Required if ``obs_matrix`` is fragment-level. level : {'bin', 'fragment'} The level of ``obs_matrix``. powerlaw : bool Whether or not to fit a discrete power law distribution to the data. regression : bool Whether or not to use a polynomial regression model. degree : int The degree of the regression model to use. lowess_smooth : bool Whether or not to use lowess smoothing to compute the model. lowess_frac : float The lowess smoothing fraction parameter. log_transform : {'counts', 'both', 'none', 'auto'} What to transform into log space. * counts: log-transform only the counts but not the distances. This results in semi-log models, which don't work on fragment-level data yet. * both: log-transform both the counts and the distances, resulting in log-log models. * none: don't log anything. * auto: automatically pick a reasonably choice based on the other kwargs. monotonic : bool Pass True to force the one-dimensional expected model to be monotonic. donut : bool Pass True to apply donut-filter local correction to the expected model. Not implemented for fragment-level input data. w : int The outer width of the donut when using donut correction. Should be an odd integer. p : int The inner width of the donut when using donut correction. Should be an odd integer. donut_frac : float If the fraction of possible elements in the donut that lie wihtin the region and have non-infinte values is lower than this fraction then the donut-corrected value at that point will be NaN. min_exp : float If the sum of the 1-D expected matrix under the donut or lower left footprint for a particular pixel is less than this value, set the output at this pixel to nan to avoid numerical instability related to division by small numbers. log_donut : bool Pass True to perform donut correction in log-counts space. max_donut_ll : bool If ``donut`` is True, pass True here too to make the donut correction use the maximum of the "donut" and "lower-left" regions. distance_expected : Optional[Union[List[float], Dict[int, float]]] Pass a one-dimensional expected model to use it instead of computing a new one from scratch according to the other kwargs. exclude_near_diagonal : bool If regression or lowess_smooth are True, set this kwarg to True to ignore the first third of the distance scales when fitting the model. Returns ------- Tuple[np.ndarray, Union[List[float], Dict[int, float]], Optional[ np.ndarray]] The first element of the tuple is the expected matrix. The second element of the tuple is the one-dimensional expected model, which will be a list of expected values if ``level`` was 'bin' or a dict mapping integer distances to expected values if ``level`` was 'fragment'. The third element will be the pairwise distance matrix if ``level`` was 'fragment', but will simply be None if ``level`` was 'bin'. """ # get distance expected if it wasn't passed if distance_expected is None: distance_expected = get_distance_expected( obs_matrix, regional_primermap=regional_primermap, level=level, powerlaw=powerlaw, regression=regression, degree=degree, lowess_smooth=lowess_smooth, lowess_frac=lowess_frac, log_transform=log_transform, exclude_near_diagonal=exclude_near_diagonal) # make a distance matrix if we are at the fragment level if level == 'fragment': distance_matrix = make_distance_matrix(regional_primermap) assert np.issubdtype(distance_matrix[0, 1], int) else: distance_matrix = None # force monotonic if monotonic: print('forcing monotonicity') distance_expected = force_monotonic(distance_expected) # make expected matrix if level == 'bin': expected_matrix = make_expected_matrix_from_list(distance_expected) else: expected_matrix = make_expected_matrix_from_dict( distance_expected, distance_matrix) expected_matrix = expected_matrix[:len(obs_matrix), :len(obs_matrix)] # donut filter if donut: print('applying donut correction') # propagate nan's to expected obs_matrix, expected_matrix = propagate_nans( obs_matrix, expected_matrix) # log transform if requested if log_donut: obs_matrix = np.log(obs_matrix + 1) expected_matrix = np.log(expected_matrix + 1) # donut correct if not max_donut_ll: expected_matrix = donut_filt( obs_matrix, expected_matrix, p, w, max_percent=donut_frac, min_exp=min_exp) else: # compute two matrices donut_matrix = donut_filt(obs_matrix, expected_matrix, p, w, max_percent=donut_frac, min_exp=min_exp) ll_matrix = lower_left_filt(obs_matrix, expected_matrix, p, w, max_percent=donut_frac, min_exp=min_exp) # max operation expected_matrix = np.fmax(donut_matrix, ll_matrix) # undo log transform if log_donut: expected_matrix = np.exp(expected_matrix) - 1 return expected_matrix, distance_expected, distance_matrix
[docs]def make_expected_matrix_from_list(distance_expected): """ Converts a bin-level one-dimensional expected model into an expected matrix. Parameters ---------- distance_expected : List[float] The one-dimensional distance expected model to make a matrix out of. Returns ------- np.ndarray The expected matrix. """ expected_matrix = np.zeros((len(distance_expected), len(distance_expected)), dtype=float) for i in range(len(expected_matrix)): for j in range(i + 1): expected_matrix[i, j] = distance_expected[i - j] expected_matrix[j, i] = distance_expected[i - j] return expected_matrix
[docs]def make_expected_matrix_from_dict(distance_expected, distance_matrix): """ Converts a fragment-level one-dimensional expected model into an expected matrix. Parameters ---------- distance_expected : Dict[int, float] A mapping from interaction distances in units of base pairs to the expected value at that distance. distance_matrix : np.ndarray The pairwise distance matrix for the fragments in this region. Returns ------- np.ndarray The expected matrix. """ expected_matrix = np.zeros_like(distance_matrix, dtype=float) for i in range(len(expected_matrix)): for j in range(i + 1): expected_matrix[i, j] = distance_expected[distance_matrix[i, j]] expected_matrix[j, i] = distance_expected[distance_matrix[i, j]] return expected_matrix
[docs]def make_expected_dict_from_matrix(expected_matrix, distance_matrix): """ Convert an expected matrix into a dict representation of the one-dimensional expected model it embodies. Parameters ---------- expected_matrix : np.ndarray The expected matrix. distance_matrix : np.ndarray The pairwise distance matrix for the fragments in this region. Returns ------- Dict[int, float] A mapping from interaction distances in units of base pairs to the expected value at that distance. """ distance_expected = {} for i in range(len(expected_matrix)): for j in range(i + 1): if np.isfinite(expected_matrix[i, j]): distance_expected[distance_matrix[i, j]] = expected_matrix[i, j] return distance_expected
[docs]@parallelize_regions def make_distance_matrix(regional_primermap): """ Construct a pairwise distance matrix for the fragments in a region from the primermap describing those fragments. Parameters ---------- regional_primermap : List[Dict[str, Any]] The primermap for this region. Returns ------- np.ndarray The pairwise distance matrix for all fragments in this region in units of base pairs. """ return np.array([[get_mid_to_mid_distance(regional_primermap[i], regional_primermap[j]) for i in range(len(regional_primermap))] for j in range(len(regional_primermap))], dtype=int)
[docs]def poly_log_log_fragment(regional_counts, distances, degree=1, pseudocount=1): """ Make a regional one-dimensional fragment-level expected model by fitting a polynomial in log-log space. Parameters ---------- regional_counts : np.ndarray The observed counts matrix for this region. distances : np.ndarray The pairwise distance matrix for all fragments in this region in units of base pairs. degree : int The degree of the polynomial to fit. pseudocount : int The pseudocount to add to the counts before logging. Returns ------- Dict[int, float] A mapping from interaction distances in units of base pairs to the expected value at that distance. """ # log transform log_regional_counts = np.log(regional_counts + pseudocount) log_distances = np.log(distances + pseudocount) # make data of the form [log_distance, log_count], ignoring nans data = np.asarray([[log_distances[i, j], log_regional_counts[i, j]] for i in range(len(log_regional_counts)) for j in range(i + 1) if np.isfinite(log_regional_counts[i, j])]) # do the linear fit fit = np.poly1d(np.polyfit(data[:, 0], data[:, 1], degree)) # repackage distance_expected = { dist: np.exp(fit(np.log(dist + pseudocount))) - pseudocount for dist in np.unique(distances.flatten()) } return distance_expected
[docs]def global_poly_log_log_fragment(counts, distances, degree=1, pseudocount=1): """ Make a global one-dimensional fragment-level expected model by fitting a polynomial in log-log space. Parameters ---------- counts : Dict[str, np.ndarray] The observed counts dict to fit the model to. distances : Dict[str, np.ndarray] A dict of pairwise distance matrices describing the genomic distances between the elements of the matrices in ``counts``. The keys and array dimensions should match the keys and array dimensions of ``counts``. degree : int The degree of the polynomial to fit. pseudocount : int The pseudocount to add to the counts before logging. Returns ------- Dict[int, float] A mapping from interaction distances in units of base pairs to the expected value at that distance. """ # log transform log_counts = {region: np.log(counts[region] + pseudocount) for region in counts.keys()} log_distances = {region: np.log(distances[region] + pseudocount) for region in distances.keys()} # make data of the form [log_distance, log_count], ignoring nans data = np.asarray([[log_distances[region][i, j], log_counts[region][i, j]] for region in log_counts.keys() for i in range(len(log_counts[region])) for j in range(i + 1) if np.isfinite(log_counts[region][i, j])]) # do the linear fit fit = np.poly1d(np.polyfit(data[:, 0], data[:, 1], degree)) # repackage distance_expected = { dist: np.exp(fit(np.log(dist + pseudocount))) - pseudocount for dist in np.unique(np.concatenate([distances[region].flatten() for region in distances.keys()])) } return distance_expected
[docs]def poly_log_log_binned(regional_counts, degree=1, pseudocount=1, exclude_near_diagonal=False): """ Make a regional one-dimensional bin-level expected model by fitting a polynomial in log-log space. Parameters ---------- regional_counts : np.ndarray The observed counts matrix for this region. degree : int The degree of the polynomial to fit. pseudocount : int The pseudocount to add to the counts before logging. exclude_near_diagonal : bool If regression or lowess_smooth are True, set this kwarg to True to ignore the first third of the distance scales when fitting the model. Returns ------- List[float] The one-dimensional expected model. The ``i`` th element of the list corresponds to the expected value for interactions between loci separated by ``i`` bins. """ # log transform log_regional_counts = np.log(regional_counts + pseudocount) # establish total size size = len(regional_counts) # make data of the form [log_distance, log_count], ignoring nans data = np.asarray([[np.log(i - j + pseudocount), log_regional_counts[i, j]] for i in range(len(log_regional_counts)) for j in range(i + 1) if np.isfinite(log_regional_counts[i, j])]) # filter key_index = np.int(size / 3) if exclude_near_diagonal else 0 key_log_distance = np.log(key_index + pseudocount) filtered_data = np.asarray([x for x in data if x[0] >= key_log_distance]) # do the linear fit fit = np.poly1d( np.polyfit(filtered_data[:, 0], filtered_data[:, 1], degree)) # get empirical expected empirical = empirical_binned(regional_counts, log_transform=True) # repackage distance_expected = [empirical[i] if i < key_index else np.exp(fit(np.log(i + pseudocount))) - pseudocount for i in range(size)] return distance_expected
[docs]def global_poly_log_log_binned(counts, degree=1, pseudocount=1, exclude_near_diagonal=False): """ Make a global one-dimensional bin-level expected model by fitting a polynomial in log-log space. Parameters ---------- counts : Dict[str, np.ndarray] The observed counts dict to fit the model to. degree : int The degree of the polynomial to fit. pseudocount : int The pseudocount to add to the counts before logging. exclude_near_diagonal : bool If regression or lowess_smooth are True, set this kwarg to True to ignore the first third of the distance scales when fitting the model. Returns ------- List[float] The one-dimensional expected model. The ``i`` th element of the list corresponds to the expected value for interactions between loci separated by ``i`` bins. The length of this list will match the size of the largest region in the input counts dict. """ # log transform log_counts = {region: np.log(counts[region] + pseudocount) for region in counts.keys()} # make data of the form [log_distance, log_count], ignoring nans data = np.asarray([[np.log(i - j + pseudocount), log_counts[region][i, j]] for region in log_counts.keys() for i in range(len(log_counts[region])) for j in range(i + 1) if np.isfinite(log_counts[region][i, j])]) # establish total size size = max([len(counts[region]) for region in counts.keys()]) # filter key_index = np.int(size / 3) if exclude_near_diagonal else 0 key_log_distance = np.log(key_index + pseudocount) filtered_data = np.asarray([x for x in data if x[0] >= key_log_distance]) # do the linear fit fit = np.poly1d( np.polyfit(filtered_data[:, 0], filtered_data[:, 1], degree)) # get empirical expected empirical = global_empirical_binned(counts, log_transform=True) # repackage distance_expected = [empirical[i] if i < key_index else np.exp(fit(np.log(i + pseudocount))) - pseudocount for i in range(size)] return distance_expected
[docs]def powerlaw_binned(regional_counts, exclude_near_diagonal=False): """ Make a regional one-dimensional bin-level expected model by fitting a polynomial in log-log space. Parameters ---------- regional_counts : np.ndarray The observed counts matrix for this region. exclude_near_diagonal : bool If regression or lowess_smooth are True, set this kwarg to True to ignore the first third of the distance scales when fitting the model. Returns ------- List[float] The one-dimensional expected model. The ``i`` th element of the list corresponds to the expected value for interactions between loci separated by ``i`` bins. """ # fit empirical distance dependence empirical = np.array(empirical_binned(regional_counts, log_transform=True)) empirical_floored = np.floor(empirical).astype(int) # make data of the form [distance], ignoring nans # empirical_floored represents an empirical un-normalized pdf over distances # data represents the empirical distance dependence as a sample from the # empirical pdf over distances size = len(regional_counts) idx = int(size / 3) if exclude_near_diagonal else 1 data = np.concatenate([[i]*empirical_floored[i] for i in range(idx, size)]) # fit powerlaw # size factor represents how many time we have to draw from the pdf # to recover something on the scale of the original empirical data fit = pl.Fit(data, xmin=idx, xmax=size-1, discrete=True) size_factor = empirical_floored[idx:].sum() powerlaw_expected = fit.power_law.pdf(np.arange(idx, size)) * size_factor # return stitched distance dependence return np.concatenate([empirical[0:idx], powerlaw_expected])
[docs]def global_powerlaw_binned(counts, exclude_near_diagonal=False): """ Make a global one-dimensional bin-level expected model by fitting a polynomial in log-log space. Parameters ---------- counts : Dict[str, np.ndarray] The observed counts dict to fit the model to. exclude_near_diagonal : bool If regression or lowess_smooth are True, set this kwarg to True to ignore the first third of the distance scales when fitting the model. Returns ------- List[float] The one-dimensional expected model. The ``i`` th element of the list corresponds to the expected value for interactions between loci separated by ``i`` bins. The length of this list will match the size of the largest region in the input counts dict. """ # fit empirical distance dependence empirical = np.array(global_empirical_binned(counts, log_transform=True)) empirical_floored = np.floor(empirical).astype(int) # establish total size size = max([len(counts[region]) for region in counts.keys()]) # make data of the form [distance], ignoring nans # empirical_floored represents an empirical un-normalized pdf over distances # data represents the empirical distance dependence as a sample from the # empirical pdf over distances idx = int(size / 3) if exclude_near_diagonal else 1 data = np.concatenate([[i]*empirical_floored[i] for i in range(idx, size)]) # fit powerlaw # size factor represents how many time we have to draw from the pdf # to recover something on the scale of the original empirical data fit = pl.Fit(data, xmin=idx, xmax=size-1, discrete=True) size_factor = empirical_floored[idx:].sum() powerlaw_expected = fit.power_law.pdf(np.arange(idx, size)) * size_factor # return stitched distance dependence return np.concatenate([empirical[0:idx], powerlaw_expected])
[docs]def lowess_log_log_fragment(regional_counts, distances, pseudocount=1, frac=0.8): """ Make a regional one-dimensional fragment-level expected model by performing lowess regression in log-log space. Parameters ---------- regional_counts : np.ndarray The observed counts matrix for this region. distances : np.ndarray The pairwise distance matrix for all fragments in this region in units of base pairs. pseudocount : int The pseudocount to add to the counts before logging. frac : float The lowess smoothing fraction parameter to use. Returns ------- Dict[int, float] A mapping from interaction distances in units of base pairs to the expected value at that distance. """ # log transform log_regional_counts = np.log(regional_counts + pseudocount) log_distances = np.log(distances + pseudocount) # make data of the form [distance, count], ignoring nans data = np.asarray([[log_distances[i, j], log_regional_counts[i, j]] for i in range(len(log_regional_counts)) for j in range(i + 1) if np.isfinite(log_regional_counts[i, j])]) # do the lowess fit fit = lowess(data[:, 1], data[:, 0], frac=frac, it=3, delta=0.01*np.ptp(data[:, 1])) # unlog fit_dists = np.rint(np.exp(fit[:, 0]) - pseudocount).astype(int) fit_counts = np.exp(fit[:, 1]) - pseudocount # repackage distance_expected = {fit_dists[i]: fit_counts[i] for i in range(len(fit))} # fill nans for dist in np.unique(distances.flatten()): if dist not in distance_expected: distance_expected[dist] = np.nan return distance_expected
[docs]def global_lowess_log_log_fragment(counts, distances, pseudocount=1, frac=0.8): """ Make a global one-dimensional fragment-level expected model by performing lowess regression in log-log space. Parameters ---------- counts : Dict[str, np.ndarray] The observed counts dict to fit the model to. distances : Dict[str, np.ndarray] A dict of pairwise distance matrices describing the genomic distances between the elements of the matrices in ``counts``. The keys and array dimensions should match the keys and array dimensions of ``counts``. pseudocount : int The pseudocount to add to the counts before logging. frac : float The lowess smoothing fraction parameter to use. Returns ------- Dict[int, float] A mapping from interaction distances in units of base pairs to the expected value at that distance. """ # log transform log_counts = {region: np.log(counts[region] + pseudocount) for region in counts.keys()} log_distances = {region: np.log(distances[region] + pseudocount) for region in distances.keys()} # make data of the form [distance, count], ignoring nans data = np.asarray([[log_distances[region][i, j], log_counts[region][i, j]] for region in log_counts.keys() for i in range(len(log_counts[region])) for j in range(i + 1) if np.isfinite(log_counts[region][i, j])]) # do the lowess fit fit = lowess(data[:, 1], data[:, 0], frac=frac, it=3, delta=0.01*np.ptp(data[:, 1])) # unlog fit_dists = np.rint(np.exp(fit[:, 0]) - pseudocount).astype(int) fit_counts = np.exp(fit[:, 1]) - pseudocount # repackage distance_expected = {fit_dists[i]: fit_counts[i] for i in range(len(fit))} # fill nans for dist in np.unique(np.concatenate([distances[region].flatten() for region in distances.keys()])): if dist not in distance_expected: distance_expected[dist] = np.nan return distance_expected
[docs]def lowess_log_log_binned(regional_counts, pseudocount=1, frac=0.8, exclude_near_diagonal=False): """ Make a regional one-dimensional bin-level expected model by performing lowess regression in log-log space. Parameters ---------- regional_counts : np.ndarray The observed counts matrix for this region. pseudocount : int The pseudocount to add to the counts before logging. frac : float The lowess smoothing fraction parameter to use. exclude_near_diagonal : bool If regression or lowess_smooth are True, set this kwarg to True to ignore the first third of the distance scales when fitting the model. Returns ------- List[float] The one-dimensional expected model. The ``i`` th element of the list corresponds to the expected value for interactions between loci separated by ``i`` bins. """ # log transform log_regional_counts = np.log(regional_counts + pseudocount) # make data of the form [distance, count], ignoring nans data = np.asarray([[np.log(i - j + pseudocount), log_regional_counts[i, j]] for i in range(len(log_regional_counts)) for j in range(i + 1) if np.isfinite(log_regional_counts[i, j])]) # establish total size size = len(regional_counts) # filter key_index = np.int(size / 3) if exclude_near_diagonal else 0 key_log_distance = np.log(key_index + pseudocount) filtered_data = np.asarray([x for x in data if x[0] >= key_log_distance]) # do the lowess fit fit = lowess(filtered_data[:, 1], filtered_data[:, 0], frac=frac, it=3, delta=0.01*np.ptp(filtered_data[:, 1])) # unlog fit_dists = np.rint(np.exp(fit[:, 0]) - pseudocount).astype(int) fit_counts = np.exp(fit[:, 1]) - pseudocount # get empirical expected empirical = empirical_binned(regional_counts, log_transform=True) # repackage distance_expected = {fit_dists[i]: fit_counts[i] for i in range(len(fit))} distance_expected = [empirical[i] if i < key_index else distance_expected[i] if i in distance_expected else np.nan for i in range(size)] return distance_expected
[docs]def global_lowess_log_log_binned(counts, pseudocount=1, frac=0.8, exclude_near_diagonal=False): """ Make a global one-dimensional bin-level expected model by performing lowess regression in log-log space. Parameters ---------- counts : Dict[str, np.ndarray] The observed counts dict to fit the model to. pseudocount : int The pseudocount to add to the counts before logging. frac : float The lowess smoothing fraction parameter to use. exclude_near_diagonal : bool If regression or lowess_smooth are True, set this kwarg to True to ignore the first third of the distance scales when fitting the model. Returns ------- List[float] The one-dimensional expected model. The ``i`` th element of the list corresponds to the expected value for interactions between loci separated by ``i`` bins. The length of this list will match the size of the largest region in the input counts dict. """ # log transform log_counts = {region: np.log(counts[region] + pseudocount) for region in counts.keys()} # make data of the form [distance, count], ignoring nans data = np.asarray([[np.log(i - j + pseudocount), log_counts[region][i, j]] for region in log_counts.keys() for i in range(len(log_counts[region])) for j in range(i + 1) if np.isfinite(log_counts[region][i, j])]) # establish total size size = max([len(counts[region]) for region in counts.keys()]) # filter key_index = np.int(size / 3) if exclude_near_diagonal else 0 key_log_distance = np.log(key_index + pseudocount) filtered_data = np.asarray([x for x in data if x[0] >= key_log_distance]) # do the lowess fit fit = lowess(filtered_data[:, 1], filtered_data[:, 0], frac=frac, it=3, delta=0.01*np.ptp(filtered_data[:, 1])) # unlog fit_dists = np.rint(np.exp(fit[:, 0]) - pseudocount).astype(int) fit_counts = np.exp(fit[:, 1]) - pseudocount # get empirical expected empirical = global_empirical_binned(counts, log_transform=True) # repackage distance_expected = {fit_dists[i]: fit_counts[i] for i in range(len(fit))} distance_expected = [empirical[i] if i < key_index else distance_expected[i] if i in distance_expected else np.nan for i in range(size)] return distance_expected
[docs]def force_monotonic(distance_expected): """ Force a one-dimensional distance expected to be monotonic. Parameters ---------- distance_expected : Union[List[float], Dict[int, float]] The one-dimensional expected model to force to monotonicity. If the model describes bin-level data, this should be a list of floats, where the ``i`` th element of the list corresponds to the expected value for interactions between loci separated by ``i`` bins. If the model describes fragment-level data, this should be a dict mapping interaction distances in units of base pairs to the expected value at that distance. Returns ------- Union[List[float], Dict[int, float]] The forced-monotonic version of the input one-dimensional expected model. """ if type(distance_expected) == list: monotonic_expected = [] min_value = np.inf for i in range(len(distance_expected)): min_value = min(min_value, distance_expected[i]) monotonic_expected.append(min_value) return monotonic_expected elif type(distance_expected) == dict: distances = sorted(distance_expected.keys()) monotonic_expected = {} min_value = np.inf for dist in distances: min_value = min(min_value, distance_expected[dist]) monotonic_expected[dist] = min_value return monotonic_expected elif isinstance(distance_expected, np.ndarray): return np.minimum.accumulate(distance_expected) else: raise NotImplementedError('cannot force monotonicity of a %s' % type(distance_expected))
[docs]def empirical_binned(regional_counts, log_transform=True): """ Make a regional one-dimensional bin-level expected model by taking an average of the interaction values at each distance. Parameters ---------- regional_counts : np.ndarray The observed counts matrix for this region. log_transform : bool Pass True to take the geometric mean instead of the arithmetic mean, which is equivalent to averaging log-transformed counts. Returns ------- List[float] The one-dimensional expected model. The ``i`` th element of the list corresponds to the expected value for interactions between loci separated by ``i`` bins. """ # make offdiagonals offdiagonals = [np.diag(regional_counts, k=i) for i in range(len(regional_counts))] # pick appropriate mean function mean_function = np.nanmean if log_transform: mean_function = gmean return [mean_function(offdiagonal) if np.any(np.isfinite(offdiagonal)) else np.nan for offdiagonal in offdiagonals]
[docs]def global_empirical_binned(counts, log_transform=True): """ Make a global one-dimensional bin-level expected model by taking an average of the interaction values at each distance. Parameters ---------- counts : Dict[str, np.ndarray] The observed counts dict to fit the model to. log_transform : bool Pass True to take the geometric mean instead of the arithmetic mean, which is equivalent to averaging log-transformed counts. Returns ------- List[float] The one-dimensional expected model. The ``i`` th element of the list corresponds to the expected value for interactions between loci separated by ``i`` bins. The length of this list will match the size of the largest region in the input counts dict. """ # make offdiagonals offdiagonals = [np.concatenate([np.diag(counts[region], k=i) for region in counts.keys()]) for i in range(max([len(counts[region]) for region in counts.keys()]))] # pick appropriate mean function mean_function = np.nanmean if log_transform: mean_function = gmean return [mean_function(offdiagonal) if np.any(np.isfinite(offdiagonal)) else np.nan for offdiagonal in offdiagonals]
[docs]def lowess_binned_log_counts(regional_counts, pseudocount=1, frac=0.8, exclude_near_diagonal=False): """ Make a regional one-dimensional bin-level expected model by performing lowess regression in log-counts space, excluding the first third of the region and only using the emprical geometric means there instead. Parameters ---------- regional_counts : np.ndarray The observed counts matrix for this region. pseudocount : int The pseudocount to add to the counts before logging. frac : float The lowess smoothing fraction parameter to use. exclude_near_diagonal : bool If regression or lowess_smooth are True, set this kwarg to True to ignore the first third of the distance scales when fitting the model. Returns ------- List[float] The one-dimensional expected model. The ``i`` th element of the list corresponds to the expected value for interactions between loci separated by ``i`` bins. """ # log counts regional_counts = np.log(regional_counts + pseudocount) # establish size size = len(regional_counts) # make offdiagonals offdiagonals = [np.diag(regional_counts, k=i) for i in range(size)] # get empirical expected empirical = empirical_binned(regional_counts, log_transform=False) # make data of the form [distance, count], ignoring nans data = np.asarray([[dist, count] for dist in range(size) for count in offdiagonals[dist] if np.isfinite(count)]) # don't try to fit the first third of the region key_index = np.int(size / 3) if exclude_near_diagonal else 0 filtered_data = np.asarray([x for x in data if x[0] >= key_index]) # do the lowess fit fit = lowess(filtered_data[:, 1], filtered_data[:, 0], frac=frac, it=3, delta=0.01*np.ptp(filtered_data[:, 1])) # filter the fit to just the region above the join point filtered_fit = np.asarray([x for x in fit if x[0] >= key_index]) # construct an array that will represent the joined fit joined_fit = np.zeros(size) for i in range(key_index): joined_fit[i] = empirical[i] for i in range(key_index, size): query_result = [x for x in filtered_fit if x[0] == i] if query_result: joined_fit[i] = query_result[0][1] else: joined_fit[i] = empirical[i] # unlog joined_fit = [np.exp(x) - pseudocount for x in joined_fit] return joined_fit
[docs]def lowess_binned(regional_counts, frac=0.8, exclude_near_diagonal=False): """ Make a regional one-dimensional bin-level expected model by performing lowess regression in unlogged space, excluding the first third of the region and only using the emprical geometric means there instead. Parameters ---------- regional_counts : np.ndarray The observed counts matrix for this region. frac : float The lowess smoothing fraction parameter to use. exclude_near_diagonal : bool If regression or lowess_smooth are True, set this kwarg to True to ignore the first third of the distance scales when fitting the model. Returns ------- List[float] The one-dimensional expected model. The ``i`` th element of the list corresponds to the expected value for interactions between loci separated by ``i`` bins. """ # establish size size = len(regional_counts) # make offdiagonals offdiagonals = [np.diag(regional_counts, k=i) for i in range(size)] # get empirical expected empirical = empirical_binned(regional_counts, log_transform=False) # make data of the form [distance, count], ignoring nans data = np.asarray([[dist, count] for dist in range(size) for count in offdiagonals[dist] if np.isfinite(count)]) # don't try to fit the first third of the region key_index = np.int(size / 3) if exclude_near_diagonal else 0 filtered_data = np.asarray([x for x in data if x[0] >= key_index]) # do the lowess fit fit = lowess(filtered_data[:, 1], filtered_data[:, 0], frac=frac, it=3, delta=0.01*np.ptp(filtered_data[:, 1])) # filter the fit to just the region above the join point filtered_fit = np.asarray([x for x in fit if x[0] >= key_index]) # construct an array that will represent the joined fit joined_fit = np.zeros(size) for i in range(key_index): joined_fit[i] = empirical[i] for i in range(key_index, size): query_result = [x for x in filtered_fit if x[0] == i] if query_result: joined_fit[i] = query_result[0][1] else: joined_fit[i] = empirical[i] return joined_fit
[docs]def global_lowess_binned_log_counts(counts, pseudocount=1, frac=0.8, exclude_near_diagonal=False): """ Make a global one-dimensional bin-level expected model by performing lowess regression in log-counts space, excluding the first third of the distance scales and only using the emprical geometric means there instead. Parameters ---------- counts : Dict[str, np.ndarray] The observed counts dict to fit the model to. pseudocount : int The pseudocount to add to the counts before logging. frac : float The lowess smoothing fraction parameter to use. exclude_near_diagonal : bool If regression or lowess_smooth are True, set this kwarg to True to ignore the first third of the distance scales when fitting the model. Returns ------- List[float] The one-dimensional expected model. The ``i`` th element of the list corresponds to the expected value for interactions between loci separated by ``i`` bins. The length of this list will match the size of the largest region in the input counts dict. """ # log counts log_counts = {region: np.log(counts[region] + pseudocount) for region in counts.keys()} # establish total size size = max([len(log_counts[region]) for region in log_counts.keys()]) # make offdiagonals offdiagonals = [np.concatenate([np.diag(log_counts[region], k=i) for region in log_counts.keys()]) for i in range(size)] # get empirical expected empirical = global_empirical_binned(log_counts, log_transform=False) # make data of the form [distance, count], ignoring nans data = np.asarray([[dist, count] for dist in range(size) for count in offdiagonals[dist] if np.isfinite(count)]) # don't try to fit the first third of the region key_index = np.int(size / 3) if exclude_near_diagonal else 0 filtered_data = np.asarray([x for x in data if x[0] >= key_index]) # do the lowess fit fit = lowess(filtered_data[:, 1], filtered_data[:, 0], frac=frac, it=3, delta=0.01*np.ptp(filtered_data[:, 1])) # filter the fit to just the region above the join point filtered_fit = np.asarray([x for x in fit if x[0] >= key_index]) # construct an array that will represent the joined fit joined_fit = np.zeros(size) for i in range(key_index): joined_fit[i] = empirical[i] for i in range(key_index, size): query_result = [x for x in filtered_fit if x[0] == i] if query_result: joined_fit[i] = query_result[0][1] else: joined_fit[i] = empirical[i] # unlog joined_fit = [np.exp(x) - pseudocount for x in joined_fit] return joined_fit
[docs]def global_lowess_binned(counts, frac=0.8, exclude_near_diagonal=False): """ Make a global one-dimensional bin-level expected model by performing lowess regression in unlogged space, excluding the first third of the distance scales and only using the emprical arithmetic means there instead. Parameters ---------- counts : Dict[str, np.ndarray] The observed counts dict to fit the model to. frac : float The lowess smoothing fraction parameter to use. exclude_near_diagonal : bool If regression or lowess_smooth are True, set this kwarg to True to ignore the first third of the distance scales when fitting the model. Returns ------- List[float] The one-dimensional expected model. The ``i`` th element of the list corresponds to the expected value for interactions between loci separated by ``i`` bins. The length of this list will match the size of the largest region in the input counts dict. """ # establish total size size = max([len(counts[region]) for region in counts.keys()]) # make offdiagonals offdiagonals = [np.concatenate([np.diag(counts[region], k=i) for region in counts.keys()]) for i in range(size)] # get empirical expected empirical = global_empirical_binned(counts, log_transform=False) # make data of the form [distance, count], ignoring nans data = np.asarray([[dist, count] for dist in range(size) for count in offdiagonals[dist] if np.isfinite(count)]) # don't try to fit the first third of the region key_index = np.int(size / 3) if exclude_near_diagonal else 0 filtered_data = np.asarray([x for x in data if x[0] >= key_index]) # do the lowess fit fit = lowess(filtered_data[:, 1], filtered_data[:, 0], frac=frac, it=3, delta=0.01*np.ptp(filtered_data[:, 1])) # filter the fit to just the region above the join point filtered_fit = np.asarray([x for x in fit if x[0] >= key_index]) # construct an array that will represent the joined fit joined_fit = np.zeros(size) for i in range(key_index): joined_fit[i] = empirical[i] for i in range(key_index, size): query_result = [x for x in filtered_fit if x[0] == i] if query_result: joined_fit[i] = query_result[0][1] else: joined_fit[i] = empirical[i] return joined_fit
[docs]@parallelize_regions def interpolate_expected(expected_matrix, regional_primermap, distance): """ Interpolate the value of an expected model (represented as a matrix) at an arbitrary distance scale. Parameters ---------- expected_matrix : np.ndarray The expected matrix to use as a source for interpolation. regional_primermap : List[Dict[str, Any]] The primermap for this region. distance : int The interaction distance at which to estimate the expected value, in base pairs. Returns ------- float The interpolated expected value, or -1 if ``distance`` is outside of the range of the expected model. """ # construct distance matrix distance_matrix = make_distance_matrix(regional_primermap) # flatten matrices flat_distance_matrix = np.array(flatten_regional_counts(distance_matrix)) flat_expected_matrix = np.array(flatten_regional_counts(expected_matrix)) # filter nan's filtered_distance_matrix = flat_distance_matrix[ np.isfinite(flat_distance_matrix) & np.isfinite(flat_expected_matrix)] filtered_expected_matrix = flat_expected_matrix[ np.isfinite(flat_distance_matrix) & np.isfinite(flat_expected_matrix)] # prepare model xp = np.unique(filtered_distance_matrix) fp = np.array( [np.mean(filtered_expected_matrix[filtered_distance_matrix == x]) for x in xp]) # interpolate return np.interp(distance, xp, fp, left=-1, right=-1)