"""
Module for computing expected models for 5C interaction data.
"""
import numpy as np
import powerlaw as pl
from statsmodels.nonparametric.smoothers_lowess import lowess
from lib5c.util.parallelization import parallelize_regions
from lib5c.util.bed import get_mid_to_mid_distance
from lib5c.util.mathematics import gmean
from lib5c.util.counts import flatten_regional_counts, propagate_nans
from lib5c.algorithms.donut_filters import donut_filt, lower_left_filt
[docs]@parallelize_regions
def make_poly_log_log_fragment_expected_matrix(obs_matrix, regional_primermap):
"""
Convenience function for quickly making an expected matrix for a
fragment-level observed counts matrix based on a simple power law
relationship.
Parameters
----------
obs_matrix : np.ndarray
The matrix of observed counts.
regional_primermap : List[Dict[str, Any]]
Primermap describing the loci in the region represented by
``obs_matrix``. Necessary to figure out distances between elements in
the contact matrix.
Returns
-------
np.ndarray
The expected matrix.
"""
distance_matrix = make_distance_matrix(regional_primermap)
distance_expected = poly_log_log_fragment(
obs_matrix, distance_matrix)
return make_expected_matrix_from_dict(distance_expected, distance_matrix)
[docs]@parallelize_regions
def make_poly_log_log_binned_expected_matrix(obs_matrix,
exclude_near_diagonal=False):
"""
Convenience function for quickly making an expected matrix for a bin-level
observed counts matrix based on a simple power law relationship.
Parameters
----------
obs_matrix : np.ndarray
The matrix of observed counts.
exclude_near_diagonal : bool
If regression or lowess_smooth are True, set this kwarg to True to
ignore the first third of the distance scales when fitting the model.
Returns
-------
np.ndarray
The expected matrix.
"""
distance_expected = poly_log_log_binned(
obs_matrix, exclude_near_diagonal=exclude_near_diagonal)
return make_expected_matrix_from_list(distance_expected)
[docs]@parallelize_regions
def make_powerlaw_binned_expected_matrix(obs_matrix,
exclude_near_diagonal=False):
"""
Convenience function for quickly making an expected matrix for a bin-level
observed counts matrix based on a simple power law relationship.
Parameters
----------
obs_matrix : np.ndarray
The matrix of observed counts.
exclude_near_diagonal : bool
If regression or lowess_smooth are True, set this kwarg to True to
ignore the first third of the distance scales when fitting the model.
Returns
-------
np.ndarray
The expected matrix.
"""
distance_expected = powerlaw_binned(
obs_matrix, exclude_near_diagonal=exclude_near_diagonal)
return make_expected_matrix_from_list(distance_expected)
[docs]def get_distance_expected(obs_matrix, regional_primermap=None, level='bin',
powerlaw=False, regression=False, degree=1,
lowess_smooth=False, lowess_frac=0.8,
log_transform='auto', exclude_near_diagonal=False):
"""
Convenience function for computing a regional one-dimensional expected model
from a matrix of observed counts, with properties that can be customized by
kwargs.
Parameters
----------
obs_matrix : np.ndarray
The matrix of observed counts to model.
regional_primermap : Optional[List[Dict[str, Any]]]
The primermap for this region. Required if ``obs_matrix`` is
fragment-level.
level : {'bin', 'fragment'}
The level of ``obs_matrix``.
powerlaw : bool
Whether or not to fit a discrete power law distribution to the data.
regression : bool
Whether or not to use a polynomial regression model.
degree : int
The degree of the regression model to use.
lowess_smooth : bool
Whether or not to use lowess smoothing to compute the model.
lowess_frac : float
The lowess smoothing fraction parameter.
log_transform : {'counts', 'both', 'none', 'auto'}
What to transform into log space.
* counts: log-transform only the counts but not the distances. This
results in semi-log models, which don't work on fragment-level
data yet.
* both: log-transform both the counts and the distances, resulting
in log-log models.
* none: don't log anything.
* auto: automatically pick a reasonably choice based on the other
kwargs.
exclude_near_diagonal : bool
If regression or lowess_smooth are True, set this kwarg to True to
ignore the first third of the distance scales when fitting the model.
Returns
-------
Union[List[float], Dict[int, float]]
The one-dimensional expected model. For bin-level data, this is a list
of floats, where the ``i`` th element of the list corresponds to the
expected value for interactions between loci separated by ``i`` bins.
For fragment-level data, this is a dict mapping interaction distances in
units of base pairs to the appropriate expected values.
"""
# make 1-D expected
if level == 'bin':
# bin level, don't need a distance matrix
if powerlaw:
print('using power law 1-D distance model')
distance_expected = powerlaw_binned(
obs_matrix, exclude_near_diagonal=exclude_near_diagonal)
elif regression:
print('using polynomial log-log 1-D distance model')
distance_expected = poly_log_log_binned(
obs_matrix, degree=degree,
exclude_near_diagonal=exclude_near_diagonal)
elif lowess_smooth:
if log_transform == 'counts':
print('using lowess log-counts 1-D distance model')
distance_expected = lowess_binned_log_counts(
obs_matrix, frac=lowess_frac,
exclude_near_diagonal=exclude_near_diagonal)
elif log_transform == 'none':
print('using lowess unlogged 1-D distance model')
distance_expected = lowess_binned(
obs_matrix, frac=lowess_frac,
exclude_near_diagonal=exclude_near_diagonal)
else: # catches 'both' and 'auto'
print('using lowess log-log 1-D distance model')
distance_expected = lowess_log_log_binned(
obs_matrix, frac=lowess_frac,
exclude_near_diagonal=exclude_near_diagonal)
else:
if log_transform == 'none':
print('using unlogged empirical binned 1-D distance model')
distance_expected = empirical_binned(
obs_matrix, log_transform=False)
else: # catches 'both' and 'auto'; 'counts' is illegal
print('defaulting to logged empirical binned 1-D distance '
'model')
distance_expected = empirical_binned(
obs_matrix, log_transform=True)
else:
# fragment level, need a distance matrix
distance_matrix = make_distance_matrix(regional_primermap)
assert np.issubdtype(distance_matrix[0, 1], int)
if lowess_smooth:
print('using lowess log-log 1-D distance model')
distance_expected = lowess_log_log_fragment(
obs_matrix, distance_matrix)
else:
print('defaulting to polynomial log-log 1-D distance model')
distance_expected = poly_log_log_fragment(
obs_matrix, distance_matrix, degree=degree)
return distance_expected
[docs]def get_global_distance_expected(counts, primermap=None, level='bin',
powerlaw=False, regression=False, degree=1,
lowess_smooth=False, lowess_frac=0.8,
log_transform='auto',
exclude_near_diagonal=False):
"""
Convenience function for computing a global one-dimensional expected model
from a dict of observed counts, with properties that can be customized by
kwargs.
Parameters
----------
counts : Dict[str, np.ndarray]
The dict of observed counts to model.
primermap : Optional[Dict[str, List[Dict[str, Any]]]]
A primermap corresponding to ``counts``.
level : {'bin', 'fragment'}
The level of ``counts``.
powerlaw : bool
Whether or not to fit a discrete power law distribution to the data.
regression : bool
Whether or not to use a polynomial regression model.
degree : int
The degree of the regression model to use.
lowess_smooth : bool
Whether or not to use lowess smoothing to compute the model.
lowess_frac : float
The lowess smoothing fraction parameter.
log_transform : {'counts', 'both', 'none', 'auto'}
What to transform into log space.
* counts: log-transform only the counts but not the distances. This
results in semi-log models, which don't work on fragment-level
data yet.
* both: log-transform both the counts and the distances, resulting
in log-log models.
* none: don't log anything.
* auto: automatically pick a reasonably choice based on the other
kwargs.
exclude_near_diagonal : bool
If regression or lowess_smooth are True, set this kwarg to True to
ignore the first third of the distance scales when fitting the model.
Returns
-------
Union[List[float], Dict[int, float]]
The one-dimensional expected model. For bin-level data, this is a list
of floats, where the ``i`` th element of the list corresponds to the
expected value for interactions between loci separated by ``i`` bins.
For fragment-level data, this is a dict mapping interaction distances in
units of base pairs to the appropriate expected values.
"""
# make 1-D expected
if level == 'bin':
# bin level, don't need a distance matrix
if powerlaw:
print('using power law 1-D distance model')
distance_expected = global_powerlaw_binned(
counts, exclude_near_diagonal=exclude_near_diagonal)
elif regression:
print('using polynomial log-log 1-D distance model')
distance_expected = global_poly_log_log_binned(
counts, degree=degree,
exclude_near_diagonal=exclude_near_diagonal)
elif lowess_smooth:
if log_transform == 'counts':
print('using lowess log-counts 1-D distance model')
distance_expected = global_lowess_binned_log_counts(
counts, frac=lowess_frac,
exclude_near_diagonal=exclude_near_diagonal)
elif log_transform == 'none':
print('using lowess unlogged 1-D distance model')
distance_expected = global_lowess_binned(
counts, frac=lowess_frac,
exclude_near_diagonal=exclude_near_diagonal)
else:
print('using lowess log-log 1-D distance model')
distance_expected = global_lowess_log_log_binned(
counts, frac=lowess_frac,
exclude_near_diagonal=exclude_near_diagonal)
else:
if log_transform == 'none':
print('using unlogged empirical binned 1-D distance model')
distance_expected = global_empirical_binned(
counts, log_transform=False)
else:
print('defaulting to logged empirical binned 1-D distance '
'model')
distance_expected = global_empirical_binned(
counts, log_transform=True)
else:
# fragment level, need a distance matrix
distance_matrix = make_distance_matrix(primermap)
if lowess_smooth:
print('using lowess log-log 1-D distance model')
distance_expected = global_lowess_log_log_fragment(
counts, distance_matrix)
else:
print('defaulting to polynomial log-log 1-D distance model')
distance_expected = global_poly_log_log_fragment(
counts, distance_matrix, degree=degree)
return distance_expected
[docs]@parallelize_regions
def make_expected_matrix(obs_matrix, regional_primermap=None, level='bin',
powerlaw=False, regression=False, degree=1,
lowess_smooth=False, lowess_frac=0.8,
log_transform='auto', monotonic=False, donut=False,
w=15, p=5, donut_frac=0.2, min_exp=0.1,
log_donut=False, max_donut_ll=False,
distance_expected=None, exclude_near_diagonal=False):
"""
Convenience function for computing a complete expected matrix given a matrix
of observed counts that can be customized with a variety of kwargs.
Parameters
----------
obs_matrix : np.ndarray
The matrix of observed counts to make an expected matrix for.
regional_primermap : Optional[List[Dict[str, Any]]]
The primermap for this region. Required if ``obs_matrix`` is
fragment-level.
level : {'bin', 'fragment'}
The level of ``obs_matrix``.
powerlaw : bool
Whether or not to fit a discrete power law distribution to the data.
regression : bool
Whether or not to use a polynomial regression model.
degree : int
The degree of the regression model to use.
lowess_smooth : bool
Whether or not to use lowess smoothing to compute the model.
lowess_frac : float
The lowess smoothing fraction parameter.
log_transform : {'counts', 'both', 'none', 'auto'}
What to transform into log space.
* counts: log-transform only the counts but not the distances. This
results in semi-log models, which don't work on fragment-level
data yet.
* both: log-transform both the counts and the distances, resulting
in log-log models.
* none: don't log anything.
* auto: automatically pick a reasonably choice based on the other
kwargs.
monotonic : bool
Pass True to force the one-dimensional expected model to be monotonic.
donut : bool
Pass True to apply donut-filter local correction to the expected model.
Not implemented for fragment-level input data.
w : int
The outer width of the donut when using donut correction. Should be an
odd integer.
p : int
The inner width of the donut when using donut correction. Should be an
odd integer.
donut_frac : float
If the fraction of possible elements in the donut that lie wihtin the
region and have non-infinte values is lower than this fraction then the
donut-corrected value at that point will be NaN.
min_exp : float
If the sum of the 1-D expected matrix under the donut or lower left
footprint for a particular pixel is less than this value, set the output
at this pixel to nan to avoid numerical instability related to division
by small numbers.
log_donut : bool
Pass True to perform donut correction in log-counts space.
max_donut_ll : bool
If ``donut`` is True, pass True here too to make the donut correction
use the maximum of the "donut" and "lower-left" regions.
distance_expected : Optional[Union[List[float], Dict[int, float]]]
Pass a one-dimensional expected model to use it instead of computing a
new one from scratch according to the other kwargs.
exclude_near_diagonal : bool
If regression or lowess_smooth are True, set this kwarg to True to
ignore the first third of the distance scales when fitting the model.
Returns
-------
Tuple[np.ndarray, Union[List[float], Dict[int, float]], Optional[
np.ndarray]]
The first element of the tuple is the expected matrix. The second
element of the tuple is the one-dimensional expected model, which will
be a list of expected values if ``level`` was 'bin' or a dict mapping
integer distances to expected values if ``level`` was 'fragment'. The
third element will be the pairwise distance matrix if ``level`` was
'fragment', but will simply be None if ``level`` was 'bin'.
"""
# get distance expected if it wasn't passed
if distance_expected is None:
distance_expected = get_distance_expected(
obs_matrix, regional_primermap=regional_primermap, level=level,
powerlaw=powerlaw, regression=regression, degree=degree,
lowess_smooth=lowess_smooth, lowess_frac=lowess_frac,
log_transform=log_transform,
exclude_near_diagonal=exclude_near_diagonal)
# make a distance matrix if we are at the fragment level
if level == 'fragment':
distance_matrix = make_distance_matrix(regional_primermap)
assert np.issubdtype(distance_matrix[0, 1], int)
else:
distance_matrix = None
# force monotonic
if monotonic:
print('forcing monotonicity')
distance_expected = force_monotonic(distance_expected)
# make expected matrix
if level == 'bin':
expected_matrix = make_expected_matrix_from_list(distance_expected)
else:
expected_matrix = make_expected_matrix_from_dict(
distance_expected, distance_matrix)
expected_matrix = expected_matrix[:len(obs_matrix), :len(obs_matrix)]
# donut filter
if donut:
print('applying donut correction')
# propagate nan's to expected
obs_matrix, expected_matrix = propagate_nans(
obs_matrix, expected_matrix)
# log transform if requested
if log_donut:
obs_matrix = np.log(obs_matrix + 1)
expected_matrix = np.log(expected_matrix + 1)
# donut correct
if not max_donut_ll:
expected_matrix = donut_filt(
obs_matrix, expected_matrix, p, w, max_percent=donut_frac,
min_exp=min_exp)
else:
# compute two matrices
donut_matrix = donut_filt(obs_matrix, expected_matrix, p, w,
max_percent=donut_frac, min_exp=min_exp)
ll_matrix = lower_left_filt(obs_matrix, expected_matrix, p, w,
max_percent=donut_frac, min_exp=min_exp)
# max operation
expected_matrix = np.fmax(donut_matrix, ll_matrix)
# undo log transform
if log_donut:
expected_matrix = np.exp(expected_matrix) - 1
return expected_matrix, distance_expected, distance_matrix
[docs]def make_expected_matrix_from_list(distance_expected):
"""
Converts a bin-level one-dimensional expected model into an expected matrix.
Parameters
----------
distance_expected : List[float]
The one-dimensional distance expected model to make a matrix out of.
Returns
-------
np.ndarray
The expected matrix.
"""
expected_matrix = np.zeros((len(distance_expected), len(distance_expected)),
dtype=float)
for i in range(len(expected_matrix)):
for j in range(i + 1):
expected_matrix[i, j] = distance_expected[i - j]
expected_matrix[j, i] = distance_expected[i - j]
return expected_matrix
[docs]def make_expected_matrix_from_dict(distance_expected, distance_matrix):
"""
Converts a fragment-level one-dimensional expected model into an expected
matrix.
Parameters
----------
distance_expected : Dict[int, float]
A mapping from interaction distances in units of base pairs to the
expected value at that distance.
distance_matrix : np.ndarray
The pairwise distance matrix for the fragments in this region.
Returns
-------
np.ndarray
The expected matrix.
"""
expected_matrix = np.zeros_like(distance_matrix, dtype=float)
for i in range(len(expected_matrix)):
for j in range(i + 1):
expected_matrix[i, j] = distance_expected[distance_matrix[i, j]]
expected_matrix[j, i] = distance_expected[distance_matrix[i, j]]
return expected_matrix
[docs]def make_expected_dict_from_matrix(expected_matrix, distance_matrix):
"""
Convert an expected matrix into a dict representation of the one-dimensional
expected model it embodies.
Parameters
----------
expected_matrix : np.ndarray
The expected matrix.
distance_matrix : np.ndarray
The pairwise distance matrix for the fragments in this region.
Returns
-------
Dict[int, float]
A mapping from interaction distances in units of base pairs to the
expected value at that distance.
"""
distance_expected = {}
for i in range(len(expected_matrix)):
for j in range(i + 1):
if np.isfinite(expected_matrix[i, j]):
distance_expected[distance_matrix[i, j]] = expected_matrix[i, j]
return distance_expected
[docs]@parallelize_regions
def make_distance_matrix(regional_primermap):
"""
Construct a pairwise distance matrix for the fragments in a region from the
primermap describing those fragments.
Parameters
----------
regional_primermap : List[Dict[str, Any]]
The primermap for this region.
Returns
-------
np.ndarray
The pairwise distance matrix for all fragments in this region in units
of base pairs.
"""
return np.array([[get_mid_to_mid_distance(regional_primermap[i],
regional_primermap[j])
for i in range(len(regional_primermap))]
for j in range(len(regional_primermap))], dtype=int)
[docs]def poly_log_log_fragment(regional_counts, distances, degree=1, pseudocount=1):
"""
Make a regional one-dimensional fragment-level expected model by fitting a
polynomial in log-log space.
Parameters
----------
regional_counts : np.ndarray
The observed counts matrix for this region.
distances : np.ndarray
The pairwise distance matrix for all fragments in this region in units
of base pairs.
degree : int
The degree of the polynomial to fit.
pseudocount : int
The pseudocount to add to the counts before logging.
Returns
-------
Dict[int, float]
A mapping from interaction distances in units of base pairs to the
expected value at that distance.
"""
# log transform
log_regional_counts = np.log(regional_counts + pseudocount)
log_distances = np.log(distances + pseudocount)
# make data of the form [log_distance, log_count], ignoring nans
data = np.asarray([[log_distances[i, j], log_regional_counts[i, j]]
for i in range(len(log_regional_counts))
for j in range(i + 1)
if np.isfinite(log_regional_counts[i, j])])
# do the linear fit
fit = np.poly1d(np.polyfit(data[:, 0], data[:, 1], degree))
# repackage
distance_expected = {
dist: np.exp(fit(np.log(dist + pseudocount))) - pseudocount
for dist in np.unique(distances.flatten())
}
return distance_expected
[docs]def global_poly_log_log_fragment(counts, distances, degree=1, pseudocount=1):
"""
Make a global one-dimensional fragment-level expected model by fitting a
polynomial in log-log space.
Parameters
----------
counts : Dict[str, np.ndarray]
The observed counts dict to fit the model to.
distances : Dict[str, np.ndarray]
A dict of pairwise distance matrices describing the genomic distances
between the elements of the matrices in ``counts``. The keys and array
dimensions should match the keys and array dimensions of ``counts``.
degree : int
The degree of the polynomial to fit.
pseudocount : int
The pseudocount to add to the counts before logging.
Returns
-------
Dict[int, float]
A mapping from interaction distances in units of base pairs to the
expected value at that distance.
"""
# log transform
log_counts = {region: np.log(counts[region] + pseudocount)
for region in counts.keys()}
log_distances = {region: np.log(distances[region] + pseudocount)
for region in distances.keys()}
# make data of the form [log_distance, log_count], ignoring nans
data = np.asarray([[log_distances[region][i, j], log_counts[region][i, j]]
for region in log_counts.keys()
for i in range(len(log_counts[region]))
for j in range(i + 1)
if np.isfinite(log_counts[region][i, j])])
# do the linear fit
fit = np.poly1d(np.polyfit(data[:, 0], data[:, 1], degree))
# repackage
distance_expected = {
dist: np.exp(fit(np.log(dist + pseudocount))) - pseudocount
for dist in np.unique(np.concatenate([distances[region].flatten()
for region in distances.keys()]))
}
return distance_expected
[docs]def poly_log_log_binned(regional_counts, degree=1, pseudocount=1,
exclude_near_diagonal=False):
"""
Make a regional one-dimensional bin-level expected model by fitting a
polynomial in log-log space.
Parameters
----------
regional_counts : np.ndarray
The observed counts matrix for this region.
degree : int
The degree of the polynomial to fit.
pseudocount : int
The pseudocount to add to the counts before logging.
exclude_near_diagonal : bool
If regression or lowess_smooth are True, set this kwarg to True to
ignore the first third of the distance scales when fitting the model.
Returns
-------
List[float]
The one-dimensional expected model. The ``i`` th element of the list
corresponds to the expected value for interactions between loci
separated by ``i`` bins.
"""
# log transform
log_regional_counts = np.log(regional_counts + pseudocount)
# establish total size
size = len(regional_counts)
# make data of the form [log_distance, log_count], ignoring nans
data = np.asarray([[np.log(i - j + pseudocount), log_regional_counts[i, j]]
for i in range(len(log_regional_counts))
for j in range(i + 1)
if np.isfinite(log_regional_counts[i, j])])
# filter
key_index = np.int(size / 3) if exclude_near_diagonal else 0
key_log_distance = np.log(key_index + pseudocount)
filtered_data = np.asarray([x for x in data if x[0] >= key_log_distance])
# do the linear fit
fit = np.poly1d(
np.polyfit(filtered_data[:, 0], filtered_data[:, 1], degree))
# get empirical expected
empirical = empirical_binned(regional_counts, log_transform=True)
# repackage
distance_expected = [empirical[i]
if i < key_index
else np.exp(fit(np.log(i + pseudocount))) - pseudocount
for i in range(size)]
return distance_expected
[docs]def global_poly_log_log_binned(counts, degree=1, pseudocount=1,
exclude_near_diagonal=False):
"""
Make a global one-dimensional bin-level expected model by fitting a
polynomial in log-log space.
Parameters
----------
counts : Dict[str, np.ndarray]
The observed counts dict to fit the model to.
degree : int
The degree of the polynomial to fit.
pseudocount : int
The pseudocount to add to the counts before logging.
exclude_near_diagonal : bool
If regression or lowess_smooth are True, set this kwarg to True to
ignore the first third of the distance scales when fitting the model.
Returns
-------
List[float]
The one-dimensional expected model. The ``i`` th element of the list
corresponds to the expected value for interactions between loci
separated by ``i`` bins. The length of this list will match the size of
the largest region in the input counts dict.
"""
# log transform
log_counts = {region: np.log(counts[region] + pseudocount)
for region in counts.keys()}
# make data of the form [log_distance, log_count], ignoring nans
data = np.asarray([[np.log(i - j + pseudocount), log_counts[region][i, j]]
for region in log_counts.keys()
for i in range(len(log_counts[region]))
for j in range(i + 1)
if np.isfinite(log_counts[region][i, j])])
# establish total size
size = max([len(counts[region]) for region in counts.keys()])
# filter
key_index = np.int(size / 3) if exclude_near_diagonal else 0
key_log_distance = np.log(key_index + pseudocount)
filtered_data = np.asarray([x for x in data if x[0] >= key_log_distance])
# do the linear fit
fit = np.poly1d(
np.polyfit(filtered_data[:, 0], filtered_data[:, 1], degree))
# get empirical expected
empirical = global_empirical_binned(counts, log_transform=True)
# repackage
distance_expected = [empirical[i]
if i < key_index
else np.exp(fit(np.log(i + pseudocount))) - pseudocount
for i in range(size)]
return distance_expected
[docs]def powerlaw_binned(regional_counts, exclude_near_diagonal=False):
"""
Make a regional one-dimensional bin-level expected model by fitting a
polynomial in log-log space.
Parameters
----------
regional_counts : np.ndarray
The observed counts matrix for this region.
exclude_near_diagonal : bool
If regression or lowess_smooth are True, set this kwarg to True to
ignore the first third of the distance scales when fitting the model.
Returns
-------
List[float]
The one-dimensional expected model. The ``i`` th element of the list
corresponds to the expected value for interactions between loci
separated by ``i`` bins.
"""
# fit empirical distance dependence
empirical = np.array(empirical_binned(regional_counts, log_transform=True))
empirical_floored = np.floor(empirical).astype(int)
# make data of the form [distance], ignoring nans
# empirical_floored represents an empirical un-normalized pdf over distances
# data represents the empirical distance dependence as a sample from the
# empirical pdf over distances
size = len(regional_counts)
idx = int(size / 3) if exclude_near_diagonal else 1
data = np.concatenate([[i]*empirical_floored[i] for i in range(idx, size)])
# fit powerlaw
# size factor represents how many time we have to draw from the pdf
# to recover something on the scale of the original empirical data
fit = pl.Fit(data, xmin=idx, xmax=size-1, discrete=True)
size_factor = empirical_floored[idx:].sum()
powerlaw_expected = fit.power_law.pdf(np.arange(idx, size)) * size_factor
# return stitched distance dependence
return np.concatenate([empirical[0:idx], powerlaw_expected])
[docs]def global_powerlaw_binned(counts, exclude_near_diagonal=False):
"""
Make a global one-dimensional bin-level expected model by fitting a
polynomial in log-log space.
Parameters
----------
counts : Dict[str, np.ndarray]
The observed counts dict to fit the model to.
exclude_near_diagonal : bool
If regression or lowess_smooth are True, set this kwarg to True to
ignore the first third of the distance scales when fitting the model.
Returns
-------
List[float]
The one-dimensional expected model. The ``i`` th element of the list
corresponds to the expected value for interactions between loci
separated by ``i`` bins. The length of this list will match the size of
the largest region in the input counts dict.
"""
# fit empirical distance dependence
empirical = np.array(global_empirical_binned(counts, log_transform=True))
empirical_floored = np.floor(empirical).astype(int)
# establish total size
size = max([len(counts[region]) for region in counts.keys()])
# make data of the form [distance], ignoring nans
# empirical_floored represents an empirical un-normalized pdf over distances
# data represents the empirical distance dependence as a sample from the
# empirical pdf over distances
idx = int(size / 3) if exclude_near_diagonal else 1
data = np.concatenate([[i]*empirical_floored[i] for i in range(idx, size)])
# fit powerlaw
# size factor represents how many time we have to draw from the pdf
# to recover something on the scale of the original empirical data
fit = pl.Fit(data, xmin=idx, xmax=size-1, discrete=True)
size_factor = empirical_floored[idx:].sum()
powerlaw_expected = fit.power_law.pdf(np.arange(idx, size)) * size_factor
# return stitched distance dependence
return np.concatenate([empirical[0:idx], powerlaw_expected])
[docs]def lowess_log_log_fragment(regional_counts, distances, pseudocount=1,
frac=0.8):
"""
Make a regional one-dimensional fragment-level expected model by performing
lowess regression in log-log space.
Parameters
----------
regional_counts : np.ndarray
The observed counts matrix for this region.
distances : np.ndarray
The pairwise distance matrix for all fragments in this region in units
of base pairs.
pseudocount : int
The pseudocount to add to the counts before logging.
frac : float
The lowess smoothing fraction parameter to use.
Returns
-------
Dict[int, float]
A mapping from interaction distances in units of base pairs to the
expected value at that distance.
"""
# log transform
log_regional_counts = np.log(regional_counts + pseudocount)
log_distances = np.log(distances + pseudocount)
# make data of the form [distance, count], ignoring nans
data = np.asarray([[log_distances[i, j], log_regional_counts[i, j]]
for i in range(len(log_regional_counts))
for j in range(i + 1)
if np.isfinite(log_regional_counts[i, j])])
# do the lowess fit
fit = lowess(data[:, 1], data[:, 0], frac=frac, it=3,
delta=0.01*np.ptp(data[:, 1]))
# unlog
fit_dists = np.rint(np.exp(fit[:, 0]) - pseudocount).astype(int)
fit_counts = np.exp(fit[:, 1]) - pseudocount
# repackage
distance_expected = {fit_dists[i]: fit_counts[i] for i in range(len(fit))}
# fill nans
for dist in np.unique(distances.flatten()):
if dist not in distance_expected:
distance_expected[dist] = np.nan
return distance_expected
[docs]def global_lowess_log_log_fragment(counts, distances, pseudocount=1, frac=0.8):
"""
Make a global one-dimensional fragment-level expected model by performing
lowess regression in log-log space.
Parameters
----------
counts : Dict[str, np.ndarray]
The observed counts dict to fit the model to.
distances : Dict[str, np.ndarray]
A dict of pairwise distance matrices describing the genomic distances
between the elements of the matrices in ``counts``. The keys and array
dimensions should match the keys and array dimensions of ``counts``.
pseudocount : int
The pseudocount to add to the counts before logging.
frac : float
The lowess smoothing fraction parameter to use.
Returns
-------
Dict[int, float]
A mapping from interaction distances in units of base pairs to the
expected value at that distance.
"""
# log transform
log_counts = {region: np.log(counts[region] + pseudocount)
for region in counts.keys()}
log_distances = {region: np.log(distances[region] + pseudocount)
for region in distances.keys()}
# make data of the form [distance, count], ignoring nans
data = np.asarray([[log_distances[region][i, j], log_counts[region][i, j]]
for region in log_counts.keys()
for i in range(len(log_counts[region]))
for j in range(i + 1)
if np.isfinite(log_counts[region][i, j])])
# do the lowess fit
fit = lowess(data[:, 1], data[:, 0], frac=frac, it=3,
delta=0.01*np.ptp(data[:, 1]))
# unlog
fit_dists = np.rint(np.exp(fit[:, 0]) - pseudocount).astype(int)
fit_counts = np.exp(fit[:, 1]) - pseudocount
# repackage
distance_expected = {fit_dists[i]: fit_counts[i] for i in range(len(fit))}
# fill nans
for dist in np.unique(np.concatenate([distances[region].flatten()
for region in distances.keys()])):
if dist not in distance_expected:
distance_expected[dist] = np.nan
return distance_expected
[docs]def lowess_log_log_binned(regional_counts, pseudocount=1, frac=0.8,
exclude_near_diagonal=False):
"""
Make a regional one-dimensional bin-level expected model by performing
lowess regression in log-log space.
Parameters
----------
regional_counts : np.ndarray
The observed counts matrix for this region.
pseudocount : int
The pseudocount to add to the counts before logging.
frac : float
The lowess smoothing fraction parameter to use.
exclude_near_diagonal : bool
If regression or lowess_smooth are True, set this kwarg to True to
ignore the first third of the distance scales when fitting the model.
Returns
-------
List[float]
The one-dimensional expected model. The ``i`` th element of the list
corresponds to the expected value for interactions between loci
separated by ``i`` bins.
"""
# log transform
log_regional_counts = np.log(regional_counts + pseudocount)
# make data of the form [distance, count], ignoring nans
data = np.asarray([[np.log(i - j + pseudocount), log_regional_counts[i, j]]
for i in range(len(log_regional_counts))
for j in range(i + 1)
if np.isfinite(log_regional_counts[i, j])])
# establish total size
size = len(regional_counts)
# filter
key_index = np.int(size / 3) if exclude_near_diagonal else 0
key_log_distance = np.log(key_index + pseudocount)
filtered_data = np.asarray([x for x in data if x[0] >= key_log_distance])
# do the lowess fit
fit = lowess(filtered_data[:, 1], filtered_data[:, 0], frac=frac, it=3,
delta=0.01*np.ptp(filtered_data[:, 1]))
# unlog
fit_dists = np.rint(np.exp(fit[:, 0]) - pseudocount).astype(int)
fit_counts = np.exp(fit[:, 1]) - pseudocount
# get empirical expected
empirical = empirical_binned(regional_counts, log_transform=True)
# repackage
distance_expected = {fit_dists[i]: fit_counts[i] for i in range(len(fit))}
distance_expected = [empirical[i]
if i < key_index
else distance_expected[i]
if i in distance_expected
else np.nan
for i in range(size)]
return distance_expected
[docs]def global_lowess_log_log_binned(counts, pseudocount=1, frac=0.8,
exclude_near_diagonal=False):
"""
Make a global one-dimensional bin-level expected model by performing lowess
regression in log-log space.
Parameters
----------
counts : Dict[str, np.ndarray]
The observed counts dict to fit the model to.
pseudocount : int
The pseudocount to add to the counts before logging.
frac : float
The lowess smoothing fraction parameter to use.
exclude_near_diagonal : bool
If regression or lowess_smooth are True, set this kwarg to True to
ignore the first third of the distance scales when fitting the model.
Returns
-------
List[float]
The one-dimensional expected model. The ``i`` th element of the list
corresponds to the expected value for interactions between loci
separated by ``i`` bins. The length of this list will match the size of
the largest region in the input counts dict.
"""
# log transform
log_counts = {region: np.log(counts[region] + pseudocount)
for region in counts.keys()}
# make data of the form [distance, count], ignoring nans
data = np.asarray([[np.log(i - j + pseudocount), log_counts[region][i, j]]
for region in log_counts.keys()
for i in range(len(log_counts[region]))
for j in range(i + 1)
if np.isfinite(log_counts[region][i, j])])
# establish total size
size = max([len(counts[region]) for region in counts.keys()])
# filter
key_index = np.int(size / 3) if exclude_near_diagonal else 0
key_log_distance = np.log(key_index + pseudocount)
filtered_data = np.asarray([x for x in data if x[0] >= key_log_distance])
# do the lowess fit
fit = lowess(filtered_data[:, 1], filtered_data[:, 0], frac=frac, it=3,
delta=0.01*np.ptp(filtered_data[:, 1]))
# unlog
fit_dists = np.rint(np.exp(fit[:, 0]) - pseudocount).astype(int)
fit_counts = np.exp(fit[:, 1]) - pseudocount
# get empirical expected
empirical = global_empirical_binned(counts, log_transform=True)
# repackage
distance_expected = {fit_dists[i]: fit_counts[i] for i in range(len(fit))}
distance_expected = [empirical[i]
if i < key_index
else distance_expected[i]
if i in distance_expected
else np.nan
for i in range(size)]
return distance_expected
[docs]def force_monotonic(distance_expected):
"""
Force a one-dimensional distance expected to be monotonic.
Parameters
----------
distance_expected : Union[List[float], Dict[int, float]]
The one-dimensional expected model to force to monotonicity. If the
model describes bin-level data, this should be a list of floats, where
the ``i`` th element of the list corresponds to the expected value for
interactions between loci separated by ``i`` bins. If the model
describes fragment-level data, this should be a dict mapping interaction
distances in units of base pairs to the expected value at that distance.
Returns
-------
Union[List[float], Dict[int, float]]
The forced-monotonic version of the input one-dimensional expected
model.
"""
if type(distance_expected) == list:
monotonic_expected = []
min_value = np.inf
for i in range(len(distance_expected)):
min_value = min(min_value, distance_expected[i])
monotonic_expected.append(min_value)
return monotonic_expected
elif type(distance_expected) == dict:
distances = sorted(distance_expected.keys())
monotonic_expected = {}
min_value = np.inf
for dist in distances:
min_value = min(min_value, distance_expected[dist])
monotonic_expected[dist] = min_value
return monotonic_expected
elif isinstance(distance_expected, np.ndarray):
return np.minimum.accumulate(distance_expected)
else:
raise NotImplementedError('cannot force monotonicity of a %s'
% type(distance_expected))
[docs]def empirical_binned(regional_counts, log_transform=True):
"""
Make a regional one-dimensional bin-level expected model by taking an
average of the interaction values at each distance.
Parameters
----------
regional_counts : np.ndarray
The observed counts matrix for this region.
log_transform : bool
Pass True to take the geometric mean instead of the arithmetic mean,
which is equivalent to averaging log-transformed counts.
Returns
-------
List[float]
The one-dimensional expected model. The ``i`` th element of the list
corresponds to the expected value for interactions between loci
separated by ``i`` bins.
"""
# make offdiagonals
offdiagonals = [np.diag(regional_counts, k=i)
for i in range(len(regional_counts))]
# pick appropriate mean function
mean_function = np.nanmean
if log_transform:
mean_function = gmean
return [mean_function(offdiagonal)
if np.any(np.isfinite(offdiagonal))
else np.nan
for offdiagonal in offdiagonals]
[docs]def global_empirical_binned(counts, log_transform=True):
"""
Make a global one-dimensional bin-level expected model by taking an
average of the interaction values at each distance.
Parameters
----------
counts : Dict[str, np.ndarray]
The observed counts dict to fit the model to.
log_transform : bool
Pass True to take the geometric mean instead of the arithmetic mean,
which is equivalent to averaging log-transformed counts.
Returns
-------
List[float]
The one-dimensional expected model. The ``i`` th element of the list
corresponds to the expected value for interactions between loci
separated by ``i`` bins. The length of this list will match the size of
the largest region in the input counts dict.
"""
# make offdiagonals
offdiagonals = [np.concatenate([np.diag(counts[region], k=i)
for region in counts.keys()])
for i in range(max([len(counts[region])
for region in counts.keys()]))]
# pick appropriate mean function
mean_function = np.nanmean
if log_transform:
mean_function = gmean
return [mean_function(offdiagonal)
if np.any(np.isfinite(offdiagonal))
else np.nan
for offdiagonal in offdiagonals]
[docs]def lowess_binned_log_counts(regional_counts, pseudocount=1, frac=0.8,
exclude_near_diagonal=False):
"""
Make a regional one-dimensional bin-level expected model by performing
lowess regression in log-counts space, excluding the first third of the
region and only using the emprical geometric means there instead.
Parameters
----------
regional_counts : np.ndarray
The observed counts matrix for this region.
pseudocount : int
The pseudocount to add to the counts before logging.
frac : float
The lowess smoothing fraction parameter to use.
exclude_near_diagonal : bool
If regression or lowess_smooth are True, set this kwarg to True to
ignore the first third of the distance scales when fitting the model.
Returns
-------
List[float]
The one-dimensional expected model. The ``i`` th element of the list
corresponds to the expected value for interactions between loci
separated by ``i`` bins.
"""
# log counts
regional_counts = np.log(regional_counts + pseudocount)
# establish size
size = len(regional_counts)
# make offdiagonals
offdiagonals = [np.diag(regional_counts, k=i) for i in range(size)]
# get empirical expected
empirical = empirical_binned(regional_counts, log_transform=False)
# make data of the form [distance, count], ignoring nans
data = np.asarray([[dist, count]
for dist in range(size)
for count in offdiagonals[dist]
if np.isfinite(count)])
# don't try to fit the first third of the region
key_index = np.int(size / 3) if exclude_near_diagonal else 0
filtered_data = np.asarray([x for x in data if x[0] >= key_index])
# do the lowess fit
fit = lowess(filtered_data[:, 1], filtered_data[:, 0], frac=frac, it=3,
delta=0.01*np.ptp(filtered_data[:, 1]))
# filter the fit to just the region above the join point
filtered_fit = np.asarray([x for x in fit if x[0] >= key_index])
# construct an array that will represent the joined fit
joined_fit = np.zeros(size)
for i in range(key_index):
joined_fit[i] = empirical[i]
for i in range(key_index, size):
query_result = [x for x in filtered_fit if x[0] == i]
if query_result:
joined_fit[i] = query_result[0][1]
else:
joined_fit[i] = empirical[i]
# unlog
joined_fit = [np.exp(x) - pseudocount for x in joined_fit]
return joined_fit
[docs]def lowess_binned(regional_counts, frac=0.8, exclude_near_diagonal=False):
"""
Make a regional one-dimensional bin-level expected model by performing
lowess regression in unlogged space, excluding the first third of the
region and only using the emprical geometric means there instead.
Parameters
----------
regional_counts : np.ndarray
The observed counts matrix for this region.
frac : float
The lowess smoothing fraction parameter to use.
exclude_near_diagonal : bool
If regression or lowess_smooth are True, set this kwarg to True to
ignore the first third of the distance scales when fitting the model.
Returns
-------
List[float]
The one-dimensional expected model. The ``i`` th element of the list
corresponds to the expected value for interactions between loci
separated by ``i`` bins.
"""
# establish size
size = len(regional_counts)
# make offdiagonals
offdiagonals = [np.diag(regional_counts, k=i) for i in range(size)]
# get empirical expected
empirical = empirical_binned(regional_counts, log_transform=False)
# make data of the form [distance, count], ignoring nans
data = np.asarray([[dist, count]
for dist in range(size)
for count in offdiagonals[dist]
if np.isfinite(count)])
# don't try to fit the first third of the region
key_index = np.int(size / 3) if exclude_near_diagonal else 0
filtered_data = np.asarray([x for x in data if x[0] >= key_index])
# do the lowess fit
fit = lowess(filtered_data[:, 1], filtered_data[:, 0], frac=frac, it=3,
delta=0.01*np.ptp(filtered_data[:, 1]))
# filter the fit to just the region above the join point
filtered_fit = np.asarray([x for x in fit if x[0] >= key_index])
# construct an array that will represent the joined fit
joined_fit = np.zeros(size)
for i in range(key_index):
joined_fit[i] = empirical[i]
for i in range(key_index, size):
query_result = [x for x in filtered_fit if x[0] == i]
if query_result:
joined_fit[i] = query_result[0][1]
else:
joined_fit[i] = empirical[i]
return joined_fit
[docs]def global_lowess_binned_log_counts(counts, pseudocount=1, frac=0.8,
exclude_near_diagonal=False):
"""
Make a global one-dimensional bin-level expected model by performing
lowess regression in log-counts space, excluding the first third of the
distance scales and only using the emprical geometric means there instead.
Parameters
----------
counts : Dict[str, np.ndarray]
The observed counts dict to fit the model to.
pseudocount : int
The pseudocount to add to the counts before logging.
frac : float
The lowess smoothing fraction parameter to use.
exclude_near_diagonal : bool
If regression or lowess_smooth are True, set this kwarg to True to
ignore the first third of the distance scales when fitting the model.
Returns
-------
List[float]
The one-dimensional expected model. The ``i`` th element of the list
corresponds to the expected value for interactions between loci
separated by ``i`` bins. The length of this list will match the size of
the largest region in the input counts dict.
"""
# log counts
log_counts = {region: np.log(counts[region] + pseudocount)
for region in counts.keys()}
# establish total size
size = max([len(log_counts[region]) for region in log_counts.keys()])
# make offdiagonals
offdiagonals = [np.concatenate([np.diag(log_counts[region], k=i)
for region in log_counts.keys()])
for i in range(size)]
# get empirical expected
empirical = global_empirical_binned(log_counts, log_transform=False)
# make data of the form [distance, count], ignoring nans
data = np.asarray([[dist, count]
for dist in range(size)
for count in offdiagonals[dist]
if np.isfinite(count)])
# don't try to fit the first third of the region
key_index = np.int(size / 3) if exclude_near_diagonal else 0
filtered_data = np.asarray([x for x in data if x[0] >= key_index])
# do the lowess fit
fit = lowess(filtered_data[:, 1], filtered_data[:, 0], frac=frac, it=3,
delta=0.01*np.ptp(filtered_data[:, 1]))
# filter the fit to just the region above the join point
filtered_fit = np.asarray([x for x in fit if x[0] >= key_index])
# construct an array that will represent the joined fit
joined_fit = np.zeros(size)
for i in range(key_index):
joined_fit[i] = empirical[i]
for i in range(key_index, size):
query_result = [x for x in filtered_fit if x[0] == i]
if query_result:
joined_fit[i] = query_result[0][1]
else:
joined_fit[i] = empirical[i]
# unlog
joined_fit = [np.exp(x) - pseudocount for x in joined_fit]
return joined_fit
[docs]def global_lowess_binned(counts, frac=0.8, exclude_near_diagonal=False):
"""
Make a global one-dimensional bin-level expected model by performing
lowess regression in unlogged space, excluding the first third of the
distance scales and only using the emprical arithmetic means there instead.
Parameters
----------
counts : Dict[str, np.ndarray]
The observed counts dict to fit the model to.
frac : float
The lowess smoothing fraction parameter to use.
exclude_near_diagonal : bool
If regression or lowess_smooth are True, set this kwarg to True to
ignore the first third of the distance scales when fitting the model.
Returns
-------
List[float]
The one-dimensional expected model. The ``i`` th element of the list
corresponds to the expected value for interactions between loci
separated by ``i`` bins. The length of this list will match the size of
the largest region in the input counts dict.
"""
# establish total size
size = max([len(counts[region]) for region in counts.keys()])
# make offdiagonals
offdiagonals = [np.concatenate([np.diag(counts[region], k=i)
for region in counts.keys()])
for i in range(size)]
# get empirical expected
empirical = global_empirical_binned(counts, log_transform=False)
# make data of the form [distance, count], ignoring nans
data = np.asarray([[dist, count]
for dist in range(size)
for count in offdiagonals[dist]
if np.isfinite(count)])
# don't try to fit the first third of the region
key_index = np.int(size / 3) if exclude_near_diagonal else 0
filtered_data = np.asarray([x for x in data if x[0] >= key_index])
# do the lowess fit
fit = lowess(filtered_data[:, 1], filtered_data[:, 0], frac=frac, it=3,
delta=0.01*np.ptp(filtered_data[:, 1]))
# filter the fit to just the region above the join point
filtered_fit = np.asarray([x for x in fit if x[0] >= key_index])
# construct an array that will represent the joined fit
joined_fit = np.zeros(size)
for i in range(key_index):
joined_fit[i] = empirical[i]
for i in range(key_index, size):
query_result = [x for x in filtered_fit if x[0] == i]
if query_result:
joined_fit[i] = query_result[0][1]
else:
joined_fit[i] = empirical[i]
return joined_fit
[docs]@parallelize_regions
def interpolate_expected(expected_matrix, regional_primermap, distance):
"""
Interpolate the value of an expected model (represented as a matrix) at an
arbitrary distance scale.
Parameters
----------
expected_matrix : np.ndarray
The expected matrix to use as a source for interpolation.
regional_primermap : List[Dict[str, Any]]
The primermap for this region.
distance : int
The interaction distance at which to estimate the expected value, in
base pairs.
Returns
-------
float
The interpolated expected value, or -1 if ``distance`` is outside of the
range of the expected model.
"""
# construct distance matrix
distance_matrix = make_distance_matrix(regional_primermap)
# flatten matrices
flat_distance_matrix = np.array(flatten_regional_counts(distance_matrix))
flat_expected_matrix = np.array(flatten_regional_counts(expected_matrix))
# filter nan's
filtered_distance_matrix = flat_distance_matrix[
np.isfinite(flat_distance_matrix) & np.isfinite(flat_expected_matrix)]
filtered_expected_matrix = flat_expected_matrix[
np.isfinite(flat_distance_matrix) & np.isfinite(flat_expected_matrix)]
# prepare model
xp = np.unique(filtered_distance_matrix)
fp = np.array(
[np.mean(filtered_expected_matrix[filtered_distance_matrix == x])
for x in xp])
# interpolate
return np.interp(distance, xp, fp, left=-1, right=-1)