"""
Module providing utilities for defining and constructing filter functions.
"""
import numpy as np
from scipy.stats import gmean, norm
[docs]def make_filter_function(function='gmean', threshold=0.0, norm_order=1,
bin_width=4000, sigma=12000.0, inverse=False,
gaussian=False):
"""
Convenience function for quickly constructing filtering functions with
desired properties.
Parameters
----------
function : {'sum', 'median', 'amean', 'gmean'}
The aggregation function to use. This is the operation that will be
applied to all points in the neighborhood, after weighting their values
if appropriate.
threshold : float
If less than this fraction of the values in a neighborhood are
non-infinite, the filter function will return nan for that neighborhood.
norm_order : int
The order of p-norm to use when computing distances.
bin_width : int
The width of each bin in base pairs. This value is used to scale certain
weights.
sigma : float
The value to use for the standard deviation of the Gaussian when using
Gaussian weights.
inverse : bool
Pass True to use "inverse" weights as in Yaffe and Tanay 2011.
gaussian : bool
Pass True to use Gaussian weights with standard deviation ``sigma``.
Returns
-------
Callable[[List[Dict[str, Any]]], float]
The constructed filter function. This function takes in a "neighborhood"
and returns the filtered value given that neighborhood. A neighborhood
is represented as a list of "nearby points" where each nearby point is
represented as a dict of the following form::
{
'value': float,
'x_dist': int,
'y_dist': int
}
where 'value' is the value at the point and 'x_dist' and 'y_dist' are
its distances from the center of the neighborhood along the x- and
y-axis, respectively, in base pairs.
"""
if function == 'sum':
filter_function = simple_sum(threshold)
elif function == 'median':
filter_function = median(threshold)
elif function == 'amean':
if gaussian:
filter_function = amean_gaussian(sigma=sigma,
norm_ord=norm_order,
check_threshold=threshold)
elif inverse:
filter_function = amean_inverse(
bin_width=bin_width,
norm_ord=norm_order,
check_threshold=threshold)
else:
filter_function = arithmetic_mean(threshold)
elif function == 'gmean':
if gaussian:
filter_function = gmean_gaussian(sigma=sigma,
norm_ord=norm_order,
check_threshold=threshold)
elif inverse:
filter_function = gmean_inverse(
bin_width=bin_width,
norm_ord=norm_order,
check_threshold=threshold)
else:
filter_function = geometric_mean(threshold)
else:
raise ValueError('function %s not implemented' % function)
return filter_function
[docs]def check_neighboorhood_nonnan(neighborhood, threshold):
"""
Check to see if a neighborhood clears as specified non-nan fraction
threshold.
Parameters
----------
neighborhood : List[Dict[str, Any]]
A list of "nearby points" where each nearby point is
represented as a dict of the following form::
{
'value': float,
'x_dist': int,
'y_dist': int
}
where 'value' is the value at the point and 'x_dist' and 'y_dist' are
its distances from the center of the neighborhood along the x- and
y-axis, respectively, in base pairs.
threshold : float
If less than this fraction of the values in the neighborhood are
non-infinite, the neighborhood fails the check.
Returns
-------
bool
True if this neighborhood clears the threshold, otherwise False.
"""
num_nan = len([1 for i in neighborhood if ~np.isfinite(i['value'])])
total = len(neighborhood)
if total == 0 or float(num_nan) / total <= threshold:
return False
return True
[docs]def check_neighboorhood_positive(neighborhood, threshold):
"""
Check to see if a neighborhood clears as specified positive fraction
threshold.
Parameters
----------
neighborhood : List[Dict[str, Any]]
A list of "nearby points" where each nearby point is
represented as a dict of the following form::
{
'value': float,
'x_dist': int,
'y_dist': int
}
where 'value' is the value at the point and 'x_dist' and 'y_dist' are
its distances from the center of the neighborhood along the x- and
y-axis, respectively, in base pairs.
threshold : float
If less than this fraction of the values in the neighborhood are
positive, the neighborhood fails the check.
Returns
-------
bool
True if this neighborhood clears the threshold, otherwise False.
"""
num_positive = len([1 for i in neighborhood if i['value'] > 0])
total = len(neighborhood)
if total == 0 or float(num_positive) / total <= threshold:
return False
return True
[docs]def value_filter_function(function, function_kwargs=None, pseudocount=0,
check_function=None, check_threshold=None):
"""
Constructs a filter function that passes the values in the neighborhood to
an aggregation function.
Parameters
----------
function : Callable[Sequence[float], float]
The aggregation function to use on the values in each neighborhood.
function_kwargs : Optional[Dict[str, Any]]
Kwargs to be passed to ``function``.
pseudocount : float
A pseudocount to be added to the values before applying the aggregation
function. Useful if the aggregation function has catastrophic behavior
when one input value is zero.
check_function : Optional[Callable[[List[Dict[str, Any]], float], bool]]
A function that takes in a neighborhood and a threshold value and
performs some sort of test on the neighborhood, returning False if the
filter function should return NaN for the neighborhood because it fails
some critical condition.
check_threshold : float
The threshold to pass as the second arg to ``check_function``.
Returns
-------
Callable[[List[Dict[str, Any]]], float]
The constructed filter function. This function takes in a "neighborhood"
and returns the filtered value given that neighborhood. A neighborhood
is represented as a list of "nearby points" where each nearby point is
represented as a dict of the following form::
{
'value': float,
'x_dist': int,
'y_dist': int
}
where 'value' is the value at the point and 'x_dist' and 'y_dist' are
its distances from the center of the neighborhood along the x- and
y-axis, respectively, in base pairs.
"""
# resolve function_kwargs
if function_kwargs is None:
function_kwargs = {}
def filter_function(neighborhood):
# honor check API
if check_function is not None:
if not check_function(neighborhood, check_threshold):
return np.nan
# extract values while adding a pseudocount
values = [i['value'] + pseudocount
for i in neighborhood
if np.isfinite(i['value'])]
# compute result based on function
if values:
return function(values, **function_kwargs)
else:
return np.nan
return filter_function
[docs]def norm_filter_function(weighted_function, norm_function, weighted_kwargs=None,
norm_kwargs=None, pseudocount=0, check_function=None,
check_threshold=None):
"""
Constructs a filter function that passes the value and some distance norm
(as specified by ``norm_function``) for each point in the neighborhood to a
special aggregation function capable of performing weighted aggregation
based on these distances.
Parameters
----------
weighted_function : Callable[[List[Dict[str, float]], float]
A special aggregation function that takes in a list of points
represented as dicts with the following structure::
{
'value': float,
'dist': float
}
where 'value' is the interaction value at that point and 'dist' is its
scalar distance from the neighborhood. This function should then return
a float representing the aggregate value of the neighborhood, weighted
using the distances.
norm_function : Callable[[Tuple[int]], float]
A function that takes in a tuple of ints representing the x- and y-axis
distances of a point to the neighborhood and returns a scalar value
representing the distance.
weighted_kwargs : Optional[Dict[str, Any]]
Kwargs to be passed to ``weighted_function``.
norm_kwargs : Optional[Dict[str, Any]]
Kwargs to be passed to ``norm_function``.
pseudocount : float
A pseudocount to be added to the values before applying the aggregation
function. Useful if the aggregation function has catastrophic behavior
when one input value is zero.
check_function : Optional[Callable[[List[Dict[str, Any]], float], bool]]
A function that takes in a neighborhood and a threshold value and
performs some sort of test on the neighborhood, returning False if the
filter function should return NaN for the neighborhood because it fails
some critical condition.
check_threshold : float
The threshold to pass as the second arg to ``check_function``.
Returns
-------
Callable[[List[Dict[str, Any]]], float]
The constructed filter function. This function takes in a "neighborhood"
and returns the filtered value given that neighborhood. A neighborhood
is represented as a list of "nearby points" where each nearby point is
represented as a dict of the following form::
{
'value': float,
'x_dist': int,
'y_dist': int
}
where 'value' is the value at the point and 'x_dist' and 'y_dist' are
its distances from the center of the neighborhood along the x- and
y-axis, respectively, in base pairs.
"""
# resolve function_kwargs and norm_kwargs
if weighted_kwargs is None:
weighted_kwargs = {}
if norm_kwargs is None:
norm_kwargs = {}
def filter_function(neighborhood):
# honor check API
if check_function is not None:
if not check_function(neighborhood, check_threshold):
return np.nan
# prepare value and distance for function
values_distances = [{'value': i['value'] + pseudocount,
'dist' : norm_function((i['x_dist'], i['y_dist']),
**norm_kwargs)}
for i in neighborhood
if np.isfinite(i['value'])]
# compute results based on function
if values_distances:
return weighted_function(values_distances, **weighted_kwargs)
else:
return np.nan
return filter_function
[docs]def weighted_values_distances_function(weighting_function,
aggregating_function,
weighting_kwargs=None,
aggregating_kwargs=None,
cache=True):
"""
Constructs a weighted aggregation function appropriate for use with
``norm_filter_function()``.
Parameters
----------
weighting_function : Callable[[float], float]
A function that takes in a distance and returns a weight.
aggregating_function : Callable[[Sequence[float], Sequence[float]], float]
A special aggregating function that takes in the values and the weights
as parallel vectors and returns the aggregated value.
weighting_kwargs : Optional[Dict[str, Any]]
Kwargs to be passed to ``weighting_function``.
aggregating_kwargs : Optional[Dict[str, Any]]
Kwargs to be passed to ``aggregating_function``.
cache : bool
Pass True to make the returned function use a cache to avoid recomputing
expensive weighting function calls.
Returns
-------
Callable[[List[Dict[str, float]], float]
A special aggregation function that takes in a list of points
represented as dicts with the following structure::
{
'value': float,
'dist': float
}
where 'value' is the interaction value at that point and 'dist' is its
scalar distance from the neighborhood. This function returns a float
representing the aggregate value of the neighborhood, weighted using the
distances.
"""
# resolve weighting_kwargs and aggregating_kwargs
if weighting_kwargs is None:
weighting_kwargs = {}
if aggregating_kwargs is None:
aggregating_kwargs = {}
# create cache
if cache:
cached_weights = {}
def weighted_function(values_distances):
if cache:
weights = []
for i in values_distances:
if i['dist'] in cached_weights:
weights.append(cached_weights[i['dist']])
else:
weight = weighting_function(i['dist'], **weighting_kwargs)
cached_weights[i['dist']] = weight
weights.append(weight)
weights = np.array(weights)
else:
weights = np.array([weighting_function(i['dist'],
**weighting_kwargs)
for i in values_distances])
values = np.array([i['value'] for i in values_distances])
return aggregating_function(values, weights, **aggregating_kwargs)
return weighted_function
[docs]def inverse_weighting_function(distance, bin_width=None):
"""
The "inverse" weighting function used in Yaffe and Tanay 2011.
Parameters
----------
distance : float
The distance to compute a weight for, in base pairs.
bin_width : Optional[int]
The bin width in base pairs. Used to make results equivalent to Yaffe
and Tanay 2011 by scaling ``distance`` to units of bins. Pass None to
simply leave the distance in units of base pairs
Returns
-------
float
A weight appropriate for this distance.
"""
if bin_width is not None:
return 1 / ((distance / float(bin_width)) + 1)
return float(1) / (distance + 1)
[docs]def amean_inverse(bin_width=4000, norm_ord=1, check_threshold=0.2):
"""
Constructs a filter function that uses the arithmetic mean with "inverse"
weights as the aggregating function and a p-norm as the norm function.
Parameters
----------
bin_width : int
The bin width in base pairs.
norm_ord : int
The order of the p-norm to use to convert (x-dist, y-dist) vectors to
scalar distances.
check_threshold : float
If less than this fraction of the values in a neighborhood are
positive, the filter function will return NaN.
Returns
-------
Callable[[List[Dict[str, Any]]], float]
The constructed filter function. This function takes in a "neighborhood"
and returns the filtered value given that neighborhood. A neighborhood
is represented as a list of "nearby points" where each nearby point is
represented as a dict of the following form::
{
'value': float,
'x_dist': int,
'y_dist': int
}
where 'value' is the value at the point and 'x_dist' and 'y_dist' are
its distances from the center of the neighborhood along the x- and
y-axis, respectively, in base pairs.
"""
return norm_filter_function(
weighted_function=weighted_values_distances_function(
weighting_function=inverse_weighting_function,
aggregating_function=weighted_amean,
weighting_kwargs={'bin_width': bin_width}
),
norm_function=np.linalg.norm,
norm_kwargs={'ord': norm_ord},
check_function=check_neighboorhood_positive,
check_threshold=check_threshold
)
[docs]def gmean_inverse(bin_width=4000, norm_ord=1, check_threshold=0.2):
"""
Constructs a filter function that uses the geometric mean with "inverse"
weights as the aggregating function and a p-norm as the norm function.
Parameters
----------
bin_width : int
The bin width in base pairs.
norm_ord : int
The order of the p-norm to use to convert (x-dist, y-dist) vectors to
scalar distances.
check_threshold : float
If less than this fraction of the values in a neighborhood are
positive, the filter function will return NaN.
Returns
-------
Callable[[List[Dict[str, Any]]], float]
The constructed filter function. This function takes in a "neighborhood"
and returns the filtered value given that neighborhood. A neighborhood
is represented as a list of "nearby points" where each nearby point is
represented as a dict of the following form::
{
'value': float,
'x_dist': int,
'y_dist': int
}
where 'value' is the value at the point and 'x_dist' and 'y_dist' are
its distances from the center of the neighborhood along the x- and
y-axis, respectively, in base pairs.
"""
return norm_filter_function(
weighted_function=weighted_values_distances_function(
weighting_function=inverse_weighting_function,
aggregating_function=weighted_gmean,
weighting_kwargs={'bin_width': bin_width}
),
norm_function=np.linalg.norm,
norm_kwargs={'ord': norm_ord},
pseudocount=1,
check_function=check_neighboorhood_positive,
check_threshold=check_threshold
)
[docs]def weighted_amean(values, weights):
"""
Weighted version of the arithmetic mean.
Parameters
----------
values : Sequence[float]
The values to aggregate.
weights : Sequence[float]
The weights for each value.
Returns
-------
float
The weighted arithmetic mean of the values given the weights.
"""
normed_weights = weights / weights.sum()
return np.dot(values, normed_weights)
[docs]def weighted_gmean(values, weights):
"""
Weighted version of the geometric mean.
Parameters
----------
values : Sequence[float]
The values to aggregate.
weights : Sequence[float]
The weights for each value.
Returns
-------
float
The weighted geometric mean of the values given the weights.
"""
return np.exp(np.dot(np.log(values), weights) / np.sum(weights))
[docs]def amean_gaussian(sigma=1000.0, norm_ord=1, check_threshold=0.2):
"""
Constructs a filter function that uses the arithmetic mean with Gaussian
weights as the aggregating function and a p-norm as the norm function.
Parameters
----------
sigma : float
The standard deviation to use for the Gaussian when assigning weights.
norm_ord : int
The order of the p-norm to use to convert (x-dist, y-dist) vectors to
scalar distances.
check_threshold : float
If less than this fraction of the values in a neighborhood are
positive, the filter function will return NaN.
Returns
-------
Callable[[List[Dict[str, Any]]], float]
The constructed filter function. This function takes in a "neighborhood"
and returns the filtered value given that neighborhood. A neighborhood
is represented as a list of "nearby points" where each nearby point is
represented as a dict of the following form::
{
'value': float,
'x_dist': int,
'y_dist': int
}
where 'value' is the value at the point and 'x_dist' and 'y_dist' are
its distances from the center of the neighborhood along the x- and
y-axis, respectively, in base pairs.
"""
return norm_filter_function(
weighted_function=weighted_values_distances_function(
weighting_function=norm.pdf,
aggregating_function=weighted_amean,
weighting_kwargs={'scale': sigma}
),
norm_function=np.linalg.norm,
norm_kwargs={'ord': norm_ord},
check_function=check_neighboorhood_positive,
check_threshold=check_threshold
)
[docs]def gmean_gaussian(sigma=1000.0, norm_ord=1, check_threshold=0.2):
"""
Constructs a filter function that uses the geometric mean with Gaussian
weights as the aggregating function and a p-norm as the norm function.
Parameters
----------
sigma : float
The standard deviation to use for the Gaussian when assigning weights.
norm_ord : int
The order of the p-norm to use to convert (x-dist, y-dist) vectors to
scalar distances.
check_threshold : float
If less than this fraction of the values in a neighborhood are
positive, the filter function will return NaN.
Returns
-------
Callable[[List[Dict[str, Any]]], float]
The constructed filter function. This function takes in a "neighborhood"
and returns the filtered value given that neighborhood. A neighborhood
is represented as a list of "nearby points" where each nearby point is
represented as a dict of the following form::
{
'value': float,
'x_dist': int,
'y_dist': int
}
where 'value' is the value at the point and 'x_dist' and 'y_dist' are
its distances from the center of the neighborhood along the x- and
y-axis, respectively, in base pairs.
"""
return norm_filter_function(
weighted_function=weighted_values_distances_function(
weighting_function=norm.pdf,
aggregating_function=weighted_gmean,
weighting_kwargs={'scale': sigma}
),
norm_function=np.linalg.norm,
norm_kwargs={'ord': norm_ord},
pseudocount=1,
check_function=check_neighboorhood_positive,
check_threshold=check_threshold
)
[docs]def geometric_mean(check_threshold=0.2):
"""
Constructs a filter function that uses the unweighted geometric mean as the
aggregating function.
Parameters
----------
check_threshold : float
If less than this fraction of the values in a neighborhood are
positive, the filter function will return NaN.
Returns
-------
Callable[[List[Dict[str, Any]]], float]
The constructed filter function. This function takes in a "neighborhood"
and returns the filtered value given that neighborhood. A neighborhood
is represented as a list of "nearby points" where each nearby point is
represented as a dict of the following form::
{
'value': float,
'x_dist': int,
'y_dist': int
}
where 'value' is the value at the point and 'x_dist' and 'y_dist' are
its distances from the center of the neighborhood along the x- and
y-axis, respectively, in base pairs.
"""
return value_filter_function(
function=gmean,
pseudocount=1,
check_function=check_neighboorhood_positive,
check_threshold=check_threshold
)
[docs]def arithmetic_mean(check_threshold=0.2):
"""
Constructs a filter function that uses the unweighted arithmetic mean as the
aggregating function.
Parameters
----------
check_threshold : float
If less than this fraction of the values in a neighborhood are
positive, the filter function will return NaN.
Returns
-------
Callable[[List[Dict[str, Any]]], float]
The constructed filter function. This function takes in a "neighborhood"
and returns the filtered value given that neighborhood. A neighborhood
is represented as a list of "nearby points" where each nearby point is
represented as a dict of the following form::
{
'value': float,
'x_dist': int,
'y_dist': int
}
where 'value' is the value at the point and 'x_dist' and 'y_dist' are
its distances from the center of the neighborhood along the x- and
y-axis, respectively, in base pairs.
"""
return value_filter_function(
function=np.mean,
check_function=check_neighboorhood_positive,
check_threshold=check_threshold
)
[docs]def simple_sum(check_threshold=0.2):
"""
Constructs a filter function that uses a simple sum as the aggregating
function.
Parameters
----------
check_threshold : float
If less than this fraction of the values in a neighborhood are
positive, the filter function will return NaN.
Returns
-------
Callable[[List[Dict[str, Any]]], float]
The constructed filter function. This function takes in a "neighborhood"
and returns the filtered value given that neighborhood. A neighborhood
is represented as a list of "nearby points" where each nearby point is
represented as a dict of the following form::
{
'value': float,
'x_dist': int,
'y_dist': int
}
where 'value' is the value at the point and 'x_dist' and 'y_dist' are
its distances from the center of the neighborhood along the x- and
y-axis, respectively, in base pairs.
"""
return value_filter_function(
function=np.sum,
check_function=check_neighboorhood_positive,
check_threshold=check_threshold
)