Source code for lib5c.algorithms.filtering.filter_functions

"""
Module providing utilities for defining and constructing filter functions.
"""

import numpy as np
from scipy.stats import gmean, norm


[docs]def make_filter_function(function='gmean', threshold=0.0, norm_order=1, bin_width=4000, sigma=12000.0, inverse=False, gaussian=False): """ Convenience function for quickly constructing filtering functions with desired properties. Parameters ---------- function : {'sum', 'median', 'amean', 'gmean'} The aggregation function to use. This is the operation that will be applied to all points in the neighborhood, after weighting their values if appropriate. threshold : float If less than this fraction of the values in a neighborhood are non-infinite, the filter function will return nan for that neighborhood. norm_order : int The order of p-norm to use when computing distances. bin_width : int The width of each bin in base pairs. This value is used to scale certain weights. sigma : float The value to use for the standard deviation of the Gaussian when using Gaussian weights. inverse : bool Pass True to use "inverse" weights as in Yaffe and Tanay 2011. gaussian : bool Pass True to use Gaussian weights with standard deviation ``sigma``. Returns ------- Callable[[List[Dict[str, Any]]], float] The constructed filter function. This function takes in a "neighborhood" and returns the filtered value given that neighborhood. A neighborhood is represented as a list of "nearby points" where each nearby point is represented as a dict of the following form:: { 'value': float, 'x_dist': int, 'y_dist': int } where 'value' is the value at the point and 'x_dist' and 'y_dist' are its distances from the center of the neighborhood along the x- and y-axis, respectively, in base pairs. """ if function == 'sum': filter_function = simple_sum(threshold) elif function == 'median': filter_function = median(threshold) elif function == 'amean': if gaussian: filter_function = amean_gaussian(sigma=sigma, norm_ord=norm_order, check_threshold=threshold) elif inverse: filter_function = amean_inverse( bin_width=bin_width, norm_ord=norm_order, check_threshold=threshold) else: filter_function = arithmetic_mean(threshold) elif function == 'gmean': if gaussian: filter_function = gmean_gaussian(sigma=sigma, norm_ord=norm_order, check_threshold=threshold) elif inverse: filter_function = gmean_inverse( bin_width=bin_width, norm_ord=norm_order, check_threshold=threshold) else: filter_function = geometric_mean(threshold) else: raise ValueError('function %s not implemented' % function) return filter_function
[docs]def check_neighboorhood_nonnan(neighborhood, threshold): """ Check to see if a neighborhood clears as specified non-nan fraction threshold. Parameters ---------- neighborhood : List[Dict[str, Any]] A list of "nearby points" where each nearby point is represented as a dict of the following form:: { 'value': float, 'x_dist': int, 'y_dist': int } where 'value' is the value at the point and 'x_dist' and 'y_dist' are its distances from the center of the neighborhood along the x- and y-axis, respectively, in base pairs. threshold : float If less than this fraction of the values in the neighborhood are non-infinite, the neighborhood fails the check. Returns ------- bool True if this neighborhood clears the threshold, otherwise False. """ num_nan = len([1 for i in neighborhood if ~np.isfinite(i['value'])]) total = len(neighborhood) if total == 0 or float(num_nan) / total <= threshold: return False return True
[docs]def check_neighboorhood_positive(neighborhood, threshold): """ Check to see if a neighborhood clears as specified positive fraction threshold. Parameters ---------- neighborhood : List[Dict[str, Any]] A list of "nearby points" where each nearby point is represented as a dict of the following form:: { 'value': float, 'x_dist': int, 'y_dist': int } where 'value' is the value at the point and 'x_dist' and 'y_dist' are its distances from the center of the neighborhood along the x- and y-axis, respectively, in base pairs. threshold : float If less than this fraction of the values in the neighborhood are positive, the neighborhood fails the check. Returns ------- bool True if this neighborhood clears the threshold, otherwise False. """ num_positive = len([1 for i in neighborhood if i['value'] > 0]) total = len(neighborhood) if total == 0 or float(num_positive) / total <= threshold: return False return True
[docs]def value_filter_function(function, function_kwargs=None, pseudocount=0, check_function=None, check_threshold=None): """ Constructs a filter function that passes the values in the neighborhood to an aggregation function. Parameters ---------- function : Callable[Sequence[float], float] The aggregation function to use on the values in each neighborhood. function_kwargs : Optional[Dict[str, Any]] Kwargs to be passed to ``function``. pseudocount : float A pseudocount to be added to the values before applying the aggregation function. Useful if the aggregation function has catastrophic behavior when one input value is zero. check_function : Optional[Callable[[List[Dict[str, Any]], float], bool]] A function that takes in a neighborhood and a threshold value and performs some sort of test on the neighborhood, returning False if the filter function should return NaN for the neighborhood because it fails some critical condition. check_threshold : float The threshold to pass as the second arg to ``check_function``. Returns ------- Callable[[List[Dict[str, Any]]], float] The constructed filter function. This function takes in a "neighborhood" and returns the filtered value given that neighborhood. A neighborhood is represented as a list of "nearby points" where each nearby point is represented as a dict of the following form:: { 'value': float, 'x_dist': int, 'y_dist': int } where 'value' is the value at the point and 'x_dist' and 'y_dist' are its distances from the center of the neighborhood along the x- and y-axis, respectively, in base pairs. """ # resolve function_kwargs if function_kwargs is None: function_kwargs = {} def filter_function(neighborhood): # honor check API if check_function is not None: if not check_function(neighborhood, check_threshold): return np.nan # extract values while adding a pseudocount values = [i['value'] + pseudocount for i in neighborhood if np.isfinite(i['value'])] # compute result based on function if values: return function(values, **function_kwargs) else: return np.nan return filter_function
[docs]def norm_filter_function(weighted_function, norm_function, weighted_kwargs=None, norm_kwargs=None, pseudocount=0, check_function=None, check_threshold=None): """ Constructs a filter function that passes the value and some distance norm (as specified by ``norm_function``) for each point in the neighborhood to a special aggregation function capable of performing weighted aggregation based on these distances. Parameters ---------- weighted_function : Callable[[List[Dict[str, float]], float] A special aggregation function that takes in a list of points represented as dicts with the following structure:: { 'value': float, 'dist': float } where 'value' is the interaction value at that point and 'dist' is its scalar distance from the neighborhood. This function should then return a float representing the aggregate value of the neighborhood, weighted using the distances. norm_function : Callable[[Tuple[int]], float] A function that takes in a tuple of ints representing the x- and y-axis distances of a point to the neighborhood and returns a scalar value representing the distance. weighted_kwargs : Optional[Dict[str, Any]] Kwargs to be passed to ``weighted_function``. norm_kwargs : Optional[Dict[str, Any]] Kwargs to be passed to ``norm_function``. pseudocount : float A pseudocount to be added to the values before applying the aggregation function. Useful if the aggregation function has catastrophic behavior when one input value is zero. check_function : Optional[Callable[[List[Dict[str, Any]], float], bool]] A function that takes in a neighborhood and a threshold value and performs some sort of test on the neighborhood, returning False if the filter function should return NaN for the neighborhood because it fails some critical condition. check_threshold : float The threshold to pass as the second arg to ``check_function``. Returns ------- Callable[[List[Dict[str, Any]]], float] The constructed filter function. This function takes in a "neighborhood" and returns the filtered value given that neighborhood. A neighborhood is represented as a list of "nearby points" where each nearby point is represented as a dict of the following form:: { 'value': float, 'x_dist': int, 'y_dist': int } where 'value' is the value at the point and 'x_dist' and 'y_dist' are its distances from the center of the neighborhood along the x- and y-axis, respectively, in base pairs. """ # resolve function_kwargs and norm_kwargs if weighted_kwargs is None: weighted_kwargs = {} if norm_kwargs is None: norm_kwargs = {} def filter_function(neighborhood): # honor check API if check_function is not None: if not check_function(neighborhood, check_threshold): return np.nan # prepare value and distance for function values_distances = [{'value': i['value'] + pseudocount, 'dist' : norm_function((i['x_dist'], i['y_dist']), **norm_kwargs)} for i in neighborhood if np.isfinite(i['value'])] # compute results based on function if values_distances: return weighted_function(values_distances, **weighted_kwargs) else: return np.nan return filter_function
[docs]def weighted_values_distances_function(weighting_function, aggregating_function, weighting_kwargs=None, aggregating_kwargs=None, cache=True): """ Constructs a weighted aggregation function appropriate for use with ``norm_filter_function()``. Parameters ---------- weighting_function : Callable[[float], float] A function that takes in a distance and returns a weight. aggregating_function : Callable[[Sequence[float], Sequence[float]], float] A special aggregating function that takes in the values and the weights as parallel vectors and returns the aggregated value. weighting_kwargs : Optional[Dict[str, Any]] Kwargs to be passed to ``weighting_function``. aggregating_kwargs : Optional[Dict[str, Any]] Kwargs to be passed to ``aggregating_function``. cache : bool Pass True to make the returned function use a cache to avoid recomputing expensive weighting function calls. Returns ------- Callable[[List[Dict[str, float]], float] A special aggregation function that takes in a list of points represented as dicts with the following structure:: { 'value': float, 'dist': float } where 'value' is the interaction value at that point and 'dist' is its scalar distance from the neighborhood. This function returns a float representing the aggregate value of the neighborhood, weighted using the distances. """ # resolve weighting_kwargs and aggregating_kwargs if weighting_kwargs is None: weighting_kwargs = {} if aggregating_kwargs is None: aggregating_kwargs = {} # create cache if cache: cached_weights = {} def weighted_function(values_distances): if cache: weights = [] for i in values_distances: if i['dist'] in cached_weights: weights.append(cached_weights[i['dist']]) else: weight = weighting_function(i['dist'], **weighting_kwargs) cached_weights[i['dist']] = weight weights.append(weight) weights = np.array(weights) else: weights = np.array([weighting_function(i['dist'], **weighting_kwargs) for i in values_distances]) values = np.array([i['value'] for i in values_distances]) return aggregating_function(values, weights, **aggregating_kwargs) return weighted_function
[docs]def inverse_weighting_function(distance, bin_width=None): """ The "inverse" weighting function used in Yaffe and Tanay 2011. Parameters ---------- distance : float The distance to compute a weight for, in base pairs. bin_width : Optional[int] The bin width in base pairs. Used to make results equivalent to Yaffe and Tanay 2011 by scaling ``distance`` to units of bins. Pass None to simply leave the distance in units of base pairs Returns ------- float A weight appropriate for this distance. """ if bin_width is not None: return 1 / ((distance / float(bin_width)) + 1) return float(1) / (distance + 1)
[docs]def amean_inverse(bin_width=4000, norm_ord=1, check_threshold=0.2): """ Constructs a filter function that uses the arithmetic mean with "inverse" weights as the aggregating function and a p-norm as the norm function. Parameters ---------- bin_width : int The bin width in base pairs. norm_ord : int The order of the p-norm to use to convert (x-dist, y-dist) vectors to scalar distances. check_threshold : float If less than this fraction of the values in a neighborhood are positive, the filter function will return NaN. Returns ------- Callable[[List[Dict[str, Any]]], float] The constructed filter function. This function takes in a "neighborhood" and returns the filtered value given that neighborhood. A neighborhood is represented as a list of "nearby points" where each nearby point is represented as a dict of the following form:: { 'value': float, 'x_dist': int, 'y_dist': int } where 'value' is the value at the point and 'x_dist' and 'y_dist' are its distances from the center of the neighborhood along the x- and y-axis, respectively, in base pairs. """ return norm_filter_function( weighted_function=weighted_values_distances_function( weighting_function=inverse_weighting_function, aggregating_function=weighted_amean, weighting_kwargs={'bin_width': bin_width} ), norm_function=np.linalg.norm, norm_kwargs={'ord': norm_ord}, check_function=check_neighboorhood_positive, check_threshold=check_threshold )
[docs]def gmean_inverse(bin_width=4000, norm_ord=1, check_threshold=0.2): """ Constructs a filter function that uses the geometric mean with "inverse" weights as the aggregating function and a p-norm as the norm function. Parameters ---------- bin_width : int The bin width in base pairs. norm_ord : int The order of the p-norm to use to convert (x-dist, y-dist) vectors to scalar distances. check_threshold : float If less than this fraction of the values in a neighborhood are positive, the filter function will return NaN. Returns ------- Callable[[List[Dict[str, Any]]], float] The constructed filter function. This function takes in a "neighborhood" and returns the filtered value given that neighborhood. A neighborhood is represented as a list of "nearby points" where each nearby point is represented as a dict of the following form:: { 'value': float, 'x_dist': int, 'y_dist': int } where 'value' is the value at the point and 'x_dist' and 'y_dist' are its distances from the center of the neighborhood along the x- and y-axis, respectively, in base pairs. """ return norm_filter_function( weighted_function=weighted_values_distances_function( weighting_function=inverse_weighting_function, aggregating_function=weighted_gmean, weighting_kwargs={'bin_width': bin_width} ), norm_function=np.linalg.norm, norm_kwargs={'ord': norm_ord}, pseudocount=1, check_function=check_neighboorhood_positive, check_threshold=check_threshold )
[docs]def weighted_amean(values, weights): """ Weighted version of the arithmetic mean. Parameters ---------- values : Sequence[float] The values to aggregate. weights : Sequence[float] The weights for each value. Returns ------- float The weighted arithmetic mean of the values given the weights. """ normed_weights = weights / weights.sum() return np.dot(values, normed_weights)
[docs]def weighted_gmean(values, weights): """ Weighted version of the geometric mean. Parameters ---------- values : Sequence[float] The values to aggregate. weights : Sequence[float] The weights for each value. Returns ------- float The weighted geometric mean of the values given the weights. """ return np.exp(np.dot(np.log(values), weights) / np.sum(weights))
[docs]def amean_gaussian(sigma=1000.0, norm_ord=1, check_threshold=0.2): """ Constructs a filter function that uses the arithmetic mean with Gaussian weights as the aggregating function and a p-norm as the norm function. Parameters ---------- sigma : float The standard deviation to use for the Gaussian when assigning weights. norm_ord : int The order of the p-norm to use to convert (x-dist, y-dist) vectors to scalar distances. check_threshold : float If less than this fraction of the values in a neighborhood are positive, the filter function will return NaN. Returns ------- Callable[[List[Dict[str, Any]]], float] The constructed filter function. This function takes in a "neighborhood" and returns the filtered value given that neighborhood. A neighborhood is represented as a list of "nearby points" where each nearby point is represented as a dict of the following form:: { 'value': float, 'x_dist': int, 'y_dist': int } where 'value' is the value at the point and 'x_dist' and 'y_dist' are its distances from the center of the neighborhood along the x- and y-axis, respectively, in base pairs. """ return norm_filter_function( weighted_function=weighted_values_distances_function( weighting_function=norm.pdf, aggregating_function=weighted_amean, weighting_kwargs={'scale': sigma} ), norm_function=np.linalg.norm, norm_kwargs={'ord': norm_ord}, check_function=check_neighboorhood_positive, check_threshold=check_threshold )
[docs]def gmean_gaussian(sigma=1000.0, norm_ord=1, check_threshold=0.2): """ Constructs a filter function that uses the geometric mean with Gaussian weights as the aggregating function and a p-norm as the norm function. Parameters ---------- sigma : float The standard deviation to use for the Gaussian when assigning weights. norm_ord : int The order of the p-norm to use to convert (x-dist, y-dist) vectors to scalar distances. check_threshold : float If less than this fraction of the values in a neighborhood are positive, the filter function will return NaN. Returns ------- Callable[[List[Dict[str, Any]]], float] The constructed filter function. This function takes in a "neighborhood" and returns the filtered value given that neighborhood. A neighborhood is represented as a list of "nearby points" where each nearby point is represented as a dict of the following form:: { 'value': float, 'x_dist': int, 'y_dist': int } where 'value' is the value at the point and 'x_dist' and 'y_dist' are its distances from the center of the neighborhood along the x- and y-axis, respectively, in base pairs. """ return norm_filter_function( weighted_function=weighted_values_distances_function( weighting_function=norm.pdf, aggregating_function=weighted_gmean, weighting_kwargs={'scale': sigma} ), norm_function=np.linalg.norm, norm_kwargs={'ord': norm_ord}, pseudocount=1, check_function=check_neighboorhood_positive, check_threshold=check_threshold )
[docs]def geometric_mean(check_threshold=0.2): """ Constructs a filter function that uses the unweighted geometric mean as the aggregating function. Parameters ---------- check_threshold : float If less than this fraction of the values in a neighborhood are positive, the filter function will return NaN. Returns ------- Callable[[List[Dict[str, Any]]], float] The constructed filter function. This function takes in a "neighborhood" and returns the filtered value given that neighborhood. A neighborhood is represented as a list of "nearby points" where each nearby point is represented as a dict of the following form:: { 'value': float, 'x_dist': int, 'y_dist': int } where 'value' is the value at the point and 'x_dist' and 'y_dist' are its distances from the center of the neighborhood along the x- and y-axis, respectively, in base pairs. """ return value_filter_function( function=gmean, pseudocount=1, check_function=check_neighboorhood_positive, check_threshold=check_threshold )
[docs]def arithmetic_mean(check_threshold=0.2): """ Constructs a filter function that uses the unweighted arithmetic mean as the aggregating function. Parameters ---------- check_threshold : float If less than this fraction of the values in a neighborhood are positive, the filter function will return NaN. Returns ------- Callable[[List[Dict[str, Any]]], float] The constructed filter function. This function takes in a "neighborhood" and returns the filtered value given that neighborhood. A neighborhood is represented as a list of "nearby points" where each nearby point is represented as a dict of the following form:: { 'value': float, 'x_dist': int, 'y_dist': int } where 'value' is the value at the point and 'x_dist' and 'y_dist' are its distances from the center of the neighborhood along the x- and y-axis, respectively, in base pairs. """ return value_filter_function( function=np.mean, check_function=check_neighboorhood_positive, check_threshold=check_threshold )
[docs]def simple_sum(check_threshold=0.2): """ Constructs a filter function that uses a simple sum as the aggregating function. Parameters ---------- check_threshold : float If less than this fraction of the values in a neighborhood are positive, the filter function will return NaN. Returns ------- Callable[[List[Dict[str, Any]]], float] The constructed filter function. This function takes in a "neighborhood" and returns the filtered value given that neighborhood. A neighborhood is represented as a list of "nearby points" where each nearby point is represented as a dict of the following form:: { 'value': float, 'x_dist': int, 'y_dist': int } where 'value' is the value at the point and 'x_dist' and 'y_dist' are its distances from the center of the neighborhood along the x- and y-axis, respectively, in base pairs. """ return value_filter_function( function=np.sum, check_function=check_neighboorhood_positive, check_threshold=check_threshold )
[docs]def median(check_threshold=0.2): """ Constructs a filter function that uses the median as the aggregating function. Parameters ---------- check_threshold : float If less than this fraction of the values in a neighborhood are positive, the filter function will return NaN. Returns ------- Callable[[List[Dict[str, Any]]], float] The constructed filter function. This function takes in a "neighborhood" and returns the filtered value given that neighborhood. A neighborhood is represented as a list of "nearby points" where each nearby point is represented as a dict of the following form:: { 'value': float, 'x_dist': int, 'y_dist': int } where 'value' is the value at the point and 'x_dist' and 'y_dist' are its distances from the center of the neighborhood along the x- and y-axis, respectively, in base pairs. """ return value_filter_function( function=np.median, check_function=check_neighboorhood_positive, check_threshold=check_threshold )