Source code for lib5c.algorithms.filtering.unsmoothable_columns

"""
Module for identifying "unsmoothable columns" - sets of bins that don't contain
any non-zero fragments and are too wide to smooth over.
"""

import itertools

import numpy as np

from lib5c.algorithms.filtering.fragment_bin_filtering import (
    find_upstream_primers, find_nearby_fragments)
from lib5c.algorithms.trimming import wipe_counts
from lib5c.util.primers import guess_bin_step
from lib5c.util.parallelization import parallelize_regions


[docs]def unsmoothable_column_threshold_heuristic(window_width, bin_step):
    """
    This function defines the heuristic that determines how long a run of
    fragment-less bins must be before it is considered "unsmoothable".

    Parameters
    ----------
    window_width : int
        The width of the filtering window in base pairs.
    bin_step : int
        The "sampling rate" or "bin step".

    Returns
    -------
    int
        The maximum length of a run of fragment-less bins must be before it is
        considered "unsmoothable".
    """
    return ((window_width / 2) / bin_step) + 1


[docs]def find_unsmoothable_columns(regional_primermap, regional_pixelmap,
                              window_width, upstream_primer_mapping=None,
                              midpoint=False):
    """
    Identifies the unsmoothable columns in a region assuming that the smoothing
    was a filtering operation applied on fragment-level data.

    Parameters
    ----------
    regional_primermap : List[Dict[str, Any]]
        The primermap describing the primers for this region.
    regional_pixelmap : List[Dict[str, Any]]
        The pixelmap describing the bins for this region.
    window_width : int
        The width of the filtering window in base pairs.
    upstream_primer_mapping : Dict[int, int]
        A mapping from each bin index to the index of its nearest upstream
        primer. See ``lib5c.algorithms.filtering.fragment_bin_filtering
        .find_upstream_primers()``.
    midpoint : bool
        Pass True to restore legacy behavior when distances to fragments were
        based on their midpoints. The new behavior (with this kwarg set to
        False) is to compute distances to fragments based on their closest
        endpoint.

    Returns
    -------
    List[bool]
        A list of boolean values with length equal to the number of bins in the
        region. The ``i`` th element of this list is True if the ``i`` th bin in
        the region is an "unsmoothable column".
    """
    bin_step = guess_bin_step(regional_pixelmap)
    threshold = unsmoothable_column_threshold_heuristic(window_width, bin_step)

    if upstream_primer_mapping is None:
        upstream_primer_mapping = find_upstream_primers(regional_pixelmap,
                                                        regional_primermap)

    # a list of bool which is True if the ith "small bin" is empty
    blank_columns = [not bool(len(find_nearby_fragments(i,
                                                        regional_pixelmap,
                                                        regional_primermap,
                                                        upstream_primer_mapping,
                                                        bin_step / 2,
                                                        midpoint=midpoint)))
                     for i in range(len(regional_pixelmap))]

    # figure out which empty columns should be smoothed over
    unsmoothable_columns = []
    # group blank and nonblank columns
    for blank, run in itertools.groupby(blank_columns):
        # only write Trues if this is a group of more than threshold consecutive
        # blank columns
        run_as_list = list(run)
        if blank and len(run_as_list) > threshold:
            unsmoothable_columns.extend([True] * len(run_as_list))
        else:
            unsmoothable_columns.extend([False] * len(run_as_list))

    return unsmoothable_columns


[docs]@parallelize_regions
def wipe_unsmoothable_columns(binned_counts, primermap, pixelmap, window_width,
                              midpoint=False):
    """
    Convenience function for wiping the unsmoothable columns in a binned counts
    matrix assuming that the smoothing was a filtering operation applied on
    fragment-level data.

    Parameters
    ----------
    binned_counts : np.ndarray
        The matrix of binned counts to wipe unsmoothable columns from.
    primermap : List[Dict[str, Any]]
        The primermap describing the primers for this region.
    pixelmap : List[Dict[str, Any]]
        The pixelmap describing the bins for this region.
    window_width : int
        The width of the filtering window in base pairs.
    midpoint : bool
        Pass True to restore legacy behavior when distances to fragments were
        based on their midpoints. The new behavior (with this kwarg set to
        False) is to compute distances to fragments based on their closest
        endpoint.

    Returns
    -------
    np.ndarray
        The wiped matrix of binned counts.
    """
    unsmoothable_columns = find_unsmoothable_columns(primermap, pixelmap,
                                                     window_width,
                                                     midpoint=midpoint)
    return wipe_counts(binned_counts, np.where(unsmoothable_columns))


[docs]def find_prebinned_unsmoothable_columns(regional_counts, regional_pixelmap,
                                        window_width):
    """
    Identifies the unsmoothable columns in a region assuming that the smoothing
    was a filtering operation applied on data that was already bin-level.

    Parameters
    ----------
    regional_counts : np.ndarray
        The matrix of counts for this region.
    regional_pixelmap : List[Dict[str, Any]]
        The pixelmap describing the bins for this region.
    window_width : int
        The width of the filtering window in base pairs.

    Returns
    -------
    List[bool]
        A list of boolean values with length equal to the number of bins in the
        region. The ``i`` th element of this list is True if the ``i`` th bin in
        the region is an "unsmoothable column".
    """
    bin_step = guess_bin_step(regional_pixelmap)
    threshold = unsmoothable_column_threshold_heuristic(window_width, bin_step)

    # a list of bool which is True if the ith "small bin" is empty
    blank_columns = [not np.any(np.isfinite(regional_counts[:, i]))
                     for i in range(len(regional_counts))]

    # figure out which empty columns should be smoothed over
    unsmoothable_columns = []
    # group blank and nonblank columns
    for blank, run in itertools.groupby(blank_columns):
        # only write Trues if this is a group of more than threshold consecutive
        # blank columns
        run_as_list = list(run)
        if blank and len(run_as_list) > threshold:
            unsmoothable_columns.extend([True] * len(run_as_list))
        else:
            unsmoothable_columns.extend([False] * len(run_as_list))

    return unsmoothable_columns


[docs]@parallelize_regions
def wipe_prebinned_unsmoothable_columns(smoothed_counts, prebinned_counts,
                                        pixelmap, window_width):
    """
    Convenience function for wiping the unsmoothable columns in a binned counts
    matrix assuming that the smoothing was a filtering operation applied on
    bin-level data.

    Parameters
    ----------
    smoothed_counts : np.ndarray
        The matrix of smoothed counts to wipe unsmoothable columns from.
    prebinned_counts : np.ndarray
        The original binned counts matrix to use to identify zero-count columns.
    pixelmap : List[Dict[str, Any]]
        The pixelmap describing the bins for this region.
    window_width : int
        The width of the filtering window in base pairs.

    Returns
    -------
    np.ndarray
        The wiped matrix of binned counts.
    """
    unsmoothable_columns = find_prebinned_unsmoothable_columns(
        prebinned_counts, pixelmap, window_width)
    return wipe_counts(smoothed_counts, np.where(unsmoothable_columns))