Source code for lib5c.algorithms.clustering.valley

"""
Module for splitting clusters using a valley heuristic.
"""

from copy import deepcopy

from lib5c.algorithms.clustering.util import belongs_to_which
from lib5c.algorithms.clustering.knn import get_knn, classify_peak
from lib5c.algorithms.clustering.adjacency import make_clusters as \
    make_clusters_by_adjacency


[docs]def split_cluster(parent_cluster, guides, reassign=1.0):
    """
    Splits a cluster using the valley heuristic.

    Parameters
    ----------
    parent_cluster : cluster
        The cluster to split.
    guides : list of clusters
        The guides used to seed the child clusters. See
        ``lib5c.algorithms.clustering.valley.split_clusters()``.
    reassign : float between 0.0 and 1.0
        When a cluster is split, the peaks in the parent cluster with pvalues
        above this threshold may be discarded instead of being assigned to one
        or the other of the child clusters. When the value of this kwarg is 1.0,
        no peaks are discarded and all peaks are reassigned to one or the other
        of the child clusters. When the value of this kwarg is 0.0, all peaks
        are discarded and no peaks are reassigned to the child clusters.

    Returns
    -------
    list of clusters
        The child clusters of the parent cluster.

    Notes
    -----
    See ``lib5c.algorithms.clustering.valley.split_clusters()``.
    """

    # the child clusters
    child_clusters = [[] for guide in guides]

    # a list of unassigned_peaks
    unassigned_peaks = deepcopy(parent_cluster)

    # filter the unassigned_peaks based on the reassign threshold
    unassigned_peaks = [x for x in unassigned_peaks if x['pvalue'] < reassign]

    # sort the peaks
    unassigned_peaks.sort(key=lambda x: x['value'], reverse=True)

    prev_length = len(unassigned_peaks)
    iteration_at_length = 0
    while unassigned_peaks and iteration_at_length <= prev_length:
        if len(unassigned_peaks) == prev_length:
            iteration_at_length += 1
        else:
            prev_length = len(unassigned_peaks)
            iteration_at_length = 1
        # check to see if this peak is identical to a guide peak
        target = belongs_to_which(unassigned_peaks[0], guides)
        if target > -1:
            child_clusters[target].append(unassigned_peaks.pop(0))
        else:
            # find this peak's neighbors
            neighbors = get_knn(unassigned_peaks[0], [peak
                                                      for guide in guides
                                                      for peak in guide], 5)
            target = classify_peak(unassigned_peaks[0], neighbors,
                                   child_clusters, weighted=True,
                                   automove=False)
            if target > -1:
                # classification worked, move the peak
                child_clusters[target].append(unassigned_peaks.pop(0))
            else:
                # classification failed, send it back to the end of the line
                unassigned_peaks.append(unassigned_peaks.pop(0))

    return child_clusters


[docs]def split_clusters(clusters, reassign=1.0, size_threshold=3):
    """
    Splits all clusters in a list of clusters using a recursive valley-splitting
    heuristic.

    Parameters
    ---------
    clusters : list of clusters
        The list of clusters to split.
    reassign : float between 0.0 and 1.0
        When a cluster is split, the peaks in the parent cluster with pvalues
        above this threshold may be discarded instead of being assigned to one
        or the other of the child clusters. When the value of this kwarg is 1.0,
        no peaks are discarded and all peaks are reassigned to one or the other
        of the child clusters. When the value of this kwarg is 0.0, all peaks
        are discarded and no peaks are reassigned to the child clusters.
    size_threshold : int
        The minimum size of child clusters that will be created by the
        splitting. If a splitting operation would result in clusters smaller
        than this number, that splitting operation will not be performed.

    Returns
    -------
    list of clusters
        The split clusters.

    Notes
    -----
    A cluster will get split if there exists a p-value such that thresholding
    the cluster at that p-value results in the creation of at least two
    contiguous groups of high-confidence peaks (p-value below the threshold)
    larger than the ``size_threshold`` separated by a "valley" of low-confidence
    peaks (p-value above the threshold). This rule is applied recursively until
    only unsplittable, or "atomic", clusters remain.
    """
    step = 0.005
    atomic_clusters = []
    while clusters:
        for i in range(len(clusters)):
            if len(clusters[i]) >= 2*size_threshold:
                max_pvalue = max([peak['pvalue'] for peak in clusters[i]])
                current_pvalue = max_pvalue
                while True:
                    # tighten threshold
                    current_pvalue -= step

                    # filter peaks
                    filtered_peaks = [
                        x for x in clusters[i]
                        if x['pvalue'] < current_pvalue
                    ]

                    if filtered_peaks:
                        # cluster by adjacency
                        guides = make_clusters_by_adjacency(filtered_peaks)

                        # remove things that are too small
                        guides = [x for x in guides if len(x) > size_threshold]

                        if len(guides) > 1:
                            new_clusters = None
                            if reassign > 0.0:
                                new_clusters = split_cluster(clusters[i],
                                                             guides, reassign)
                            else:
                                new_clusters = guides
                            clusters.extend(new_clusters)
                            clusters.pop(i)
                            # break condition 1:
                            # successful split
                            break
                    else:
                        # the threshold is too tight; give up
                        atomic_clusters.append(clusters.pop(i))
                        # break condition 2:
                        # no threshold splits the cluster
                        break
                # breaks 1 and 2 feed into this one
                break
            else:
                atomic_clusters.append(clusters.pop(i))
                # break condition 3:
                # cluster is too small to even attempt split
                break

    return atomic_clusters