Source code for lib5c.algorithms.enrichment

"""
Module for computing enrichments of annotations within categories of categorized
loops.
"""

from __future__ import division

from __future__ import absolute_import
import numpy as np
from scipy.ndimage import generic_filter
import scipy.stats as stats

from lib5c.util.lru_cache import lru_cache


[docs]@lru_cache(maxsize=None)
def process_annotations(annotation_label, region, annotationmaps, threshold=0,
                        margin=1):
    """
    Extracts one annotation and one region from a dict of annotationmaps and
    returns it in a vector form.

    This function should be called from within the bodies of vectorized
    enrichment functions that accept standard annotationmaps as arguments.

    Parameters
    ----------
    annotation_label : str
        The annotation for which a vector should be created. Must be a key of
        ``annotationmaps``.
    region : str
        The specific region for which a vector should be created. Must be a key
        of ``annotationmaps[annotation_label]``.
    annotationmaps : dict of annotationmap
        A dict describing the annotations. In total, it should have the
        following structure::

            {
                'annotation_a_name': {
                    'region_1_name': list of int,
                    'region_2_name': list of int,
                    ...
                 },
                'annotation_b_name': {
                    'region_1_name': list of int,
                    'region_2_name': list of int,
                    ...
                },
                ...
            }

        where ``annotationmaps['annotation_a']['region_r']`` should be a list of
        ints describing the number of ``'annotation_a'``s present in each bin of
        ``'region_r'``.
    threshold : int
        Bins are defined to contain an annotation if they are "hit" strictly
        more than ``threshold`` times by the annotation.
    margin : int
        A bin is defined to contain an annotation if any bin within ``margin``
        bins is "hit" by the annotation. Corresponds to a "margin for error" in
        the intersection precision.

    Returns
    -------
    np.ndarray
        The processed vector representing the coverage of the selected
        annotation across the selected region, according to the definitions
        implied by the choise of ``threshold`` and ``margin``.

    Examples
    --------
    >>> annotationmaps = {'a': {'r1': [0, 0, 2, 1]}}
    >>> process_annotations('a', 'r1', annotationmaps)
    array([0, 1, 1, 1])
    >>> process_annotations('a', 'r1', annotationmaps, threshold=1, margin=0)
    array([0, 0, 1, 0])
    """
    return generic_filter(annotationmaps[annotation_label][region],
                          lambda x: x.sum() > threshold, size=1+2*margin,
                          mode='constant')


[docs]@lru_cache(maxsize=None)
def count_intersections(annotation_a, annotation_b, region, category,
                        annotationmaps, looping_classes, threshold=0, margin=1,
                        asymmetric=False):
    """
    Counts the number of times one annotation intersects another at a particular
    category of called loops within a specified region.

    Parameters
    ----------
    annotation_a : str
        The annotation to look for on one side of the loop. Must be a key into
        ``annotationmaps``.
    annotation_b : str
        The annotation to look for on the other side of the loop. Must be a key
        into ``annotationmaps``.
    region : str
        The region to count intersections over.
    category : str
        The loop category to count intersections for.
    annotationmaps : dict of annotationmap
        A dict describing the annotations. In total, it should have the
        following structure::

            {
                'annotation_a_name': {
                    'region_1_name': list of int,
                    'region_2_name': list of int,
                    ...
                 },
                'annotation_b_name': {
                    'region_1_name': list of int,
                    'region_2_name': list of int,
                    ...
                },
                ...
            }

        where ``annotationmaps['annotation_a']['region_r']`` should be a list of
        ints describing the number of ``'annotation_a'``s present in each bin of
        ``'region_r'``.
    looping_classes : dict of np.ndarray with str dtype
        The keys should be region names as strings, the values should be square,
        symmetric arrays of the same size and shape as the indicated region,
        with string loop category names in the positions of categorized loops.
    threshold : int
        Bins are defined to contain an annotation if they are "hit" strictly
        more than ``threshold`` times by the annotation.
    margin : int
        A bin is defined to contain an annotation if any bin within ``margin``
        bins is "hit" by the annotation. Corresponds to a "margin for error" in
        the intersection precision.
    asymmetric : bool
        Pass True to only count situations when A is upstream of B. Pass False
        to count intersections regardless of order.

    Returns
    -------
    int
        The total number of intersections.

    Examples
    --------
    >>> import numpy as np
    >>> clear_enrichment_caches()
    >>> annotationmaps = {'a': {'r1': [0, 0, 2, 1]},
    ...                   'b': {'r1': [1, 1, 0, 0]}}
    >>> looping_classes = {'r1': np.array([[''   , ''   , 'es' , 'ips'],
    ...                                    [''   , ''   , 'npc', 'npc'],
    ...                                    ['es' , 'npc', ''   , ''   ],
    ...                                    ['ips', 'npc', ''   , ''   ]],
    ...                                   dtype='U25')}
    >>> count_intersections('a', 'b', 'r1', 'es', annotationmaps,
    ...                     looping_classes, margin=0)
    1
    >>> count_intersections('a', 'b', 'r1', 'npc', annotationmaps,
    ...                     looping_classes, margin=0)
    2
    >>> count_intersections('a', 'b', 'r1', 'npc', annotationmaps,
    ...                     looping_classes, margin=0)
    2
    >>> count_intersections.cache_info()
    CacheInfo(hits=1, misses=2, maxsize=None, currsize=2)
    >>> count_intersections('a', 'b', 'r1', 'es', annotationmaps,
    ...                     looping_classes, margin=0, asymmetric=True)
    0
    >>> count_intersections('b', 'a', 'r1', 'es', annotationmaps,
    ...                     looping_classes, margin=0, asymmetric=True)
    1
    """
    temp = np.outer(
        process_annotations(annotation_a, region, annotationmaps,
                            threshold=threshold, margin=margin),
        process_annotations(annotation_b, region, annotationmaps,
                            threshold=threshold, margin=margin)) > 0
    if not asymmetric:
        return np.tril(looping_classes[region] == category)[temp | temp.T].sum()
    return np.tril(looping_classes[region] == category)[temp.T].sum()


[docs]@lru_cache(maxsize=None)
def count_intersections_all(annotation_a, annotation_b, category,
                            annotationmaps, looping_classes, threshold=0,
                            margin=1, asymmetric=False):
    """
    Counts the number of times ``annotation_a`` and ``annotation_b`` are found
    on opposite ends of loops in a given category of loop type across all
    genomic regions.

    Parameters
    ----------
    annotation_a : str
        Annotation to look for on one side of the loop.
    annotation_b : str
        Annotation to look for on the other side of the loop.
    category : str
        Only consider loops of this category.
    annotationmaps : dict of annotationmap
        A dict describing the annotations. In total, it should have the
        following structure::

            {
                'annotation_a_name': {
                    'region_1_name': list of int,
                    'region_2_name': list of int,
                    ...
                 },
                'annotation_b_name': {
                    'region_1_name': list of int,
                    'region_2_name': list of int,
                    ...
                },
                ...
            }

        where ``annotationmaps['annotation_a']['region_r']`` should be a list of
        ints describing the number of ``'annotation_a'``s present in each bin of
        ``'region_r'``.
    looping_classes : dict of np.ndarray with str dtype
        The keys should be region names as strings, the values should be square,
        symmetric arrays of the same size and shape as the indicated region,
        with string loop category names in the positions of categorized loops.
    threshold : int
        Bins are defined to contain an annotation if they are "hit" strictly
        more than ``threshold`` times by the annotation.
    margin : int
        A bin is defined to contain an annotation if any bin within ``margin``
        bins is "hit" by the annotation. Corresponds to a "margin for error" in
        the intersection precision.
    asymmetric : bool
        Pass True to only count situations when A is upstream of B. Pass False
        to count intersections regardless of order.

    Returns
    -------
    int
        The total number of intersections across all regions.

    Examples
    --------
    >>> import numpy as np
    >>> annotationmaps = {'a': {'r1': [0, 0, 2], 'r2': [1, 0]},
    ...                   'b': {'r1': [1, 1, 0], 'r2': [0, 1]}}
    >>> looping_classes = {'r1': np.array([['npc', ''   , 'es' ],
    ...                                    [''   , ''   , 'npc'],
    ...                                    ['es' , 'npc', ''   ]],
    ...                                   dtype='U25'),
    ...                    'r2': np.array([[''   , 'es' ],
    ...                                    ['es' , ''   ]],
    ...                                   dtype='U25')}
    >>> count_intersections_all('a', 'b', 'es', annotationmaps,
    ...                         looping_classes, margin=0)
    2
    >>> count_intersections_all('a', 'b', 'npc', annotationmaps,
    ...                         looping_classes, margin=0)
    1
    """
    all_intersections = 0
    for region in looping_classes.keys():
        all_intersections += count_intersections(
            annotation_a, annotation_b, region, category, annotationmaps,
            looping_classes, threshold=threshold, margin=margin,
            asymmetric=asymmetric)
    return all_intersections


[docs]@lru_cache(maxsize=None)
def get_annotation_percentage(annotation_a, annotation_b, region, category,
                              annotationmaps, looping_classes, threshold=0,
                              margin=1, asymmetric=False):
    """
    Computes the precentage of loops within a particular region categorized into
    a particular category that represent loops between ``annotation_a`` and
    ``annotation_b``.

    Parameters
    ----------
    annotation_a : str
        Annotation to look for on one side of the loop.
    annotation_b : str
        Annotation to look for on the other side of the loop.
    region : str
        The region to compute the percentage within.
    category : str
        The category of loops to consider.
    annotationmaps : dict of annotationmap
        A dict describing the annotations. In total, it should have the
        following structure::

            {
                'annotation_a_name': {
                    'region_1_name': list of int,
                    'region_2_name': list of int,
                    ...
                 },
                'annotation_b_name': {
                    'region_1_name': list of int,
                    'region_2_name': list of int,
                    ...
                },
                ...
            }

        where ``annotationmaps['annotation_a']['region_r']`` should be a list of
        ints describing the number of ``'annotation_a'``s present in each bin of
        ``'region_r'``.
    looping_classes : dict of np.ndarray with str dtype
        The keys should be region names as strings, the values should be square,
        symmetric arrays of the same size and shape as the indicated region,
        with string loop category names in the positions of categorized loops.
    threshold : int
        Bins are defined to contain an annotation if they are "hit" strictly
        more than ``threshold`` times by the annotation.
    margin : int
        A bin is defined to contain an annotation if any bin within ``margin``
        bins is "hit" by the annotation. Corresponds to a "margin for error" in
        the intersection precision.
    asymmetric : bool
        Pass True to only count situations when A is upstream of B. Pass False
        to count intersections regardless of order.

    Returns
    -------
    float
        The fraction of loops within the region of the specified category that
        represent loops between the indicated annotations.

    Examples
    --------
    >>> import numpy as np
    >>> annotationmaps = {'a': {'r1': [0, 0, 0, 1]},
    ...                   'b': {'r1': [1, 1, 0, 0]}}
    >>> looping_classes = {'r1': np.array([[''   , ''   , 'es' , 'ips'],
    ...                                    [''   , ''   , 'npc', 'npc'],
    ...                                    ['es' , 'npc', ''   , ''   ],
    ...                                    ['ips', 'npc', ''   , ''   ]],
    ...                                   dtype='U25')}
    >>> get_annotation_percentage('a', 'b', 'r1', 'ips', annotationmaps,
    ...                           looping_classes, margin=0)
    1.0
    >>> get_annotation_percentage('a', 'b', 'r1', 'npc', annotationmaps,
    ...                           looping_classes, margin=0)
    0.5
    """
    return (count_intersections(annotation_a, annotation_b, region, category,
                                annotationmaps, looping_classes,
                                threshold=threshold, margin=margin,
                                asymmetric=asymmetric) /
            np.tril(looping_classes[region] == category).sum())


[docs]@lru_cache(maxsize=None)
def get_annotation_percentage_all(annotation_a, annotation_b, category,
                                  annotationmaps, looping_classes, threshold=0,
                                  margin=1, asymmetric=False):
    """
    Computes the precentage of loops across all regions categorized into a
    particular category that represent loops between ``annotation_a`` and
    ``annotation_b``.

    Parameters
    ----------
    annotation_a : str
        Annotation to look for on one side of the loop.
    annotation_b : str
        Annotation to look for on the other side of the loop.
    category : str
        The category of loops to consider.
    annotationmaps : dict of annotationmap
        A dict describing the annotations. In total, it should have the
        following structure::

            {
                'annotation_a_name': {
                    'region_1_name': list of int,
                    'region_2_name': list of int,
                    ...
                 },
                'annotation_b_name': {
                    'region_1_name': list of int,
                    'region_2_name': list of int,
                    ...
                },
                ...
            }

        where ``annotationmaps['annotation_a']['region_r']`` should be a list of
        ints describing the number of ``'annotation_a'``s present in each bin of
        ``'region_r'``.
    looping_classes : dict of np.ndarray with str dtype
        The keys should be region names as strings, the values should be square,
        symmetric arrays of the same size and shape as the indicated region,
        with string loop category names in the positions of categorized loops.
    threshold : int
        Bins are defined to contain an annotation if they are "hit" strictly
        more than ``threshold`` times by the annotation.
    margin : int
        A bin is defined to contain an annotation if any bin within ``margin``
        bins is "hit" by the annotation. Corresponds to a "margin for error" in
        the intersection precision.
    asymmetric : bool
        Pass True to only count situations when A is upstream of B. Pass False
        to count intersections regardless of order.

    Returns
    -------
    float
        The fraction of loops across all regions of the specified category that
        represent loops between the indicated annotations.

    Examples
    --------
    >>> import numpy as np
    >>> annotationmaps = {'a': {'r1': [0, 0, 2], 'r2': [1, 0]},
    ...                   'b': {'r1': [1, 1, 0], 'r2': [0, 1]}}
    >>> looping_classes = {'r1': np.array([['npc', ''   , 'es' ],
    ...                                    [''   , ''   , 'npc'],
    ...                                    ['es' , 'npc', ''   ]],
    ...                                   dtype='U25'),
    ...                    'r2': np.array([['npc', 'es' ],
    ...                                    ['es' , 'npc']],
    ...                                   dtype='U25')}
    >>> get_annotation_percentage_all('a', 'b', 'es', annotationmaps,
    ...                               looping_classes, margin=0)
    1.0
    >>> get_annotation_percentage_all('a', 'b', 'npc', annotationmaps,
    ...                               looping_classes, margin=0)
    0.25
    """
    all_interactions = 0
    for region in looping_classes.keys():
        all_interactions += np.tril(looping_classes[region] == category).sum()
    return (count_intersections_all(annotation_a, annotation_b, category,
                                    annotationmaps, looping_classes,
                                    threshold=threshold, margin=margin,
                                    asymmetric=asymmetric) /
            all_interactions)


[docs]@lru_cache(maxsize=None)
def get_fold_change(annotation_a, annotation_b, region, category,
                    annotationmaps, looping_classes, threshold=0, margin=1,
                    asymmetric=False):
    """
    Computes the fold enrichment of the percentage of loops of a particular
    category in a particular region connecting specified annotations relative
    to the special "background" reference category.

    Parameters
    ----------
    annotation_a : str
        Annotation to look for on one side of the loop.
    annotation_b : str
        Annotation to look for on the other side of the loop.
    region : str
        The region to compute the fold enrichment within.
    category : str
        The category of loops to consider.
    annotationmaps : dict of annotationmap
        A dict describing the annotations. In total, it should have the
        following structure::

            {
                'annotation_a_name': {
                    'region_1_name': list of int,
                    'region_2_name': list of int,
                    ...
                 },
                'annotation_b_name': {
                    'region_1_name': list of int,
                    'region_2_name': list of int,
                    ...
                },
                ...
            }

        where ``annotationmaps['annotation_a']['region_r']`` should be a list of
        ints describing the number of ``'annotation_a'``s present in each bin of
        ``'region_r'``.
    looping_classes : dict of np.ndarray with str dtype
        The keys should be region names as strings, the values should be square,
        symmetric arrays of the same size and shape as the indicated region,
        with string loop category names in the positions of categorized loops.
    threshold : int
        Bins are defined to contain an annotation if they are "hit" strictly
        more than ``threshold`` times by the annotation.
    margin : int
        A bin is defined to contain an annotation if any bin within ``margin``
        bins is "hit" by the annotation. Corresponds to a "margin for error" in
        the intersection precision.
    asymmetric : bool
        Pass True to only count situations when A is upstream of B. Pass False
        to count intersections regardless of order.

    Returns
    -------
    float
        The fold enrichment.

    Examples
    --------
    >>> import numpy as np
    >>> annotationmaps = {'a': {'r1': [1, 0, 0, 1]},
    ...                   'b': {'r1': [1, 1, 0, 0]}}
    >>> looping_classes = {'r1': np.array([[''   , ''   , 'es' , 'ips'],
    ...                                    [''   , ''   , 'npc', 'npc'],
    ...                                    ['es' , 'npc', ''   , ''   ],
    ...                                    ['ips', 'npc', ''   , ''   ]],
    ...                                   dtype='U25')}
    >>> looping_classes['r1'][looping_classes['r1'] == ''] = 'background'
    >>> get_fold_change('a', 'b', 'r1', 'ips', annotationmaps, looping_classes,
    ...                 margin=0)
    3.0
    >>> get_fold_change('a', 'b', 'r1', 'npc', annotationmaps, looping_classes,
    ...                 margin=0)
    1.5
    """
    return (get_annotation_percentage(annotation_a, annotation_b, region,
                                      category, annotationmaps, looping_classes,
                                      threshold=threshold, margin=margin,
                                      asymmetric=asymmetric) /
            get_annotation_percentage(annotation_a, annotation_b, region,
                                      'background', annotationmaps,
                                      looping_classes, threshold=threshold,
                                      margin=margin, asymmetric=asymmetric))


[docs]@lru_cache(maxsize=None)
def get_fold_change_all(annotation_a, annotation_b, category, annotationmaps,
                        looping_classes, threshold=0, margin=1,
                        asymmetric=False):
    """
    Computes the fold enrichment of the percentage of loops of a particular
    category across all regions connecting specified annotations relative to the
    special "background" reference category.

    Parameters
    ----------
    annotation_a : str
        Annotation to look for on one side of the loop.
    annotation_b : str
        Annotation to look for on the other side of the loop.
    category : str
        The category of loops to consider.
    annotationmaps : dict of annotationmap
        A dict describing the annotations. In total, it should have the
        following structure::

            {
                'annotation_a_name': {
                    'region_1_name': list of int,
                    'region_2_name': list of int,
                    ...
                 },
                'annotation_b_name': {
                    'region_1_name': list of int,
                    'region_2_name': list of int,
                    ...
                },
                ...
            }

        where ``annotationmaps['annotation_a']['region_r']`` should be a list of
        ints describing the number of ``'annotation_a'``s present in each bin of
        ``'region_r'``.
    looping_classes : dict of np.ndarray with str dtype
        The keys should be region names as strings, the values should be square,
        symmetric arrays of the same size and shape as the indicated region,
        with string loop category names in the positions of categorized loops.
    threshold : int
        Bins are defined to contain an annotation if they are "hit" strictly
        more than ``threshold`` times by the annotation.
    margin : int
        A bin is defined to contain an annotation if any bin within ``margin``
        bins is "hit" by the annotation. Corresponds to a "margin for error" in
        the intersection precision.
    asymmetric : bool
        Pass True to only count situations when A is upstream of B. Pass False
        to count intersections regardless of order.

    Returns
    -------
    float
        The fold enrichment.

    Examples
    --------
    >>> import numpy as np
    >>> annotationmaps = {'a': {'r1': [0, 1, 2], 'r2': [1, 0]},
    ...                   'b': {'r1': [1, 1, 0], 'r2': [0, 1]}}
    >>> looping_classes = {'r1': np.array([['npc', ''   , 'es' ],
    ...                                    [''   , ''   , 'npc'],
    ...                                    ['es' , 'npc', ''   ]],
    ...                                   dtype='U25'),
    ...                    'r2': np.array([['ips', 'es' ],
    ...                                    ['es' , ''   ]],
    ...                                   dtype='U25')}
    >>> looping_classes['r1'][looping_classes['r1'] == ''] = 'background'
    >>> looping_classes['r2'][looping_classes['r2'] == ''] = 'background'
    >>> get_fold_change_all('a', 'b', 'es', annotationmaps, looping_classes,
    ...                     margin=0)
    2.0
    >>> get_fold_change_all('a', 'b', 'npc', annotationmaps, looping_classes,
    ...                     margin=0)
    1.0
    >>> get_fold_change_all('a', 'b', 'ips', annotationmaps, looping_classes,
    ...                     margin=0)
    0.0
    """
    denominator = get_annotation_percentage_all(
        annotation_a, annotation_b, 'background', annotationmaps,
        looping_classes, threshold=threshold, margin=margin,
        asymmetric=asymmetric)
    if denominator == 0:
        return 0
    numerator = get_annotation_percentage_all(
        annotation_a, annotation_b, category, annotationmaps, looping_classes,
        threshold=threshold, margin=margin, asymmetric=asymmetric)
    return numerator / denominator


[docs]@lru_cache(maxsize=None)
def get_fisher_exact_pvalue(annotation_a, annotation_b, region, category,
                            annotationmaps, looping_classes, threshold=0,
                            margin=1, asymmetric=False):
    """
    Use Fisher's exact test to compute a one-sided p-value against the null
    hypothesis that the selected loop category's overlap with selected
    annotations in a chosen region is the same as the special "background"
    reference loop category's overlap with the same annotations.

    Parameters
    ----------
    annotation_a : str
        Annotation to look for on one side of the loop.
    annotation_b : str
        Annotation to look for on the other side of the loop.
    region : str
        The region to compute the p-value within
    category : str
        The category of loops to consider.
    annotationmaps : dict of annotationmap
        A dict describing the annotations. In total, it should have the
        following structure::

            {
                'annotation_a_name': {
                    'region_1_name': list of int,
                    'region_2_name': list of int,
                    ...
                 },
                'annotation_b_name': {
                    'region_1_name': list of int,
                    'region_2_name': list of int,
                    ...
                },
                ...
            }

        where ``annotationmaps['annotation_a']['region_r']`` should be a list of
        ints describing the number of ``'annotation_a'``s present in each bin of
        ``'region_r'``.
    looping_classes : dict of np.ndarray with str dtype
        The keys should be region names as strings, the values should be square,
        symmetric arrays of the same size and shape as the indicated region,
        with string loop category names in the positions of categorized loops.
    threshold : int
        Bins are defined to contain an annotation if they are "hit" strictly
        more than ``threshold`` times by the annotation.
    margin : int
        A bin is defined to contain an annotation if any bin within ``margin``
        bins is "hit" by the annotation. Corresponds to a "margin for error" in
        the intersection precision.
    asymmetric : bool
        Pass True to only count situations when A is upstream of B. Pass False
        to count intersections regardless of order.

    Returns
    -------
    float
        The p-value.

    Examples
    --------
    >>> import numpy as np
    >>> annotationmaps = {'a': {'r1': [1, 0, 0, 1]},
    ...                   'b': {'r1': [1, 1, 0, 0]}}
    >>> looping_classes = {'r1': np.array([[''   , ''   , 'es' , 'ips'],
    ...                                    [''   , ''   , 'npc', 'npc'],
    ...                                    ['es' , 'npc', ''   , ''   ],
    ...                                    ['ips', 'npc', ''   , ''   ]],
    ...                                   dtype='U25')}
    >>> looping_classes['r1'][looping_classes['r1'] == ''] = 'background'
    >>> get_fisher_exact_pvalue('a', 'b', 'r1', 'ips', annotationmaps,
    ...                         looping_classes, margin=0)
    0.428571428571428...
    >>> get_fisher_exact_pvalue('a', 'b', 'r1', 'npc', annotationmaps,
    ...                         looping_classes, margin=0)
    0.642857142857142...
    """
    # count loops in the specified category
    category_loops_total = np.tril(looping_classes[region] == category).sum()
    category_loops_hit = count_intersections(
        annotation_a, annotation_b, region, category, annotationmaps,
        looping_classes, threshold=threshold, margin=margin,
        asymmetric=asymmetric)
    category_loops_not_hit = category_loops_total - category_loops_hit

    # count loops in the background category
    bkgd_loops_total = np.tril(looping_classes[region] == 'background').sum()
    bkgd_loops_hit = count_intersections(
        annotation_a, annotation_b, region, 'background', annotationmaps,
        looping_classes, threshold=threshold, margin=margin,
        asymmetric=asymmetric)
    bkgd_loops_not_hit = bkgd_loops_total - bkgd_loops_hit

    # short-circuit if neither category nor background have any hits
    if category_loops_hit == 0 and bkgd_loops_hit == 0:
        return 0.5

    # assemble contingency table
    cont_table = [[category_loops_hit, bkgd_loops_hit],
                  [category_loops_not_hit, bkgd_loops_not_hit]]

    # return the smaller of the two single-tailed p-values
    return min(stats.fisher_exact(cont_table, alternative='less')[1],
               stats.fisher_exact(cont_table, alternative='greater')[1])


[docs]@lru_cache(maxsize=None)
def get_fisher_exact_pvalue_all(annotation_a, annotation_b, category,
                                annotationmaps, looping_classes, threshold=0,
                                margin=1, asymmetric=False):
    """
    Use Fisher's exact test to compute a one-sided p-value against the null
    hypothesis that the selected loop category's overlap with selected
    annotations across all regions is the same as the special "background"
    reference loop category's overlap with the same annotations.

    Parameters
    ----------
    annotation_a : str
        Annotation to look for on one side of the loop.
    annotation_b : str
        Annotation to look for on the other side of the loop.
    category : str
        The category of loops to consider.
    annotationmaps : dict of annotationmap
        A dict describing the annotations. In total, it should have the
        following structure::

            {
                'annotation_a_name': {
                    'region_1_name': list of int,
                    'region_2_name': list of int,
                    ...
                 },
                'annotation_b_name': {
                    'region_1_name': list of int,
                    'region_2_name': list of int,
                    ...
                },
                ...
            }

        where ``annotationmaps['annotation_a']['region_r']`` should be a list of
        ints describing the number of ``'annotation_a'``s present in each bin of
        ``'region_r'``.
    looping_classes : dict of np.ndarray with str dtype
        The keys should be region names as strings, the values should be square,
        symmetric arrays of the same size and shape as the indicated region,
        with string loop category names in the positions of categorized loops.
    threshold : int
        Bins are defined to contain an annotation if they are "hit" strictly
        more than ``threshold`` times by the annotation.
    margin : int
        A bin is defined to contain an annotation if any bin within ``margin``
        bins is "hit" by the annotation. Corresponds to a "margin for error" in
        the intersection precision.
    asymmetric : bool
        Pass True to only count situations when A is upstream of B. Pass False
        to count intersections regardless of order.

    Returns
    -------
    float
        The p-value.

    Examples
    --------
    >>> import numpy as np
    >>> annotationmaps = {'a': {'r1': [0, 1, 2], 'r2': [1, 0]},
    ...                   'b': {'r1': [1, 1, 0], 'r2': [0, 1]}}
    >>> looping_classes = {'r1': np.array([['npc', ''   , 'es' ],
    ...                                    [''   , ''   , 'npc'],
    ...                                    ['es' , 'npc', ''   ]],
    ...                                   dtype='U25'),
    ...                    'r2': np.array([['ips', 'es' ],
    ...                                    ['es' , ''   ]],
    ...                                   dtype='U25')}
    >>> looping_classes['r1'][looping_classes['r1'] == ''] = 'background'
    >>> looping_classes['r2'][looping_classes['r2'] == ''] = 'background'
    >>> round(get_fisher_exact_pvalue_all('a', 'b', 'es', annotationmaps,
    ...                                   looping_classes, margin=0), 14)
    0.4
    >>> round(get_fisher_exact_pvalue_all('a', 'b', 'npc', annotationmaps,
    ...                                   looping_classes, margin=0), 14)
    0.8
    >>> round(get_fisher_exact_pvalue_all('a', 'b', 'ips', annotationmaps,
    ...                             looping_classes, margin=0), 14)
    0.6
    """
    # count loops in the specified category
    category_loops_total = sum(
        [np.tril(looping_classes[region] == category).sum()
         for region in looping_classes])
    category_loops_hit = count_intersections_all(
        annotation_a, annotation_b, category, annotationmaps, looping_classes,
        threshold=threshold, margin=margin, asymmetric=asymmetric)
    category_loops_not_hit = category_loops_total - category_loops_hit

    # count loops in the background category
    bkgd_loops_total = sum(
        [np.tril(looping_classes[region] == 'background').sum()
         for region in looping_classes])
    bkgd_loops_hit = count_intersections_all(
        annotation_a, annotation_b, 'background', annotationmaps,
        looping_classes, threshold=threshold, margin=margin,
        asymmetric=asymmetric)
    bkgd_loops_not_hit = bkgd_loops_total - bkgd_loops_hit

    # short-circuit if neither category nor background have any hits
    if category_loops_hit == 0 and bkgd_loops_hit == 0:
        return 0.5

    # assemble contingency table
    cont_table = [[category_loops_hit, bkgd_loops_hit],
                  [category_loops_not_hit, bkgd_loops_not_hit]]

    # return the smaller of the two single-tailed p-values
    return min(stats.fisher_exact(cont_table, alternative='less')[1],
               stats.fisher_exact(cont_table, alternative='greater')[1])


[docs]def clear_enrichment_caches():
    """
    Clear all caches related to enrichment computations.

    This function is deprecated. Previously, it was necessary to call this
    function within a script whenever the content of ``annotationmaps`` or
    ``looping_classes`` changed. The current cache implementation does not need
    to be cleared when this happens.

    Examples
    --------
    >>> import numpy as np
    >>> clear_enrichment_caches()
    >>> annotationmaps = {'a': {'r1': [0, 0, 2, 1]},
    ...                   'b': {'r1': [1, 1, 0, 0]}}
    >>> looping_classes = {'r1': np.array([[''   , ''   , 'es' , 'ips'],
    ...                                    [''   , ''   , 'npc', 'npc'],
    ...                                    ['es' , 'npc', ''   , ''   ],
    ...                                    ['ips', 'npc', ''   , ''   ]],
    ...                                   dtype='U25')}
    >>> count_intersections('a', 'b', 'r1', 'es', annotationmaps,
    ...                     looping_classes, margin=0)
    1
    >>> looping_classes = {'r1': np.array([[''   , ''   , 'ips', 'ips'],
    ...                                    [''   , ''   , 'npc', 'npc'],
    ...                                    ['ips', 'npc', ''   , ''   ],
    ...                                    ['ips', 'npc', ''   , ''   ]],
    ...                                   dtype='U25')}
    >>> count_intersections('a', 'b', 'r1', 'es', annotationmaps,
    ...                     looping_classes, margin=0)
    0
    >>> clear_enrichment_caches()
    >>> count_intersections('a', 'b', 'r1', 'es', annotationmaps,
    ...                     looping_classes, margin=0)
    0
    """
    process_annotations.cache_clear()
    count_intersections.cache_clear()
    count_intersections_all.cache_clear()
    get_annotation_percentage.cache_clear()
    get_annotation_percentage_all.cache_clear()
    get_fold_change.cache_clear()
    get_fold_change_all.cache_clear()
    get_fisher_exact_pvalue.cache_clear()
    get_fisher_exact_pvalue_all.cache_clear()