Source code for lib5c.plotters.expected

"""
Module for visualization of one-dimensional distance-dependent expected models.

Two convenience functions are exposed:

* ``plot_bin_expected()``
* ``plot_fragment_expected()``

which are bin- and fragment-level wrappers around ``plot_log_log_expected()``.
The other functions are utility functions.

These functions are all overloaded so that an arg called ``distance_expected``
can be replaced with a dict of named distance expecteds. This will result in an
overlayed comparison of all the expected models in the dict.

The other functions in this module are private helper functions.
"""

from collections import OrderedDict

import matplotlib.pyplot as plt
import numpy as np

from lib5c.util.parallelization import parallelize_regions
from lib5c.util.plotting import plotter


[docs]@parallelize_regions
def plot_bin_expected(obs_matrix, distance_expected, hexbin=False, kde=False,
                      color='r', semilog=False, linewidth=4,
                      title='1-D expected model', ylabel='log counts',
                      xlabel='log distance', legend=True, **kwargs):
    """
    Convenience function for plotting a visualization of a one-dimensional
    distance-dependent expected model defined over bin-level data.

    Parameters
    ----------
    obs_matrix : np.ndarray
        The matrix of real interaction data that the model will be compared to.
    distance_expected : List[float]
        The one-dimensional distance-dependence model. The ``i`` th element of
        the list should correspond to the expected value for interactions
        between loci separated by ``i`` bins. To compare multiple expected
        models to the same observed data, pass a dict or an OrderedDict whose
        keys are string names for the models.
    hexbin : bool
        Pass True to use a hexbin plot to represent the density of the real
        data.
    kde : bool
        Pass True to use a kernel density estimate to represent the density of
        the real data.
    color : str
        The color to draw the expected model line with. When comparing multiple
        models, this can be a dict or OrderedDict with the same keys as
        ``distance_expected``.
    semilog : bool
        Pass True to leave the distance axis unlogged.
    linewidth : float
        Line width to draw the model with.
    kwargs : kwargs
        Typical plotter kwargs.

    Returns
    -------
    pyplot axis
        The axis plotted on.
    """
    real_data = np.asarray(
        [[i - j, obs_matrix[i, j]]
         for i in range(len(obs_matrix))
         for j in range(i + 1)
         if np.isfinite(obs_matrix[i, j])]
    )
    exp_distances, exp_values = _bin_distance_expected_to_arrays(
        distance_expected)
    return plot_log_log_expected(
        real_data[:, 0],
        real_data[:, 1],
        exp_distances,
        exp_values,
        hexbin=hexbin, kde=kde, color=color, semilog=semilog,
        linewidth=linewidth, title=title, ylabel=ylabel, xlabel=xlabel,
        legend=legend, **kwargs)


[docs]@parallelize_regions
def plot_fragment_expected(obs_matrix, distance_expected, distance_matrix,
                           hexbin=False, kde=False, color='r', semilog=False,
                           linewidth=4, title='1-D expected model',
                           ylabel='log counts', xlabel='log distance',
                           legend=True, **kwargs):
    """
    Convenience function for plotting a visualization of a one-dimensional
    distance-dependent expected model defined over fragment-level data.

    Parameters
    ----------
    obs_matrix : np.ndarray
        The matrix of real interaction data that the model will be compared to.
    distance_expected : Dict[int, float]
        A mapping from interaction distances in units of base pairs to the
        expected value at that distance. To compare multiple expected models to
        the same observed data, pass a dict or an OrderedDict whose keys are
        string names for the models.
    distance_matrix : np.ndarray
        The pairwise distance matrix for the fragments in this region.
    hexbin : bool
        Pass True to use a hexbin plot to represent the density of the real
        data.
    kde : bool
        Pass True to use a kernel density estimate to represent the density of
        the real data.
    color : str
        The color to draw the expected model line with. When comparing multiple
        models, this can be a dict or OrderedDict with the same keys as
        ``distance_expected``.
    semilog : bool
        Pass True to leave the distance axis unlogged.
    linewidth : float
        Line width to draw the model with.
    kwargs : kwargs
        Typical plotter kwargs.

    Returns
    -------
    pyplot axis
        The axis plotted on.
    """
    real_data = np.asarray(
        [[distance_matrix[i, j], obs_matrix[i, j]]
         for i in range(len(obs_matrix))
         for j in range(i + 1)
         if np.isfinite(obs_matrix[i, j])]
    )
    exp_distances, exp_values = _fragment_distance_expected_to_arrays(
        distance_expected)
    return plot_log_log_expected(
        real_data[:, 0],
        real_data[:, 1],
        exp_distances,
        exp_values,
        hexbin=hexbin, kde=kde, color=color, semilog=semilog,
        linewidth=linewidth, title=title, ylabel=ylabel, xlabel=xlabel,
        legend=legend, **kwargs)


def _bin_distance_expected_to_arrays(distance_expected):
    if isinstance(distance_expected, dict):
        exp_distances = np.arange(
            len(distance_expected[list(distance_expected.keys())[0]]))
        exp_values = OrderedDict([(model, np.array(distance_expected[model]))
                                  for model in distance_expected])
    else:
        exp_distances = np.arange(len(distance_expected))
        exp_values = np.array(distance_expected)
    return exp_distances, exp_values


def _fragment_distance_expected_to_arrays(distance_expected):
    if isinstance(distance_expected[list(distance_expected.keys())[0]], dict):
        exp_distances = np.array(
            list(distance_expected[list(distance_expected.keys())[0]].keys()))
        exp_values = OrderedDict([
            (
                model,
                np.array(list(
                    distance_expected[model].values()))[exp_distances.argsort()]
            )
            for model in distance_expected
        ])
    else:
        exp_distances = np.array(list(distance_expected.keys()))
        exp_values = np.array(
            list(distance_expected.values()))[exp_distances.argsort()]
    exp_distances.sort()
    return exp_distances, exp_values


[docs]@plotter
def plot_log_log_expected(obs_distances, obs_values, exp_distances, exp_values,
                          hexbin=False, kde=False, color='r', pseudocount=1,
                          semilog=False, linewidth=4,
                          title='1-D expected model', ylabel='log counts',
                          xlabel='log distance', legend=True, **kwargs):
    """
    Plot a visualization of an expected model over real data.

    Parameters
    ----------
    obs_distances : np.ndarray
        Flat array of the distances of the ``obs_values``.
    obs_values : np.ndarray
        Flat array of the real data values.
    exp_distances : np.ndarray
        Flat array of the distances of the ``exp_values``.
    exp_values : np.ndarray
        Flat array of the expected data values predicted by the model. To
        compare multiple expected models to the same observed data, pass a dict
        or an OrderedDict whose keys are string names for the models.
    title : str
        Title to write on the plot.
    ylabel : str
        Label for the y-axis on the plot.
    xlabel : str
        Label for the x-axis on the plot.
    hexbin : bool
        Pass True to use a hexbin plot to represent the density of the real
        data.
    kde : bool
        Pass True to use a kernel density estimate to represent the density of
        the real data.
    color : str
        The color to draw the expected model line with. When comparing multiple
        models, this can be a dict or OrderedDict with the same keys as
        ``distance_expected``.
    pseudocount : int
        Pseudocount to add to distances if called with ``semilog=True``.
    semilog : bool
        Pass True to leave the distance axis unlogged.
    linewidth : float
        Line width to draw the model with.
    kwargs : kwargs
        Typical plotter kwargs.

    Returns
    -------
    pyplot axis
        The axis plotted on.
    """
    if not semilog:
        obs_distances = np.log(obs_distances + pseudocount)
        exp_distances = np.log(exp_distances + pseudocount)
    obs_values = np.log(obs_values + pseudocount)
    if isinstance(exp_values, dict):
        exp_values = OrderedDict([(model,
                                   np.log(exp_values[model] + pseudocount))
                                  for model in exp_values])
    else:
        exp_values = np.log(exp_values + pseudocount)
    if kde:
        import seaborn as sns
        sns.kdeplot(obs_distances, obs_values, cmap='Blues', shade=True,
                    shade_lowest=False, n_levels=30)
    elif hexbin:
        from seaborn.distributions import _freedman_diaconis_bins
        x_bins = _freedman_diaconis_bins(obs_distances)
        y_bins = _freedman_diaconis_bins(obs_values)
        gridsize = int(np.mean([x_bins, y_bins]))
        plt.hexbin(obs_distances, obs_values, cmap='Blues', bins='log',
                   gridsize=gridsize, label='real data')
        xlim = plt.xlim()
        ylim = plt.ylim()
    else:
        plt.scatter(obs_distances, obs_values, label='real data')
    if isinstance(exp_values, dict):
        if isinstance(color, dict):
            for model in exp_values:
                plt.plot(exp_distances[exp_values[model] >= 0],
                         exp_values[model][exp_values[model] >= 0],
                         c=color[model],
                         lw=linewidth,
                         label=model)
        else:
            ax = plt.gca()
            ax.set_prop_cycle(
                'color',
                plt.get_cmap('Set1')(np.linspace(0, 1,
                                                 max(len(exp_values), 8))))
            for model in exp_values:
                plt.plot(exp_distances[exp_values[model] >= 0],
                         exp_values[model][exp_values[model] >= 0],
                         lw=linewidth,
                         label=model)
    else:
        plt.plot(exp_distances, exp_values, c=color, lw=linewidth,
                 label='model')
    if hexbin:
        plt.xlim(xlim)
        plt.ylim(ylim)

    # unbelievable hack to dodge a strange ValueError on legend creation
    handles, _ = plt.gca().get_legend_handles_labels()
    if hasattr(handles[-1], '_original_edgecolor'):
        handles[-1]._original_edgecolor = None