Source code for lib5c.util.distributions

"""
Module containing utility functions for parametrizing statistical distributions.
"""

import numpy as np
import scipy.stats as stats

from lib5c.util.parallelization import parallelize_regions


[docs]def log_parameters(m, v):
    """
    Attempts to guess appropriate log-scale mean and variance parameters given
    non-log scale estimators of these quantities, under the assumptions of a
    lognormal model.

    Based on https://en.wikipedia.org/wiki/Log-normal_distribution#Notation

    Parameters
    ----------
    m : float
        The non-log scale mean.
    v : float
        The non-log scale variance.

    Returns
    -------
    float, float
        The first float is the log-scale mean, the second is the log-scale
        variance.

    Notes
    -----
    This function is array-safe.

    Examples
    --------
    >>> mu = 2  # mean of a normal random variable X
    >>> sigma_2 = 16  # variance of X
    >>> scale = np.exp(mu)  # parameter conversion for scipy.stats.lognorm
    >>> s = np.sqrt(sigma_2)  # ditto
    >>> y = stats.lognorm(s=s, scale=scale)  # a lognormal RV: exp(X) = Y
    >>> m, v = y.stats(moments='mv')  # mean and variance of Y
    >>> m, v
    (array(22026.4657948...), array(4.31123106e+15))
    >>> log_parameters(m, v)  # recover moments of X from moments of Y
    (2.0, 16.0)
    """
    return np.log(m / np.sqrt(1 + v/m**2)), np.log(1 + v/m**2)


[docs]def convert_parameters(mu, sigma_2, dist_gen, log=False):
    """
    Obtain correct scipy.stats parameterizations for selected one- and two-
    parameter distributions given a desired mean and variance.

    Parameters
    ----------
    mu : float
        The mean of the desired distribution.
    sigma_2 : float
        The variance of the desired distribution.
    dist_gen : scipy.stats.rv_generic
        The target distribution.
    log : bool
        Pass True to attempt to convert exp and var to log-scale.

    Returns
    -------
    tuple of float
        The appropriate scipy.stats parameters.
    """
    mu = np.array(mu) if hasattr(mu, '__len__') else np.array([mu])
    sigma_2 = np.array(sigma_2) if hasattr(sigma_2, '__len__')\
        else np.array([sigma_2])
    if log:
        mu, sigma_2 = log_parameters(mu, sigma_2)
    if dist_gen.name == 'norm':
        return mu, np.sqrt(sigma_2)
    elif dist_gen.name == 'logistic':
        return mu, np.sqrt(3 * sigma_2) / np.pi
    elif dist_gen.name == 'nbinom':
        # overwrite sigma_2's that are below mu (force to Poisson)
        idx = sigma_2 < mu + 0.001
        sigma_2[idx] = mu[idx] + 0.001
        # compute n and p
        return mu**2 / (sigma_2 - mu), 1 - (sigma_2 - mu) / sigma_2
    elif dist_gen.name == 'poisson':
        return (mu,)
    else:
        raise NotImplementedError(
            'distribution %s not supported' % dist_gen.name)


[docs]def freeze_distribution(dist_gen, mu, sigma_2, log=False):
    """
    Create a frozen distribution of a given type, given a mean and variance.

    Parameters
    ----------
    dist_gen : scipy.stats.rv_generic
        The distribution to use.
    mu : float
        The desired mean.
    sigma_2 : float
        The desired variance.
    log : bool
        Pass True to attempt to convert mu and sigma_2 to log-scale.

    Returns
    -------
    scipy.stats.rv_frozen
        A frozen distribution of the specified type with specified mean and
        variance.

    Notes
    -----
    This function does not perform any fitting, because it assumes that the
    first two moments directly and uniquely identify the desired distribution.

    Examples
    --------
    >>> frozen_dist = freeze_distribution(stats.poisson, 4.0, 4.0)
    >>> print('%s distribution with mean %.2f and variance %.2f'
    ...       % ((frozen_dist.dist.name,) + frozen_dist.stats(moments='mv')))
    poisson distribution with mean 4.00 and variance 4.00

    >>> frozen_dist = freeze_distribution(stats.nbinom, 4.0, 3.0)
    >>> print('%s distribution with mean %.2f and variance %.2f'
    ...       % ((frozen_dist.dist.name,) + frozen_dist.stats(moments='mv')))
    nbinom distribution with mean 4.00 and variance 4.00

    >>> frozen_dist = freeze_distribution(stats.nbinom, 3.0, 4.0)
    >>> print('%s distribution with mean %.2f and variance %.2f'
    ...       % ((frozen_dist.dist.name,) + frozen_dist.stats(moments='mv')))
    nbinom distribution with mean 3.00 and variance 4.00

    >>> frozen_dist = freeze_distribution(stats.norm, 4.0, 3.0)
    >>> print('%s distribution with mean %.2f and variance %.2f'
    ...       % ((frozen_dist.dist.name,) + frozen_dist.stats(moments='mv')))
    norm distribution with mean 4.00 and variance 3.00

    >>> frozen_dist = freeze_distribution(stats.logistic, 4.0, 3.0)
    >>> print('%s distribution with mean %.2f and variance %.2f'
    ...       % ((frozen_dist.dist.name,) + frozen_dist.stats(moments='mv')))
    logistic distribution with mean 4.00 and variance 3.00
    >>> mu = 2  # mean of a normal random variable X
    >>> sigma_2 = 16  # variance of X
    >>> scale = np.exp(mu)  # parameter conversion for scipy.stats.lognorm
    >>> s = np.sqrt(sigma_2)  # ditto
    >>> y = stats.lognorm(s=s, scale=scale)  # a lognormal RV: exp(X) = Y
    >>> m, v = y.stats(moments='mv')  # mean and variance of Y
    >>> m, v
    (array(22026.4657948...), array(4.31123106e+15))
    >>> frozen_dist = freeze_distribution(stats.logistic, m, v, log=True)
    >>> print('%s distribution with mean %.2f and variance %.2f'
    ...       % ((frozen_dist.dist.name,) + frozen_dist.stats(moments='mv')))
    logistic distribution with mean 2.00 and variance 16.00
    """
    return dist_gen(*convert_parameters(mu, sigma_2, dist_gen, log=log))


[docs]@parallelize_regions
def call_pvalues(obs, exp, var, dist_gen, log=False):
    """
    Call right-tail p-values for obs against a theoretical distribution whose
    family is specified by dist_gen an whose first two moments are specified by
    exp and var, respectively.

    Parameters
    ----------
    obs : float
        The observed value to call a p-value for.
    exp, var : float
        The first two moments of the null distribution.
    dist_gen : scipy.stats.rv_generic or str
        The null distribution to parameterize. If a string is passed this will
        be replaced with `getattr(scipy.stats, dist_gen)`.
    log : bool
        Pass True to attempt to convert exp and var to log-scale.

    Returns
    -------
    float
        The right-tail p-value.

    Notes
    -----
    This function is array-safe.
    """
    if type(dist_gen) == str:
        dist_gen = getattr(stats, dist_gen)
    params = convert_parameters(exp, var, dist_gen, log=log)
    if hasattr(dist_gen, 'pmf'):
        return dist_gen.sf(obs, *params) + dist_gen.pmf(obs, *params)
    return dist_gen.sf(obs, *params)