"""
Module containing utility functions for parametrizing statistical distributions.
"""
import numpy as np
import scipy.stats as stats
from lib5c.util.parallelization import parallelize_regions
[docs]def log_parameters(m, v):
"""
Attempts to guess appropriate log-scale mean and variance parameters given
non-log scale estimators of these quantities, under the assumptions of a
lognormal model.
Based on https://en.wikipedia.org/wiki/Log-normal_distribution#Notation
Parameters
----------
m : float
The non-log scale mean.
v : float
The non-log scale variance.
Returns
-------
float, float
The first float is the log-scale mean, the second is the log-scale
variance.
Notes
-----
This function is array-safe.
Examples
--------
>>> mu = 2 # mean of a normal random variable X
>>> sigma_2 = 16 # variance of X
>>> scale = np.exp(mu) # parameter conversion for scipy.stats.lognorm
>>> s = np.sqrt(sigma_2) # ditto
>>> y = stats.lognorm(s=s, scale=scale) # a lognormal RV: exp(X) = Y
>>> m, v = y.stats(moments='mv') # mean and variance of Y
>>> m, v
(array(22026.4657948...), array(4.31123106e+15))
>>> log_parameters(m, v) # recover moments of X from moments of Y
(2.0, 16.0)
"""
return np.log(m / np.sqrt(1 + v/m**2)), np.log(1 + v/m**2)
[docs]def convert_parameters(mu, sigma_2, dist_gen, log=False):
"""
Obtain correct scipy.stats parameterizations for selected one- and two-
parameter distributions given a desired mean and variance.
Parameters
----------
mu : float
The mean of the desired distribution.
sigma_2 : float
The variance of the desired distribution.
dist_gen : scipy.stats.rv_generic
The target distribution.
log : bool
Pass True to attempt to convert exp and var to log-scale.
Returns
-------
tuple of float
The appropriate scipy.stats parameters.
"""
mu = np.array(mu) if hasattr(mu, '__len__') else np.array([mu])
sigma_2 = np.array(sigma_2) if hasattr(sigma_2, '__len__')\
else np.array([sigma_2])
if log:
mu, sigma_2 = log_parameters(mu, sigma_2)
if dist_gen.name == 'norm':
return mu, np.sqrt(sigma_2)
elif dist_gen.name == 'logistic':
return mu, np.sqrt(3 * sigma_2) / np.pi
elif dist_gen.name == 'nbinom':
# overwrite sigma_2's that are below mu (force to Poisson)
idx = sigma_2 < mu + 0.001
sigma_2[idx] = mu[idx] + 0.001
# compute n and p
return mu**2 / (sigma_2 - mu), 1 - (sigma_2 - mu) / sigma_2
elif dist_gen.name == 'poisson':
return (mu,)
else:
raise NotImplementedError(
'distribution %s not supported' % dist_gen.name)
[docs]def freeze_distribution(dist_gen, mu, sigma_2, log=False):
"""
Create a frozen distribution of a given type, given a mean and variance.
Parameters
----------
dist_gen : scipy.stats.rv_generic
The distribution to use.
mu : float
The desired mean.
sigma_2 : float
The desired variance.
log : bool
Pass True to attempt to convert mu and sigma_2 to log-scale.
Returns
-------
scipy.stats.rv_frozen
A frozen distribution of the specified type with specified mean and
variance.
Notes
-----
This function does not perform any fitting, because it assumes that the
first two moments directly and uniquely identify the desired distribution.
Examples
--------
>>> frozen_dist = freeze_distribution(stats.poisson, 4.0, 4.0)
>>> print('%s distribution with mean %.2f and variance %.2f'
... % ((frozen_dist.dist.name,) + frozen_dist.stats(moments='mv')))
poisson distribution with mean 4.00 and variance 4.00
>>> frozen_dist = freeze_distribution(stats.nbinom, 4.0, 3.0)
>>> print('%s distribution with mean %.2f and variance %.2f'
... % ((frozen_dist.dist.name,) + frozen_dist.stats(moments='mv')))
nbinom distribution with mean 4.00 and variance 4.00
>>> frozen_dist = freeze_distribution(stats.nbinom, 3.0, 4.0)
>>> print('%s distribution with mean %.2f and variance %.2f'
... % ((frozen_dist.dist.name,) + frozen_dist.stats(moments='mv')))
nbinom distribution with mean 3.00 and variance 4.00
>>> frozen_dist = freeze_distribution(stats.norm, 4.0, 3.0)
>>> print('%s distribution with mean %.2f and variance %.2f'
... % ((frozen_dist.dist.name,) + frozen_dist.stats(moments='mv')))
norm distribution with mean 4.00 and variance 3.00
>>> frozen_dist = freeze_distribution(stats.logistic, 4.0, 3.0)
>>> print('%s distribution with mean %.2f and variance %.2f'
... % ((frozen_dist.dist.name,) + frozen_dist.stats(moments='mv')))
logistic distribution with mean 4.00 and variance 3.00
>>> mu = 2 # mean of a normal random variable X
>>> sigma_2 = 16 # variance of X
>>> scale = np.exp(mu) # parameter conversion for scipy.stats.lognorm
>>> s = np.sqrt(sigma_2) # ditto
>>> y = stats.lognorm(s=s, scale=scale) # a lognormal RV: exp(X) = Y
>>> m, v = y.stats(moments='mv') # mean and variance of Y
>>> m, v
(array(22026.4657948...), array(4.31123106e+15))
>>> frozen_dist = freeze_distribution(stats.logistic, m, v, log=True)
>>> print('%s distribution with mean %.2f and variance %.2f'
... % ((frozen_dist.dist.name,) + frozen_dist.stats(moments='mv')))
logistic distribution with mean 2.00 and variance 16.00
"""
return dist_gen(*convert_parameters(mu, sigma_2, dist_gen, log=log))
[docs]@parallelize_regions
def call_pvalues(obs, exp, var, dist_gen, log=False):
"""
Call right-tail p-values for obs against a theoretical distribution whose
family is specified by dist_gen an whose first two moments are specified by
exp and var, respectively.
Parameters
----------
obs : float
The observed value to call a p-value for.
exp, var : float
The first two moments of the null distribution.
dist_gen : scipy.stats.rv_generic or str
The null distribution to parameterize. If a string is passed this will
be replaced with `getattr(scipy.stats, dist_gen)`.
log : bool
Pass True to attempt to convert exp and var to log-scale.
Returns
-------
float
The right-tail p-value.
Notes
-----
This function is array-safe.
"""
if type(dist_gen) == str:
dist_gen = getattr(stats, dist_gen)
params = convert_parameters(exp, var, dist_gen, log=log)
if hasattr(dist_gen, 'pmf'):
return dist_gen.sf(obs, *params) + dist_gen.pmf(obs, *params)
return dist_gen.sf(obs, *params)