"""
Module providing utility functions related to sorting data.
"""
import numpy as np
[docs]def rankdata_plus(a, method='average'):
"""
Assign ranks to data, dealing with ties appropriately.
Slight modification of ``scipy.stats.rankdata()`` that returns the sorter in
addition to the ranks; this allows the sort information to be re-used
without having to sort again.
Ranks begin at 1. The `method` argument controls how ranks are assigned
to equal values. See [1]_ for further discussion of ranking methods.
Parameters
----------
a : array_like
The array of values to be ranked. The array is first flattened.
method : str, optional
The method used to assign ranks to tied elements.
The options are 'average', 'min', 'max', 'dense' and 'ordinal'.
'average':
The average of the ranks that would have been assigned to
all the tied values is assigned to each value.
'min':
The minimum of the ranks that would have been assigned to all
the tied values is assigned to each value. (This is also
referred to as "competition" ranking.)
'max':
The maximum of the ranks that would have been assigned to all
the tied values is assigned to each value.
'dense':
Like 'min', but the rank of the next highest element is assigned
the rank immediately after those assigned to the tied elements.
'ordinal':
All values are given a distinct rank, corresponding to the order
that the values occur in `a`.
The default is 'average'.
Returns
-------
ranks, sorter : ndarray, ndarray
Ranks is an array of length equal to the size of `a`, containing rank
scores. Sorter is the argsort result from the initial sorting.
References
----------
.. [1] "Ranking", http://en.wikipedia.org/wiki/Ranking
Examples
--------
>>> from lib5c.util.sorting import rankdata_plus
>>> np.array_equal(rankdata_plus([0, 2, 3, 2])[1], # the sorter
... np.array([0, 1, 3, 2]))
True
>>> np.array_equal(rankdata_plus([0, 2, 3, 2])[0], # the ranks
... np.array([1., 2.5, 4., 2.5]))
True
>>> np.array_equal(rankdata_plus([0, 2, 3, 2], 'min')[0], # the ranks
... np.array([1, 2, 4, 2]))
True
>>> np.array_equal(rankdata_plus([0, 2, 3, 2], 'max')[0], # the ranks
... np.array([1, 3, 4, 3]))
True
>>> np.array_equal(rankdata_plus([0, 2, 3, 2], 'dense')[0], # the ranks
... np.array([1, 2, 3, 2]))
True
>>> np.array_equal(rankdata_plus([0, 2, 3, 2], 'ordinal')[0], # the ranks
... np.array([1, 2, 4, 3]))
True
"""
if method not in ('average', 'min', 'max', 'dense', 'ordinal'):
raise ValueError('unknown method "{0}"'.format(method))
arr = np.ravel(np.asarray(a))
algo = 'mergesort' if method == 'ordinal' else 'quicksort'
sorter = np.argsort(arr, kind=algo)
inv = np.empty(sorter.size, dtype=np.intp)
inv[sorter] = np.arange(sorter.size, dtype=np.intp)
if method == 'ordinal':
return inv + 1, sorter
arr = arr[sorter]
obs = np.r_[True, arr[1:] != arr[:-1]]
dense = obs.cumsum()[inv]
if method == 'dense':
return dense, sorter
# cumulative counts of each unique value
count = np.r_[np.nonzero(obs)[0], len(obs)]
if method == 'max':
return count[dense], sorter
if method == 'min':
return count[dense - 1] + 1, sorter
# average method
return .5 * (count[dense] + count[dense - 1] + 1), sorter