Source code for lib5c.util.scales

"""
Module for determining various scales for visualization purposes.
"""

from copy import copy

import numpy as np

from lib5c.parsers.bed import load_features
from lib5c.util.counts import flatten_regional_counts


[docs]def compute_regional_obs_scale(counts_superdict, region, top_percentile=98):
    """
    Computes a typical scale for visualizing observed for a specified region.

    Parameters
    ----------
    counts_superdict : dict of dicts of 2d numpy arrays
        The keys of the outer dict are replicate names. The values are dicts
        corresponding to counts dicts (see
        ``lib5c.parsers.counts.load_counts()``), whose keys are region names and
        whose values are the arrays of counts values for that region. These
        arrays are square and symmetric.
    region : str
        The region for which the scale should be computed.
    top_percentile : int
        The upper percentile to use when determinig the max of the scale.

    Returns
    -------
    list of float
        The first element of this list is the minimum value of the computed
        scale. The second element of this list is the maximum value of the
        computed scale.

    Notes
    -----
    The returned scale is computed as ranging from 0 to the average of the 98th
    percentiles across the replicates.
    """
    mins = []
    top_percentiles = []
    for rep in counts_superdict.keys():
        flattened_counts = flatten_regional_counts(
            counts_superdict[rep][region], discard_nan=True)
        mins.append(np.min(flattened_counts))
        top_percentiles.append(np.percentile(flattened_counts, top_percentile))
    return [np.mean(mins), np.mean(top_percentiles)]


[docs]def compute_regional_obs_over_exp_scale(counts_superdict, region):
    """
    Computes a typical scale for visualizing observed over expected counts for a
    specified region.

    Parameters
    ----------
    counts_superdict : dict of dicts of 2d numpy arrays
        The keys of the outer dict are replicate names. The values are dicts
        corresponding to counts dicts (see
        ``lib5c.parsers.counts.load_counts()``), whose keys are region names and
        whose values are the arrays of counts values for that region. These
        arrays are square and symmetric.
    region : str
        The region for which the scale should be computed.

    Returns
    -------
    list of float
        The first element of this list is the minimum value of the computed
        scale. The second element of this list is the maximum value of the
        computed scale.

    Notes
    -----
    The returned scale is computed as the mean of the observed over expected
    counts for the selected region across all replicates, plus and minus two and
    a half times the mean of the standard deviations for the selected region
    across all replicates for the maximum and the minimum, respectively.
    """
    means = []
    stds = []
    for rep in counts_superdict.keys():
        flattened_counts = flatten_regional_counts(
            counts_superdict[rep][region], discard_nan=True)
        means.append(np.mean(flattened_counts))
        stds.append(np.std(flattened_counts))
    mean_of_means = np.mean(means)
    mean_of_stds = np.mean(stds)
    return [mean_of_means - 2.5*mean_of_stds, mean_of_means + 2.5*mean_of_stds]


[docs]def compute_track_scales(tracks, pixelmap, conditions=(),
                         filename_generator=lambda x: '%s.bed' % x):
    """
    Computes regional zero-to-max scales for visualizing ChIP-seq tracks.

    Parameters
    ----------
    tracks : list of str
        List of string identifiers for the tracks to compute scales for. The
        identifiers will be used to find the appropriate BED files on the disk
        according to filename_generator.
    pixelmap : pixelmap
        The pixelmap to use when identifying the region names and boundaries.
        See ``lib5c.parsers.bed.get_pixelmap()``.
    conditions : list of str
        List of string identifiers for the conditions. If this kwarg is passed,
        the tracks will be grouped by condition, and tracks that differ only in
        the condition identifier will be assigned the same scale. If this list
        is empty (as it is by default), no such grouping of tracks is performed.
    filename_generator : function str -> str
        When passed any string from tracks, this function should return a string
        reference to the BED file on the disk containing the BED features for
        that track.

    Returns
    -------
    dict of dict of numeric
        The keys of the outer dict are the elements of tracks. The values are
        dicts whose keys are region names and whose values are the maximum
        values within that region for that track.

    Notes
    -----
    Since this function computes a zero-to-max scale, the minimum of the scale
    is always implicitly taken to be zero.

    When conditions is not empty, the tracks are grouped by removing all
    instances of all condition identifiers from the track identifiers and then
    comparing them for equality. When the sequence of characters defined by a
    condition identifier appears elsewhere in the track name in a location not
    intended to identify the condition of the track, this may lead to failure to
    correctly group tracks by condition.
    """
    # load features for all tracks
    features = {track: load_features(filename_generator(track))
                for track in tracks}

    # compute regional maxima for all tracks
    maxima = {track: {region: max(
        [feature['value']
         for feature in features[track][pixelmap[region][0]['chrom']]
         if
         feature['chrom'] == pixelmap[region][0]['chrom'] and
         feature['end'] >= pixelmap[region][0]['start'] and
         feature['start'] <= pixelmap[region][-1]['end']])
        for region in pixelmap.keys()}
        for track in tracks}

    # trim conditions from track names
    trimmed_tracks = {}
    for track in tracks:
        trimmed_track = copy(track)
        for condition in conditions:
            trimmed_track = trimmed_track.replace(condition, '')
        trimmed_tracks[track] = trimmed_track

    # build comparison sets
    comparison_sets = []
    comparison_set_map = {}
    for track in tracks:
        assigned_flag = False
        for comparison_set in comparison_sets:
            for element in comparison_set:
                if trimmed_tracks[track] == trimmed_tracks[element]:
                    comparison_set_map[track] = \
                        comparison_sets.index(comparison_set)
                    comparison_set.append(track)
                    assigned_flag = True
                    break
        if not assigned_flag:
            comparison_set_map[track] = len(comparison_sets)
            comparison_sets.append([track])

    # set the track scale to the maximum of the appropriate comparison set
    track_scales = {track: {region: max(
        [maxima[compared_track][region]
         for compared_track in comparison_sets[comparison_set_map[track]]])
        for region in pixelmap.keys()}
        for track in tracks}
    return track_scales


# test client
[docs]def main():
    from lib5c.parsers.primers import get_pixelmap
    from lib5c.parsers.counts import load_counts

    # hardcode replicates
    replicates = ['ES_Rep1',
                  'NPC_Rep31']

    # hardcode conditions
    conditions = ['ES', 'NPC']

    # hardcode tracktypes
    tracktypes = ['Smc1', 'CTCF', 'H3K4me1', 'H3K27ac']

    # compute tracks
    tracks = ['%s_%s' % (tracktype, condition)
              for tracktype in tracktypes
              for condition in conditions]

    # get pixelmap
    pixelmap = get_pixelmap('test/bins_nuc.bed')

    # load counts
    counts_superdict = {
        replicate: load_counts('test/%s_obs_over_exp.counts' % replicate)
        for replicate in replicates}

    print('colorscales')
    for region in pixelmap.keys():
        print('%s: %s' %
              (region,
               compute_regional_obs_over_exp_scale(counts_superdict, region)))

    print('')
    print('track scales')
    track_scales = compute_track_scales(
        tracks, pixelmap, conditions,
        filename_generator=lambda x: 'test/tracks/%s.bed' % x)
    for track in track_scales.keys():
        print('%s:' % track)
        for region in track_scales[track].keys():
            print('\t%s: %s' % (region, track_scales[track][region]))


if __name__ == "__main__":
    main()