"""
Module for determining various scales for visualization purposes.
"""
from copy import copy
import numpy as np
from lib5c.parsers.bed import load_features
from lib5c.util.counts import flatten_regional_counts
[docs]def compute_regional_obs_scale(counts_superdict, region, top_percentile=98):
"""
Computes a typical scale for visualizing observed for a specified region.
Parameters
----------
counts_superdict : dict of dicts of 2d numpy arrays
The keys of the outer dict are replicate names. The values are dicts
corresponding to counts dicts (see
``lib5c.parsers.counts.load_counts()``), whose keys are region names and
whose values are the arrays of counts values for that region. These
arrays are square and symmetric.
region : str
The region for which the scale should be computed.
top_percentile : int
The upper percentile to use when determinig the max of the scale.
Returns
-------
list of float
The first element of this list is the minimum value of the computed
scale. The second element of this list is the maximum value of the
computed scale.
Notes
-----
The returned scale is computed as ranging from 0 to the average of the 98th
percentiles across the replicates.
"""
mins = []
top_percentiles = []
for rep in counts_superdict.keys():
flattened_counts = flatten_regional_counts(
counts_superdict[rep][region], discard_nan=True)
mins.append(np.min(flattened_counts))
top_percentiles.append(np.percentile(flattened_counts, top_percentile))
return [np.mean(mins), np.mean(top_percentiles)]
[docs]def compute_regional_obs_over_exp_scale(counts_superdict, region):
"""
Computes a typical scale for visualizing observed over expected counts for a
specified region.
Parameters
----------
counts_superdict : dict of dicts of 2d numpy arrays
The keys of the outer dict are replicate names. The values are dicts
corresponding to counts dicts (see
``lib5c.parsers.counts.load_counts()``), whose keys are region names and
whose values are the arrays of counts values for that region. These
arrays are square and symmetric.
region : str
The region for which the scale should be computed.
Returns
-------
list of float
The first element of this list is the minimum value of the computed
scale. The second element of this list is the maximum value of the
computed scale.
Notes
-----
The returned scale is computed as the mean of the observed over expected
counts for the selected region across all replicates, plus and minus two and
a half times the mean of the standard deviations for the selected region
across all replicates for the maximum and the minimum, respectively.
"""
means = []
stds = []
for rep in counts_superdict.keys():
flattened_counts = flatten_regional_counts(
counts_superdict[rep][region], discard_nan=True)
means.append(np.mean(flattened_counts))
stds.append(np.std(flattened_counts))
mean_of_means = np.mean(means)
mean_of_stds = np.mean(stds)
return [mean_of_means - 2.5*mean_of_stds, mean_of_means + 2.5*mean_of_stds]
[docs]def compute_track_scales(tracks, pixelmap, conditions=(),
filename_generator=lambda x: '%s.bed' % x):
"""
Computes regional zero-to-max scales for visualizing ChIP-seq tracks.
Parameters
----------
tracks : list of str
List of string identifiers for the tracks to compute scales for. The
identifiers will be used to find the appropriate BED files on the disk
according to filename_generator.
pixelmap : pixelmap
The pixelmap to use when identifying the region names and boundaries.
See ``lib5c.parsers.bed.get_pixelmap()``.
conditions : list of str
List of string identifiers for the conditions. If this kwarg is passed,
the tracks will be grouped by condition, and tracks that differ only in
the condition identifier will be assigned the same scale. If this list
is empty (as it is by default), no such grouping of tracks is performed.
filename_generator : function str -> str
When passed any string from tracks, this function should return a string
reference to the BED file on the disk containing the BED features for
that track.
Returns
-------
dict of dict of numeric
The keys of the outer dict are the elements of tracks. The values are
dicts whose keys are region names and whose values are the maximum
values within that region for that track.
Notes
-----
Since this function computes a zero-to-max scale, the minimum of the scale
is always implicitly taken to be zero.
When conditions is not empty, the tracks are grouped by removing all
instances of all condition identifiers from the track identifiers and then
comparing them for equality. When the sequence of characters defined by a
condition identifier appears elsewhere in the track name in a location not
intended to identify the condition of the track, this may lead to failure to
correctly group tracks by condition.
"""
# load features for all tracks
features = {track: load_features(filename_generator(track))
for track in tracks}
# compute regional maxima for all tracks
maxima = {track: {region: max(
[feature['value']
for feature in features[track][pixelmap[region][0]['chrom']]
if
feature['chrom'] == pixelmap[region][0]['chrom'] and
feature['end'] >= pixelmap[region][0]['start'] and
feature['start'] <= pixelmap[region][-1]['end']])
for region in pixelmap.keys()}
for track in tracks}
# trim conditions from track names
trimmed_tracks = {}
for track in tracks:
trimmed_track = copy(track)
for condition in conditions:
trimmed_track = trimmed_track.replace(condition, '')
trimmed_tracks[track] = trimmed_track
# build comparison sets
comparison_sets = []
comparison_set_map = {}
for track in tracks:
assigned_flag = False
for comparison_set in comparison_sets:
for element in comparison_set:
if trimmed_tracks[track] == trimmed_tracks[element]:
comparison_set_map[track] = \
comparison_sets.index(comparison_set)
comparison_set.append(track)
assigned_flag = True
break
if not assigned_flag:
comparison_set_map[track] = len(comparison_sets)
comparison_sets.append([track])
# set the track scale to the maximum of the appropriate comparison set
track_scales = {track: {region: max(
[maxima[compared_track][region]
for compared_track in comparison_sets[comparison_set_map[track]]])
for region in pixelmap.keys()}
for track in tracks}
return track_scales
# test client
[docs]def main():
from lib5c.parsers.primers import get_pixelmap
from lib5c.parsers.counts import load_counts
# hardcode replicates
replicates = ['ES_Rep1',
'NPC_Rep31']
# hardcode conditions
conditions = ['ES', 'NPC']
# hardcode tracktypes
tracktypes = ['Smc1', 'CTCF', 'H3K4me1', 'H3K27ac']
# compute tracks
tracks = ['%s_%s' % (tracktype, condition)
for tracktype in tracktypes
for condition in conditions]
# get pixelmap
pixelmap = get_pixelmap('test/bins_nuc.bed')
# load counts
counts_superdict = {
replicate: load_counts('test/%s_obs_over_exp.counts' % replicate)
for replicate in replicates}
print('colorscales')
for region in pixelmap.keys():
print('%s: %s' %
(region,
compute_regional_obs_over_exp_scale(counts_superdict, region)))
print('')
print('track scales')
track_scales = compute_track_scales(
tracks, pixelmap, conditions,
filename_generator=lambda x: 'test/tracks/%s.bed' % x)
for track in track_scales.keys():
print('%s:' % track)
for region in track_scales[track].keys():
print('\t%s: %s' % (region, track_scales[track][region]))
if __name__ == "__main__":
main()