"""
Module for visualizing architectural clusters.
"""
from matplotlib import pyplot as plt
from lib5c.algorithms.clustering.util import clusters_to_array
from lib5c.util.scales import compute_regional_obs_over_exp_scale, \
compute_track_scales
from lib5c.plotters.heatmap import plot_heatmap
from lib5c.plotters.colorscales import obs_over_exp_colorscale
[docs]def compute_bounding_box(cluster_peaks):
"""
Computes a rectangular bounding box for a cluster.
Parameters
----------
cluster_peaks : list of dicts of ints
Each dict in the list represents a peak in the cluster and has the
form::
{
'x': int,
'y': int
}
where these ints represent the respective x- and y-coordinates of the
peak within the region.
Returns
-------
dict of ints
The returned dict represents the computed bounding box and has the
form::
{
'x_min': int,
'x_max': int,
'y_min': int,
'y_max': int
}
where the ints describe the maximal and minimal indices of pixels along
the x- and y-axes to be included within the bounding box, respectively.
"""
return {'x_min': min([peak['x'] for peak in cluster_peaks]),
'x_max': max([peak['x'] for peak in cluster_peaks]),
'y_min': min([peak['y'] for peak in cluster_peaks]),
'y_max': max([peak['y'] for peak in cluster_peaks])}
[docs]def make_zoom_window(bounding_box, region_size, padding=2, invert=False):
"""
Computes a zoom window around a target bounding box.
Parameters
----------
bounding_box : dict of ints
This dict represents the target bounding box and should have the form::
{
'x_min': int,
'x_max': int,
'y_min': int,
'y_max': int
}
where the ints describe the maximal and minimal indices of pixels along
the x- and y-axes included within the bounding box, respectively.
region_size : int
The side length of the square region within which the target bounding
box lies, in pixels.
padding : int
The number of pixels to include within the zoom window around the
bounding box. If the zoom window is at the edge of the region, it may be
impossible to pad by the specified number of pixels, in which case the
zoom window will be shifted away from the edge of the region.
invert : bool
If True, the zoom window will be reflected across the diagonal.
Returns
-------
dict of int
This dict represents the computed zoom window and has the following
structure::
{
'x_start': int,
'y_start': int,
'size': int
}
where x_start and y_start refer to the indices of the lowest-indexed
pixel to be included in the zoom window along each axis, and size
indicates the side length of the square zoom window in pixels.
"""
# figure out how large the window should be
zoom_window_size = max(
bounding_box['x_max'] - bounding_box['x_min'],
bounding_box['y_max'] - bounding_box['y_min']) + 1 + 2 * padding
# make a first guess at where the window should be
zoom_window = {'x_start': bounding_box['x_min'] - padding,
'y_start': bounding_box['y_min'] - padding,
'size': zoom_window_size}
if invert:
zoom_window = {'x_start': bounding_box['y_min'] - padding,
'y_start': bounding_box['x_min'] - padding,
'size': zoom_window_size}
# shift the window if we're close to the edge of the heatmap
if zoom_window['x_start'] + zoom_window['size'] - 1 >= region_size:
zoom_window['x_start'] -= zoom_window['x_start'] + zoom_window['size'] \
- region_size
if zoom_window['y_start'] + zoom_window['size'] - 1 >= region_size:
zoom_window['y_start'] -= zoom_window['y_start'] + zoom_window['size'] \
- region_size
if zoom_window['x_start'] < 0:
zoom_window['x_start'] -= zoom_window['x_start']
if zoom_window['y_start'] < 0:
zoom_window['y_start'] -= zoom_window['y_start']
return zoom_window
[docs]def plot_cluster(pixelmap, counts_superdict, cluster_peaks, cluster_region,
colorscales='auto', tracks=(),
track_filename_generator=lambda x: '%s.bed' % x, conditions=(),
track_scales='auto', zoom_window='auto', padding=2,
invert=False,
output_filename_generator=lambda x, y: 'output/%s_%s' % (x, y),
heatmap_kwargs='default'):
"""
Plots a contact probability heatmaps centered on a cluster.
Parameters
----------
pixelmap : pixelmap
The pixelmap to use for plotting the heatmap. See
``lib5c.parsers.bed.get_pixelmap()``.
counts_superdict : dict of counts dicts
The contact probability data for each replicate to be plotted. The keys
of the outer dict are replicate names as strings. The values of the
outer dict are counts dicts whose keys are region names as strings and
whose values are 2D square symmetric numpy arrays containing the counts
for that region in that replicate. See ``lib5c.util.counts_superdict``.
cluster_peaks : list of dicts of ints
Each dict in the list represents a peak in the cluster and has the
form::
{
'x': int,
'y': int
}
where these ints represent the respective x- and y-coordinates of the
peak within the region.
cluster_region : str
The name of the region in which the cluster lies.
colorscales : 'auto' or dict of lists of numerics
If 'auto' is passed, the regional colorscales for the heatmaps are
automatically computed using
``lib5c.util.scales.compute_regional_obs_over_exp_scale()``.
Alternatively, the colorscales may be specified as a dict
whose keys are region names and whose values are lists of numerics of
length two. The first element of this list will be the minumum value on
the colorscale, while the second element will be the maximum value on
the colorscale.
tracks : list of str
List of string identifiers for ChIP-seq tracks to include on the
heatmaps. The identifiers will be used to find the appropriate BED files
on the disk according to track_filename_generator.
track_filename_generator : function str -> str
When passed any string from tracks, this function should return a string
reference to the BED file on the disk containing the BED features for
that track.
conditions : list of str
List of string identifiers for distinct conditions in the dataset. If
this list is not empty, it will be used to group replicates and tracks
into distinct groups that contain the identifiers in this list as
substrings of their names. Replicates identified as belonging to a
certain condition will be plotted with the tracks identified as
belonging to that same condition. For example, consider a conditions
list ['ES', 'NPC'], replicates 'NPC_Rep1' and 'ES_Rep1', and tracks
'ES_CTCF' and 'NPC_CTCF'. The 'ES_Rep1' heatmap will be drawn with the
'ES_CTCF' track, and the 'NPC_Rep1' heatmap will be drawn with the
'NPC_CTCF' track. If this list is empty, all tracks will be drawn with
all heatmaps. Additionally, this kwarg is included in the call to
``lib5c.util.scales.compute_track_scales()``, which computes the track
scales if track_scales is 'auto' (see below).
track_scales : 'auto' or dict of numerics
If 'auto' is passed, the track scales will be computed automatically
using ``lib5c.util.scales.compute_track_scales()``. The conditions kwarg
will be included in this call. For example, consider a conditions list
['ES', 'NPC'], and tracks 'ES_CTCF' and 'NPC_CTCF'. Since the track
names differ only in their condition identifier, these two tracks will
be plotted using the same scale if track_scales is 'auto'.
Alternatively, the track scales may be specified as a dict whose keys
are the track names and whose values are numerics specifying the maximum
value of the scale for that track. The minumum value of all tracks is
assumed to be zero.
zoom_window : 'auto' or dict of int
If 'auto' is passed, a zoom window for the cluster will be automatically
computed. Alternatively, the zoom window may be specified as a dict with
the following structure::
{
'x_start': int,
'y_start': int,
'size': int
}
where x_start and y_start refer to the indices of the lowest-indexed
pixel to be included in the zoom window along each axis, and size
indicates the side length of the square zoom window in pixels.
padding : int
If zoom_window is 'auto', this kwarg specifies how much padding to
include around the cluster boundaries when computing the zoom window.
invert : bool
If zoom_window is 'auto', this kwarg specifies whether or not the
automatically computed zoom window should be reflected across the
diagonal, reversing the x- and y-axes on the resulting heatmaps.
output_filename_generator : function (str, str) -> str
This function should take in cluster_region and the name of a replicate
and return the output filename to write the corresponding heatmap to.
heatmap_kwargs : 'default' or dict
Passing a dict to this kwarg allows the specification of arbitrary
kwargs to be included in the final call to
``lib5c.plotters.heatmap.plot_heatmap()``. If 'default' is passed
instead of a dict, a default dict of typical kwargs will be used.
"""
# auto zoom window
if zoom_window == 'auto':
zoom_window = make_zoom_window(compute_bounding_box(cluster_peaks),
len(pixelmap[cluster_region]), padding,
invert=invert)
# auto colorscales
if colorscales == 'auto':
colorscales = {
region: compute_regional_obs_over_exp_scale(counts_superdict,
region)
for region in pixelmap.keys()}
# auto track scales
if track_scales == 'auto':
track_scales = compute_track_scales(tracks, pixelmap, conditions,
track_filename_generator)
# compute replicates by condition
if conditions:
replicates_by_condition = {
condition: [replicate for replicate in counts_superdict.keys()
if condition in replicate]
for condition in conditions
}
else:
replicates_by_condition = {
'any': [replicate for replicate in counts_superdict.keys()]
}
# compute tracks by condition
if conditions:
tracks_by_condition = {
condition: [track for track in tracks if condition in track]
for condition in conditions
}
else:
tracks_by_condition = {'any': [track for track in tracks]}
# sane defaults for heatmap_kwargs
if heatmap_kwargs == 'default':
heatmap_kwargs = {'show_colorscale': False,
'cmap': obs_over_exp_colorscale,
'cluster_colors': 'lime'}
for condition in replicates_by_condition.keys():
for rep in replicates_by_condition[condition]:
plot_heatmap(
counts_superdict[rep][cluster_region],
output_filename_generator(cluster_region, rep),
region=cluster_region,
pixelmap=pixelmap,
zoom_window=zoom_window,
colorscale=colorscales[cluster_region],
tracks=[track_filename_generator(track)
for track in tracks_by_condition[condition]],
track_scales=[track_scales[track][cluster_region]
for track in tracks_by_condition[condition]],
clusters=[cluster_peaks],
**heatmap_kwargs
)
[docs]def plot_cluster_indices(pixelmap, clusters,
output_filename_generator=lambda x: '%s_indices' % x,
heatmap_kwargs='default'):
"""
Plots clusters for each region where each cluster is in a different color
and each cluster is labeled with its index in the list of clusters for that
region.
Parameters
----------
pixelmap : pixelmap
The pixelmap to use for plotting the heatmap. See
``lib5c.parsers.bed.get_pixelmap()``.
clusters : dict of lists of lists of dicts of ints
The keys of the outer dict are region names. The values (the outer
lists) represent the list of clusters within each region. The inner
lists represent clusters. Each dict in each inner list represents a peak
in that cluster and has the form::
{
'x': int,
'y': int
}
where these ints represent the respective x- and y-coordinates of the
peak within the region. As an example, we should be able to index into
clusters as follows::
clusters[region_name : str][index of cluster within region : int]\
[index of peak within cluster : int] = \
{
'x': int,
'y': int
}
output_filename_generator : function str -> str
This function should take in a region name and return the output
filename to which the cluster index heatmap for that region should be
written.
heatmap_kwargs : 'default' or dict
Passing a dict to this kwarg allows the specification of arbitrary
kwargs to be included in the final call to
``lib5c.plotters.heatmap.plot_heatmap()``. If 'default' is passed
instead of a dict, a default dict of typical kwargs will be used.
"""
# deduce regions
regions = list(pixelmap.keys())
# sane defaults for heatmap_kwargs
if heatmap_kwargs == 'default':
heatmap_kwargs = {'show_colorscale': False,
'cmap': plt.get_cmap('gist_ncar')}
# plot heatmaps
for region in regions:
plot_heatmap(clusters_to_array(clusters[region], len(pixelmap[region])),
output_filename_generator(region),
pixelmap=pixelmap,
region=region,
clusters=clusters[region],
cluster_labels='indices',
**heatmap_kwargs)
# test client
[docs]def main():
import pickle
from lib5c.parsers.primers import load_primermap
from lib5c.parsers.counts import load_counts
# hardcode replicates
replicates = ['ES_Rep1',
'NPC_Rep31']
# hardcode conditions
conditions = ['ES', 'NPC']
# hardcode tracktypes
tracktypes = ['Smc1', 'CTCF', 'H3K4me1', 'H3K27ac']
# compute tracks
tracks = ['%s_%s' % (tracktype, condition)
for tracktype in tracktypes
for condition in conditions]
# get pixelmap
pixelmap = load_primermap('test/bins_nuc.bed')
# load counts
counts_superdict = {
replicate: load_counts('test/%s_obs_over_exp.counts' % replicate)
for replicate in replicates}
# a set of clusters
with open('test/clusters.pickle', 'rb') as handle:
clusters = pickle.load(handle)
# plot cluster indices
plot_cluster_indices(
pixelmap, clusters,
output_filename_generator=lambda x: 'test/clusters/%s_indices' % x)
# plot zoomin heatmaps
plot_cluster(
pixelmap, counts_superdict, clusters['Klf4'][1], 'Klf4',
colorscales='auto', tracks=tracks,
track_filename_generator=lambda x: 'test/tracks/%s.bed' % x,
conditions=conditions, track_scales='auto', zoom_window='auto',
padding=2, invert=False,
output_filename_generator=lambda x, y: 'test/clusters/%s_%s' %
(x, y),
heatmap_kwargs={'show_colorscale': True,
'cmap' : obs_over_exp_colorscale,
'cluster_colors' : 'pink'}
)
if __name__ == "__main__":
main()