Source code for lib5c.plotters.clustering

"""
Module for visualizing architectural clusters.
"""

from matplotlib import pyplot as plt

from lib5c.algorithms.clustering.util import clusters_to_array
from lib5c.util.scales import compute_regional_obs_over_exp_scale, \
    compute_track_scales
from lib5c.plotters.heatmap import plot_heatmap
from lib5c.plotters.colorscales import obs_over_exp_colorscale


[docs]def compute_bounding_box(cluster_peaks): """ Computes a rectangular bounding box for a cluster. Parameters ---------- cluster_peaks : list of dicts of ints Each dict in the list represents a peak in the cluster and has the form:: { 'x': int, 'y': int } where these ints represent the respective x- and y-coordinates of the peak within the region. Returns ------- dict of ints The returned dict represents the computed bounding box and has the form:: { 'x_min': int, 'x_max': int, 'y_min': int, 'y_max': int } where the ints describe the maximal and minimal indices of pixels along the x- and y-axes to be included within the bounding box, respectively. """ return {'x_min': min([peak['x'] for peak in cluster_peaks]), 'x_max': max([peak['x'] for peak in cluster_peaks]), 'y_min': min([peak['y'] for peak in cluster_peaks]), 'y_max': max([peak['y'] for peak in cluster_peaks])}
[docs]def make_zoom_window(bounding_box, region_size, padding=2, invert=False): """ Computes a zoom window around a target bounding box. Parameters ---------- bounding_box : dict of ints This dict represents the target bounding box and should have the form:: { 'x_min': int, 'x_max': int, 'y_min': int, 'y_max': int } where the ints describe the maximal and minimal indices of pixels along the x- and y-axes included within the bounding box, respectively. region_size : int The side length of the square region within which the target bounding box lies, in pixels. padding : int The number of pixels to include within the zoom window around the bounding box. If the zoom window is at the edge of the region, it may be impossible to pad by the specified number of pixels, in which case the zoom window will be shifted away from the edge of the region. invert : bool If True, the zoom window will be reflected across the diagonal. Returns ------- dict of int This dict represents the computed zoom window and has the following structure:: { 'x_start': int, 'y_start': int, 'size': int } where x_start and y_start refer to the indices of the lowest-indexed pixel to be included in the zoom window along each axis, and size indicates the side length of the square zoom window in pixels. """ # figure out how large the window should be zoom_window_size = max( bounding_box['x_max'] - bounding_box['x_min'], bounding_box['y_max'] - bounding_box['y_min']) + 1 + 2 * padding # make a first guess at where the window should be zoom_window = {'x_start': bounding_box['x_min'] - padding, 'y_start': bounding_box['y_min'] - padding, 'size': zoom_window_size} if invert: zoom_window = {'x_start': bounding_box['y_min'] - padding, 'y_start': bounding_box['x_min'] - padding, 'size': zoom_window_size} # shift the window if we're close to the edge of the heatmap if zoom_window['x_start'] + zoom_window['size'] - 1 >= region_size: zoom_window['x_start'] -= zoom_window['x_start'] + zoom_window['size'] \ - region_size if zoom_window['y_start'] + zoom_window['size'] - 1 >= region_size: zoom_window['y_start'] -= zoom_window['y_start'] + zoom_window['size'] \ - region_size if zoom_window['x_start'] < 0: zoom_window['x_start'] -= zoom_window['x_start'] if zoom_window['y_start'] < 0: zoom_window['y_start'] -= zoom_window['y_start'] return zoom_window
[docs]def plot_cluster(pixelmap, counts_superdict, cluster_peaks, cluster_region, colorscales='auto', tracks=(), track_filename_generator=lambda x: '%s.bed' % x, conditions=(), track_scales='auto', zoom_window='auto', padding=2, invert=False, output_filename_generator=lambda x, y: 'output/%s_%s' % (x, y), heatmap_kwargs='default'): """ Plots a contact probability heatmaps centered on a cluster. Parameters ---------- pixelmap : pixelmap The pixelmap to use for plotting the heatmap. See ``lib5c.parsers.bed.get_pixelmap()``. counts_superdict : dict of counts dicts The contact probability data for each replicate to be plotted. The keys of the outer dict are replicate names as strings. The values of the outer dict are counts dicts whose keys are region names as strings and whose values are 2D square symmetric numpy arrays containing the counts for that region in that replicate. See ``lib5c.util.counts_superdict``. cluster_peaks : list of dicts of ints Each dict in the list represents a peak in the cluster and has the form:: { 'x': int, 'y': int } where these ints represent the respective x- and y-coordinates of the peak within the region. cluster_region : str The name of the region in which the cluster lies. colorscales : 'auto' or dict of lists of numerics If 'auto' is passed, the regional colorscales for the heatmaps are automatically computed using ``lib5c.util.scales.compute_regional_obs_over_exp_scale()``. Alternatively, the colorscales may be specified as a dict whose keys are region names and whose values are lists of numerics of length two. The first element of this list will be the minumum value on the colorscale, while the second element will be the maximum value on the colorscale. tracks : list of str List of string identifiers for ChIP-seq tracks to include on the heatmaps. The identifiers will be used to find the appropriate BED files on the disk according to track_filename_generator. track_filename_generator : function str -> str When passed any string from tracks, this function should return a string reference to the BED file on the disk containing the BED features for that track. conditions : list of str List of string identifiers for distinct conditions in the dataset. If this list is not empty, it will be used to group replicates and tracks into distinct groups that contain the identifiers in this list as substrings of their names. Replicates identified as belonging to a certain condition will be plotted with the tracks identified as belonging to that same condition. For example, consider a conditions list ['ES', 'NPC'], replicates 'NPC_Rep1' and 'ES_Rep1', and tracks 'ES_CTCF' and 'NPC_CTCF'. The 'ES_Rep1' heatmap will be drawn with the 'ES_CTCF' track, and the 'NPC_Rep1' heatmap will be drawn with the 'NPC_CTCF' track. If this list is empty, all tracks will be drawn with all heatmaps. Additionally, this kwarg is included in the call to ``lib5c.util.scales.compute_track_scales()``, which computes the track scales if track_scales is 'auto' (see below). track_scales : 'auto' or dict of numerics If 'auto' is passed, the track scales will be computed automatically using ``lib5c.util.scales.compute_track_scales()``. The conditions kwarg will be included in this call. For example, consider a conditions list ['ES', 'NPC'], and tracks 'ES_CTCF' and 'NPC_CTCF'. Since the track names differ only in their condition identifier, these two tracks will be plotted using the same scale if track_scales is 'auto'. Alternatively, the track scales may be specified as a dict whose keys are the track names and whose values are numerics specifying the maximum value of the scale for that track. The minumum value of all tracks is assumed to be zero. zoom_window : 'auto' or dict of int If 'auto' is passed, a zoom window for the cluster will be automatically computed. Alternatively, the zoom window may be specified as a dict with the following structure:: { 'x_start': int, 'y_start': int, 'size': int } where x_start and y_start refer to the indices of the lowest-indexed pixel to be included in the zoom window along each axis, and size indicates the side length of the square zoom window in pixels. padding : int If zoom_window is 'auto', this kwarg specifies how much padding to include around the cluster boundaries when computing the zoom window. invert : bool If zoom_window is 'auto', this kwarg specifies whether or not the automatically computed zoom window should be reflected across the diagonal, reversing the x- and y-axes on the resulting heatmaps. output_filename_generator : function (str, str) -> str This function should take in cluster_region and the name of a replicate and return the output filename to write the corresponding heatmap to. heatmap_kwargs : 'default' or dict Passing a dict to this kwarg allows the specification of arbitrary kwargs to be included in the final call to ``lib5c.plotters.heatmap.plot_heatmap()``. If 'default' is passed instead of a dict, a default dict of typical kwargs will be used. """ # auto zoom window if zoom_window == 'auto': zoom_window = make_zoom_window(compute_bounding_box(cluster_peaks), len(pixelmap[cluster_region]), padding, invert=invert) # auto colorscales if colorscales == 'auto': colorscales = { region: compute_regional_obs_over_exp_scale(counts_superdict, region) for region in pixelmap.keys()} # auto track scales if track_scales == 'auto': track_scales = compute_track_scales(tracks, pixelmap, conditions, track_filename_generator) # compute replicates by condition if conditions: replicates_by_condition = { condition: [replicate for replicate in counts_superdict.keys() if condition in replicate] for condition in conditions } else: replicates_by_condition = { 'any': [replicate for replicate in counts_superdict.keys()] } # compute tracks by condition if conditions: tracks_by_condition = { condition: [track for track in tracks if condition in track] for condition in conditions } else: tracks_by_condition = {'any': [track for track in tracks]} # sane defaults for heatmap_kwargs if heatmap_kwargs == 'default': heatmap_kwargs = {'show_colorscale': False, 'cmap': obs_over_exp_colorscale, 'cluster_colors': 'lime'} for condition in replicates_by_condition.keys(): for rep in replicates_by_condition[condition]: plot_heatmap( counts_superdict[rep][cluster_region], output_filename_generator(cluster_region, rep), region=cluster_region, pixelmap=pixelmap, zoom_window=zoom_window, colorscale=colorscales[cluster_region], tracks=[track_filename_generator(track) for track in tracks_by_condition[condition]], track_scales=[track_scales[track][cluster_region] for track in tracks_by_condition[condition]], clusters=[cluster_peaks], **heatmap_kwargs )
[docs]def plot_cluster_indices(pixelmap, clusters, output_filename_generator=lambda x: '%s_indices' % x, heatmap_kwargs='default'): """ Plots clusters for each region where each cluster is in a different color and each cluster is labeled with its index in the list of clusters for that region. Parameters ---------- pixelmap : pixelmap The pixelmap to use for plotting the heatmap. See ``lib5c.parsers.bed.get_pixelmap()``. clusters : dict of lists of lists of dicts of ints The keys of the outer dict are region names. The values (the outer lists) represent the list of clusters within each region. The inner lists represent clusters. Each dict in each inner list represents a peak in that cluster and has the form:: { 'x': int, 'y': int } where these ints represent the respective x- and y-coordinates of the peak within the region. As an example, we should be able to index into clusters as follows:: clusters[region_name : str][index of cluster within region : int]\ [index of peak within cluster : int] = \ { 'x': int, 'y': int } output_filename_generator : function str -> str This function should take in a region name and return the output filename to which the cluster index heatmap for that region should be written. heatmap_kwargs : 'default' or dict Passing a dict to this kwarg allows the specification of arbitrary kwargs to be included in the final call to ``lib5c.plotters.heatmap.plot_heatmap()``. If 'default' is passed instead of a dict, a default dict of typical kwargs will be used. """ # deduce regions regions = list(pixelmap.keys()) # sane defaults for heatmap_kwargs if heatmap_kwargs == 'default': heatmap_kwargs = {'show_colorscale': False, 'cmap': plt.get_cmap('gist_ncar')} # plot heatmaps for region in regions: plot_heatmap(clusters_to_array(clusters[region], len(pixelmap[region])), output_filename_generator(region), pixelmap=pixelmap, region=region, clusters=clusters[region], cluster_labels='indices', **heatmap_kwargs)
# test client
[docs]def main(): import pickle from lib5c.parsers.primers import load_primermap from lib5c.parsers.counts import load_counts # hardcode replicates replicates = ['ES_Rep1', 'NPC_Rep31'] # hardcode conditions conditions = ['ES', 'NPC'] # hardcode tracktypes tracktypes = ['Smc1', 'CTCF', 'H3K4me1', 'H3K27ac'] # compute tracks tracks = ['%s_%s' % (tracktype, condition) for tracktype in tracktypes for condition in conditions] # get pixelmap pixelmap = load_primermap('test/bins_nuc.bed') # load counts counts_superdict = { replicate: load_counts('test/%s_obs_over_exp.counts' % replicate) for replicate in replicates} # a set of clusters with open('test/clusters.pickle', 'rb') as handle: clusters = pickle.load(handle) # plot cluster indices plot_cluster_indices( pixelmap, clusters, output_filename_generator=lambda x: 'test/clusters/%s_indices' % x) # plot zoomin heatmaps plot_cluster( pixelmap, counts_superdict, clusters['Klf4'][1], 'Klf4', colorscales='auto', tracks=tracks, track_filename_generator=lambda x: 'test/tracks/%s.bed' % x, conditions=conditions, track_scales='auto', zoom_window='auto', padding=2, invert=False, output_filename_generator=lambda x, y: 'test/clusters/%s_%s' % (x, y), heatmap_kwargs={'show_colorscale': True, 'cmap' : obs_over_exp_colorscale, 'cluster_colors' : 'pink'} )
if __name__ == "__main__": main()