"""
Module for the GeneExtendableHeatmap class, which adds gene track plotting
functionality for the extendable heatmap system.
"""
import os
import matplotlib.collections as collections
from matplotlib.colors import to_rgba
from lib5c.plotters.extendable import BaseExtendableHeatmap
from lib5c.parsers.genes import load_gene_table
from lib5c.util.bed import check_intersect
[docs]class GeneExtendableHeatmap(BaseExtendableHeatmap):
"""
ExtendableHeatmap mixin class providing gene track plotting functionality.
To deal with the fact that genes may overlap (e.g., where a gene has
multiple isoforms), this class uses the concept of "gene stacks". Each gene
track in a gene stack represents a separate axis added to the
ExtendableHeatmap. By packing a set of genes into separate "rows", functions
like ``add_gene_stack()`` can plot each row in the stack as a separate gene
track via ``add_gene_track()``.
Most commonly, we will want to add reference gene tracks corresponding to a
particular genome assembly. To make this easy, this class provides the
``add_refgene_stack()`` and ``add_refgene_stacks()`` functions.
"""
[docs] def add_gene_tracks(self, genes, size='3%', pad=0.0, axis_limits=(0, 1),
intron_height=0.05, exon_height=0.5, colors=None):
"""
Adds a gene track for a single row of genes to both the bottom and
left side of the heatmap by calling ``add_gene_track()`` twice.
Parameters
----------
genes : list of dict
Each dict should represent a gene and could have the following
keys::
{
'chrom' : str,
'start' : int,
'end' : int,
'name' : str,
'id' : str,
'strand': '+' or '-',
'blocks': list of dicts
}
Each block represents an exon as dicts with the following
structure::
{
'start': int,
'end' : int
}
The 'name' and 'id' keys are optional and are only used when color-
coding genes. See ``lib5c.parsers.genes.load_genes()``.
size : str
The size of the new axis as a percentage of the main heatmap width.
Should be passed as a string ending in '%'.
pad : float
The padding to use between the existing parts of the figure and the
newly added axis.
axis_limits : tuple of float
Axis limits for the non-genomic axis of the gene track.
intron_height : float
Controls thickness of gene introns. Pass a larger number for thicker
introns.
exon_height : float
Controls thickness of gene exons. Pass a larger number for thicker
exons.
colors : dict, optional
Pass a dict mapping gene names or id's to matplotlib colors to color
code those genes. Genes not in the dict will be colored black by
default. Using gene names as keys should color all isoforms, while
using gene id's as keys should color just the isoform matching the
specified id.
Returns
-------
list of pyplot axes
The newly added gene track axes.
"""
ax_h = self.add_gene_track(
genes, loc='bottom', size=size, pad=pad, new_ax_name='h_genes',
axis_limits=axis_limits, intron_height=intron_height,
exon_height=exon_height, colors=colors)
ax_v = self.add_gene_track(
genes, loc='left', size=size, pad=pad, new_ax_name='v_genes',
axis_limits=axis_limits, intron_height=intron_height,
exon_height=exon_height, colors=colors)
return [ax_h, ax_v]
[docs] def add_gene_track(self, genes, loc='bottom', size='3%', pad=0.0,
new_ax_name='genes', axis_limits=(0, 1),
intron_height=0.05, exon_height=0.5, colors=None):
"""
Adds one gene track (for one row of genes) along either the x- or y-axis
of the heatmap.
Parameters
----------
genes : list of dict
Each dict should represent a gene and could have the following
keys::
{
'chrom' : str,
'start' : int,
'end' : int,
'name' : str,
'id' : str,
'strand': '+' or '-',
'blocks': list of dicts
}
Each block represents an exon as dicts with the following
structure::
{
'start': int,
'end' : int
}
The 'name' and 'id' keys are optional and are only used when color-
coding genes. See ``lib5c.parsers.genes.load_genes()``.
loc : {'top', 'bottom', 'left', 'right'}
Which side of the heatmap to add the new gene track to.
size : str
The size of the new axis as a percentage of the main heatmap width.
Should be passed as a string ending in '%'.
pad : float
The padding to use between the existing parts of the figure and the
newly added axis.
new_ax_name : str
The name for the new axis. You can access the new axis later at
``h[name]`` where ``h`` is this ExtendableHeatmap instance.
axis_limits : tuple of float
Axis limits for the non-genomic axis of the gene track.
intron_height : float
Controls thickness of gene introns. Pass a larger number for thicker
introns.
exon_height : float
Controls thickness of gene exons. Pass a larger number for thicker
exons.
colors : dict, optional
Pass a dict mapping gene names or id's to matplotlib colors to color
code those genes. Genes not in the dict will be colored black by
default. Using gene names as keys should color all isoforms, while
using gene id's as keys should color just the isoform matching the
specified id.
Returns
-------
pyplot axis
The newly added gene track axis.
"""
# create new axis
ax = self.add_margin_ax(loc=loc, size=size, pad=pad,
new_ax_name=new_ax_name,
axis_limits=axis_limits)
# deduce orientation, either h or v, and save the correct grange
if loc in ['bottom', 'top']:
orientation = 'horizontal'
grange = self.grange_x
else:
orientation = 'vertical'
grange = self.grange_y
# deduce midpoint of the non-genomic axis
midpoint = (axis_limits[0] + axis_limits[1]) / 2.
# compute track_width and track_height, assuming horizontal orientation
track_width = grange['end'] - grange['start']
track_height = abs(axis_limits[1] - axis_limits[0])
# create patches for each gene
verts = []
polycolors = []
segments = []
linecolors = []
for gene in genes:
# skip genes that don't intersect the window
if not check_intersect(gene, grange):
continue
# determine color for this gene
color = to_rgba('k')
if colors is not None:
if gene['id'] in colors:
color = to_rgba(colors[gene['id']])
elif gene['name'] in colors:
color = to_rgba(colors[gene['name']])
# plot intron rectangle
l, r, t, b = (gene['start'], gene['end'],
midpoint + intron_height / 2.,
midpoint - intron_height / 2.)
hverts = [(l, b), (r, b), (r, t), (l, t)]
if orientation == 'vertical':
verts.append([(y, x) for x, y in hverts])
else:
verts.append(hverts)
polycolors.append(color)
# plot exon rectangles
for block in gene['blocks']:
l, r, t, b = (block['start'], block['end'],
midpoint + exon_height / 2.,
midpoint - exon_height / 2.)
hverts = [(l, b), (r, b), (r, t), (l, t)]
if orientation == 'vertical':
verts.append([(y, x) for x, y in hverts])
else:
verts.append(hverts)
polycolors.append(color)
# plot a little arrow based on the strand information if present
# TODO: figure out why this factor of 40 is necessary
arrow_size = (exon_height / 60.) * (track_width / track_height)
if gene['strand'] == '+':
upper_coords = [
(gene['start'], midpoint),
(gene['start'] - arrow_size, midpoint + exon_height / 2.)
]
lower_coords = [
(gene['start'], midpoint),
(gene['start'] - arrow_size, midpoint - exon_height / 2.)
]
elif gene['strand'] == '-':
upper_coords = [
(gene['end'], midpoint),
(gene['end'] + arrow_size, midpoint + exon_height / 2.)
]
lower_coords = [
(gene['end'], midpoint),
(gene['end'] + arrow_size, midpoint - exon_height / 2.)
]
else:
continue
if orientation == 'vertical':
upper_coords = [(y, x) for x, y in upper_coords]
lower_coords = [(y, x) for x, y in lower_coords]
segments.append(upper_coords)
linecolors.append(color)
segments.append(lower_coords)
linecolors.append(color)
ax.add_collection(
collections.LineCollection(segments, colors=linecolors))
ax.add_collection(
collections.PolyCollection(verts, edgecolors=polycolors,
facecolors=polycolors))
return ax
[docs] def add_gene_stacks(self, genes, size='3%', pad_before=0.0, pad_within=0.0,
axis_limits=(0, 1), intron_height=0.05, exon_height=0.5,
padding=1000, colors=None):
"""
Adds a gene stack for a set of genes to both the bottom and left side of
the heatmap by calling ``add_gene_stack()`` twice.
Parameters
----------
genes : list of dict
Each dict should represent a gene and could have the following
keys::
{
'chrom' : str,
'start' : int,
'end' : int,
'name' : str,
'id' : str,
'strand': '+' or '-',
'blocks': list of dicts
}
Each block represents an exon as dicts with the following
structure::
{
'start': int,
'end' : int
}
The 'name' and 'id' keys are optional and are only used when color-
coding genes. See ``lib5c.parsers.genes.load_genes()``.
size : str
The size of each new axis as a percentage of the main heatmap width.
Should be passed as a string ending in '%'.
pad_before : float
The padding to use between the existing parts of the figure and the
newly added gene tracks.
pad_within : float
The padding to use between each newly added gene track.
axis_limits : tuple of float
Axis limits for the non-genomic axis of each new gene track.
intron_height : float
Controls thickness of gene introns. Pass a larger number for thicker
introns.
exon_height : float
Controls thickness of gene exons. Pass a larger number for thicker
exons.
padding : int
The padding to use when packing genes into rows, in units of base
pairs. Genes that are within this many base pairs of each other will
get packed into different rows.
colors : dict, optional
Pass a dict mapping gene names or id's to matplotlib colors to color
code those genes. Genes not in the dict will be colored black by
default. Using gene names as keys should color all isoforms, while
using gene id's as keys should color just the isoform matching the
specified id.
Returns
-------
list of lists of pyplot axis
The first element of the outer list is a list of the newly added
horizontal gene track axes, one for each row of genes. The second
element is the same but for the newly added vertical gene track
axes.
"""
ax_hs = self.add_gene_stack(
genes, loc='bottom', size=size, pad_before=pad_before,
pad_within=pad_within, axis_limits=axis_limits,
intron_height=intron_height, exon_height=exon_height,
padding=padding, colors=colors)
ax_vs = self.add_gene_stack(
genes, loc='left', size=size, pad_before=pad_before,
pad_within=pad_within, axis_limits=axis_limits,
intron_height=intron_height, exon_height=exon_height,
padding=padding, colors=colors)
return [ax_hs, ax_vs]
[docs] def add_gene_stack(self, genes, loc='bottom', size='3%', pad_before=0.0,
pad_within=0.0, axis_limits=(0, 1), intron_height=0.05,
exon_height=0.5, padding=1000, colors=None):
"""
Adds one stack of gene tracks along either the x- or y-axis of the
heatmap by packing one set of genes into rows and calling
``add_gene_track()`` once for every row.
Parameters
----------
genes : list of dict
Each dict should represent a gene and could have the following
keys::
{
'chrom' : str,
'start' : int,
'end' : int,
'name' : str,
'id' : str,
'strand': '+' or '-',
'blocks': list of dicts
}
Each block represents an exon as dicts with the following
structure::
{
'start': int,
'end' : int
}
The 'name' and 'id' keys are optional and are only used when color-
coding genes. See ``lib5c.parsers.genes.load_genes()``.
loc : {'top', 'bottom', 'left', 'right'}
Which side of the heatmap to add the new gene tracks to.
size : str
The size of each new axis as a percentage of the main heatmap width.
Should be passed as a string ending in '%'.
pad_before : float
The padding to use between the existing parts of the figure and the
newly added gene tracks.
pad_within : float
The padding to use between each newly added gene track.
axis_limits : tuple of float
Axis limits for the non-genomic axis of each new gene track.
intron_height : float
Controls thickness of gene introns. Pass a larger number for thicker
introns.
exon_height : float
Controls thickness of gene exons. Pass a larger number for thicker
exons.
padding : int
The padding to use when packing genes into rows, in units of base
pairs. Genes that are within this many base pairs of each other will
get packed into different rows.
colors : dict, optional
Pass a dict mapping gene names or id's to matplotlib colors to color
code those genes. Genes not in the dict will be colored black by
default. Using gene names as keys should color all isoforms, while
using gene id's as keys should color just the isoform matching the
specified id.
Returns
-------
list of pyplot axis
The newly added gene track axes, one for each row of genes.
"""
# deduce orientation, either h or v, and save the correct grange
if loc in ['bottom', 'top']:
orientation = 'horizontal'
grange = self.grange_x
else:
orientation = 'vertical'
grange = self.grange_y
# initialize data structures for packing
row_cursors = []
rows = []
# initialize the first row
rows.append([])
row_cursors.append(-10000) # basically should be -Inf
# main loop for packing genes
for gene in genes:
# skip genes that don't intersect the window
if not check_intersect(gene, grange):
continue
gene_placed = False
for i in range(len(rows)):
if gene['start'] > row_cursors[i] + padding:
row_cursors[i] = gene['end']
rows[i].append(gene)
gene_placed = True
break
if not gene_placed:
rows.append([gene])
row_cursors.append(gene['end'])
# add a track for each row
return [
self.add_gene_track(
rows[i], loc=loc, size=size,
pad=pad_before if i == 0 else pad_within,
axis_limits=axis_limits, intron_height=intron_height,
exon_height=exon_height,
new_ax_name='%s_gene_track_%i' % (orientation, i),
colors=colors)
for i in range(len(rows))
]
[docs] def add_refgene_stacks(self, assembly, size='3%', pad_before=0.0,
pad_within=0.0, axis_limits=(0, 1),
intron_height=0.05, exon_height=0.5, padding=1000,
colors=None):
"""
Adds a gene stack for a set of genes to both the bottom and left side of
the heatmap by calling ``add_refgene_stack()`` twice.
Parameters
----------
assembly : {'hg18', 'hg19', 'hg38', 'mm9', 'mm10'}
The genome assembly to load reference genes for.
size : str
The size of each new axis as a percentage of the main heatmap width.
Should be passed as a string ending in '%'.
pad_before : float
The padding to use between the existing parts of the figure and the
newly added gene tracks.
pad_within : float
The padding to use between each newly added gene track.
axis_limits : tuple of float
Axis limits for the non-genomic axis of each new gene track.
intron_height : float
Controls thickness of gene introns. Pass a larger number for thicker
introns.
exon_height : float
Controls thickness of gene exons. Pass a larger number for thicker
exons.
padding : int
The padding to use when packing genes into rows, in units of base
pairs. Genes that are within this many base pairs of each other will
get packed into different rows.
colors : dict, optional
Pass a dict mapping gene names or id's to matplotlib colors to color
code those genes. Genes not in the dict will be colored black by
default. Using gene names as keys should color all isoforms, while
using gene id's as keys should color just the isoform matching the
specified id.
Returns
-------
list of lists of pyplot axis
The first element of the outer list is a list of the newly added
horizontal gene track axes, one for each row of genes. The second
element is the same but for the newly added vertical gene track
axes.
"""
ax_hs = self.add_refgene_stack(
assembly, loc='bottom', size=size, pad_before=pad_before,
pad_within=pad_within, axis_limits=axis_limits,
intron_height=intron_height, exon_height=exon_height,
padding=padding, colors=colors)
ax_vs = self.add_refgene_stack(
assembly, loc='left', size=size, pad_before=pad_before,
pad_within=pad_within, axis_limits=axis_limits,
intron_height=intron_height, exon_height=exon_height,
padding=padding, colors=colors)
return [ax_hs, ax_vs]
[docs] def add_refgene_stack(self, assembly, loc='bottom', size='3%',
pad_before=0.0, pad_within=0.0, axis_limits=(0, 1),
intron_height=0.05, exon_height=0.5, padding=1000,
colors=None):
"""
Adds a gene stack to either the x- or y-axis of the heatmap by getting a
set of reference genes for a specified genome assembly, and then passes
that set of genes to ``add_gene_stack()``.
Parameters
----------
assembly : {'hg18', 'hg19', 'hg38', 'mm9', 'mm10'}
The genome assembly to load reference genes for.
loc : {'top', 'bottom', 'left', 'right'}
Which side of the heatmap to add the new gene tracks to.
size : str
The size of each new axis as a percentage of the main heatmap width.
Should be passed as a string ending in '%'.
pad_before : float
The padding to use between the existing parts of the figure and the
newly added gene tracks.
pad_within : float
The padding to use between each newly added gene track.
axis_limits : tuple of float
Axis limits for the non-genomic axis of each new gene track.
intron_height : float
Controls thickness of gene introns. Pass a larger number for thicker
introns.
exon_height : float
Controls thickness of gene exons. Pass a larger number for thicker
exons.
padding : int
The padding to use when packing genes into rows, in units of base
pairs. Genes that are within this many base pairs of each other will
get packed into different rows.
colors : dict, optional
Pass a dict mapping gene names or id's to matplotlib colors to color
code those genes. Genes not in the dict will be colored black by
default. Using gene names as keys should color all isoforms, while
using gene id's as keys should color just the isoform matching the
specified id.
Returns
-------
list of pyplot axis
The newly added gene track axes, one for each row of genes.
"""
# deduce orientation and save the correct grange
if loc in ['bottom', 'top']:
grange = self.grange_x
else:
grange = self.grange_y
if assembly not in ['hg18', 'hg19', 'hg38', 'mm9', 'mm10']:
raise ValueError('unrecognized assembly %s' % assembly)
directory, filename = os.path.split(__file__)
if not directory:
directory = '.'
refgene_file_name = '%d/../gene_tracks/%g_refseq_genes.gz' \
.replace('%d', directory) \
.replace('%g', assembly) \
.replace('%c', grange['chrom'])
genes = load_gene_table(refgene_file_name)[grange['chrom']]
return self.add_gene_stack(genes, loc=loc, size=size,
pad_before=pad_before,
pad_within=pad_within,
axis_limits=axis_limits,
intron_height=intron_height,
exon_height=exon_height, padding=padding,
colors=colors)