Source code for lib5c.tools.heatmap
from lib5c.tools.parents import level_parser, simple_in_out_parser, \
parallelization_parser, region_parser, primerfile_parser
[docs]def add_heatmap_tool(parser):
heatmap_parser = parser.add_parser(
'heatmap',
prog='lib5c plot heatmap',
help='plot interaction frequency heatmaps',
parents=[primerfile_parser, level_parser, simple_in_out_parser,
region_parser, parallelization_parser]
)
heatmap_parser.add_argument(
'-c', '--colormap',
type=str,
default='obs',
help='''Specify the colormap to use. Special values include 'obs',
'abs_obs', 'is', 'obs_over_exp', and 'pvalue'. The default is 'obs'.''')
heatmap_parser.add_argument(
'-s', '--scale',
type=str,
help='''Specify the colorscale as a string literal of the form
'(min,max)'. You can write formulas which include the special symbols
'min', 'max', 'mu' (mean), 'sigma' (standard deviation),
'p95' (95th percentile), or 'p98' (98th percentile), such as
'(mu-2.5*sigma,mu+2.5*sigma)'. Alternatively, pass a path to a
colorscale file as produced by `lib5c colorscale`. Pass nothing to
visualize the heatmap on a percentile rank scale.''')
heatmap_parser.add_argument(
'-g', '--genes',
type=str,
help='''If plotting a bin-level heatmap, pass one of 'mm9', 'mm10',
'hg18', 'hg19', or 'hg38' to add gene tracks for the selected reference
genome.''')
heatmap_parser.add_argument(
'-C', '--colorbar',
action='store_true',
help='''Include a colorbar next to the heatmap.''')
heatmap_parser.add_argument(
'-R', '--rulers',
action='store_true',
help='''Include genomic coordinate rulers on the heatmap.''')
heatmap_parser.add_argument(
'-P', '--pvalue',
action='store_true',
help='''Shortcut for -c pvalue -s '(0,1)', useful for plotting
p-values.''')
heatmap_parser.add_argument(
'-T', '--tetris',
action='store_true',
help='''Pass this flag to draw classified interactions in different
colors. Overrides -c/--colormap and -s/--colorscale.''')
heatmap_parser.add_argument(
'-b', '--log_base',
type=str,
default='None',
help='''Pass this flag to log the input data using the given base before
visualizing. The default is None, which applies no logging.''')
heatmap_parser.add_argument(
'-e', '--pseudocount',
type=float,
default=1.0,
help='''Pass this flag to specify a psuedocount to use before logging
the data. The default is 1.0.''')
heatmap_parser.add_argument(
'-x', '--x_zoom',
type=str,
help='''Specify a genomic range to zoom in the x-axis on of the form
'chr:start-end'. You must also pass -r/--region.''')
heatmap_parser.add_argument(
'-y', '--y_zoom',
type=str,
help='''Specify a genomic range to zoom in the y-axis on of the form
'chr:start-end'. If you pass -x/--x_zoom but not -y/--y_zoom the zoom
window is assumed to be on-diagonal (x- and y-axes are the same).''')
heatmap_parser.add_argument(
'-t', '--tracks',
type=str,
help='''Pass a comma-separated list of bigwig files to plot as chipseq
tracks.''')
heatmap_parser.add_argument(
'-d', '--domains',
type=str,
help='''Pass a path to a bedfile of contact domains to outline them on
the heatmap.''')
heatmap_parser.set_defaults(func=heatmap_tool)
[docs]def heatmap_tool(parser, args):
import glob
import numpy as np
from lib5c.tools.helpers import resolve_level, resolve_parallel, \
split_self_regionally, resolve_primerfile
from lib5c.parsers.primers import load_primermap
from lib5c.parsers.counts import load_counts
from lib5c.parsers.genes import load_genes
from lib5c.parsers.bed import load_features
from lib5c.plotters.colormaps import get_colormap
from lib5c.plotters.heatmap import plot_heatmap
from lib5c.plotters.queried_counts_heatmap import \
plot_queried_counts_heatmap
from lib5c.util.counts import extract_queried_counts, flip_pvalues, \
regional_counts_to_pvalues, queried_counts_to_pvalues,\
flatten_regional_counts, log_regional_counts, parallel_log_counts
from lib5c.util.ast_eval import eval_expr
from lib5c.util.bed import parse_feature_from_string
from lib5c.util.slicing import slice_matrix_by_grange
from lib5c.parsers.config import parse_config
# resolve primerfile and level, load primermap
primerfile = resolve_primerfile(args.infile, args.primerfile)
primermap = load_primermap(primerfile)
resolved_level = resolve_level(primermap, args.level)
# -s shortcuts
if args.scale == 'obs':
args.scale = '(min,p98)'
if args.scale == 'obs_over_exp':
args.scale = '(mu-2.5*sigma,mu+2.5*sigma)'
# special check between resolving the level and resolving parallel:
# we will check to see if args.scale contains any of the keywords (min, max,
# mu, sigma), and if it does we will parse all the counts and evaluate the
# expression
if args.scale is not None and\
any(keyword in args.scale
for keyword in ['min', 'max', 'mu', 'sigma', 'p95', 'p98']):
if args.region is None:
split_self_regionally(
list(primermap.keys()), script='lib5c plot heatmap',
hang=args.hang)
print(('precomputing scale for region %s' % args.region))
expanded_infiles = glob.glob(args.infile.strip('\'"'))
regional_counts_superdict = {
infile: load_counts(infile, primermap)[args.region]
for infile in expanded_infiles
}
if args.log_base != 'None':
regional_counts_superdict = {
infile: log_regional_counts(regional_counts_superdict[infile],
pseudocount=args.pseudocount,
base=args.log_base)
for infile in expanded_infiles}
flattened_regional_counts = {
infile: flatten_regional_counts(regional_counts_superdict[infile],
discard_nan=True)
for infile in expanded_infiles}
mus = [np.mean(flattened_regional_counts[infile])
for infile in expanded_infiles]
sigmas = [np.std(flattened_regional_counts[infile])
for infile in expanded_infiles]
mins = [np.min(flattened_regional_counts[infile])
for infile in expanded_infiles]
maxs = [np.max(flattened_regional_counts[infile])
for infile in expanded_infiles]
p98s = [np.percentile(flattened_regional_counts[infile], 98)
for infile in expanded_infiles]
p95s = [np.percentile(flattened_regional_counts[infile], 95)
for infile in expanded_infiles]
variables = {'mu': np.mean(mus),
'sigma': np.mean(sigmas),
'min': np.mean(mins),
'max': np.mean(maxs),
'p95': np.mean(p95s),
'p98': np.mean(p98s)}
pieces = list(map(str.strip, args.scale.strip('()').split(',')))
left = eval_expr(pieces[0], variables=variables)
right = eval_expr(pieces[1], variables=variables)
args.scale = '(%s,%s)' % (left, right)
# resolve parallel
resolve_parallel(parser, args, subcommand='plot heatmap')
# load counts
print('loading counts')
counts = load_counts(args.infile, primermap)
# support logging
if args.log_base != 'None':
counts = parallel_log_counts(counts, pseudocount=args.pseudocount,
base=args.log_base)
print('preparing to plot')
# resolve region
if args.region is not None:
counts = counts[args.region]
primermap = primermap[args.region]
# compute queried counts if appropriate
if resolved_level == 'fragment':
counts, primermap_x, primermap_y = extract_queried_counts(counts,
primermap)
else:
primermap_x = primermap
primermap_y = primermap
# parse colorscale from the parameter value
if args.tetris:
colorscale = None
elif args.scale is None:
if resolved_level == 'bin':
counts = regional_counts_to_pvalues(counts)
else:
counts = queried_counts_to_pvalues(counts)
counts = flip_pvalues(counts)
colorscale = (0, 0.98)
elif '(' in args.scale and ')' in args.scale and ',' in args.scale:
colorscale = list(map(float, args.scale.strip('()').split(',')))
else:
colorscale = parse_config(args.scale, 'colorscales')
if args.region is not None:
colorscale = colorscale[args.region]
# resolve colormap
if args.tetris:
cmap = None
elif args.pvalue:
cmap = 'pvalue'
colorscale = (0, 1)
else:
cmap = get_colormap(args.colormap)
# resolve outfile
if args.region:
resolved_outfile = args.outfile
else:
resolved_outfile = {region: args.outfile.replace(r'%r', region)
for region in counts.keys()}
# resolve genes
if type(args.genes) == str and args.genes not in ['mm9', 'mm10', 'hg18',
'hg19', 'hg38']:
genes_dict = load_genes(args.genes)
if args.region:
resolved_genes = genes_dict[primermap[0]['chrom']]
else:
resolved_genes = {region: genes_dict[primermap[region][0]['chrom']]
for region in primermap}
else:
resolved_genes = args.genes
# resolve tracks
resolved_tracks = list(map(str.strip, args.tracks.strip('()').split(','))) \
if args.tracks is not None else None
# resolve domains
if args.domains is None:
resolved_domains = None
elif args.region is not None:
resolved_domains = load_features(
args.domains)[primermap[0]['chrom']]
else:
resolved_domains = {
region: load_features(args.domains)[primermap[region][0]['chrom']]
for region in primermap
}
# resolve zoom window query
if args.region is not None and args.x_zoom is not None:
grange_x = parse_feature_from_string(args.x_zoom)
if args.y_zoom is not None:
grange_y = parse_feature_from_string(args.y_zoom)
else:
grange_y = grange_x
# this works only because args.region is defined, which means that
# primermap_x and primermap_y are guaranteed to be regional here
counts, grange_x, grange_y = slice_matrix_by_grange(
counts, regional_primermap_x=primermap_x, grange_x=grange_x,
regional_primermap_y=primermap_y, grange_y=grange_y)
else:
# everything below is technically wrong for fragment-level heatmaps,
# but these variables are not passed to the fragment-level heatmap
# plotter anyway
if args.region is None:
grange_x = {region: {'chrom': primermap[region][0]['chrom'],
'start': primermap[region][0]['start'],
'end': primermap[region][-1]['end']}
for region in primermap}
grange_y = grange_x
else:
grange_x = {'chrom': primermap[0]['chrom'],
'start': primermap[0]['start'],
'end': primermap[-1]['end']}
grange_y = grange_x
# plot heatmap
print('plotting')
if resolved_level == 'bin':
plot_heatmap(
matrix=counts, grange_x=grange_x, grange_y=grange_y,
rulers=args.rulers, genes=resolved_genes, colorscale=colorscale,
colorbar=args.colorbar, colormap=cmap, tracks=resolved_tracks,
domains=resolved_domains, outfile=resolved_outfile)
else:
plot_queried_counts_heatmap(
counts, resolved_outfile, colorscale=colorscale,
colorbar=args.colorbar, cmap=cmap)