Source code for lib5c.tools.heatmap

from lib5c.tools.parents import level_parser, simple_in_out_parser, \
    parallelization_parser, region_parser, primerfile_parser


[docs]def add_heatmap_tool(parser):
    heatmap_parser = parser.add_parser(
        'heatmap',
        prog='lib5c plot heatmap',
        help='plot interaction frequency heatmaps',
        parents=[primerfile_parser, level_parser, simple_in_out_parser,
                 region_parser, parallelization_parser]
    )
    heatmap_parser.add_argument(
        '-c', '--colormap',
        type=str,
        default='obs',
        help='''Specify the colormap to use. Special values include 'obs',
        'abs_obs', 'is', 'obs_over_exp', and 'pvalue'. The default is 'obs'.''')
    heatmap_parser.add_argument(
        '-s', '--scale',
        type=str,
        help='''Specify the colorscale as a string literal of the form
        '(min,max)'. You can write formulas which include the special symbols
        'min', 'max', 'mu' (mean), 'sigma' (standard deviation),
        'p95' (95th percentile), or 'p98' (98th percentile), such as
        '(mu-2.5*sigma,mu+2.5*sigma)'. Alternatively, pass a path to a
        colorscale file as produced by `lib5c colorscale`. Pass nothing to
        visualize the heatmap on a percentile rank scale.''')
    heatmap_parser.add_argument(
        '-g', '--genes',
        type=str,
        help='''If plotting a bin-level heatmap, pass one of 'mm9', 'mm10',
        'hg18', 'hg19', or 'hg38' to add gene tracks for the selected reference
        genome.''')
    heatmap_parser.add_argument(
        '-C', '--colorbar',
        action='store_true',
        help='''Include a colorbar next to the heatmap.''')
    heatmap_parser.add_argument(
        '-R', '--rulers',
        action='store_true',
        help='''Include genomic coordinate rulers on the heatmap.''')
    heatmap_parser.add_argument(
        '-P', '--pvalue',
        action='store_true',
        help='''Shortcut for -c pvalue -s '(0,1)', useful for plotting
        p-values.''')
    heatmap_parser.add_argument(
        '-T', '--tetris',
        action='store_true',
        help='''Pass this flag to draw classified interactions in different
        colors. Overrides -c/--colormap and -s/--colorscale.''')
    heatmap_parser.add_argument(
        '-b', '--log_base',
        type=str,
        default='None',
        help='''Pass this flag to log the input data using the given base before
        visualizing. The default is None, which applies no logging.''')
    heatmap_parser.add_argument(
        '-e', '--pseudocount',
        type=float,
        default=1.0,
        help='''Pass this flag to specify a psuedocount to use before logging
        the data. The default is 1.0.''')
    heatmap_parser.add_argument(
        '-x', '--x_zoom',
        type=str,
        help='''Specify a genomic range to zoom in the x-axis on of the form
        'chr:start-end'. You must also pass -r/--region.''')
    heatmap_parser.add_argument(
        '-y', '--y_zoom',
        type=str,
        help='''Specify a genomic range to zoom in the y-axis on of the form
        'chr:start-end'. If you pass -x/--x_zoom but not -y/--y_zoom the zoom
        window is assumed to be on-diagonal (x- and y-axes are the same).''')
    heatmap_parser.add_argument(
        '-t', '--tracks',
        type=str,
        help='''Pass a comma-separated list of bigwig files to plot as chipseq
        tracks.''')
    heatmap_parser.add_argument(
        '-d', '--domains',
        type=str,
        help='''Pass a path to a bedfile of contact domains to outline them on
        the heatmap.''')
    heatmap_parser.set_defaults(func=heatmap_tool)


[docs]def heatmap_tool(parser, args):
    import glob

    import numpy as np

    from lib5c.tools.helpers import resolve_level, resolve_parallel, \
        split_self_regionally, resolve_primerfile
    from lib5c.parsers.primers import load_primermap
    from lib5c.parsers.counts import load_counts
    from lib5c.parsers.genes import load_genes
    from lib5c.parsers.bed import load_features
    from lib5c.plotters.colormaps import get_colormap
    from lib5c.plotters.heatmap import plot_heatmap
    from lib5c.plotters.queried_counts_heatmap import \
        plot_queried_counts_heatmap
    from lib5c.util.counts import extract_queried_counts, flip_pvalues, \
        regional_counts_to_pvalues, queried_counts_to_pvalues,\
        flatten_regional_counts, log_regional_counts, parallel_log_counts
    from lib5c.util.ast_eval import eval_expr
    from lib5c.util.bed import parse_feature_from_string
    from lib5c.util.slicing import slice_matrix_by_grange
    from lib5c.parsers.config import parse_config

    # resolve primerfile and level, load primermap
    primerfile = resolve_primerfile(args.infile, args.primerfile)
    primermap = load_primermap(primerfile)
    resolved_level = resolve_level(primermap, args.level)

    # -s shortcuts
    if args.scale == 'obs':
        args.scale = '(min,p98)'
    if args.scale == 'obs_over_exp':
        args.scale = '(mu-2.5*sigma,mu+2.5*sigma)'

    # special check between resolving the level and resolving parallel:
    # we will check to see if args.scale contains any of the keywords (min, max,
    # mu, sigma), and if it does we will parse all the counts and evaluate the
    # expression
    if args.scale is not None and\
            any(keyword in args.scale
                for keyword in ['min', 'max', 'mu', 'sigma', 'p95', 'p98']):
        if args.region is None:
            split_self_regionally(
                list(primermap.keys()), script='lib5c plot heatmap',
                hang=args.hang)
        print(('precomputing scale for region %s' % args.region))
        expanded_infiles = glob.glob(args.infile.strip('\'"'))
        regional_counts_superdict = {
            infile: load_counts(infile, primermap)[args.region]
            for infile in expanded_infiles
        }
        if args.log_base != 'None':
            regional_counts_superdict = {
                infile: log_regional_counts(regional_counts_superdict[infile],
                                            pseudocount=args.pseudocount,
                                            base=args.log_base)
                for infile in expanded_infiles}
        flattened_regional_counts = {
            infile: flatten_regional_counts(regional_counts_superdict[infile],
                                            discard_nan=True)
            for infile in expanded_infiles}
        mus = [np.mean(flattened_regional_counts[infile])
               for infile in expanded_infiles]
        sigmas = [np.std(flattened_regional_counts[infile])
                  for infile in expanded_infiles]
        mins = [np.min(flattened_regional_counts[infile])
                for infile in expanded_infiles]
        maxs = [np.max(flattened_regional_counts[infile])
                for infile in expanded_infiles]
        p98s = [np.percentile(flattened_regional_counts[infile], 98)
                for infile in expanded_infiles]
        p95s = [np.percentile(flattened_regional_counts[infile], 95)
                for infile in expanded_infiles]
        variables = {'mu': np.mean(mus),
                     'sigma': np.mean(sigmas),
                     'min': np.mean(mins),
                     'max': np.mean(maxs),
                     'p95': np.mean(p95s),
                     'p98': np.mean(p98s)}
        pieces = list(map(str.strip, args.scale.strip('()').split(',')))
        left = eval_expr(pieces[0], variables=variables)
        right = eval_expr(pieces[1], variables=variables)
        args.scale = '(%s,%s)' % (left, right)

    # resolve parallel
    resolve_parallel(parser, args, subcommand='plot heatmap')

    # load counts
    print('loading counts')
    counts = load_counts(args.infile, primermap)

    # support logging
    if args.log_base != 'None':
        counts = parallel_log_counts(counts, pseudocount=args.pseudocount,
                                     base=args.log_base)

    print('preparing to plot')
    # resolve region
    if args.region is not None:
        counts = counts[args.region]
        primermap = primermap[args.region]

    # compute queried counts if appropriate
    if resolved_level == 'fragment':
        counts, primermap_x, primermap_y = extract_queried_counts(counts,
                                                                  primermap)
    else:
        primermap_x = primermap
        primermap_y = primermap

    # parse colorscale from the parameter value
    if args.tetris:
        colorscale = None
    elif args.scale is None:
        if resolved_level == 'bin':
            counts = regional_counts_to_pvalues(counts)
        else:
            counts = queried_counts_to_pvalues(counts)
        counts = flip_pvalues(counts)
        colorscale = (0, 0.98)
    elif '(' in args.scale and ')' in args.scale and ',' in args.scale:
        colorscale = list(map(float, args.scale.strip('()').split(',')))
    else:
        colorscale = parse_config(args.scale, 'colorscales')
        if args.region is not None:
            colorscale = colorscale[args.region]

    # resolve colormap
    if args.tetris:
        cmap = None
    elif args.pvalue:
        cmap = 'pvalue'
        colorscale = (0, 1)
    else:
        cmap = get_colormap(args.colormap)

    # resolve outfile
    if args.region:
        resolved_outfile = args.outfile
    else:
        resolved_outfile = {region: args.outfile.replace(r'%r', region)
                            for region in counts.keys()}

    # resolve genes
    if type(args.genes) == str and args.genes not in ['mm9', 'mm10', 'hg18',
                                                      'hg19', 'hg38']:
        genes_dict = load_genes(args.genes)
        if args.region:
            resolved_genes = genes_dict[primermap[0]['chrom']]
        else:
            resolved_genes = {region: genes_dict[primermap[region][0]['chrom']]
                              for region in primermap}
    else:
        resolved_genes = args.genes

    # resolve tracks
    resolved_tracks = list(map(str.strip, args.tracks.strip('()').split(','))) \
        if args.tracks is not None else None

    # resolve domains
    if args.domains is None:
        resolved_domains = None
    elif args.region is not None:
        resolved_domains = load_features(
            args.domains)[primermap[0]['chrom']]
    else:
        resolved_domains = {
            region: load_features(args.domains)[primermap[region][0]['chrom']]
            for region in primermap
        }

    # resolve zoom window query
    if args.region is not None and args.x_zoom is not None:
        grange_x = parse_feature_from_string(args.x_zoom)
        if args.y_zoom is not None:
            grange_y = parse_feature_from_string(args.y_zoom)
        else:
            grange_y = grange_x
        # this works only because args.region is defined, which means that
        # primermap_x and primermap_y are guaranteed to be regional here
        counts, grange_x, grange_y = slice_matrix_by_grange(
            counts, regional_primermap_x=primermap_x, grange_x=grange_x,
            regional_primermap_y=primermap_y, grange_y=grange_y)
    else:
        # everything below is technically wrong for fragment-level heatmaps,
        # but these variables are not passed to the fragment-level heatmap
        # plotter anyway
        if args.region is None:
            grange_x = {region: {'chrom': primermap[region][0]['chrom'],
                                 'start': primermap[region][0]['start'],
                                 'end': primermap[region][-1]['end']}
                        for region in primermap}
            grange_y = grange_x
        else:
            grange_x = {'chrom': primermap[0]['chrom'],
                        'start': primermap[0]['start'],
                        'end': primermap[-1]['end']}
            grange_y = grange_x

    # plot heatmap
    print('plotting')
    if resolved_level == 'bin':
        plot_heatmap(
            matrix=counts, grange_x=grange_x, grange_y=grange_y,
            rulers=args.rulers, genes=resolved_genes, colorscale=colorscale,
            colorbar=args.colorbar, colormap=cmap, tracks=resolved_tracks,
            domains=resolved_domains, outfile=resolved_outfile)
    else:
        plot_queried_counts_heatmap(
            counts, resolved_outfile, colorscale=colorscale,
            colorbar=args.colorbar, cmap=cmap)