Source code for lib5c.tools.hic_extract

[docs]def add_hic_extract_tool(parser):
    hic_extract_parser = parser.add_parser(
        'hic-extract',
        prog='lib5c hic-extract',
        help='extract chunks from Hi-C data'
    )
    hic_extract_parser.add_argument(
        'matrix',
        type=str,
        help='''Path to contact matrix file. %%c will be replaced by the
        chromosome name if multiple files are necessary.''')
    hic_extract_parser.add_argument(
        'range',
        type=str,
        help='''Genomic range to extract, in the form 'chrom:start-end' Pass a
        path to a tab-separated file whose columns are region names and ranges
        to extract multiple named ranges.''')
    hic_extract_parser.add_argument(
        'output_countsfile',
        type=str,
        help='''Path to write extracted counts to.''')
    hic_extract_parser.add_argument(
        'output_bedfile',
        type=str,
        help='''Path to write information about extracted bins to.''')
    hic_extract_parser.add_argument(
        '-b', '--bias_vector_file',
        type=str,
        help='''Path to file containing bias vector that counts will be divided
        by before being written. %%c will be replaced by the chromosome name if
        multiple files are necessary.''')
    hic_extract_parser.set_defaults(func=hic_extract_tool)


[docs]def parse_range_string(range_string):
    chrom, start_end = range_string.split(':')
    start, end = start_end.split('-')
    return {'chrom': chrom, 'start': int(start), 'end': int(end)}


[docs]def hic_extract_tool(parser, args):
    from lib5c.parsers.hic import load_range_from_contact_matrix
    from lib5c.writers.counts import write_counts
    from lib5c.writers.primers import write_primermap

    # resolve ranges
    if ':' in args.range:
        ranges = {'unnamed_region': parse_range_string(args.range)}
    else:
        with open(args.range, 'r') as handle:
            ranges = {}
            for line in handle:
                if line.startswith('#'):
                    continue
                pieces = line.strip().split('\t')
                ranges[pieces[0]] = parse_range_string(pieces[1])

    # resolve matrix
    matrices = {region: args.matrix.replace(r'%c', ranges[region]['chrom'])
                for region in ranges}

    # resolve bias_vectors
    bias_vectors = None
    if args.bias_vector_file is not None:
        bias_vectors = {
            region: args.bias_vector_file.replace('%c', ranges[region]['chrom'])
            for region in ranges
        }

    # resolve region names
    region_names = {region: region for region in ranges}

    # parse
    counts, pixelmap = load_range_from_contact_matrix(
        matrices, ranges, region_name=region_names, norm_file=bias_vectors)

    # write
    write_counts(counts, args.output_countsfile, pixelmap)
    write_primermap(pixelmap, args.output_bedfile)