Source code for lib5c.parsers.hic

"""
Module for parsing Hi-C data from the Rao et al. 2014 paper.
"""

import itertools
import warnings

import numpy as np

from lib5c.util.parallelization import parallelize_regions


[docs]@parallelize_regions def load_range_from_contact_matrix(contact_matrix_file, grange, region_name='', norm_file=None): """ Parses a chunk of contact information out of a Hi-C contact matrix file. The Hi-C contact matrix file format parsed by this function is the format used in the contact matrices uploaded to GEO for the Rao et al. 2014 paper. It is also the same format used by the Juicer tools dump command. Parameters ---------- contact_matrix_file : str String reference to the Hi-C contact matrix file to parse. grange : Dict[str, Any] The genomic range to extract contact information for. This should be specified as a dict with the following structure:: { 'chrom': str, 'start': int, 'end': int } region_name : Optional[str] Name for this genomic region. If passed, it will be used to name the bins in the returned pixelmap. norm_file : Optional[str] String reference to a file containing a Hi-C bias vector corresponding to the ``contact_matrix_file``. If passed, the data will be normalized using this vector. Returns ------- Tuple[np.ndarray, List[Dict[str, Any]]] The first element of the tuple is the extract counts matrix for the requested genomic range. The second element of the tuple is a pixelmap generated for this region describing the specific bins that were extracted. """ # load normalization vector if provided norm_vector = [] if norm_file is not None: with open(norm_file, 'r') as handle: for line in handle: norm_vector.append(float(line.strip())) with open(contact_matrix_file, 'r') as handle: # guess resolution based on first two lines lines = list(itertools.islice(handle, 2)) resolution = abs(int(lines[1].split('\t')[0]) - int(lines[0].split('\t')[0])) if resolution == 0: resolution = abs(int(lines[1].split('\t')[1]) - int(lines[0].split('\t')[1])) # warn if rounding genomic range to resolution if grange['start'] % resolution: warnings.warn('rounding %s to next lowest bin' % grange['start']) grange['start'] = (grange['start'] / resolution) * resolution if grange['end'] % resolution: warnings.warn('rounding %s to next largest bin' % grange['end']) grange['end'] = (grange['end'] / resolution + 1) * resolution # compute coordinate transformation size = int((grange['end'] - grange['start']) / resolution) # initialize array array = np.zeros([size, size]) # reset file position following islice call handle.seek(0) # parse for line in handle: pieces = line.strip().split('\t') left = int(pieces[0]) right = int(pieces[1]) if (grange['start'] <= left < grange['end']) and \ (grange['start'] <= right < grange['end']): value = float(pieces[2]) if norm_vector: factor = (norm_vector[int(left / resolution)] * norm_vector[int(right / resolution)]) if np.isfinite(factor) and factor > 0: value /= factor array[int((left - grange['start']) / resolution), int((right - grange['start']) / resolution)] = value array[int((right - grange['start']) / resolution), int((left - grange['start']) / resolution)] = value # make regional pixelmap regional_pixelmap = [{'name' : '%s_BIN_%03d' % (region_name, int((i - grange['start']) / resolution)), 'chrom': grange['chrom'], 'start': i, 'end' : i + resolution} for i in range(grange['start'], grange['end'], resolution)] return array, regional_pixelmap