Source code for lib5c.parsers.hic
"""
Module for parsing Hi-C data from the Rao et al. 2014 paper.
"""
import itertools
import warnings
import numpy as np
from lib5c.util.parallelization import parallelize_regions
[docs]@parallelize_regions
def load_range_from_contact_matrix(contact_matrix_file, grange, region_name='',
norm_file=None):
"""
Parses a chunk of contact information out of a Hi-C contact matrix file.
The Hi-C contact matrix file format parsed by this function is the format
used in the contact matrices uploaded to GEO for the Rao et al. 2014 paper.
It is also the same format used by the Juicer tools dump command.
Parameters
----------
contact_matrix_file : str
String reference to the Hi-C contact matrix file to parse.
grange : Dict[str, Any]
The genomic range to extract contact information for. This should be
specified as a dict with the following structure::
{
'chrom': str,
'start': int,
'end': int
}
region_name : Optional[str]
Name for this genomic region. If passed, it will be used to name the
bins in the returned pixelmap.
norm_file : Optional[str]
String reference to a file containing a Hi-C bias vector corresponding
to the ``contact_matrix_file``. If passed, the data will be normalized
using this vector.
Returns
-------
Tuple[np.ndarray, List[Dict[str, Any]]]
The first element of the tuple is the extract counts matrix for the
requested genomic range. The second element of the tuple is a pixelmap
generated for this region describing the specific bins that were
extracted.
"""
# load normalization vector if provided
norm_vector = []
if norm_file is not None:
with open(norm_file, 'r') as handle:
for line in handle:
norm_vector.append(float(line.strip()))
with open(contact_matrix_file, 'r') as handle:
# guess resolution based on first two lines
lines = list(itertools.islice(handle, 2))
resolution = abs(int(lines[1].split('\t')[0]) -
int(lines[0].split('\t')[0]))
if resolution == 0:
resolution = abs(int(lines[1].split('\t')[1]) -
int(lines[0].split('\t')[1]))
# warn if rounding genomic range to resolution
if grange['start'] % resolution:
warnings.warn('rounding %s to next lowest bin' % grange['start'])
grange['start'] = (grange['start'] / resolution) * resolution
if grange['end'] % resolution:
warnings.warn('rounding %s to next largest bin' % grange['end'])
grange['end'] = (grange['end'] / resolution + 1) * resolution
# compute coordinate transformation
size = int((grange['end'] - grange['start']) / resolution)
# initialize array
array = np.zeros([size, size])
# reset file position following islice call
handle.seek(0)
# parse
for line in handle:
pieces = line.strip().split('\t')
left = int(pieces[0])
right = int(pieces[1])
if (grange['start'] <= left < grange['end']) and \
(grange['start'] <= right < grange['end']):
value = float(pieces[2])
if norm_vector:
factor = (norm_vector[int(left / resolution)] *
norm_vector[int(right / resolution)])
if np.isfinite(factor) and factor > 0:
value /= factor
array[int((left - grange['start']) / resolution),
int((right - grange['start']) / resolution)] = value
array[int((right - grange['start']) / resolution),
int((left - grange['start']) / resolution)] = value
# make regional pixelmap
regional_pixelmap = [{'name' : '%s_BIN_%03d' %
(region_name,
int((i - grange['start']) / resolution)),
'chrom': grange['chrom'],
'start': i,
'end' : i + resolution}
for i in range(grange['start'],
grange['end'],
resolution)]
return array, regional_pixelmap