Source code for lib5c.contrib.interlap.util

"""
Module containing utilities for interfacing with InterLap objects from the
external ``interlap`` Python package, which provides efficient binary interval
search for finding overlapping genomic features.
"""

from interlap import InterLap


[docs]def features_to_interlaps(features, chroms=None): """ Converts feature dicts to InterLap objects. Parameters ---------- features : dict of list of dict The keys of the outer dict should be chromosome names as strings. The values of the outer dict represent lists of features found on that chromosome. The inner dicts represent individual genomic features, with at least the following keys:: { 'chrom': str, 'start': int, 'end' : int } See ``lib5c.parsers.load_features()`` for more information. chroms : list of str, optional To create InterLap objects for only specified chromosomes, pass a list of their names. Pass None to create InterLap objects for all chromosomes. Returns ------- dict of InterLap The keys are chromosome names as strings, the values are InterLap objects containing all features on the chromosome. The original feature dicts are saved in the data element of each interval in the InterLap. """ # resolve chroms if chroms is None: chroms = list(features.keys()) return {chrom: InterLap([(f['start'], f['end'] - 1, f) for f in features[chrom]]) for chrom in chroms}
[docs]def query_interlap(interlap, query_feature): """ Searches an InterLap object to find features that overlap a given query feature. Parameters ---------- interlap : InterLap The InterLap object to search. Each interval in the InterLap object must have a data element, see ``lib5c.contrib.interlap.util.features_to_interlaps()``. query_feature : dict Dict representing the genomic region in which to search for overlapping features. Must have at least the following keys:: { 'chrom': str, 'start': int, 'end' : int } Returns ------- list of dict Each dict in the list represents a feature found in the InterLap object that overlaps the query feature. """ hits = interlap.find((query_feature['start'], query_feature['end'] - 1)) return [hit[2] for hit in hits]