Source code for lib5c.util.bed

"""
Module containing utilities for manipulating BED files and BED features.

BED features are commonly represented as dicts with the following structure::

    {
        'chrom': str
        'start': int,
        'end'  : int,
    }

but may also contain additional fields.
"""

import re


GRANGE_PATTERN = re.compile('(\w+):(\d+)-(\d+)')


[docs]def count_intersections(query_feature, feature_set): """ Counts the number of times a query feature is hit by a set of other features. Parameters ---------- query_feature : Dict[str, Any] The feature to count intersections for. feature_set : List[Dict[str, Any]] The set of features to intersect with the query feature. Returns ------- int The number of intersections Notes ----- Features are represented as dicts with the following structure:: { 'chrom': str 'start': int, 'end' : int, } See ``lib5c.parsers.bed.load_features()``. """ counter = 0 for feature in feature_set: if check_intersect(query_feature, feature): counter += 1 return counter
[docs]def check_intersect(a, b): """ Checks to see if two features intersect. Parameters --------- a, b : Dict[str, Any] The two features to check for intersection. Returns ------- bool True if the features intersect, False otherwise. Notes ----- Features are represented as dicts with the following structure:: { 'chrom': str 'start': int, 'end' : int, } See ``lib5c.parsers.bed.load_features()``. """ if ( a['chrom'] == b['chrom'] and a['end'] > b['start'] and b['end'] > a['start'] ): return True return False
[docs]def flatten_features(features): """ Flattens a features dict and returns a flat list of features. Typically, BED features are kept in dicts organized by chromosome. For example, this is the data structure returned by ``lib5c.parsers.bed.load_features()``. When a flat list is desired, this function can be used to flatten the dictionary into a simple list. Parameters ---------- features : Dict[str, List[Dict[str, Any]]] The keys are chromosome names. The values are lists of features for that chromosome. The features are represented as dicts with at least the following keys:: { 'start': int, 'end' : int } Returns ------- List[Dict[str, Any]] These dicts, which represent the same features as those contained in the original dict, have the following keys:: { 'chrom': str, 'start': int, 'end' : int } as well as any additional keys that were present in the inner dicts of the features dict passed to this function. Notes ----- If the dicts that describe the features already contain a 'chrom' key, that key's value will get overwritten during the flattening. """ flattened_list = [] for chrom in features.keys(): for feature in features[chrom]: feature['chrom'] = chrom flattened_list.extend(features[chrom]) return flattened_list
[docs]def get_midpoint(fragment, force_int=False): """ Gets the midpoint of a fragment. Parameters ---------- fragment : Dict[str, Any] The fragment to find the midpoint of. The fragment must be represented as a dict with at least the following keys:: { 'start': int, 'end': int } force_int : bool Return an int rounded towards zero instead of a float. Returns ------- float The midpoint of the fragment, rounded towards zero if force_int is True. Examples -------- >>> fragment = {'start': 50, 'end': 100} >>> get_midpoint(fragment) 75.0 """ if force_int: return int(get_midpoint(fragment)) return (fragment['start'] + fragment['end']) / 2.0
[docs]def get_mid_to_mid_distance(fragment_a, fragment_b): """ Gets the mid-to-mid distance between two fragments. Parameters ---------- fragment_a, fragment_b : Dict[str, Any] The fragments to find the distance between. The fragments must be represented as dicts with at least the following keys:: { 'start': int, 'end': int } Returns ------- float The mid-to-mid distance """ return abs(get_midpoint(fragment_a) - get_midpoint(fragment_b))
[docs]def parse_feature_from_string(grange_string): """ Parses BED feature from a string specifying the genomic range. Parameters ---------- grange_string : str The genomic range to parse, specified as a string of the form <chrom>:<start>-<end>. The interval is interpreted as a BED interval (0-based index, half-open interval). Returns ------- dict The BED feature dict, which has keys 'chrom', 'start', and 'end'. """ chrom, start, end = GRANGE_PATTERN.match(grange_string).groups() return {'chrom': chrom, 'start': int(start), 'end': int(end)}
# test client
[docs]def main(): from lib5c.parsers.bed import load_features # get some features features_a = load_features( 'test/annotations/V65EScells_CTCFMed12Smc1_2015.bed') features_b = load_features( 'test/annotations/V65EScells_Superenhancers_1_2015.bed') # test count_intersections print(count_intersections(features_b['chr7'][0], flatten_features(features_a)))
if __name__ == "__main__": main()