"""
Module containing utilities for manipulating BED files and BED features.
BED features are commonly represented as dicts with the following structure::
{
'chrom': str
'start': int,
'end' : int,
}
but may also contain additional fields.
"""
import re
GRANGE_PATTERN = re.compile('(\w+):(\d+)-(\d+)')
[docs]def count_intersections(query_feature, feature_set):
"""
Counts the number of times a query feature is hit by a set of other
features.
Parameters
----------
query_feature : Dict[str, Any]
The feature to count intersections for.
feature_set : List[Dict[str, Any]]
The set of features to intersect with the query feature.
Returns
-------
int
The number of intersections
Notes
-----
Features are represented as dicts with the following structure::
{
'chrom': str
'start': int,
'end' : int,
}
See ``lib5c.parsers.bed.load_features()``.
"""
counter = 0
for feature in feature_set:
if check_intersect(query_feature, feature):
counter += 1
return counter
[docs]def check_intersect(a, b):
"""
Checks to see if two features intersect.
Parameters
---------
a, b : Dict[str, Any]
The two features to check for intersection.
Returns
-------
bool
True if the features intersect, False otherwise.
Notes
-----
Features are represented as dicts with the following structure::
{
'chrom': str
'start': int,
'end' : int,
}
See ``lib5c.parsers.bed.load_features()``.
"""
if (
a['chrom'] == b['chrom'] and
a['end'] > b['start'] and
b['end'] > a['start']
):
return True
return False
[docs]def flatten_features(features):
"""
Flattens a features dict and returns a flat list of features.
Typically, BED features are kept in dicts organized by chromosome. For
example, this is the data structure returned by
``lib5c.parsers.bed.load_features()``. When a flat list is desired, this
function can be used to flatten the dictionary into a simple list.
Parameters
----------
features : Dict[str, List[Dict[str, Any]]]
The keys are chromosome names. The values are lists of features for that
chromosome. The features are represented as dicts with at least the
following keys::
{
'start': int,
'end' : int
}
Returns
-------
List[Dict[str, Any]]
These dicts, which represent the same features as those contained in the
original dict, have the following keys::
{
'chrom': str,
'start': int,
'end' : int
}
as well as any additional keys that were present in the inner dicts of
the features dict passed to this function.
Notes
-----
If the dicts that describe the features already contain a 'chrom' key, that
key's value will get overwritten during the flattening.
"""
flattened_list = []
for chrom in features.keys():
for feature in features[chrom]:
feature['chrom'] = chrom
flattened_list.extend(features[chrom])
return flattened_list
[docs]def get_midpoint(fragment, force_int=False):
"""
Gets the midpoint of a fragment.
Parameters
----------
fragment : Dict[str, Any]
The fragment to find the midpoint of. The fragment must be represented
as a dict with at least the following keys::
{
'start': int,
'end': int
}
force_int : bool
Return an int rounded towards zero instead of a float.
Returns
-------
float
The midpoint of the fragment, rounded towards zero if force_int is True.
Examples
--------
>>> fragment = {'start': 50, 'end': 100}
>>> get_midpoint(fragment)
75.0
"""
if force_int:
return int(get_midpoint(fragment))
return (fragment['start'] + fragment['end']) / 2.0
[docs]def get_mid_to_mid_distance(fragment_a, fragment_b):
"""
Gets the mid-to-mid distance between two fragments.
Parameters
----------
fragment_a, fragment_b : Dict[str, Any]
The fragments to find the distance between. The fragments must be
represented as dicts with at least the following keys::
{
'start': int,
'end': int
}
Returns
-------
float
The mid-to-mid distance
"""
return abs(get_midpoint(fragment_a) - get_midpoint(fragment_b))
[docs]def parse_feature_from_string(grange_string):
"""
Parses BED feature from a string specifying the genomic range.
Parameters
----------
grange_string : str
The genomic range to parse, specified as a string of the form
<chrom>:<start>-<end>. The interval is interpreted as a BED interval
(0-based index, half-open interval).
Returns
-------
dict
The BED feature dict, which has keys 'chrom', 'start', and 'end'.
"""
chrom, start, end = GRANGE_PATTERN.match(grange_string).groups()
return {'chrom': chrom, 'start': int(start), 'end': int(end)}
# test client
[docs]def main():
from lib5c.parsers.bed import load_features
# get some features
features_a = load_features(
'test/annotations/V65EScells_CTCFMed12Smc1_2015.bed')
features_b = load_features(
'test/annotations/V65EScells_Superenhancers_1_2015.bed')
# test count_intersections
print(count_intersections(features_b['chr7'][0],
flatten_features(features_a)))
if __name__ == "__main__":
main()