Source code for lib5c.parsers.bed

"""
Module for parsing .bed files.
"""

from lib5c.util.bed import count_intersections


[docs]def load_features(bedfile, id_index=None, value_index=None, boundaries=None,
                  strict=True):
    """
    Loads the features from a .bed file into dicts and returns them.

    Parameters
    ----------
    bedfile : str
        String reference to location of .bed file to load features from.
    id_index : int
        If passed, indicates the column index of the id field.
    value_index : int
        If passed, indicates the column index of the value field.
    boundaries : list of dicts
        If passed, features will only be loaded if they intersect at least one
        of the features in this list. The features should be represented as
        dicts with the following structure::

            {
                'chrom': str,
                'start': int,
                'end'  : int
            }

    strict : boolean
        If True, there must not be any incomplete lines in the bedfile.

    Returns
    -------
    dict of lists of dicts
        The keys are chromosome names. The values are lists of features for that
        chromosome. The features are represented as dicts with the following
        structure::

            {
                'chrom': str,
                'start': int,
                'end'  : int,
                'id'   : str or None,
                'value': float or None
            }

        The 'id' and 'value' fields may be None if no feature ID's were provided
        in the BED file, but the keys will always be present in the returned
        dict.

    Notes
    -----
    The parser will attempt to guess the column indices of the id and value
    fields based on the number of columns and the types of the column entries.
    """

    # dict to store features
    features = {}

    # parse bedfile
    with open(bedfile, 'r') as handle:
        for line in handle:
            # skip comments and track line
            if line.startswith('#') or line.startswith('track'):
                continue

            # split line
            pieces = line.strip().split('\t')
            if len(pieces) < 3 and not strict:
                continue  # skip incomplete lines when strict=False

            # parse chromosome
            chromosome = pieces[0]

            # add chromosome to dict if this is a new one
            if chromosome not in features:
                features[chromosome] = []

            # parse feature information
            start = int(pieces[1])
            end = int(pieces[2])
            feature_id = None
            value = None

            # user-specified column schema
            if id_index is not None:
                feature_id = pieces[id_index]
            if value_index is not None:
                value = float(pieces[value_index])

            # intelligent column schema guessing
            if not id_index and not value_index:
                if len(pieces) >= 4:
                    try:
                        value = float(pieces[3])
                    except ValueError:
                        feature_id = pieces[3]
                if len(pieces) >= 5:
                    try:
                        value = float(pieces[4])
                    except ValueError:
                        feature_id = pieces[4]

            # dict to represent this feature
            peak_dict = {'chrom': chromosome,
                         'start': start,
                         'end'  : end}

            # add id and value if parsed
            if feature_id is not None:
                peak_dict['id'] = feature_id
            if value is not None:
                peak_dict['value'] = value

            # parse strand information if present
            if len(pieces) >= 6:
                peak_dict['strand'] = pieces[5]

            # add this feature to the list
            if (boundaries is None) or \
                    (count_intersections(peak_dict, boundaries) > 0):
                features[chromosome].append(peak_dict)

    return features


# test client
[docs]def main():
    peaks = load_features('test/peaks.bed', id_index=3, value_index=4)['chr1']
    print(len(peaks))
    print(peaks[0])
    peaks = load_features('test/peaks.bed')['chr1']
    print(len(peaks))
    print(peaks[0])
    peaks = load_features('test/reduced_bedgraph.bed', value_index=3)['chr4']
    print(len(peaks))
    print(peaks[0])
    peaks = load_features('test/reduced_bedgraph.bed')['chr4']
    print(len(peaks))
    print(peaks[0])


if __name__ == "__main__":
    main()