Source code for lib5c.parsers.primers

"""
Module for parsing .bed files containing 5C primer and bin information.
"""

from lib5c.parsers.util import parse_field
from lib5c.parsers.primer_names import default_bin_parser, \
    guess_primer_name_parser


[docs]def load_primermap(bedfile, name_parser=None, strand_index=5,
                   region_index=None, column_names=None):
    """
    Parameters
    ----------
    bedfile : str
        String reference to a primer bedfile to use to generate the primermap.
    name_parser : Optional[Callable[[str], Dict[str, Any]]]
        Function that takes in the primer name column of the bedfile (the fourth
        column) and returns a dict containing key-value pairs to be added to the
        dict that represents that primer. At a minimum, this dict must have the
        following structure::

            {
                'region': string
            }

        If the dict includes any keys that are already typically included in the
        primer dict, the values returned by this function will overwrite the
        usual values. If None is passed, an appropriate name parser will be
        guessed based on the primer/bin names.
    strand_index : Optional[int]
        If an int is passed, the column with that index will be used to
        determine strand information for the primer. If ``None`` is passed, the
        algorithm will try to guess which column contains this information. If
        this fails, strand information will not be included in the primer dict.
        Acceptable strings to indicate primer strand are 'F'/'R', 'FOR'/'REV',
        and '+'/'-'. Primers on the + strand will be assumed to be oriented in
        the 3' direction, and primers on the - strand will be assumed to be
        oriented in the 5' direction, unless an 'orientation' key is provided in
        the dict returned by ``name_parser``.
    region_index : Optional[int]
        If an int is passed, the column with that index will be used to
        determine the region the primer is in. This makes specifying
        ``region_parser`` optional and overrides the region it returns.
    column_names : Optional[List[str]]
        Pass a list of strings equal to the number of columns in the bedfile,
        describing the columns. The first four elements will be ignored. Special
        values include 'strand', which will set ``strand_index``, and 'region',
        which will override ``region_index``. All other values will end up as
        keys in the primer dicts. If this is not passed, this function will look
        for a header line in the primerfile, and if one is not found, a default
        header will be assumed.

    Returns
    -------
    Dict[str, List[Dict[str, Any]]]
        The keys of the outer dict are region names. The values are lists, where
        the :math:`i` th entry represents the :math:`i` th primer in that
        region. Primers are represented as dicts with the following structure::

            {
                'region'     : str
                'chrom'      : str,
                'start'      : int,
                'end'        : int,
                'name'       : str,
                'strand'     : '+' or '-',
                'orientation': "3'" or "5'"
            }

        though strand and orientation may not be present, and additional keys
        may be present if returned by ``name_parser``, passed in
        ``column_names``, or if a header line is present.

    Notes
    -----
    A primermap is a mapping from primers (specified by a region name and primer
    index) to the genomic range covered by those primers.
    """
    # acceptable strand identifiers
    plus_strand_identifiers = ['F', 'FOR', '+']
    minus_strand_identifiers = ['R', 'REV', '-']

    # dict to store the primermap
    primermap = {}

    # parse column_names
    if column_names is not None:
        try:
            strand_index = column_names.index('strand')
        except ValueError:
            pass
        try:
            region_index = column_names.index('region')
        except ValueError:
            pass

    # parse bedfile
    with open(bedfile, 'r') as handle:
        # parse the bedfile
        for line in handle:
            # skip comments, unless they contain bedtools nuc information
            if line.startswith('#') or line.startswith('track'):
                if column_names is None and not line.startswith('#track'):
                    pieces = line.strip().strip('#').split('\t')
                    if len(pieces) > 4:
                        column_names = pieces
                        try:
                            strand_index = column_names.index('strand')
                        except ValueError:
                            pass
                        try:
                            region_index = column_names.index('region')
                        except ValueError:
                            pass
                continue

            # split bedfile line
            feature_columns = line.strip().split('\t')

            # parse bed feature information
            chrom = feature_columns[0]
            start = int(feature_columns[1])
            end = int(feature_columns[2])
            name = feature_columns[3]

            # guess name_parser if not passed
            if name_parser is None:
                name_parser = guess_primer_name_parser(name)

            # if we failed to guess the name_parser, fall back to chromosomes
            if name_parser is None and region_index is None:
                region_index = 0

            # try parsing region using region_index
            region = None
            if region_index is not None and region_index < len(feature_columns):
                region = feature_columns[region_index]

            # try parsing strand using strand_index
            # or looping through all columns as a fallback
            strand = None
            if strand_index is not None and strand_index < len(feature_columns):
                strand_string = feature_columns[strand_index]
                if strand_string in plus_strand_identifiers:
                    strand = '+'
                elif strand_string in minus_strand_identifiers:
                    strand = '-'
            else:
                for feature_column in feature_columns:
                    if feature_column in plus_strand_identifiers:
                        strand = '+'
                    elif feature_column in minus_strand_identifiers:
                        strand = '-'

            # assemble the dict describing this primer
            # always-present, required fields
            primer_dict = {'chrom': chrom,
                           'start': start,
                           'end'  : end,
                           'name' : name}

            # if region_index or strand_index already succeeded, add them now
            if region is not None:
                primer_dict['region'] = region
            if strand is not None:
                primer_dict['strand'] = strand

            # add additional fields from name_parser to primer_dict
            if name_parser is not None:
                name_fields = name_parser(name)
                primer_dict.update(name_fields)

            # add arbitrary fields from column_names or parsed from header
            if column_names is not None and len(column_names) > 4:
                for i in range(4, len(column_names)):
                    primer_dict[column_names[i]] = parse_field(
                        feature_columns[i])

            # if 'strand' exists but 'orientation' doesn't, fill it in
            if 'strand' in primer_dict and 'orientation' not in primer_dict:
                primer_dict['orientation'] = "3'"\
                    if primer_dict['strand'] == '+' else "5'"

            # if we haven't figured out the region by now raise an error
            if 'region' not in primer_dict:
                raise ValueError(
                    'region could not be identfied (must pass one of '
                    'name_parser, region_index, or column_names with "region" '
                    'column)')

            # if this is a new region, make a new list for it
            if primer_dict['region'] not in primermap:
                primermap[primer_dict['region']] = []

            # add this primer to the primermap
            primermap[primer_dict['region']].append(primer_dict)

    for region in primermap.keys():
        # sort primers within each region
        primermap[region].sort(key=lambda x: x['start'])

        # post-check: if the primermap has 1bp gaps, we assume that the bedfile
        # is formatted incorrectly and that we should extend the end coordinate
        # by 1
        has_gaps = False
        for i in range(len(primermap[region])):
            if i == len(primermap[region]) - 1:
                break
            gap = primermap[region][i+1]['start'] - primermap[region][i]['end']
            if gap == 0:
                break
            if gap == 1:
                has_gaps = True
                break
        if has_gaps:
            for primer in primermap[region]:
                primer['end'] += 1

    return primermap


[docs]def get_pixelmap_legacy(bedfile, name_parser=default_bin_parser):
    """
    Parameters
    ----------
    bedfile : str
        String reference to a binned primer bedfile to use to generate the
        pixelmap.
    name_parser : Optional[Callable[[str], Dict[str, Any]]]
        Function that takes in the bin name column of the bedfile (the fourth
        column) and returns a dict containing key-value pairs to be added to the
        dict that represents that bin. At a minimum, this dict must have the
        following structure::

            {
                'region': str
            }

        If the dict includes any keys that are already typically included in the
        bin dict, the values returned by this function will overwrite the usual
        values.

    Returns
    -------
    Dict[str, List[Dict[str, Any]]]
        The keys of the outer dict are region names. The values are lists, where
        the :math:`i` th entry represents the :math:`i` th bin in that region.
        Bins are represented as dicts with the following structure::

            {
                'chrom': str,
                'start': int,
                'end'  : int,
                'name' : str
            }

        Additional keys may be present if returned by ``name_parser``.

    Notes
    -----
    A pixelmap is a mapping from bins (specified by a region name and bin or
    primer index) to the genomic range covered by those bins.
    """

    # dict to store the pixelmap
    pixelmap = {}

    # parse bedfile
    with open(bedfile, 'r') as handle:
        # parse the bedfile
        for line in handle:
            # skip comments
            if line.startswith('#'):
                continue

            # split line on tab
            pieces = line.strip().split('\t')

            # parse genomic coordinate information
            chrom = pieces[0]
            start = int(pieces[1])
            end = int(pieces[2])

            # parse bin name
            name_fields = name_parser(pieces[3])
            region = name_fields['region']

            # if this is a new region, make a new list for it
            if region not in pixelmap:
                pixelmap[region] = []

            # assemble the dict describing this primer
            # always-present fields
            bin_dict = {'chrom': chrom,
                        'start': start,
                        'end'  : end,
                        'name' : pieces[3]}

            # add additional fields from parse_name
            bin_dict.update(name_fields)

            # add this region to the map
            pixelmap[region].append(bin_dict)

    # sort bins within each region
    for region in pixelmap.keys():
        pixelmap[region].sort(key=lambda x: x['start'])

    return pixelmap


# test client
[docs]def main():
    print(load_primermap('test/bins.bed')['Nestin'][0])
    print(load_primermap('test/primers.bed')['Nestin'][0])
    print(load_primermap('test/primers_nuc.bed')['Nestin'][0])


if __name__ == "__main__":
    main()