"""
Module for parsing .bed files containing 5C primer and bin information.
"""
from lib5c.parsers.util import parse_field
from lib5c.parsers.primer_names import default_bin_parser, \
guess_primer_name_parser
[docs]def load_primermap(bedfile, name_parser=None, strand_index=5,
region_index=None, column_names=None):
"""
Parameters
----------
bedfile : str
String reference to a primer bedfile to use to generate the primermap.
name_parser : Optional[Callable[[str], Dict[str, Any]]]
Function that takes in the primer name column of the bedfile (the fourth
column) and returns a dict containing key-value pairs to be added to the
dict that represents that primer. At a minimum, this dict must have the
following structure::
{
'region': string
}
If the dict includes any keys that are already typically included in the
primer dict, the values returned by this function will overwrite the
usual values. If None is passed, an appropriate name parser will be
guessed based on the primer/bin names.
strand_index : Optional[int]
If an int is passed, the column with that index will be used to
determine strand information for the primer. If ``None`` is passed, the
algorithm will try to guess which column contains this information. If
this fails, strand information will not be included in the primer dict.
Acceptable strings to indicate primer strand are 'F'/'R', 'FOR'/'REV',
and '+'/'-'. Primers on the + strand will be assumed to be oriented in
the 3' direction, and primers on the - strand will be assumed to be
oriented in the 5' direction, unless an 'orientation' key is provided in
the dict returned by ``name_parser``.
region_index : Optional[int]
If an int is passed, the column with that index will be used to
determine the region the primer is in. This makes specifying
``region_parser`` optional and overrides the region it returns.
column_names : Optional[List[str]]
Pass a list of strings equal to the number of columns in the bedfile,
describing the columns. The first four elements will be ignored. Special
values include 'strand', which will set ``strand_index``, and 'region',
which will override ``region_index``. All other values will end up as
keys in the primer dicts. If this is not passed, this function will look
for a header line in the primerfile, and if one is not found, a default
header will be assumed.
Returns
-------
Dict[str, List[Dict[str, Any]]]
The keys of the outer dict are region names. The values are lists, where
the :math:`i` th entry represents the :math:`i` th primer in that
region. Primers are represented as dicts with the following structure::
{
'region' : str
'chrom' : str,
'start' : int,
'end' : int,
'name' : str,
'strand' : '+' or '-',
'orientation': "3'" or "5'"
}
though strand and orientation may not be present, and additional keys
may be present if returned by ``name_parser``, passed in
``column_names``, or if a header line is present.
Notes
-----
A primermap is a mapping from primers (specified by a region name and primer
index) to the genomic range covered by those primers.
"""
# acceptable strand identifiers
plus_strand_identifiers = ['F', 'FOR', '+']
minus_strand_identifiers = ['R', 'REV', '-']
# dict to store the primermap
primermap = {}
# parse column_names
if column_names is not None:
try:
strand_index = column_names.index('strand')
except ValueError:
pass
try:
region_index = column_names.index('region')
except ValueError:
pass
# parse bedfile
with open(bedfile, 'r') as handle:
# parse the bedfile
for line in handle:
# skip comments, unless they contain bedtools nuc information
if line.startswith('#') or line.startswith('track'):
if column_names is None and not line.startswith('#track'):
pieces = line.strip().strip('#').split('\t')
if len(pieces) > 4:
column_names = pieces
try:
strand_index = column_names.index('strand')
except ValueError:
pass
try:
region_index = column_names.index('region')
except ValueError:
pass
continue
# split bedfile line
feature_columns = line.strip().split('\t')
# parse bed feature information
chrom = feature_columns[0]
start = int(feature_columns[1])
end = int(feature_columns[2])
name = feature_columns[3]
# guess name_parser if not passed
if name_parser is None:
name_parser = guess_primer_name_parser(name)
# if we failed to guess the name_parser, fall back to chromosomes
if name_parser is None and region_index is None:
region_index = 0
# try parsing region using region_index
region = None
if region_index is not None and region_index < len(feature_columns):
region = feature_columns[region_index]
# try parsing strand using strand_index
# or looping through all columns as a fallback
strand = None
if strand_index is not None and strand_index < len(feature_columns):
strand_string = feature_columns[strand_index]
if strand_string in plus_strand_identifiers:
strand = '+'
elif strand_string in minus_strand_identifiers:
strand = '-'
else:
for feature_column in feature_columns:
if feature_column in plus_strand_identifiers:
strand = '+'
elif feature_column in minus_strand_identifiers:
strand = '-'
# assemble the dict describing this primer
# always-present, required fields
primer_dict = {'chrom': chrom,
'start': start,
'end' : end,
'name' : name}
# if region_index or strand_index already succeeded, add them now
if region is not None:
primer_dict['region'] = region
if strand is not None:
primer_dict['strand'] = strand
# add additional fields from name_parser to primer_dict
if name_parser is not None:
name_fields = name_parser(name)
primer_dict.update(name_fields)
# add arbitrary fields from column_names or parsed from header
if column_names is not None and len(column_names) > 4:
for i in range(4, len(column_names)):
primer_dict[column_names[i]] = parse_field(
feature_columns[i])
# if 'strand' exists but 'orientation' doesn't, fill it in
if 'strand' in primer_dict and 'orientation' not in primer_dict:
primer_dict['orientation'] = "3'"\
if primer_dict['strand'] == '+' else "5'"
# if we haven't figured out the region by now raise an error
if 'region' not in primer_dict:
raise ValueError(
'region could not be identfied (must pass one of '
'name_parser, region_index, or column_names with "region" '
'column)')
# if this is a new region, make a new list for it
if primer_dict['region'] not in primermap:
primermap[primer_dict['region']] = []
# add this primer to the primermap
primermap[primer_dict['region']].append(primer_dict)
for region in primermap.keys():
# sort primers within each region
primermap[region].sort(key=lambda x: x['start'])
# post-check: if the primermap has 1bp gaps, we assume that the bedfile
# is formatted incorrectly and that we should extend the end coordinate
# by 1
has_gaps = False
for i in range(len(primermap[region])):
if i == len(primermap[region]) - 1:
break
gap = primermap[region][i+1]['start'] - primermap[region][i]['end']
if gap == 0:
break
if gap == 1:
has_gaps = True
break
if has_gaps:
for primer in primermap[region]:
primer['end'] += 1
return primermap
[docs]def get_pixelmap_legacy(bedfile, name_parser=default_bin_parser):
"""
Parameters
----------
bedfile : str
String reference to a binned primer bedfile to use to generate the
pixelmap.
name_parser : Optional[Callable[[str], Dict[str, Any]]]
Function that takes in the bin name column of the bedfile (the fourth
column) and returns a dict containing key-value pairs to be added to the
dict that represents that bin. At a minimum, this dict must have the
following structure::
{
'region': str
}
If the dict includes any keys that are already typically included in the
bin dict, the values returned by this function will overwrite the usual
values.
Returns
-------
Dict[str, List[Dict[str, Any]]]
The keys of the outer dict are region names. The values are lists, where
the :math:`i` th entry represents the :math:`i` th bin in that region.
Bins are represented as dicts with the following structure::
{
'chrom': str,
'start': int,
'end' : int,
'name' : str
}
Additional keys may be present if returned by ``name_parser``.
Notes
-----
A pixelmap is a mapping from bins (specified by a region name and bin or
primer index) to the genomic range covered by those bins.
"""
# dict to store the pixelmap
pixelmap = {}
# parse bedfile
with open(bedfile, 'r') as handle:
# parse the bedfile
for line in handle:
# skip comments
if line.startswith('#'):
continue
# split line on tab
pieces = line.strip().split('\t')
# parse genomic coordinate information
chrom = pieces[0]
start = int(pieces[1])
end = int(pieces[2])
# parse bin name
name_fields = name_parser(pieces[3])
region = name_fields['region']
# if this is a new region, make a new list for it
if region not in pixelmap:
pixelmap[region] = []
# assemble the dict describing this primer
# always-present fields
bin_dict = {'chrom': chrom,
'start': start,
'end' : end,
'name' : pieces[3]}
# add additional fields from parse_name
bin_dict.update(name_fields)
# add this region to the map
pixelmap[region].append(bin_dict)
# sort bins within each region
for region in pixelmap.keys():
pixelmap[region].sort(key=lambda x: x['start'])
return pixelmap
# test client
[docs]def main():
print(load_primermap('test/bins.bed')['Nestin'][0])
print(load_primermap('test/primers.bed')['Nestin'][0])
print(load_primermap('test/primers_nuc.bed')['Nestin'][0])
if __name__ == "__main__":
main()