"""
Module for parsing .bed files containing gene track information.
"""
import io
import gzip
[docs]def load_genes(bedfile):
"""
Loads information for genes from a .bed file into dicts and returns them.
Parameters
----------
bedfile : str
String reference to location of .bed file to load genes from.
Returns
-------
dict of lists of dicts
The keys are chromosome names. The values are lists of genes for that
chromosome. The genes are represented as dicts with the following
structure::
{
'start' : int,
'end' : int,
'name' : str,
'strand': '+' or '-',
'blocks': list of dicts
}
Blocks typically represent exons and are represented as dicts with the
following structure::
{
'start': int,
'end' : int
}
"""
# dict to store genes
genes = {}
# parse bedfile
with open(bedfile, 'r') as handle:
for line in handle:
# skip comments and track line
if line.startswith('#') or line.startswith('track'):
continue
# split line
pieces = line.strip().split('\t')
# parse chromosome
chromosome = pieces[0]
# add chromosome to dict if this is a new one
if chromosome not in genes:
genes[chromosome] = []
# parse gene information
start = int(pieces[1])
end = int(pieces[2])
name = pieces[3]
strand = pieces[5]
thick_start, thick_end = None, None
if len(pieces) > 6:
thick_start = int(pieces[6])
thick_end = int(pieces[7])
block_sizes = [int(piece)
for piece in pieces[10].strip(',').split(',')]
block_starts = [int(piece)
for piece in pieces[11].strip(',').split(',')]
# compute block information
blocks = []
if thick_start and thick_end and not thick_start == thick_end:
for i in range(len(block_starts)):
block = {}
block['start'] = max(start + block_starts[i], thick_start)
block['end'] = min(start + block_starts[i] + block_sizes[i],
thick_end)
if block['end'] > block['start']:
blocks.append(block)
# add this gene to the list
genes[chromosome].append({'chrom' : chromosome,
'start' : start,
'end' : end,
'name' : name,
'strand': strand,
'blocks': blocks})
for chrom in genes:
genes[chrom].sort(key=lambda x: x['start'])
return genes
[docs]def load_gene_table(tablefile):
"""
Similar to ``load_genes()``, but reads in a gzipped UCSC table file instead.
The main advantage of this approach is that genes parsed this way include
human-readable gene symbols.
Parameters
----------
tablefile : str
String reference to location of the gzipped table file to read.
Returns
-------
dict of lists of dicts
The keys are chromosome names. The values are lists of genes for that
chromosome. The genes are represented as dicts with the following
structure::
{
'start' : int,
'end' : int,
'name' : str,
'id': str,
'strand': '+' or '-',
'blocks': list of dicts
}
Blocks typically represent exons and are represented as dicts with the
following structure::
{
'start': int,
'end' : int
}
"""
# dict to store genes
genes = {}
# parse bedfile
with gzip.open(tablefile, 'r') as gz:
gz.read1 = gz.read
with io.TextIOWrapper(gz, encoding='utf8') as handle:
for line in handle:
# skip header
if line.startswith('#'):
continue
# split line
pieces = line.split('\t')
# parse chromosome
chromosome = pieces[2].strip()
# add chromosome to dict if this is a new one
if chromosome not in genes:
genes[chromosome] = []
# parse gene information
start = int(pieces[4])
end = int(pieces[5])
id = pieces[1].strip()
name = pieces[12]
strand = pieces[3]
# parse block information if applicable
blocks = []
cds_start = int(pieces[6])
cds_end = int(pieces[7])
if cds_start != cds_end:
block_starts = [
int(piece)
for piece in pieces[9].strip(',').split(',')
]
block_ends = [
int(piece)
for piece in pieces[10].strip(',').split(',')
]
for i in range(len(block_starts)):
block = {'start': max(block_starts[i], cds_start),
'end': min(block_ends[i], cds_end)}
if block['end'] > block['start']:
blocks.append(block)
# add this gene to the list
genes[chromosome].append({'chrom' : chromosome,
'start' : start,
'end' : end,
'name' : name,
'id' : id,
'strand': strand,
'blocks': blocks})
for chrom in genes:
genes[chrom].sort(key=lambda x: x['start'])
return genes
# test client
[docs]def main():
genes = load_genes('test/genes.bed')['chr3']
print(len(genes))
print(genes[0])
if __name__ == "__main__":
main()