Source code for lib5c.parsers.genes

"""
Module for parsing .bed files containing gene track information.
"""

import io
import gzip


[docs]def load_genes(bedfile): """ Loads information for genes from a .bed file into dicts and returns them. Parameters ---------- bedfile : str String reference to location of .bed file to load genes from. Returns ------- dict of lists of dicts The keys are chromosome names. The values are lists of genes for that chromosome. The genes are represented as dicts with the following structure:: { 'start' : int, 'end' : int, 'name' : str, 'strand': '+' or '-', 'blocks': list of dicts } Blocks typically represent exons and are represented as dicts with the following structure:: { 'start': int, 'end' : int } """ # dict to store genes genes = {} # parse bedfile with open(bedfile, 'r') as handle: for line in handle: # skip comments and track line if line.startswith('#') or line.startswith('track'): continue # split line pieces = line.strip().split('\t') # parse chromosome chromosome = pieces[0] # add chromosome to dict if this is a new one if chromosome not in genes: genes[chromosome] = [] # parse gene information start = int(pieces[1]) end = int(pieces[2]) name = pieces[3] strand = pieces[5] thick_start, thick_end = None, None if len(pieces) > 6: thick_start = int(pieces[6]) thick_end = int(pieces[7]) block_sizes = [int(piece) for piece in pieces[10].strip(',').split(',')] block_starts = [int(piece) for piece in pieces[11].strip(',').split(',')] # compute block information blocks = [] if thick_start and thick_end and not thick_start == thick_end: for i in range(len(block_starts)): block = {} block['start'] = max(start + block_starts[i], thick_start) block['end'] = min(start + block_starts[i] + block_sizes[i], thick_end) if block['end'] > block['start']: blocks.append(block) # add this gene to the list genes[chromosome].append({'chrom' : chromosome, 'start' : start, 'end' : end, 'name' : name, 'strand': strand, 'blocks': blocks}) for chrom in genes: genes[chrom].sort(key=lambda x: x['start']) return genes
[docs]def load_gene_table(tablefile): """ Similar to ``load_genes()``, but reads in a gzipped UCSC table file instead. The main advantage of this approach is that genes parsed this way include human-readable gene symbols. Parameters ---------- tablefile : str String reference to location of the gzipped table file to read. Returns ------- dict of lists of dicts The keys are chromosome names. The values are lists of genes for that chromosome. The genes are represented as dicts with the following structure:: { 'start' : int, 'end' : int, 'name' : str, 'id': str, 'strand': '+' or '-', 'blocks': list of dicts } Blocks typically represent exons and are represented as dicts with the following structure:: { 'start': int, 'end' : int } """ # dict to store genes genes = {} # parse bedfile with gzip.open(tablefile, 'r') as gz: gz.read1 = gz.read with io.TextIOWrapper(gz, encoding='utf8') as handle: for line in handle: # skip header if line.startswith('#'): continue # split line pieces = line.split('\t') # parse chromosome chromosome = pieces[2].strip() # add chromosome to dict if this is a new one if chromosome not in genes: genes[chromosome] = [] # parse gene information start = int(pieces[4]) end = int(pieces[5]) id = pieces[1].strip() name = pieces[12] strand = pieces[3] # parse block information if applicable blocks = [] cds_start = int(pieces[6]) cds_end = int(pieces[7]) if cds_start != cds_end: block_starts = [ int(piece) for piece in pieces[9].strip(',').split(',') ] block_ends = [ int(piece) for piece in pieces[10].strip(',').split(',') ] for i in range(len(block_starts)): block = {'start': max(block_starts[i], cds_start), 'end': min(block_ends[i], cds_end)} if block['end'] > block['start']: blocks.append(block) # add this gene to the list genes[chromosome].append({'chrom' : chromosome, 'start' : start, 'end' : end, 'name' : name, 'id' : id, 'strand': strand, 'blocks': blocks}) for chrom in genes: genes[chrom].sort(key=lambda x: x['start']) return genes
# test client
[docs]def main(): genes = load_genes('test/genes.bed')['chr3'] print(len(genes)) print(genes[0])
if __name__ == "__main__": main()