Source code for lib5c.tools.express

from lib5c.tools.parents import level_parser, simple_in_out_parser, \
    parallelization_parser, primerfile_parser


[docs]def add_express_tool(parser): express_parser = parser.add_parser( 'express', prog='lib5c express', help='express normalization', parents=[primerfile_parser, level_parser, simple_in_out_parser, parallelization_parser] ) express_parser.add_argument( '-J', '--joint_normalize', action='store_true', help='''Normalize all input files using a single shared bias vector.''') express_parser.add_argument( '-B', '--output_bias', action='store_true', help='''If this flag is present, the bias vectors will be written to .bias files located next to the output .counts files.''') express_parser.add_argument( '-e', '--epsilon', type=float, default=1e-4, help='''The relative improvement in the residual before declaring convergence. The default is 1e-4.''') express_parser.add_argument( '-i', '--max_iterations', type=int, default=3000, help='''Maximum number of iterations. The default is 3000.''') express_parser.add_argument( '-o', '--plot_outfile', type=str, help='''Pass a string reference to a path to which to write visualizations of the residual as a function of number of iterations. This string should include one %%r, which will be replaced with the region name.''') express_parser.add_argument( '-m', '--expected_model', type=str, help='''Pass a string reference to a countsfile containing the expected model to be used for express balancing. If this is not passed, a linear log-log fit will be used.''') express_parser.set_defaults(func=express_tool)
[docs]def express_tool(parser, args): import os import glob import numpy as np from lib5c.tools.helpers import resolve_level, resolve_parallel, \ infer_replicate_names, resolve_primerfile, resolve_expected_models from lib5c.algorithms.express import express_normalize_matrix, \ joint_express_normalize from lib5c.algorithms.expected import \ make_poly_log_log_binned_expected_matrix, \ make_poly_log_log_fragment_expected_matrix from lib5c.parsers.primers import load_primermap from lib5c.parsers.counts import load_counts from lib5c.writers.counts import write_counts from lib5c.writers.bias import write_cis_bias_vector from lib5c.util.system import check_outdir # resolve primerfile and parallel primerfile = resolve_primerfile(args.infile, args.primerfile) if not args.joint_normalize: resolve_parallel(parser, args, subcommand='express') # load counts print('loading counts') primermap = load_primermap(primerfile) if not args.joint_normalize: observed_counts = load_counts(args.infile, primermap) else: infiles = glob.glob(args.infile.strip('\'"')) observed_counts = [load_counts(infile, primermap) for infile in infiles] # resolve expected model if not args.joint_normalize: if args.expected_model is not None: print('loading expected model from %s' % args.expected_model) expected_counts = load_counts(args.expected_model, primermap) else: print('precomputing expected model') if resolve_level(primermap, args.level) == 'fragment': expected_counts = make_poly_log_log_fragment_expected_matrix( observed_counts, primermap) else: expected_counts = make_poly_log_log_binned_expected_matrix( observed_counts) else: expected_counts = resolve_expected_models( args.expected_model, observed_counts, primermap, level=args.level) # express normalize if not args.joint_normalize: print('express normalizing') balanced_counts, bias_factors, errs = express_normalize_matrix( observed_counts, expected_counts, max_iter=args.max_iterations, eps=args.epsilon) else: print('joint express normalizing') # deduce regions regions = list(observed_counts[0].keys()) # reshape reshaped_observed_counts = { region: [observed_counts[i][region] for i in range(len(observed_counts))] for region in regions} reshaped_expected_counts = { region: [expected_counts[i][region] for i in range(len(observed_counts))] for region in regions} # joint normalize raw_balanced_counts, bias_factors, errs = joint_express_normalize( reshaped_observed_counts, reshaped_expected_counts, max_iter=args.max_iterations, eps=args.epsilon) # reshape balanced_counts = [{region: raw_balanced_counts[region][i] for region in regions} for i in range(len(observed_counts))] # plot error if args.plot_outfile: print('plotting output') for region in errs.keys(): resolved_outfile = args.plot_outfile.replace(r'%r', region) check_outdir(resolved_outfile) import matplotlib.pyplot as plt plt.clf() plt.plot(np.arange(1, len(errs[region])), np.log(errs[region][1:])) plt.title('%s convergence' % region) plt.ylabel('log(L1 error)') plt.xlabel('iteration number') plt.savefig(resolved_outfile) # write counts print('writing counts') if not args.joint_normalize: write_counts(balanced_counts, args.outfile, primermap) if args.output_bias: biasfile = '%s.bias' % os.path.splitext(args.outfile)[0] print('writing bias vector') write_cis_bias_vector(bias_factors, primermap, biasfile) else: replicate_names = infer_replicate_names( infiles, pattern=args.infile if '*' in args.infile else None) outfiles = [args.outfile.replace(r'%s', rep) for rep in replicate_names] for i in range(len(balanced_counts)): write_counts(balanced_counts[i], outfiles[i], primermap) if args.output_bias: print('writing bias vectors') for i in range(len(balanced_counts)): biasfile = '%s.bias' % os.path.splitext(outfiles[i])[0] write_cis_bias_vector(bias_factors, primermap, biasfile)