Source code for lib5c.tools.threshold

import argparse

from lib5c.tools.parents import primerfile_parser


[docs]def add_threshold_tool(parser):
    threshold_parser = parser.add_parser(
        'threshold',
        prog='lib5c threshold',
        help='threshold p-values',
        parents=[primerfile_parser]
    )
    threshold_parser.add_argument(
        '-c', '--conditions',
        type=str,
        help='A comma-separated list of condition names.')
    threshold_parser.add_argument(
        '-t', '--significance_threshold',
        default=0.05,
        type=float,
        help='''The p-value or q-value to threshold significance with. Default
        is 0.05.''')
    threshold_parser.add_argument(
        '-B', '--bh_fdr',
        action='store_true',
        help='''Pass this flag to apply BH-FDR adjustment to p-values before
        applying the significance threshold.''')
    threshold_parser.add_argument(
        '-T', '--two_tail',
        action='store_true',
        help='''Pass this in combination with -B/--bh_fdr to apply HH-FDR to the
        two-tailed p-values, but only report the significant right-tail events
        as loops. Note that two-tailed p-values are only accurate if p-values
        were called using a continuous distribution.''')
    threshold_parser.add_argument(
        '-C', '--concordant',
        action='store_true',
        help='''Pass this flag to report only those interactions which are
        significant in all replicates for a given condition. Skip this flag to
        combine evidence from all replicates in a given condition.''')
    threshold_parser.add_argument(
        '-s', '--size_threshold',
        default=3,
        type=int,
        help='''Interactions within connected components smaller than this will
        not be called. Default is 3.''')
    threshold_parser.add_argument(
        '-d', '--distance_threshold',
        default=24000,
        type=int,
        help='''Interactions with interaction distance (in bp) shorter than this
        will not be called. Default is 24000.''')
    threshold_parser.add_argument(
        '-b', '--background_threshold',
        default=0.6,
        type=float,
        help='''The p-value threshold to use to call a background loop class.
        Default is 0.6.''')
    threshold_parser.add_argument(
        '-o', '--dataset_outfile',
        type=str,
        help='''Pass a filename to write the complete thresholded Dataset to
        disk.''')
    threshold_parser.add_argument(
        '-k', '--kappa_confusion_outfile',
        type=str,
        help='''Pass a filename to write kappa metrics and confusion matrices
        to.''')
    threshold_parser.add_argument(
        'outfile',
        type=str,
        help='''Filename to write final calls to.''')
    threshold_parser.add_argument(
        'pvalue_countsfiles',
        type=str,
        nargs=argparse.REMAINDER,
        help='''P-value countsfiles to threshold.''')
    threshold_parser.set_defaults(func=threshold_tool)


[docs]def threshold_tool(parser, args):
    import glob

    from lib5c.tools.helpers import resolve_primerfile, \
        infer_replicate_names
    from lib5c.parsers.primers import load_primermap
    from lib5c.parsers.counts import load_counts
    from lib5c.writers.counts import write_counts
    from lib5c.algorithms.thresholding import two_way_thresholding, kappa, \
        concordance_confusion, color_confusion, count_clusters

    # resolve conditions
    conditions = list(map(str.strip, args.conditions.strip('()').split(','))) \
        if args.conditions else None

    # resolve primerfile
    primerfile = resolve_primerfile(args.pvalue_countsfiles, args.primerfile)

    # expand infiles
    expanded_infiles = []
    for infile in args.pvalue_countsfiles:
        expanded_infiles.extend(glob.glob(infile))

    # compute rep_names
    rep_names = infer_replicate_names(
        expanded_infiles, as_dict=True, pattern=args.pvalue_countsfiles[0]
        if len(args.pvalue_countsfiles) == 1
        and '*' in args.pvalue_countsfiles[0] else None)

    # load counts
    print('loading counts')
    primermap = load_primermap(primerfile)
    pvalues_superdict = {rep_name: load_counts(file_name, primermap)
                         for file_name, rep_name in rep_names.items()}

    # perform thresholding
    print('thresholding')
    d = two_way_thresholding(
        pvalues_superdict,
        primermap,
        conditions=conditions,
        significance_threshold=args.significance_threshold,
        bh_fdr=args.bh_fdr,
        two_tail=args.two_tail,
        concordant=args.concordant,
        distance_threshold=args.distance_threshold,
        size_threshold=args.size_threshold,
        background_threshold=args.background_threshold
    )

    # write
    print('writing results')
    write_counts(d.counts(name='color'), args.outfile, primermap,
                 skip_zeros=True)

    if args.dataset_outfile is not None:
        d.save(args.dataset_outfile)

    if args.kappa_confusion_outfile is not None:
        with open(args.kappa_confusion_outfile, 'w') as handle:
            if all([len(d.cond_reps[cond]) == 2 for cond in d.conditions]):
                handle.write("Cohen's kappa within reps\n")
                handle.write('-------------------------\n')
                kappa_dict = kappa(d)
                for cond in kappa_dict:
                    handle.write('%s: %s\n' % (cond, kappa_dict[cond]))
                handle.write('\n')
                handle.write('concordance confusion matrices\n')
                handle.write('------------------------------\n')
                confusion_dict = concordance_confusion(d)
                for cond in confusion_dict:
                    handle.write('%s:\n' % cond)
                    handle.write(str(confusion_dict[cond]) + '\n')
                handle.write('\n')
            if len(d.conditions) == 2:
                handle.write('color confusion matrix\n')
                handle.write('----------------------\n')
                handle.write(str(color_confusion(d)) + '\n')
                handle.write('\n')
            handle.write('cluster counts\n')
            handle.write('--------------\n')
            color_counts = count_clusters(d)
            for color, count in color_counts.items():
                handle.write('%s: %i\n' % (color, count))
            handle.write('\n')