Source code for lib5c.tools.threshold
import argparse
from lib5c.tools.parents import primerfile_parser
[docs]def add_threshold_tool(parser):
threshold_parser = parser.add_parser(
'threshold',
prog='lib5c threshold',
help='threshold p-values',
parents=[primerfile_parser]
)
threshold_parser.add_argument(
'-c', '--conditions',
type=str,
help='A comma-separated list of condition names.')
threshold_parser.add_argument(
'-t', '--significance_threshold',
default=0.05,
type=float,
help='''The p-value or q-value to threshold significance with. Default
is 0.05.''')
threshold_parser.add_argument(
'-B', '--bh_fdr',
action='store_true',
help='''Pass this flag to apply BH-FDR adjustment to p-values before
applying the significance threshold.''')
threshold_parser.add_argument(
'-T', '--two_tail',
action='store_true',
help='''Pass this in combination with -B/--bh_fdr to apply HH-FDR to the
two-tailed p-values, but only report the significant right-tail events
as loops. Note that two-tailed p-values are only accurate if p-values
were called using a continuous distribution.''')
threshold_parser.add_argument(
'-C', '--concordant',
action='store_true',
help='''Pass this flag to report only those interactions which are
significant in all replicates for a given condition. Skip this flag to
combine evidence from all replicates in a given condition.''')
threshold_parser.add_argument(
'-s', '--size_threshold',
default=3,
type=int,
help='''Interactions within connected components smaller than this will
not be called. Default is 3.''')
threshold_parser.add_argument(
'-d', '--distance_threshold',
default=24000,
type=int,
help='''Interactions with interaction distance (in bp) shorter than this
will not be called. Default is 24000.''')
threshold_parser.add_argument(
'-b', '--background_threshold',
default=0.6,
type=float,
help='''The p-value threshold to use to call a background loop class.
Default is 0.6.''')
threshold_parser.add_argument(
'-o', '--dataset_outfile',
type=str,
help='''Pass a filename to write the complete thresholded Dataset to
disk.''')
threshold_parser.add_argument(
'-k', '--kappa_confusion_outfile',
type=str,
help='''Pass a filename to write kappa metrics and confusion matrices
to.''')
threshold_parser.add_argument(
'outfile',
type=str,
help='''Filename to write final calls to.''')
threshold_parser.add_argument(
'pvalue_countsfiles',
type=str,
nargs=argparse.REMAINDER,
help='''P-value countsfiles to threshold.''')
threshold_parser.set_defaults(func=threshold_tool)
[docs]def threshold_tool(parser, args):
import glob
from lib5c.tools.helpers import resolve_primerfile, \
infer_replicate_names
from lib5c.parsers.primers import load_primermap
from lib5c.parsers.counts import load_counts
from lib5c.writers.counts import write_counts
from lib5c.algorithms.thresholding import two_way_thresholding, kappa, \
concordance_confusion, color_confusion, count_clusters
# resolve conditions
conditions = list(map(str.strip, args.conditions.strip('()').split(','))) \
if args.conditions else None
# resolve primerfile
primerfile = resolve_primerfile(args.pvalue_countsfiles, args.primerfile)
# expand infiles
expanded_infiles = []
for infile in args.pvalue_countsfiles:
expanded_infiles.extend(glob.glob(infile))
# compute rep_names
rep_names = infer_replicate_names(
expanded_infiles, as_dict=True, pattern=args.pvalue_countsfiles[0]
if len(args.pvalue_countsfiles) == 1
and '*' in args.pvalue_countsfiles[0] else None)
# load counts
print('loading counts')
primermap = load_primermap(primerfile)
pvalues_superdict = {rep_name: load_counts(file_name, primermap)
for file_name, rep_name in rep_names.items()}
# perform thresholding
print('thresholding')
d = two_way_thresholding(
pvalues_superdict,
primermap,
conditions=conditions,
significance_threshold=args.significance_threshold,
bh_fdr=args.bh_fdr,
two_tail=args.two_tail,
concordant=args.concordant,
distance_threshold=args.distance_threshold,
size_threshold=args.size_threshold,
background_threshold=args.background_threshold
)
# write
print('writing results')
write_counts(d.counts(name='color'), args.outfile, primermap,
skip_zeros=True)
if args.dataset_outfile is not None:
d.save(args.dataset_outfile)
if args.kappa_confusion_outfile is not None:
with open(args.kappa_confusion_outfile, 'w') as handle:
if all([len(d.cond_reps[cond]) == 2 for cond in d.conditions]):
handle.write("Cohen's kappa within reps\n")
handle.write('-------------------------\n')
kappa_dict = kappa(d)
for cond in kappa_dict:
handle.write('%s: %s\n' % (cond, kappa_dict[cond]))
handle.write('\n')
handle.write('concordance confusion matrices\n')
handle.write('------------------------------\n')
confusion_dict = concordance_confusion(d)
for cond in confusion_dict:
handle.write('%s:\n' % cond)
handle.write(str(confusion_dict[cond]) + '\n')
handle.write('\n')
if len(d.conditions) == 2:
handle.write('color confusion matrix\n')
handle.write('----------------------\n')
handle.write(str(color_confusion(d)) + '\n')
handle.write('\n')
handle.write('cluster counts\n')
handle.write('--------------\n')
color_counts = count_clusters(d)
for color, count in color_counts.items():
handle.write('%s: %i\n' % (color, count))
handle.write('\n')