Source code for lib5c.tools.pca

import argparse

from lib5c.tools.parents import primerfile_parser


[docs]def add_pca_tool(parser): pca_parser = parser.add_parser( 'pca', prog='lib5c plot pca', help='perform principal component analysis', parents=[primerfile_parser] ) pca_parser.add_argument( '-c', '--csv', type=str, help='''Specify an output file to write output as csv.''') pca_parser.add_argument( '-i', '--image', type=str, help='''Specify an output file to write output as image.''') pca_parser.add_argument( '-s', '--separate_colors', type=str, help='''Specify a shell-quoted, comma-separated list of class names (which must be substrings of the replicate names) to color-code the output with. For example, 'ES,NPC'.''') pca_parser.add_argument( '-x', '--x_component', type=int, default=0, help='''Principal component to plot on the x-axis (zero-indexed). The default is 0.''') pca_parser.add_argument( '-y', '--y_component', type=int, default=1, help='''Principal component to plot on the y-axis (zero-indexed). The default is 1.''') pca_parser.add_argument( '-n', '--n_components', type=int, help='''Specify a number of components to plot in a pairwise grid instead of specifying only one x_component, y_component pair.''') pca_parser.add_argument( '-S', '--scale', action='store_true', help='''Pass this flag to scale the features to unit variance.''') pca_parser.add_argument( '-L', '--log', action='store_true', help='''Pass this flag to log the features.''') pca_parser.add_argument( '-a', '--algorithm', type=str, default='pca', choices=['pca', 'ica', 'fa', 'mds'], help='''Specify what algorithm to use. The default is 'pca' for standard principal component analysis.''') pca_parser.add_argument( '-f', '--polynomial_features', type=int, default=1, help='''Specify the degree of pure polynomial features to add to the design matrix. The default is 1, which includes only the original, linear features.''') pca_parser.add_argument( '-k', '--kernel', type=str, choices=['none', 'linear', 'poly', 'rbf', 'sigmoid', 'cosine'], help='''Specify a kernel to perform kernel PCA with. The default is 'none', which does not perform kernel PCA.''') pca_parser.add_argument( '-d', '--degree', type=int, default=3, help='''Specify the degree to use when performing polynomial KPCA. The default is 3.''') pca_parser.add_argument( '-g', '--gamma', type=float, help='''Specify the value of the kernel coefficient gamma to use when performing radial basis function or polynomial KPCA. The default is to use 1/n_features.''') pca_parser.add_argument( '-b', '--bias_term', type=float, help='''Specify the independent term to use when performing polynomial or sigmoid KPCA. The default is 1 for polynomial kernels and 0 for sigmoid kernels.''') pca_parser.add_argument( 'countsfiles', type=str, nargs=argparse.REMAINDER, help='''Countsfiles to perform PCA on.''') pca_parser.set_defaults(func=pca_tool)
[docs]def pca_tool(parser, args): import glob from lib5c.tools.helpers import resolve_primerfile, infer_replicate_names,\ infer_level_mapping from lib5c.parsers.primers import load_primermap from lib5c.parsers.counts import load_counts from lib5c.algorithms.pca import compute_pca_from_counts_superdict from lib5c.writers.pca import write_pca from lib5c.plotters.pca import plot_pca, plot_multi_pca # resolve primerfile primerfile = resolve_primerfile(args.countsfiles, args.primerfile) # expand infiles expanded_infiles = [] for infile in args.countsfiles: expanded_infiles.extend(glob.glob(infile.strip('\'"'))) # load counts print('loading counts') primermap = load_primermap(primerfile) counts_superdict = {expanded_infile: load_counts(expanded_infile, primermap) for expanded_infile in expanded_infiles} # compute labels rep_names = infer_replicate_names( expanded_infiles, pattern=args.countsfiles[0] if len(args.countsfiles) == 1 and '*' in args.countsfiles[0] else None) # determine levels levels = None if args.separate_colors is not None: levels = infer_level_mapping( rep_names, list(map(str.strip, args.separate_colors.split(',')))) # resolve kernel_kwargs kernel_kwargs = {} if args.degree is not None: kernel_kwargs['degree'] = args.degree if args.gamma is not None: kernel_kwargs['gamma'] = args.gamma if args.bias_term is not None: kernel_kwargs['coef0'] = args.bias_term # perform PCA proj, pve, pcs = compute_pca_from_counts_superdict( counts_superdict, rep_order=expanded_infiles, scaled=args.scale, logged=args.log, kernel=args.kernel, kernel_kwargs=kernel_kwargs, variant=args.algorithm, pf=args.polynomial_features) # write csv if args.csv is not None: write_pca(args.csv, proj, rep_names=rep_names, pve=pve) # plot if args.image is not None: if args.n_components is not None: plot_multi_pca(proj, pcs=args.n_components, labels=rep_names, levels=levels, outfile=args.image) else: plot_pca(proj, pcs=(args.x_component, args.y_component), labels=rep_names, levels=levels, outfile=args.image)