Source code for lib5c.tools.pca
import argparse
from lib5c.tools.parents import primerfile_parser
[docs]def add_pca_tool(parser):
pca_parser = parser.add_parser(
'pca',
prog='lib5c plot pca',
help='perform principal component analysis',
parents=[primerfile_parser]
)
pca_parser.add_argument(
'-c', '--csv',
type=str,
help='''Specify an output file to write output as csv.''')
pca_parser.add_argument(
'-i', '--image',
type=str,
help='''Specify an output file to write output as image.''')
pca_parser.add_argument(
'-s', '--separate_colors',
type=str,
help='''Specify a shell-quoted, comma-separated list of class names
(which must be substrings of the replicate names) to color-code the
output with. For example, 'ES,NPC'.''')
pca_parser.add_argument(
'-x', '--x_component',
type=int,
default=0,
help='''Principal component to plot on the x-axis (zero-indexed). The
default is 0.''')
pca_parser.add_argument(
'-y', '--y_component',
type=int,
default=1,
help='''Principal component to plot on the y-axis (zero-indexed). The
default is 1.''')
pca_parser.add_argument(
'-n', '--n_components',
type=int,
help='''Specify a number of components to plot in a pairwise grid
instead of specifying only one x_component, y_component pair.''')
pca_parser.add_argument(
'-S', '--scale',
action='store_true',
help='''Pass this flag to scale the features to unit variance.''')
pca_parser.add_argument(
'-L', '--log',
action='store_true',
help='''Pass this flag to log the features.''')
pca_parser.add_argument(
'-a', '--algorithm',
type=str,
default='pca',
choices=['pca', 'ica', 'fa', 'mds'],
help='''Specify what algorithm to use. The default is 'pca' for standard
principal component analysis.''')
pca_parser.add_argument(
'-f', '--polynomial_features',
type=int,
default=1,
help='''Specify the degree of pure polynomial features to add to the
design matrix. The default is 1, which includes only the original,
linear features.''')
pca_parser.add_argument(
'-k', '--kernel',
type=str,
choices=['none', 'linear', 'poly', 'rbf', 'sigmoid', 'cosine'],
help='''Specify a kernel to perform kernel PCA with. The default is
'none', which does not perform kernel PCA.''')
pca_parser.add_argument(
'-d', '--degree',
type=int,
default=3,
help='''Specify the degree to use when performing polynomial KPCA. The
default is 3.''')
pca_parser.add_argument(
'-g', '--gamma',
type=float,
help='''Specify the value of the kernel coefficient gamma to use when
performing radial basis function or polynomial KPCA. The default is to
use 1/n_features.''')
pca_parser.add_argument(
'-b', '--bias_term',
type=float,
help='''Specify the independent term to use when performing polynomial
or sigmoid KPCA. The default is 1 for polynomial kernels and 0 for
sigmoid kernels.''')
pca_parser.add_argument(
'countsfiles',
type=str,
nargs=argparse.REMAINDER,
help='''Countsfiles to perform PCA on.''')
pca_parser.set_defaults(func=pca_tool)
[docs]def pca_tool(parser, args):
import glob
from lib5c.tools.helpers import resolve_primerfile, infer_replicate_names,\
infer_level_mapping
from lib5c.parsers.primers import load_primermap
from lib5c.parsers.counts import load_counts
from lib5c.algorithms.pca import compute_pca_from_counts_superdict
from lib5c.writers.pca import write_pca
from lib5c.plotters.pca import plot_pca, plot_multi_pca
# resolve primerfile
primerfile = resolve_primerfile(args.countsfiles, args.primerfile)
# expand infiles
expanded_infiles = []
for infile in args.countsfiles:
expanded_infiles.extend(glob.glob(infile.strip('\'"')))
# load counts
print('loading counts')
primermap = load_primermap(primerfile)
counts_superdict = {expanded_infile: load_counts(expanded_infile, primermap)
for expanded_infile in expanded_infiles}
# compute labels
rep_names = infer_replicate_names(
expanded_infiles, pattern=args.countsfiles[0]
if len(args.countsfiles) == 1 and '*' in args.countsfiles[0] else None)
# determine levels
levels = None
if args.separate_colors is not None:
levels = infer_level_mapping(
rep_names, list(map(str.strip, args.separate_colors.split(','))))
# resolve kernel_kwargs
kernel_kwargs = {}
if args.degree is not None:
kernel_kwargs['degree'] = args.degree
if args.gamma is not None:
kernel_kwargs['gamma'] = args.gamma
if args.bias_term is not None:
kernel_kwargs['coef0'] = args.bias_term
# perform PCA
proj, pve, pcs = compute_pca_from_counts_superdict(
counts_superdict, rep_order=expanded_infiles, scaled=args.scale,
logged=args.log, kernel=args.kernel, kernel_kwargs=kernel_kwargs,
variant=args.algorithm, pf=args.polynomial_features)
# write csv
if args.csv is not None:
write_pca(args.csv, proj, rep_names=rep_names, pve=pve)
# plot
if args.image is not None:
if args.n_components is not None:
plot_multi_pca(proj, pcs=args.n_components, labels=rep_names,
levels=levels, outfile=args.image)
else:
plot_pca(proj, pcs=(args.x_component, args.y_component),
labels=rep_names, levels=levels, outfile=args.image)