Source code for lib5c.algorithms.pca

import numpy as np
from sklearn.decomposition import PCA, KernelPCA, FastICA, FactorAnalysis
from sklearn.manifold import MDS
from sklearn.preprocessing import scale

from lib5c.util.counts_superdict import counts_superdict_to_matrix
from lib5c.algorithms.correlation import \
    make_pairwise_correlation_matrix_from_counts_matrix


[docs]def compute_pca_from_counts_superdict(counts_superdict, rep_order=None, **kwargs): """ Convenience function for performing PCA on a counts superdict data structure. Parameters ---------- counts_superdict : Dict[str, Dict[str, np.ndarray]] The counts superdict structure to compute PCA on. rep_order : Optional[List[str]] The order in which the replicates in ``counts_superdict`` should be considered when filling in the rows of the design matrix. kwargs : Dict[str, Any] Additional kwargs to be passed to ``compute_pca()``. Returns ------- Tuple[np.ndarray] The first element is the matrix of PCA-projected replicates. The second element is the PVE for each component, or None if the PCA method selected doesn't provide a PVE estimate. The third element is a matrix of the principle component vectors, or None if the PCA method selected doesn't provide a set of principle component vectors. """ matrix = counts_superdict_to_matrix(counts_superdict, rep_order=rep_order, discard_nan=True) return compute_pca(matrix, **kwargs)
[docs]def compute_pca(matrix, scaled=True, logged=False, kernel=None, kernel_kwargs=None, variant='pca', pf=1): """ Performs PCA on a matrix. Parameters ---------- matrix : np.ndarray The design matrix, whose rows are observations (replicates) and whose columns are features (interaction values at each position). scaled : bool Pass True to scale the features to unit variance. logged : bool Pass True to log the features before PCA. kernel : Optional[str] Pass a kernel accepted by ``sklearn.decomposition.KernelPCA()`` to perform KPCA. kernel_kwargs : Optional[Dict[str, Any]] Kwargs to use for the kernel. variant : {'pca', 'ica', 'fa', 'mds'} Select which variant of PCA to use. pf : int Specify an integer number of pure polynomial features to use in the PCA. Returns ------- Tuple[np.ndarray] The first element is the matrix of PCA-projected replicates. The second element is the PVE for each component, or None if the PCA method selected doesn't provide a PVE estimate. The third element is a matrix of the principle component vectors, or None if the PCA method selected doesn't provide a set of principle component vectors. """ # adjust matrix if variant != 'pca': matrix = matrix[:, ~np.all(matrix == matrix[0], axis=0)] if pf != 1: matrix = np.concatenate([np.power(matrix, p) for p in range(1, pf + 1)], axis=1) if logged: matrix = np.log2(matrix + 1) if scaled: matrix = scale(matrix) if variant == 'pca': print('performing pca') if kernel is not None: default_kernel_kwargs = {'degree': 3, 'gamma': 1.0 / matrix.shape[1], 'coef0': 0 if kernel == 'sigmoid' else 1} if kernel_kwargs is None: kernel_kwargs = {} default_kernel_kwargs.update(kernel_kwargs) kernel_kwargs = default_kernel_kwargs kernel_kwargs['n_components'] = len(matrix) kernel_kwargs['kernel'] = kernel pca_object = KernelPCA(**kernel_kwargs) proj = pca_object.fit(matrix).transform(matrix) pve = None pcs = pca_object.alphas_ else: pca_object = PCA() proj = pca_object.fit(matrix).transform(matrix) pve = pca_object.explained_variance_ratio_ pcs = pca_object.components_ elif variant == 'ica': print('performing ica') ica_object = FastICA() proj = ica_object.fit(matrix).transform(matrix) pve = None pcs = ica_object.components_ elif variant == 'fa': print('performing factor analysis') fa_object = FactorAnalysis(n_components=len(matrix), svd_method='lapack') proj = fa_object.fit(matrix).transform(matrix) pve = None pcs = fa_object.components_ elif variant == 'mds': print('performing multidimensional scaling') dissimilarity = 1 - make_pairwise_correlation_matrix_from_counts_matrix( matrix) mds_object = MDS(dissimilarity='precomputed') proj = mds_object.fit_transform(dissimilarity) pve = None pcs = None else: raise ValueError('variant %s not supported' % variant) return proj, pve, pcs