Source code for lib5c.algorithms.trimming

"""
Module for trimming low or "dead" 5C fragments away from 5C datasets.
"""

import numpy as np

from lib5c.util.parallelization import parallelize_regions


[docs]@parallelize_regions def trim_primers(primermap, counts_superdict, min_sum=100.0, min_frac=0.5): """ Trim a primermap using counts information from many replicates. Parameters ---------- primermap : List[Dict[str, Any]] The primermap to trim. See ``lib5c.parsers.primers.get_primermap()``. counts_superdict : Dict[str, np.ndarray] The keys are replicate names, the values are the counts for that rep. min_sum : Optional[float] Primers with a total cis sum lower than this value will be trimmed. min_frac : Optional[float] Primers with fewer than this fraction of nonzero interactions out of all their finite interactions will be trimmed. Returns ------- Tuple[List[Dict[str, Any]], Set[int]] The first element is the trimmed primermap, the second is the set of indices of the original primermap which were removed. """ # keep track of removed indices removed_indices = set() # each replicate gets its own chance to remove each primer for rep in counts_superdict.keys(): # check min_sum if min_sum: for i in range(len(primermap)): column_sum = np.nansum(counts_superdict[rep][:, i]) if column_sum < min_sum: removed_indices.add(i) # check min_frac if min_frac: for i in range(len(primermap)): column = counts_superdict[rep][:, i] total_interactions = np.count_nonzero(~np.isnan(column)) nonzero_interactions = np.count_nonzero( column[np.isfinite(column)]) fraction = nonzero_interactions / float(total_interactions) if fraction < min_frac: removed_indices.add(i) # construct trimmed primermap trimmed_primermap = [primermap[i] for i in range(len(primermap)) if i not in removed_indices] return trimmed_primermap, removed_indices
[docs]@parallelize_regions def wipe_counts(counts, indices, wipe_value=np.nan): """ Wipes specified rows and columns of the counts matrix with a specified value. Parameters ---------- counts : np.ndarray The square symmetric counts matrix to wipe. indices : Iterable[int] The indices of the rows and columns to wipe. wipe_value : Optional[float] The value to wipe the selected indices with. Returns ------- np.ndarray The wiped counts matrix. """ for i in indices: counts[:, i] = wipe_value counts[i, :] = wipe_value return counts
[docs]@parallelize_regions def trim_counts(counts, indices): """ Removes specified rows and columns from the counts matrix. Parameters ---------- counts : np.ndarray The square symmetric counts matrix to trim. indices : Iterable[int] The indices to wipe Returns ------- np.ndarray The trimmed counts matrix. """ indices = list(indices) counts = np.delete(counts, indices, axis=0) counts = np.delete(counts, indices, axis=1) return counts
[docs]def wipe_counts_superdict(counts_superdict, indices, wipe_value=np.nan): """ Applies ``wipe_counts()`` to each replicate in a ``counts_superdict``. Parameters ---------- counts_superdict : Dict[str, np.ndarray] The keys are replicate names, the values are the counts for that rep. indices : Iterable[int] The indices to wipe wipe_value : Optional[float] The value to wipe the selected indices with. Returns ------- Dict[str, np.ndarray] The keys are replicate names, the values are the wiped counts for that rep. """ return {rep: wipe_counts(counts_superdict[rep], indices, wipe_value) for rep in counts_superdict.keys()}
[docs]def trim_counts_superdict(counts_superdict, indices): """ Applies ``trim_counts()`` to each replicate in a ``counts_superdict``. Parameters ---------- counts_superdict : Dict[str, np.ndarray] The keys are replicate names, the values are the counts for that rep. indices : Iterable[int] The indices to trim. Returns ------- Dict[str, np.ndarray] The keys are replicate names, the values are the trimmed counts for that rep. """ return {rep: trim_counts(counts_superdict[rep], indices) for rep in counts_superdict.keys()}