"""
Module for trimming low or "dead" 5C fragments away from 5C datasets.
"""
import numpy as np
from lib5c.util.parallelization import parallelize_regions
[docs]@parallelize_regions
def trim_primers(primermap, counts_superdict, min_sum=100.0, min_frac=0.5):
"""
Trim a primermap using counts information from many replicates.
Parameters
----------
primermap : List[Dict[str, Any]]
The primermap to trim. See ``lib5c.parsers.primers.get_primermap()``.
counts_superdict : Dict[str, np.ndarray]
The keys are replicate names, the values are the counts for that rep.
min_sum : Optional[float]
Primers with a total cis sum lower than this value will be trimmed.
min_frac : Optional[float]
Primers with fewer than this fraction of nonzero interactions out of all
their finite interactions will be trimmed.
Returns
-------
Tuple[List[Dict[str, Any]], Set[int]]
The first element is the trimmed primermap, the second is the set of
indices of the original primermap which were removed.
"""
# keep track of removed indices
removed_indices = set()
# each replicate gets its own chance to remove each primer
for rep in counts_superdict.keys():
# check min_sum
if min_sum:
for i in range(len(primermap)):
column_sum = np.nansum(counts_superdict[rep][:, i])
if column_sum < min_sum:
removed_indices.add(i)
# check min_frac
if min_frac:
for i in range(len(primermap)):
column = counts_superdict[rep][:, i]
total_interactions = np.count_nonzero(~np.isnan(column))
nonzero_interactions = np.count_nonzero(
column[np.isfinite(column)])
fraction = nonzero_interactions / float(total_interactions)
if fraction < min_frac:
removed_indices.add(i)
# construct trimmed primermap
trimmed_primermap = [primermap[i]
for i in range(len(primermap))
if i not in removed_indices]
return trimmed_primermap, removed_indices
[docs]@parallelize_regions
def wipe_counts(counts, indices, wipe_value=np.nan):
"""
Wipes specified rows and columns of the counts matrix with a specified
value.
Parameters
----------
counts : np.ndarray
The square symmetric counts matrix to wipe.
indices : Iterable[int]
The indices of the rows and columns to wipe.
wipe_value : Optional[float]
The value to wipe the selected indices with.
Returns
-------
np.ndarray
The wiped counts matrix.
"""
for i in indices:
counts[:, i] = wipe_value
counts[i, :] = wipe_value
return counts
[docs]@parallelize_regions
def trim_counts(counts, indices):
"""
Removes specified rows and columns from the counts matrix.
Parameters
----------
counts : np.ndarray
The square symmetric counts matrix to trim.
indices : Iterable[int]
The indices to wipe
Returns
-------
np.ndarray
The trimmed counts matrix.
"""
indices = list(indices)
counts = np.delete(counts, indices, axis=0)
counts = np.delete(counts, indices, axis=1)
return counts
[docs]def wipe_counts_superdict(counts_superdict, indices, wipe_value=np.nan):
"""
Applies ``wipe_counts()`` to each replicate in a ``counts_superdict``.
Parameters
----------
counts_superdict : Dict[str, np.ndarray]
The keys are replicate names, the values are the counts for that rep.
indices : Iterable[int]
The indices to wipe
wipe_value : Optional[float]
The value to wipe the selected indices with.
Returns
-------
Dict[str, np.ndarray]
The keys are replicate names, the values are the wiped counts for that
rep.
"""
return {rep: wipe_counts(counts_superdict[rep], indices, wipe_value)
for rep in counts_superdict.keys()}
[docs]def trim_counts_superdict(counts_superdict, indices):
"""
Applies ``trim_counts()`` to each replicate in a ``counts_superdict``.
Parameters
----------
counts_superdict : Dict[str, np.ndarray]
The keys are replicate names, the values are the counts for that rep.
indices : Iterable[int]
The indices to trim.
Returns
-------
Dict[str, np.ndarray]
The keys are replicate names, the values are the trimmed counts for that
rep.
"""
return {rep: trim_counts(counts_superdict[rep], indices)
for rep in counts_superdict.keys()}