Source code for lib5c.structures.dataset

Module for the Dataset class, which provides a wrapper around a pandas DataFrame
allowing for representation of 5C data across replicates and stages of data
processing both on disk and in memory.

import os
import inspect

import numpy as np
import pandas as pd

from lib5c.parsers.primers import load_primermap
from lib5c.parsers.util import null_value
from lib5c.writers.primers import write_primermap
from lib5c.util.system import check_outdir
from lib5c.util.bed import get_mid_to_mid_distance
from import infer_level_mapping

[docs]class Dataset(object): """ Wrapper around a Pandas DataFrame. Attributes ---------- df : pd.DataFrame Contains the core data in the Dataset. Columns should be either not hierarchical, or hierarchical with the lower level of the hierarchy matching the replicate names. The row index of this DataFrame must be '<upstream_fragment_name>_<downstream_fragment_name>'. pixelmap : pixelmap, optional A pixelmap to provide information about the fragments. repinfo : pd.DataFrame, optional Its row index should be the replicate names, its columns can provide arbitrary information about each replicate, such as its condition, etc. """ def __init__(self, df, pixelmap=None, repinfo=None): """ Base constructor. Parameters ---------- df : pd.DataFrame The dataframe that makes up the Dataset. pixelmap : pixelmap, optional A pixelmap to bind to the Dataset. repinfo : repinfo-style pd.Dataframe, optional Repinfo to bind to the Dataset. """ self.df = df self.pixelmap = pixelmap self.reverse_pixelmap = None self.repinfo = repinfo if repinfo is not None: self.reps = repinfo.index self.conditions = sorted(repinfo['condition'].unique()) self.cond_reps = {cond: repinfo[repinfo['condition'] == cond].index for cond in self.conditions} else: self.reps = None self.conditions = None self.cond_reps = None if pixelmap is not None: self._add_reverse_pixelmap() self._add_region_column() self._add_distance_column()
[docs] def save(self, filename, sep=None): """ Writes this Dataset to disk as a .csv/.tsv, and optionally writes the pixelmap and/or repinfo files to disk right next to it if either or both of these data structures exist in the Dataset. Parameters ---------- filename : str The filename to write to. sep : str, optional The separator to use when writing the file. If ``filename`` ends with .csv or .tsv and ``sep`` is None, the separator will be determined automatically by the extension, but you can pass a value here to override it. """ # check outdir check_outdir(filename) # resolve sep sep = self._resolve_sep(filename, sep) # base filename will be useful base_fname, ext = os.path.splitext(filename) # write pixelmap if we have one if self.pixelmap is not None: pixelmap_fname = '%s_map.bed' % base_fname extra_column_names = \ list(set(self.pixelmap[list(self.pixelmap.keys())[0]][0].keys()) - {'chrom', 'start', 'end', 'name', 'region', 'number'}) write_primermap(self.pixelmap, pixelmap_fname, extra_column_names) # write repinfo if we have one if self.repinfo is not None: repinfo_fname = '%s_repinfo%s' % (base_fname, ext) self.repinfo.to_csv(repinfo_fname, sep=sep) # write the main dataframe self.df.to_csv(filename, sep=sep)
[docs] @classmethod def load(cls, filename, sep=None): """ Loads a Dataset from disk. Parameters ---------- filename : str The .csv or .tsv file to load the Dataset from. If a pixelmap or repinfo file is found next to this file, these files will also be loaded into the Dataset. sep : str, optional The separator to use when parsing the .csv/.tsv. Pass None to deduce this automatically from the file extension. Returns ------- Dataset The loaded Dataset. """ # resolve sep sep = cls._resolve_sep(filename, sep) # base filename will be useful base_fname, ext = os.path.splitext(filename) # load core dataframe df = pd.read_csv(filename, sep=sep, header=[0, 1], index_col=0) df.rename(columns=lambda x: '' if 'Unnamed' in x else x, inplace=True) # these are the names supplementary files should have if they exist pixelmap_fname = '%s_map.bed' % base_fname repinfo_fname = '%s_repinfo%s' % (base_fname, ext) # load pixelmap if it exists pixelmap = None if os.path.exists(pixelmap_fname): pixelmap = load_primermap(pixelmap_fname) # load repinfo if it exists repinfo = None if os.path.exists(repinfo_fname): repinfo = pd.read_csv(repinfo_fname, sep=sep, index_col=0) repinfo.index = # return the loaded instance return cls(df, pixelmap=pixelmap, repinfo=repinfo)
[docs] @classmethod def from_table_file(cls, table_file, name='counts', sep=None, pixelmap=None, repinfo=None): """ Creates a Dataset from a table file. The first column of the table file should be a FFLJ ID. The remaining columns should be count values for each replicate. The first row should specify the replicate names for each column. Parameters ---------- table_file : str The table file to read counts from. name : str Top-level column name for the data. sep : str The separator to use when parsing the table file.'\t' for tsv tables, ',' for csv tables. Pass None to guess this from the filename. pixelmap : pixelmap, optional A pixelmap to bind to the Dataset. repinfo : repinfo-style pd.Dataframe, optional Repinfo to bind to the Dataset. Returns ------- Dataset The new Dataset. """ # resolve sep sep = cls._resolve_sep(table_file, sep) # read csv df = pd.read_csv(table_file, sep=sep, index_col=0) # make column hierarchy df.columns = pd.MultiIndex.from_arrays( [[name] * len(df.columns), df.columns]) # return new Dataset return cls(df, pixelmap=pixelmap, repinfo=repinfo)
[docs] @classmethod def from_counts_superdict(cls, counts_superdict, pixelmap, name='counts', repinfo=None, rep_order=None): """ Creates a Datset from a counts_superdict and associated pixelmap. Parameters ---------- counts_superdict : counts_superdict Contains the data that will be put into the Dataset. pixelmap : pixelmap Needed to establish the row index on the Dataset. name : str Top-level column name for the data. repinfo : repinfo-style pd.Dataframe or list of str, optional Repinfo to bind to the Dataset. Pass a list of condition names to automatically create a repinfo object. rep_order : list of str, optional Pass this to guarantee the order of the columns for the replicates. Pass None to accept a random order. Returns ------- Dataset The new Dataset. """ # resolve rep_order if rep_order is None: rep_order = sorted(counts_superdict.keys()) # establish regions region_order = list(counts_superdict[rep_order[0]].keys()) # parallel lists to be filled in by the loop list_of_dict = [] list_of_fflj_id = [] # loop to fill in the parallel lists for region in region_order: for i in range(len(pixelmap[region])): for j in range(i + 1): list_of_fflj_id.append( '%s_%s' % (pixelmap[region][i]['name'], pixelmap[region][j]['name'])) list_of_dict.append( {rep: counts_superdict[rep][region][i, j] for rep in rep_order}) # use the parallel lists to create a dataframe with fflj_id index df = pd.DataFrame(list_of_dict, index=pd.Series(list_of_fflj_id, name='fflj_id')) # make column hierarchy df.columns = pd.MultiIndex.from_arrays( [[name] * len(df.columns), df.columns]) # resolve repinfo if repinfo is not None and type(repinfo[0]) == str: repinfo = Dataset._make_repinfo(rep_order, repinfo) # return new Dataset return cls(df, pixelmap=pixelmap, repinfo=repinfo)
def _add_reverse_pixelmap(self): """ Reverses self.pixelmap, binding the result to self.reverse_pixelmap. """ if self.pixelmap is None: raise ValueError('Dataset must have pixelmap bound to add reverse ' 'pixelmap') self.reverse_pixelmap = {self.pixelmap[region][i]['name']: (region, i) for region in self.pixelmap for i in range(len(self.pixelmap[region]))} def _add_region_column(self): """ Adds a 'region' column to self.df if one doesn't exist yet. Assumes that all interactions are cis. """ # check if column already exists if 'region' in self.df.columns: return # check for reverse_pixelmap if self.reverse_pixelmap is None: raise ValueError('Dataset must have reverse_pixelmap bound to add ' 'region column') self.df['region'] = [ self.reverse_pixelmap[self._split_index(fflj_id)[0]][0] for fflj_id in self.df.index] def _distance(self, fflj_id): """ Returns the mid-to-mid distance of an interaction given its FFLJ ID. Parameters ---------- fflj_id : str The FFLJ ID of the interaction. Returns ------- int Its mid-to-mid interaction distance in units of base pairs. """ # check for reverse_pixelmap if self.reverse_pixelmap is None: raise ValueError('Dataset must have reverse_pixelmap bound to ' 'compute interaction distances') left_name, right_name = self._split_index(fflj_id) left_region, left_index = self.reverse_pixelmap[left_name] right_region, right_index = self.reverse_pixelmap[right_name] return get_mid_to_mid_distance(self.pixelmap[left_region][left_index], self.pixelmap[right_region][right_index]) def _add_distance_column(self): """ Adds a 'distance' (in units of bins) column to self.df if one doesn't exist yet. Assumes that all interactions are cis. """ # check if column already exists if 'distance' in self.df.columns: return self.df['distance'] = [self._distance(fflj_id) for fflj_id in self.df.index]
[docs] def add_column_from_counts(self, counts, name): """ Adds a new column to this Dataset's df. The counts dict passed is assumed to match the pixelmap bound on this Dataset. If no pixelmap is bound, an ValueError will be raised. Parameters ---------- counts : dict of np.ndarray Should contain the values that will make up the new column. name : str The name of the new column. """ # check for reverse_pixelmap if self.reverse_pixelmap is None: raise ValueError('Dataset must have reverse_pixelmap bound to ' 'add column from counts') # deduce dtype dtype = counts[list(counts.keys())[0]].dtype # define inner function def inner_fn(index): # split index left_name, right_name = self._split_index(index) # look up names in the reverse pixelmap region, left_index = self.reverse_pixelmap[left_name] other_region, right_index = self.reverse_pixelmap[right_name] # skip trans contacts if region != other_region: return null_value(dtype) return counts[region][left_index, right_index] self.df[name] = np.array([inner_fn(i) for i in self.df.index], dtype=dtype)
[docs] def add_columns_from_counts_superdict(self, counts_superdict, name, rep_order=None): """ Adds a new group of columns to the Dataset from a counts superdict structure. Parameters ---------- counts_superdict : dict of dict of np.ndarray The outer keys are replicate names as strings, the inner keys are region names as strings, and the values are square, symmetric arrays of values for each replicate and region. name : str The name to use for the new group of columns. rep_order : list of str, optional Pass a list of replicate names to load the listed replicates in a specific order. Pass None to use the random order of the outer keys of ``counts_superdict``. """ # resolve rep_order if rep_order is None: rep_order = list(counts_superdict.keys()) for rep in rep_order: self.add_column_from_counts(counts_superdict[rep], (name, rep))
[docs] def select(self, name='counts', rep=None, region=None): """ Get a subset of this Dataset's DataFrame corresponding to a desired column, replicate, and/or region. Parameters ---------- name : str The column name of a hierarchical or non-hierarchical column. rep : str, optional If ``name`` refers to a hierarchical column, you must specify which replicate you want to select data from by passing its name here. region : str, optional To select data from only one region, pass its name here. Pass None to select data from all regions. """ idx = self.df[self.df['region'] == region].index if region\ else self.df.index cols = (name, rep) if rep else name return self.df.loc[idx, cols]
[docs] def counts(self, name='counts', rep=None, region=None, fill_value=None, dtype=None): """ Converts this Dataset to a regional_counts matrix, a counts dict, a counts_superdict, or a regional_counts_superdict. Parameters ---------- name : str The top-level column name to extract. rep : str, optional If name corresponds to a hierarchical column, pass a rep name to extract only one rep (return type will be a counts dict). Pass None to return a counts_superdict with all reps. If name corresponds to a normal column, this kwarg will be ignored. region : str, optional Pass a region name as a string to extract data for only one region. If name corresponds to a hierarchical column and rep was not passed, the return type will be a regional_counts_superdict. Otherwise, the return type will be a regional_counts matrix. Pass None to extract data for all regions. fill_value : any, optional The fill value for the counts_superdict (for entries not present in the Dataset). Pass None to use np.nan. dtype : dtype, optional The dtype to use for the np.array's in the counts_superdict. Pass None to guess them from the Dataset. If the data being extracted is strings, 'U25' will be assumed. Returns ------- regional_counts matrix, counts dict, counts_superdict, or regional_counts_superdict The data requested. See Parameters for explanation of return type. The general philosophy is that a counts_superdict will be returned, but any single-key levels will be squeezed. """ # check for reverse_pixelmap if self.reverse_pixelmap is None: raise ValueError('Dataset must have reverse_pixelmap bound to ' 'create counts_superdict') # check if name is hierachical is_hierarchical = hasattr(self.df[name], 'columns') # short-circuit: function calls itself repeatedly over reps if is_hierarchical and rep is None: return {rep: self.counts(name=name, rep=rep, region=region, fill_value=fill_value, dtype=dtype) for rep in self.df[name].columns} # establish default regions region_order = list(self.pixelmap.keys()) # honor region kwarg if region is not None: region_order = [region] # grab the relevant slice if is_hierarchical: if region is None: df_slice = self.df[name, rep] else: df_slice = self.df[self.df['region'] == region][name, rep] else: if region is None: df_slice = self.df[name] else: df_slice = self.df[self.df['region'] == region][name] # resolve dtype if dtype is None: dtype = df_slice.values.dtype # TODO: understand if this works in py3 if dtype == str: dtype = 'U25' # resolve fill_value if fill_value is None: fill_value = np.nan # TODO: understand if this works in py3 if any(c in str(dtype) for c in ['U', 'S']): fill_value = '' if dtype == int: fill_value = 0 if dtype == bool: fill_value = False # set up counts dict counts = {r: np.tile(np.array(fill_value, dtype=dtype), [len(self.pixelmap[r]), len(self.pixelmap[r])]) for r in region_order} # fill counts dict for fflj_id, value in df_slice.iteritems(): # resolve left and right fragment names left_name, right_name = self._split_index(fflj_id) # look up names in the reverse pixelmap left_region, left_index = self.reverse_pixelmap[left_name] right_region, right_index = self.reverse_pixelmap[right_name] # skip trans contacts if left_region != right_region: continue # fill value counts[left_region][left_index, right_index] = value counts[left_region][right_index, left_index] = value # return just the counts for the requested region as a matrix if the # region kwarg passed; all other return type possibilities are handled # by the short-circuit above if region is not None: return counts[region] return counts
@staticmethod def _split_index(index): """ Splits a row index value (FFLJ ID) into the individual fragment names. Currently this assumes the separator is '_' and that each fragment name has an equal number of '_'s, but this function should be used to do the splitting in case this changes in the future. Parameters ---------- index : str The row index value (FFLJ ID) to split. Returns ------- (str, str) The names of the interacting fragments. """ pieces = index.split('_') halfway = int(len(pieces)/2) return ('_'.join(pieces[:halfway]), '_'.join(pieces[halfway:])) @staticmethod def _resolve_sep(filename='', sep=None): """ Utility method to resolve tablular file separator. Parameters ---------- filename : str The filename. sep : str, optional The separator if one was specified, None otherwise. Returns ------- str The resolved separator. """ if sep is not None: return sep if filename.endswith('tsv'): return '\t' if filename.endswith('csv'): return ',' return '\t' @staticmethod def _make_repinfo(reps, conditions): """ Create a repinfo object given a list of rep names and condition names. Parameters ---------- reps : list of str The replicate names as strings. conditions : list of str The condition names as strings. Returns ------- pd.DataFrame The repinfo object. """ condition_map = infer_level_mapping(reps, conditions) conditions = list(map(condition_map.__getitem__, reps)) return pd.DataFrame(zip(reps, conditions), columns=['replicate', 'condition']) \ .set_index('replicate')
[docs] def apply_per_region(self, fn, inputs, outputs, initial_values=0.0, **kwargs): """ Apply a function over the Dataset on a per-region basis. Parameters ---------- fn : Callable The function to apply. It should take in pd.Series's or pd.DataFrames as its args, in the same order as inputs, and it should return 1D vectors, in the same order as outputs. inputs : list of (str or tuple of str) The list of columns to pass as inputs to fn. Use a tuple of strings to access hierarchical columns. Omit the secound level of a hierarchical column to pass all replicates to fn as a single pd.DataFrame. A single string or tuple will be wrapped in a list automatically. outputs : list of (str or tuple of str) Names of output columns to be added to the Dataset. Use a tuple of strings to create hierarchical columns. initial_values : list of any The values with which the new columns will be temporarily initialized. This should control the dtype of the new columns. """ # promote all singleton args to list if type(inputs) != list: inputs = [inputs] if type(outputs) != list: outputs = [outputs] if type(initial_values) != list: initial_values = [initial_values] * len(outputs) # initialize output columns for i in range(len(outputs)): self.df[outputs[i]] = initial_values[i] # apply per region for region in self.pixelmap: # inject region kwarg if fn accepts it k = dict(kwargs) if 'region' in inspect.getargspec(fn)[0]: k['region'] = region # call fn results = fn(*[self.df.loc[self.df.region == region, inputs[i]] for i in range(len(inputs))], **k) if type(results) not in [tuple, list]: results = [results] for i in range(len(outputs)): self.df.loc[self.df.region == region, outputs[i]] = results[i]
[docs] def apply_per_replicate(self, fn, inputs, outputs, **kwargs): """ Applies a function over the Dataset on a per-replicate basis. Parameters ---------- fn : Callable The function to apply. It should take in pd.Series's as its args, in the same order as inputs, and it should return 1D vectors, in the same order as outputs. inputs : list of (str or tuple of str) The list of columns to pass as inputs to fn. Use a tuple of strings to access hierarchical columns. At least one input must refer to the top level of a hierarchical column, the first such column encountered will be used to determine the replicates to apply over. Non-hierarchical columns, or hierarchical columns fully specified by a tuple of strings will be broadcast across all replicates. outputs : list of str Names of top-level output columns to be added to the Dataset. The second level will be automatically filled in with the replicate names. """ # promote all singleton args to list if type(inputs) != list: inputs = [inputs] if type(outputs) != list: outputs = [outputs] # identify reps (sub-columns of first hierarchical column) reps = None for i in range(len(inputs)): if hasattr(self.df[inputs[i]], 'columns'): reps = self.df[inputs[i]].columns break if reps is None: raise ValueError('none of the input columns are hierarchical') # apply per replicate for rep in reps: # inject rep kwarg if fn accepts it k = dict(kwargs) if 'rep' in inspect.getargspec(fn)[0]: k['rep'] = rep # call fn results = fn(*[self.df.loc[:, (inputs[i], rep)] if hasattr(self.df[inputs[i]], 'columns') else self.df.loc[:, inputs[i]] for i in range(len(inputs))], **k) if type(results) not in [tuple, list]: results = [results] for i in range(len(outputs)): self.df.loc[:, (outputs[i], rep)] = results[i]
[docs] def apply_per_replicate_per_region(self, fn, inputs, outputs, initial_values=0.0, **kwargs): """ Applies a function over the Dataset on a per-replicate, per-region basis. Parameters ---------- fn : Callable The function to apply. It should take in pd.Series's as its args, in the same order as inputs, and it should return 1D vectors, in the same order as outputs. inputs : list of (str or tuple of str) The list of columns to pass as inputs to fn. Use a tuple of strings to access hierarchical columns. At least one input must refer to the top level of a hierarchical column, the first such column encountered will be used to determine the replicates to apply over. Non-hierarchical columns, or hierarchical columns fully specified by a tuple of strings will be broadcast across all replicates. outputs : list of str Names of top-level output columns to be added to the Dataset. The second level will be automatically filled in with the replicate names. initial_values : list of any The values with which the new columns will be temporarily initialized. This should control the dtype of the new columns. """ # promote inputs and outputs to list if type(inputs) != list: inputs = [inputs] if type(outputs) != list: outputs = [outputs] # identify reps and hierarchical columns reps = None for i in range(len(inputs)): if hasattr(self.df[inputs[i]], 'columns'): reps = self.df[inputs[i]].columns break if reps is None: raise ValueError('none of the input columns are hierarchical') # loop over reps for rep in reps: # inject rep kwarg if fn accepts it k = dict(kwargs) if 'rep' in inspect.getargspec(fn)[0]: k['rep'] = rep # apply per region self.apply_per_region( fn, [(inputs[i], rep) if hasattr(self.df[inputs[i]], 'columns') else inputs[i] for i in range(len(inputs))], [(outputs[i], rep) for i in range(len(outputs))], initial_values=initial_values, **k)
[docs] def apply_across_replicates(self, fn, inputs, outputs, **kwargs): """ Applies a matrix-to-matrix function over the Dataset. This is useful for functions that don't operate independently on each replicate of the Dataset. The main advantage of this function is that it handles the unboxing of the replicates after a matrix-to-matrix function is applied. If you are looking to apply a matrix-to-vector function over the Dataset, you can do it with a one-liner, assigning the vector result(s) to the new column(s) immediately. Parameters ---------- fn : Callable The function to apply. It should take in np.ndarrays as its inputs and return np.ndarrays with the same size and shape. If some inputs are specified as individual columns, they will be passed to fn as np.ndarrays shaped as column vectors. inputs : list of (str or tuple of str) The list of columns to pass as inputs to fn. Use a tuple of strings to access hierarchical columns. At least one input must refer to the top level of a hierarchical column, the first such column encountered will be used to determine the replicates to apply over. Non-hierarchical columns, or hierarchical columns fully specified by a tuple of strings will be passed to fn as column vectors. outputs : list of str Names of top-level output columns to be added to the Dataset. The second level will be automatically filled in with the replicate names. """ # promote all singleton args to list if type(inputs) != list: inputs = [inputs] if type(outputs) != list: outputs = [outputs] # identify reps (sub-columns of first hierarchical column) reps = None for i in range(len(inputs)): if hasattr(self.df[inputs[i]], 'columns'): reps = self.df[inputs[i]].columns break if reps is None: raise ValueError('none of the input columns are hierarchical') # apply fn results = fn(*[self.df.loc[:, inputs[i]].values if hasattr(self.df[inputs[i]], 'columns') else self.df.loc[:, inputs[i]].values[:, np.newaxis] for i in range(len(inputs))], **kwargs) # promote results to list if type(results) not in [tuple, list]: results = [results] # unbox results for i in range(len(outputs)): for j in range(len(reps)): self.df.loc[:, (outputs[i], reps[j])] = results[i][:, j]
[docs] def apply_across_replicates_per_region(self, fn, inputs, outputs, initial_values=0.0, **kwargs): """ Applies a matrix-to-matrix function over the Dataset in a per-region manner. This is useful for functions that don't operate independently on each replicate of the Dataset, but which operate independently on each region of the Dataset. The main advantage of this function is that it handles the unboxing of the replicates after a matrix-to-matrix function is applied. If you are looking to apply a matrix-to-vector function over the Dataset in a per-region manner, you can do it with apply_per_region(), feeding a hierarchical column as an input. Parameters ---------- fn : Callable The function to apply. It should take in np.ndarrays as its inputs and return np.ndarrays with the same size and shape. If some inputs are specified as individual columns, they will be passed to fn as np.ndarrays shaped as column vectors. inputs : list of (str or tuple of str) The list of columns to pass as inputs to fn. Use a tuple of strings to access hierarchical columns. At least one input must refer to the top level of a hierarchical column, the first such column encountered will be used to determine the replicates to apply over. Non-hierarchical columns, or hierarchical columns fully specified by a tuple of strings will be passed to fn as column vectors. outputs : list of str Names of top-level output columns to be added to the Dataset. The second level will be automatically filled in with the replicate names. initial_values : list of any The values with which the new columns will be temporarily initialized. This should control the dtype of the new columns. """ # promote all singleton args to list if type(inputs) != list: inputs = [inputs] if type(outputs) != list: outputs = [outputs] if type(initial_values) != list: initial_values = [initial_values] * len(outputs) # identify reps (sub-columns of first hierarchical column) reps = None for i in range(len(inputs)): if hasattr(self.df[inputs[i]], 'columns'): reps = self.df[inputs[i]].columns break if reps is None: raise ValueError('none of the input columns are hierarchical') # initialize output columns for i in range(len(outputs)): for j in range(len(reps)): self.df[outputs[i], reps[j]] = initial_values[i] # apply per region for region in self.pixelmap: # inject region kwarg if fn accepts it k = dict(kwargs) if 'region' in inspect.getargspec(fn)[0]: k['region'] = region # call fn results = fn(*[ self.df.loc[self.df.region == region, inputs[i]].values if hasattr(self.df[inputs[i]], 'columns') else (self.df.loc[self.df.region == region, inputs[i]] .values[:, np.newaxis]) for i in range(len(inputs))], **k) if type(results) not in [tuple, list]: results = [results] for i in range(len(outputs)): for j in range(len(reps)): self.df.loc[self.df.region == region, (outputs[i], reps[j])] = results[i][:, j]
[docs] def dropna(self, name='counts', reps=None): """ Drops NA's from the underlying dataframe. Parameters ---------- name : str The name of the column to decide to drop based on. reps : list of str, optional If name refers to a hierarchial column, pass a list of rep names to only drop based on these reps. Pass None to drop based on the presence of an NA in any rep. If name does not refer to a hierarchical column this kwarg is ignored. """ # check for hierarchical column if hasattr(self.df[name], 'columns'): # resolve reps for hierarchical column if reps is None: reps = self.df[name].columns # generate subset subset = [(name, rep) for rep in reps] else: subset = [name] # drop self.df.dropna(subset=subset, inplace=True)