Source code for lib5c.structures.dataset

"""
Module for the Dataset class, which provides a wrapper around a pandas DataFrame
allowing for representation of 5C data across replicates and stages of data
processing both on disk and in memory.
"""

import os
import inspect

import numpy as np
import pandas as pd

from lib5c.parsers.primers import load_primermap
from lib5c.parsers.util import null_value
from lib5c.writers.primers import write_primermap
from lib5c.util.system import check_outdir
from lib5c.util.bed import get_mid_to_mid_distance
from lib5c.tools.helpers import infer_level_mapping


[docs]class Dataset(object):
    """
    Wrapper around a Pandas DataFrame.

    Attributes
    ----------
    df : pd.DataFrame
        Contains the core data in the Dataset. Columns should be either not
        hierarchical, or hierarchical with the lower level of the hierarchy
        matching the replicate names. The row index of this DataFrame must be
        '<upstream_fragment_name>_<downstream_fragment_name>'.
    pixelmap : pixelmap, optional
        A pixelmap to provide information about the fragments.
    repinfo : pd.DataFrame, optional
        Its row index should be the replicate names, its columns can provide
        arbitrary information about each replicate, such as its condition, etc.
    """
    def __init__(self, df, pixelmap=None, repinfo=None):
        """
        Base constructor.

        Parameters
        ----------
        df : pd.DataFrame
            The dataframe that makes up the Dataset.
        pixelmap : pixelmap, optional
            A pixelmap to bind to the Dataset.
        repinfo : repinfo-style pd.Dataframe, optional
            Repinfo to bind to the Dataset.
        """
        self.df = df
        self.pixelmap = pixelmap
        self.reverse_pixelmap = None
        self.repinfo = repinfo
        if repinfo is not None:
            self.reps = repinfo.index
            self.conditions = sorted(repinfo['condition'].unique())
            self.cond_reps = {cond: repinfo[repinfo['condition'] == cond].index
                              for cond in self.conditions}
        else:
            self.reps = None
            self.conditions = None
            self.cond_reps = None
        if pixelmap is not None:
            self._add_reverse_pixelmap()
            self._add_region_column()
            self._add_distance_column()

[docs]    def save(self, filename, sep=None):
        """
        Writes this Dataset to disk as a .csv/.tsv, and optionally writes the
        pixelmap and/or repinfo files to disk right next to it if either or both
        of these data structures exist in the Dataset.

        Parameters
        ----------
        filename : str
            The filename to write to.
        sep : str, optional
            The separator to use when writing the file. If ``filename`` ends
            with .csv or .tsv and ``sep`` is None, the separator will be
            determined automatically by the extension, but you can pass a value
            here to override it.
        """
        # check outdir
        check_outdir(filename)

        # resolve sep
        sep = self._resolve_sep(filename, sep)

        # base filename will be useful
        base_fname, ext = os.path.splitext(filename)

        # write pixelmap if we have one
        if self.pixelmap is not None:
            pixelmap_fname = '%s_map.bed' % base_fname
            extra_column_names = \
                list(set(self.pixelmap[list(self.pixelmap.keys())[0]][0].keys())
                     - {'chrom', 'start', 'end', 'name', 'region', 'number'})
            write_primermap(self.pixelmap, pixelmap_fname, extra_column_names)

        # write repinfo if we have one
        if self.repinfo is not None:
            repinfo_fname = '%s_repinfo%s' % (base_fname, ext)
            self.repinfo.to_csv(repinfo_fname, sep=sep)

        # write the main dataframe
        self.df.to_csv(filename, sep=sep)

[docs]    @classmethod
    def load(cls, filename, sep=None):
        """
        Loads a Dataset from disk.

        Parameters
        ----------
        filename : str
            The .csv or .tsv file to load the Dataset from. If a pixelmap or
            repinfo file is found next to this file, these files will also be
            loaded into the Dataset.
        sep : str, optional
            The separator to use when parsing the .csv/.tsv. Pass None to deduce
            this automatically from the file extension.

        Returns
        -------
        Dataset
            The loaded Dataset.
        """
        # resolve sep
        sep = cls._resolve_sep(filename, sep)

        # base filename will be useful
        base_fname, ext = os.path.splitext(filename)

        # load core dataframe
        df = pd.read_csv(filename, sep=sep, header=[0, 1], index_col=0)
        df.rename(columns=lambda x: '' if 'Unnamed' in x else x, inplace=True)

        # these are the names supplementary files should have if they exist
        pixelmap_fname = '%s_map.bed' % base_fname
        repinfo_fname = '%s_repinfo%s' % (base_fname, ext)

        # load pixelmap if it exists
        pixelmap = None
        if os.path.exists(pixelmap_fname):
            pixelmap = load_primermap(pixelmap_fname)

        # load repinfo if it exists
        repinfo = None
        if os.path.exists(repinfo_fname):
            repinfo = pd.read_csv(repinfo_fname, sep=sep, index_col=0)
            repinfo.index = repinfo.index.map(str)

        # return the loaded instance
        return cls(df, pixelmap=pixelmap, repinfo=repinfo)

[docs]    @classmethod
    def from_table_file(cls, table_file, name='counts', sep=None, pixelmap=None,
                        repinfo=None):
        """
        Creates a Dataset from a table file.

        The first column of the table file should be a FFLJ ID.

        The remaining columns should be count values for each replicate.

        The first row should specify the replicate names for each column.

        Parameters
        ----------
        table_file : str
            The table file to read counts from.
        name : str
            Top-level column name for the data.
        sep : str
            The separator to use when parsing the table file.'\t' for tsv
            tables, ',' for csv tables. Pass None to guess this from the
            filename.
        pixelmap : pixelmap, optional
            A pixelmap to bind to the Dataset.
        repinfo : repinfo-style pd.Dataframe, optional
            Repinfo to bind to the Dataset.

        Returns
        -------
        Dataset
            The new Dataset.
        """
        # resolve sep
        sep = cls._resolve_sep(table_file, sep)

        # read csv
        df = pd.read_csv(table_file, sep=sep, index_col=0)

        # make column hierarchy
        df.columns = pd.MultiIndex.from_arrays(
            [[name] * len(df.columns), df.columns])

        # return new Dataset
        return cls(df, pixelmap=pixelmap, repinfo=repinfo)

[docs]    @classmethod
    def from_counts_superdict(cls, counts_superdict, pixelmap, name='counts',
                              repinfo=None, rep_order=None):
        """
        Creates a Datset from a counts_superdict and associated pixelmap.

        Parameters
        ----------
        counts_superdict : counts_superdict
            Contains the data that will be put into the Dataset.
        pixelmap : pixelmap
            Needed to establish the row index on the Dataset.
        name : str
            Top-level column name for the data.
        repinfo : repinfo-style pd.Dataframe or list of str, optional
            Repinfo to bind to the Dataset. Pass a list of condition names to
            automatically create a repinfo object.
        rep_order : list of str, optional
            Pass this to guarantee the order of the columns for the replicates.
            Pass None to accept a random order.

        Returns
        -------
        Dataset
            The new Dataset.
        """
        # resolve rep_order
        if rep_order is None:
            rep_order = sorted(counts_superdict.keys())

        # establish regions
        region_order = list(counts_superdict[rep_order[0]].keys())

        # parallel lists to be filled in by the loop
        list_of_dict = []
        list_of_fflj_id = []

        # loop to fill in the parallel lists
        for region in region_order:
            for i in range(len(pixelmap[region])):
                for j in range(i + 1):
                    list_of_fflj_id.append(
                        '%s_%s' % (pixelmap[region][i]['name'],
                                   pixelmap[region][j]['name']))
                    list_of_dict.append(
                        {rep: counts_superdict[rep][region][i, j]
                         for rep in rep_order})

        # use the parallel lists to create a dataframe with fflj_id index
        df = pd.DataFrame(list_of_dict,
                          index=pd.Series(list_of_fflj_id, name='fflj_id'))

        # make column hierarchy
        df.columns = pd.MultiIndex.from_arrays(
            [[name] * len(df.columns), df.columns])

        # resolve repinfo
        if repinfo is not None and type(repinfo[0]) == str:
            repinfo = Dataset._make_repinfo(rep_order, repinfo)

        # return new Dataset
        return cls(df, pixelmap=pixelmap, repinfo=repinfo)

    def _add_reverse_pixelmap(self):
        """
        Reverses self.pixelmap, binding the result to self.reverse_pixelmap.
        """
        if self.pixelmap is None:
            raise ValueError('Dataset must have pixelmap bound to add reverse '
                             'pixelmap')
        self.reverse_pixelmap = {self.pixelmap[region][i]['name']: (region, i)
                                 for region in self.pixelmap
                                 for i in range(len(self.pixelmap[region]))}

    def _add_region_column(self):
        """
        Adds a 'region' column to self.df if one doesn't exist yet.

        Assumes that all interactions are cis.
        """
        # check if column already exists
        if 'region' in self.df.columns:
            return

        # check for reverse_pixelmap
        if self.reverse_pixelmap is None:
            raise ValueError('Dataset must have reverse_pixelmap bound to add '
                             'region column')

        self.df['region'] = [
            self.reverse_pixelmap[self._split_index(fflj_id)[0]][0]
            for fflj_id in self.df.index]

    def _distance(self, fflj_id):
        """
        Returns the mid-to-mid distance of an interaction given its FFLJ ID.

        Parameters
        ----------
        fflj_id : str
            The FFLJ ID of the interaction.

        Returns
        -------
        int
            Its mid-to-mid interaction distance in units of base pairs.
        """
        # check for reverse_pixelmap
        if self.reverse_pixelmap is None:
            raise ValueError('Dataset must have reverse_pixelmap bound to '
                             'compute interaction distances')

        left_name, right_name = self._split_index(fflj_id)
        left_region, left_index = self.reverse_pixelmap[left_name]
        right_region, right_index = self.reverse_pixelmap[right_name]

        return get_mid_to_mid_distance(self.pixelmap[left_region][left_index],
                                       self.pixelmap[right_region][right_index])

    def _add_distance_column(self):
        """
        Adds a 'distance' (in units of bins) column to self.df if one doesn't
        exist yet.

        Assumes that all interactions are cis.
        """
        # check if column already exists
        if 'distance' in self.df.columns:
            return

        self.df['distance'] = [self._distance(fflj_id)
                               for fflj_id in self.df.index]

[docs]    def add_column_from_counts(self, counts, name):
        """
        Adds a new column to this Dataset's df.

        The counts dict passed is assumed to match the pixelmap bound on this
        Dataset. If no pixelmap is bound, an ValueError will be raised.

        Parameters
        ----------
        counts : dict of np.ndarray
            Should contain the values that will make up the new column.
        name : str
            The name of the new column.
        """
        # check for reverse_pixelmap
        if self.reverse_pixelmap is None:
            raise ValueError('Dataset must have reverse_pixelmap bound to '
                             'add column from counts')

        # deduce dtype
        dtype = counts[list(counts.keys())[0]].dtype

        # define inner function
        def inner_fn(index):
            # split index
            left_name, right_name = self._split_index(index)

            # look up names in the reverse pixelmap
            region, left_index = self.reverse_pixelmap[left_name]
            other_region, right_index = self.reverse_pixelmap[right_name]

            # skip trans contacts
            if region != other_region:
                return null_value(dtype)

            return counts[region][left_index, right_index]

        self.df[name] = np.array([inner_fn(i) for i in self.df.index],
                                 dtype=dtype)

[docs]    def add_columns_from_counts_superdict(self, counts_superdict, name,
                                          rep_order=None):
        """
        Adds a new group of columns to the Dataset from a counts superdict
        structure.

        Parameters
        ----------
        counts_superdict : dict of dict of np.ndarray
            The outer keys are replicate names as strings, the inner keys are
            region names as strings, and the values are square, symmetric arrays
            of values for each replicate and region.
        name : str
            The name to use for the new group of columns.
        rep_order : list of str, optional
            Pass a list of replicate names to load the listed replicates in a
            specific order. Pass None to use the random order of the outer keys
            of ``counts_superdict``.
        """
        # resolve rep_order
        if rep_order is None:
            rep_order = list(counts_superdict.keys())

        for rep in rep_order:
            self.add_column_from_counts(counts_superdict[rep], (name, rep))

[docs]    def select(self, name='counts', rep=None, region=None):
        """
        Get a subset of this Dataset's DataFrame corresponding to a desired
        column, replicate, and/or region.

        Parameters
        ----------
        name : str
            The column name of a hierarchical or non-hierarchical column.
        rep : str, optional
            If ``name`` refers to a hierarchical column, you must specify which
            replicate you want to select data from by passing its name here.
        region : str, optional
            To select data from only one region, pass its name here. Pass None
            to select data from all regions.
        """
        idx = self.df[self.df['region'] == region].index if region\
            else self.df.index
        cols = (name, rep) if rep else name
        return self.df.loc[idx, cols]

[docs]    def counts(self, name='counts', rep=None, region=None, fill_value=None,
               dtype=None):
        """
        Converts this Dataset to a regional_counts matrix, a counts dict, a
        counts_superdict, or a regional_counts_superdict.

        Parameters
        ----------
        name : str
            The top-level column name to extract.
        rep : str, optional
            If name corresponds to a hierarchical column, pass a rep name to
            extract only one rep (return type will be a counts dict). Pass None
            to return a counts_superdict with all reps. If name corresponds to
            a normal column, this kwarg will be ignored.
        region : str, optional
            Pass a region name as a string to extract data for only one region.
            If name corresponds to a hierarchical column and rep was not passed,
            the return type will be a regional_counts_superdict. Otherwise, the
            return type will be a regional_counts matrix. Pass None to extract
            data for all regions.
        fill_value : any, optional
            The fill value for the counts_superdict (for entries not present in
            the Dataset). Pass None to use np.nan.
        dtype : dtype, optional
            The dtype to use for the np.array's in the counts_superdict. Pass
            None to guess them from the Dataset. If the data being extracted is
            strings, 'U25' will be assumed.

        Returns
        -------
        regional_counts matrix, counts dict, counts_superdict, or
        regional_counts_superdict
            The data requested. See Parameters for explanation of return type.
            The general philosophy is that a counts_superdict will be returned,
            but any single-key levels will be squeezed.
        """
        # check for reverse_pixelmap
        if self.reverse_pixelmap is None:
            raise ValueError('Dataset must have reverse_pixelmap bound to '
                             'create counts_superdict')

        # check if name is hierachical
        is_hierarchical = hasattr(self.df[name], 'columns')

        # short-circuit: function calls itself repeatedly over reps
        if is_hierarchical and rep is None:
            return {rep: self.counts(name=name, rep=rep, region=region,
                                     fill_value=fill_value, dtype=dtype)
                    for rep in self.df[name].columns}

        # establish default regions
        region_order = list(self.pixelmap.keys())

        # honor region kwarg
        if region is not None:
            region_order = [region]

        # grab the relevant slice
        if is_hierarchical:
            if region is None:
                df_slice = self.df[name, rep]
            else:
                df_slice = self.df[self.df['region'] == region][name, rep]
        else:
            if region is None:
                df_slice = self.df[name]
            else:
                df_slice = self.df[self.df['region'] == region][name]

        # resolve dtype
        if dtype is None:
            dtype = df_slice.values.dtype
            # TODO: understand if this works in py3
            if dtype == str:
                dtype = 'U25'

        # resolve fill_value
        if fill_value is None:
            fill_value = np.nan
            # TODO: understand if this works in py3
            if any(c in str(dtype) for c in ['U', 'S']):
                fill_value = ''
            if dtype == int:
                fill_value = 0
            if dtype == bool:
                fill_value = False

        # set up counts dict
        counts = {r: np.tile(np.array(fill_value, dtype=dtype),
                             [len(self.pixelmap[r]),
                              len(self.pixelmap[r])])
                  for r in region_order}

        # fill counts dict
        for fflj_id, value in df_slice.iteritems():
            # resolve left and right fragment names
            left_name, right_name = self._split_index(fflj_id)

            # look up names in the reverse pixelmap
            left_region, left_index = self.reverse_pixelmap[left_name]
            right_region, right_index = self.reverse_pixelmap[right_name]

            # skip trans contacts
            if left_region != right_region:
                continue

            # fill value
            counts[left_region][left_index, right_index] = value
            counts[left_region][right_index, left_index] = value

        # return just the counts for the requested region as a matrix if the
        # region kwarg passed; all other return type possibilities are handled
        # by the short-circuit above
        if region is not None:
            return counts[region]
        return counts

    @staticmethod
    def _split_index(index):
        """
        Splits a row index value (FFLJ ID) into the individual fragment names.

        Currently this assumes the separator is '_' and that each fragment name
        has an equal number of '_'s, but this function should be used to do the
        splitting in case this changes in the future.

        Parameters
        ----------
        index : str
            The row index value (FFLJ ID) to split.

        Returns
        -------
        (str, str)
            The names of the interacting fragments.
        """
        pieces = index.split('_')
        halfway = int(len(pieces)/2)
        return ('_'.join(pieces[:halfway]),
                '_'.join(pieces[halfway:]))

    @staticmethod
    def _resolve_sep(filename='', sep=None):
        """
        Utility method to resolve tablular file separator.

        Parameters
        ----------
        filename : str
            The filename.
        sep : str, optional
            The separator if one was specified, None otherwise.

        Returns
        -------
        str
            The resolved separator.
        """
        if sep is not None:
            return sep
        if filename.endswith('tsv'):
            return '\t'
        if filename.endswith('csv'):
            return ','
        return '\t'

    @staticmethod
    def _make_repinfo(reps, conditions):
        """
        Create a repinfo object given a list of rep names and condition names.

        Parameters
        ----------
        reps : list of str
            The replicate names as strings.
        conditions : list of str
            The condition names as strings.

        Returns
        -------
        pd.DataFrame
            The repinfo object.
        """
        condition_map = infer_level_mapping(reps, conditions)
        conditions = list(map(condition_map.__getitem__, reps))
        return pd.DataFrame(zip(reps, conditions),
                            columns=['replicate', 'condition']) \
            .set_index('replicate')

[docs]    def apply_per_region(self, fn, inputs, outputs, initial_values=0.0,
                         **kwargs):
        """
        Apply a function over the Dataset on a per-region basis.

        Parameters
        ----------
        fn : Callable
            The function to apply. It should take in pd.Series's or
            pd.DataFrames as its args, in the same order as inputs, and it
            should return 1D vectors, in the same order as outputs.
        inputs : list of (str or tuple of str)
            The list of columns to pass as inputs to fn. Use a tuple of strings
            to access hierarchical columns. Omit the secound level of a
            hierarchical column to pass all replicates to fn as a single
            pd.DataFrame. A single string or tuple will be wrapped in a list
            automatically.
        outputs : list of (str or tuple of str)
            Names of output columns to be added to the Dataset. Use a tuple of
            strings to create hierarchical columns.
        initial_values : list of any
            The values with which the new columns will be temporarily
            initialized. This should control the dtype of the new columns.
        """
        # promote all singleton args to list
        if type(inputs) != list:
            inputs = [inputs]
        if type(outputs) != list:
            outputs = [outputs]
        if type(initial_values) != list:
            initial_values = [initial_values] * len(outputs)

        # initialize output columns
        for i in range(len(outputs)):
            self.df[outputs[i]] = initial_values[i]

        # apply per region
        for region in self.pixelmap:
            # inject region kwarg if fn accepts it
            k = dict(kwargs)
            if 'region' in inspect.getargspec(fn)[0]:
                k['region'] = region

            # call fn
            results = fn(*[self.df.loc[self.df.region == region, inputs[i]]
                           for i in range(len(inputs))], **k)
            if type(results) not in [tuple, list]:
                results = [results]
            for i in range(len(outputs)):
                self.df.loc[self.df.region == region, outputs[i]] = results[i]

[docs]    def apply_per_replicate(self, fn, inputs, outputs, **kwargs):
        """
        Applies a function over the Dataset on a per-replicate basis.

        Parameters
        ----------
        fn : Callable
            The function to apply. It should take in pd.Series's as its args, in
            the same order as inputs, and it should return 1D vectors, in the
            same order as outputs.
        inputs : list of (str or tuple of str)
            The list of columns to pass as inputs to fn. Use a tuple of strings
            to access hierarchical columns. At least one input must refer to the
            top level of a hierarchical column, the first such column
            encountered will be used to determine the replicates to apply over.
            Non-hierarchical columns, or hierarchical columns fully specified by
            a tuple of strings will be broadcast across all replicates.
        outputs : list of str
            Names of top-level output columns to be added to the Dataset. The
            second level will be automatically filled in with the replicate
            names.
        """
        # promote all singleton args to list
        if type(inputs) != list:
            inputs = [inputs]
        if type(outputs) != list:
            outputs = [outputs]

        # identify reps (sub-columns of first hierarchical column)
        reps = None
        for i in range(len(inputs)):
            if hasattr(self.df[inputs[i]], 'columns'):
                reps = self.df[inputs[i]].columns
                break
        if reps is None:
            raise ValueError('none of the input columns are hierarchical')

        # apply per replicate
        for rep in reps:
            # inject rep kwarg if fn accepts it
            k = dict(kwargs)
            if 'rep' in inspect.getargspec(fn)[0]:
                k['rep'] = rep

            # call fn
            results = fn(*[self.df.loc[:, (inputs[i], rep)]
                           if hasattr(self.df[inputs[i]], 'columns')
                           else self.df.loc[:, inputs[i]]
                           for i in range(len(inputs))], **k)
            if type(results) not in [tuple, list]:
                results = [results]
            for i in range(len(outputs)):
                self.df.loc[:, (outputs[i], rep)] = results[i]

[docs]    def apply_per_replicate_per_region(self, fn, inputs, outputs,
                                       initial_values=0.0, **kwargs):
        """
        Applies a function over the Dataset on a per-replicate, per-region
        basis.

        Parameters
        ----------
        fn : Callable
            The function to apply. It should take in pd.Series's as its args, in
            the same order as inputs, and it should return 1D vectors, in the
            same order as outputs.
        inputs : list of (str or tuple of str)
            The list of columns to pass as inputs to fn. Use a tuple of strings
            to access hierarchical columns. At least one input must refer to the
            top level of a hierarchical column, the first such column
            encountered will be used to determine the replicates to apply over.
            Non-hierarchical columns, or hierarchical columns fully specified by
            a tuple of strings will be broadcast across all replicates.
        outputs : list of str
            Names of top-level output columns to be added to the Dataset. The
            second level will be automatically filled in with the replicate
            names.
        initial_values : list of any
            The values with which the new columns will be temporarily
            initialized. This should control the dtype of the new columns.
        """
        # promote inputs and outputs to list
        if type(inputs) != list:
            inputs = [inputs]
        if type(outputs) != list:
            outputs = [outputs]

        # identify reps and hierarchical columns
        reps = None
        for i in range(len(inputs)):
            if hasattr(self.df[inputs[i]], 'columns'):
                reps = self.df[inputs[i]].columns
                break
        if reps is None:
            raise ValueError('none of the input columns are hierarchical')

        # loop over reps
        for rep in reps:
            # inject rep kwarg if fn accepts it
            k = dict(kwargs)
            if 'rep' in inspect.getargspec(fn)[0]:
                k['rep'] = rep

            # apply per region
            self.apply_per_region(
                fn,
                [(inputs[i], rep)
                 if hasattr(self.df[inputs[i]], 'columns')
                 else inputs[i]
                 for i in range(len(inputs))],
                [(outputs[i], rep)
                 for i in range(len(outputs))],
                initial_values=initial_values,
                **k)

[docs]    def apply_across_replicates(self, fn, inputs, outputs, **kwargs):
        """
        Applies a matrix-to-matrix function over the Dataset.

        This is useful for functions that don't operate independently on each
        replicate of the Dataset.

        The main advantage of this function is that it handles the unboxing of
        the replicates after a matrix-to-matrix function is applied. If you are
        looking to apply a matrix-to-vector function over the Dataset, you can
        do it with a one-liner, assigning the vector result(s) to the new
        column(s) immediately.

        Parameters
        ----------
        fn : Callable
            The function to apply. It should take in np.ndarrays as its inputs
            and return np.ndarrays with the same size and shape. If some inputs
            are specified as individual columns, they will be passed to fn as
            np.ndarrays shaped as column vectors.
        inputs : list of (str or tuple of str)
            The list of columns to pass as inputs to fn. Use a tuple of strings
            to access hierarchical columns. At least one input must refer to the
            top level of a hierarchical column, the first such column
            encountered will be used to determine the replicates to apply over.
            Non-hierarchical columns, or hierarchical columns fully specified by
            a tuple of strings will be passed to fn as column vectors.
        outputs : list of str
            Names of top-level output columns to be added to the Dataset. The
            second level will be automatically filled in with the replicate
            names.
        """
        # promote all singleton args to list
        if type(inputs) != list:
            inputs = [inputs]
        if type(outputs) != list:
            outputs = [outputs]

        # identify reps (sub-columns of first hierarchical column)
        reps = None
        for i in range(len(inputs)):
            if hasattr(self.df[inputs[i]], 'columns'):
                reps = self.df[inputs[i]].columns
                break
        if reps is None:
            raise ValueError('none of the input columns are hierarchical')

        # apply fn
        results = fn(*[self.df.loc[:, inputs[i]].values
                       if hasattr(self.df[inputs[i]], 'columns')
                       else self.df.loc[:, inputs[i]].values[:, np.newaxis]
                       for i in range(len(inputs))], **kwargs)

        # promote results to list
        if type(results) not in [tuple, list]:
            results = [results]

        # unbox results
        for i in range(len(outputs)):
            for j in range(len(reps)):
                self.df.loc[:, (outputs[i], reps[j])] = results[i][:, j]

[docs]    def apply_across_replicates_per_region(self, fn, inputs, outputs,
                                           initial_values=0.0, **kwargs):
        """
        Applies a matrix-to-matrix function over the Dataset in a per-region
        manner.

        This is useful for functions that don't operate independently on each
        replicate of the Dataset, but which operate independently on each region
        of the Dataset.

        The main advantage of this function is that it handles the unboxing of
        the replicates after a matrix-to-matrix function is applied. If you are
        looking to apply a matrix-to-vector function over the Dataset in a
        per-region manner, you can do it with apply_per_region(), feeding a
        hierarchical column as an input.

        Parameters
        ----------
        fn : Callable
            The function to apply. It should take in np.ndarrays as its inputs
            and return np.ndarrays with the same size and shape. If some inputs
            are specified as individual columns, they will be passed to fn as
            np.ndarrays shaped as column vectors.
        inputs : list of (str or tuple of str)
            The list of columns to pass as inputs to fn. Use a tuple of strings
            to access hierarchical columns. At least one input must refer to the
            top level of a hierarchical column, the first such column
            encountered will be used to determine the replicates to apply over.
            Non-hierarchical columns, or hierarchical columns fully specified by
            a tuple of strings will be passed to fn as column vectors.
        outputs : list of str
            Names of top-level output columns to be added to the Dataset. The
            second level will be automatically filled in with the replicate
            names.
        initial_values : list of any
            The values with which the new columns will be temporarily
            initialized. This should control the dtype of the new columns.
        """
        # promote all singleton args to list
        if type(inputs) != list:
            inputs = [inputs]
        if type(outputs) != list:
            outputs = [outputs]
        if type(initial_values) != list:
            initial_values = [initial_values] * len(outputs)

        # identify reps (sub-columns of first hierarchical column)
        reps = None
        for i in range(len(inputs)):
            if hasattr(self.df[inputs[i]], 'columns'):
                reps = self.df[inputs[i]].columns
                break
        if reps is None:
            raise ValueError('none of the input columns are hierarchical')

        # initialize output columns
        for i in range(len(outputs)):
            for j in range(len(reps)):
                self.df[outputs[i], reps[j]] = initial_values[i]

        # apply per region
        for region in self.pixelmap:
            # inject region kwarg if fn accepts it
            k = dict(kwargs)
            if 'region' in inspect.getargspec(fn)[0]:
                k['region'] = region

            # call fn
            results = fn(*[
                self.df.loc[self.df.region == region, inputs[i]].values
                if hasattr(self.df[inputs[i]], 'columns')
                else (self.df.loc[self.df.region == region, inputs[i]]
                      .values[:, np.newaxis])
                for i in range(len(inputs))], **k)
            if type(results) not in [tuple, list]:
                results = [results]
            for i in range(len(outputs)):
                for j in range(len(reps)):
                    self.df.loc[self.df.region == region,
                                (outputs[i], reps[j])] = results[i][:, j]

[docs]    def dropna(self, name='counts', reps=None):
        """
        Drops NA's from the underlying dataframe.

        Parameters
        ----------
        name : str
            The name of the column to decide to drop based on.
        reps : list of str, optional
            If name refers to a hierarchial column, pass a list of rep names to
            only drop based on these reps. Pass None to drop based on the
            presence of an NA in any rep. If name does not refer to a
            hierarchical column this kwarg is ignored.
        """
        # check for hierarchical column
        if hasattr(self.df[name], 'columns'):
            # resolve reps for hierarchical column
            if reps is None:
                reps = self.df[name].columns

            # generate subset
            subset = [(name, rep) for rep in reps]
        else:
            subset = [name]

        # drop
        self.df.dropna(subset=subset, inplace=True)