Source code for scprep.select

from . import utils
from scipy import sparse

import numbers
import numpy as np
import pandas as pd
import re
import sys
import warnings

if int(sys.version.split(".")[1]) < 7:
    _re_pattern = type(re.compile(""))
else:
    _re_pattern = re.Pattern


def _is_1d(data):
    try:
        return len(data.shape) == 1
    except AttributeError:
        return True


def _check_idx_1d(idx, silent=False):
    if (not _is_1d(idx)) and np.prod(idx.shape) != np.max(idx.shape):
        if silent:
            return False
        else:
            raise ValueError("Expected idx to be 1D. Got shape {}".format(idx.shape))
    else:
        return True


def _get_columns(data):
    return data.columns if isinstance(data, pd.DataFrame) else data.index


def _get_column_length(data):
    try:
        return data.shape[1]
    except (IndexError, AttributeError):
        return len(data)


def _get_row_length(data):
    try:
        return data.shape[0]
    except (IndexError, AttributeError):
        return len(data)


def _check_columns_compatible(*data):
    for d in data:
        if not _get_column_length(d) == _get_column_length(data[0]):
            raise ValueError(
                "Expected `data` and `extra_data` to have the same number of "
                "columns. Got {}".format([_get_column_length(d) for d in data])
            )
        if isinstance(d, (pd.DataFrame, pd.Series)) and isinstance(
            data[0], (pd.DataFrame, pd.Series)
        ):
            if not np.all(_get_columns(data[0]) == _get_columns(d)):
                raise ValueError(
                    "Expected `data` and `extra_data` pandas inputs to have "
                    "the same column names. Fix with "
                    "`scprep.select.select_cols(*extra_data, idx=data.columns)`"
                )


def _check_rows_compatible(*data):
    for d in data:
        if not _get_row_length(d) == _get_row_length(data[0]):
            raise ValueError(
                "Expected `data` and `extra_data` to have the same number of "
                "rows. Got {}".format([d.shape[0] for d in data])
            )
        if isinstance(d, (pd.DataFrame, pd.Series)) and isinstance(
            data[0], (pd.DataFrame, pd.Series)
        ):
            if not np.all(data[0].index == d.index):
                raise ValueError(
                    "Expected `data` and `extra_data` pandas inputs to have "
                    "the same index. Fix with "
                    "`scprep.select.select_rows(*extra_data, idx=data.index)`"
                )


def _convert_dataframe_1d(idx, silent=False):
    if _check_idx_1d(idx, silent=silent):
        idx = idx.iloc[:, 0] if idx.shape[1] == 1 else idx.iloc[0, :]
    return idx


def _string_vector_match(data, match, fun, dtype=str):
    """Get a boolean match array from a vector.

    Parameters
    ----------
    data : list-like
        Vector to be matched against
    match : `dtype` or list-like
        Match criteria
    fun : callable(x, match)
        Function that returns True if `match` matches `x`
    dtype : type, optional (default: str)
        Expected type(match) (if not list-like)

    Returns
    -------
    data_match : list-like, dtype=bool
    """
    if isinstance(match, dtype):
        fun = np.vectorize(fun)
        return fun(data, match)
    else:
        return np.any(
            [_string_vector_match(data, m, fun, dtype=dtype) for m in match], axis=0
        )


def _exact_word_regex(word):
    allowed_chars = ["\\(", "\\)", "\\[", "\\]", "\\.", ",", "!", "\\?", " ", "^", "$"]
    wildcard = "(" + "|".join(allowed_chars) + ")+"
    return "{wildcard}{word}{wildcard}".format(wildcard=wildcard, word=re.escape(word))


def _get_string_subset_mask(
    data, starts_with=None, ends_with=None, exact_word=None, regex=None
):
    """Get a subset from a string array.

    Parameters
    ----------
    data : array-like, shape=[n_samples, n_features] or [n_features]
        Input pd.DataFrame, or list of names
    starts_with : str, list-like or None, optional (default: None)
        If not None, only return names that start with this prefix.
    ends_with : str, list-like or None, optional (default: None)
        If not None, only return names that end with this suffix.
    exact_word : str, list-like or None, optional (default: None)
        If not None, only return names that contain this exact word.
    regex : str, list-like or None, optional (default: None)
        If not None, only return names that match this regular expression.

    Returns
    -------
    data : list-like, shape<=[n_features]
        List of matching strings
    """
    mask = np.full_like(data, True, dtype=bool)
    if starts_with is not None:
        start_match = _string_vector_match(
            data, starts_with, lambda x, match: x.startswith(match)
        )
        mask = np.logical_and(mask, start_match)
    if ends_with is not None:
        end_match = _string_vector_match(
            data, ends_with, lambda x, match: x.endswith(match)
        )
        mask = np.logical_and(mask, end_match)
    if exact_word is not None:
        if not isinstance(exact_word, str):
            exact_word = [_exact_word_regex(w) for w in exact_word]
        else:
            exact_word = _exact_word_regex(exact_word)
        exact_word_match = _get_string_subset_mask(data, regex=exact_word)
        mask = np.logical_and(mask, exact_word_match)
    if regex is not None:
        if not isinstance(regex, str):
            regex = [re.compile(r) for r in regex]
        else:
            regex = re.compile(regex)
        regex_match = _string_vector_match(
            data, regex, lambda x, match: bool(match.search(x)), dtype=_re_pattern
        )
        mask = np.logical_and(mask, regex_match)
    return mask


def _get_string_subset(
    data, starts_with=None, ends_with=None, exact_word=None, regex=None
):
    """Get a subset from a string array.

    Parameters
    ----------
    data : array-like, shape=[n_samples, n_features] or [n_features]
        Input pd.DataFrame, or list of names
    starts_with : str, list-like or None, optional (default: None)
        If not None, only return names that start with this prefix.
    ends_with : str, list-like or None, optional (default: None)
        If not None, only return names that end with this suffix.
    exact_word : str, list-like or None, optional (default: None)
        If not None, only return names that contain this exact word.
    regex : str, list-like or None, optional (default: None)
        If not None, only return names that match this regular expression.

    Returns
    -------
    data : list-like, shape<=[n_features]
        List of matching strings
    """
    data = utils.toarray(data)
    mask = _get_string_subset_mask(
        data,
        starts_with=starts_with,
        ends_with=ends_with,
        exact_word=exact_word,
        regex=regex,
    )
    return data[mask]


[docs]def get_gene_set(data, starts_with=None, ends_with=None, exact_word=None, regex=None): """Get a list of genes from data. Parameters ---------- data : array-like, shape=[n_samples, n_features] or [n_features] Input pd.DataFrame, or list of gene names starts_with : str, list-like or None, optional (default: None) If not None, only return gene names that start with this prefix. ends_with : str, list-like or None, optional (default: None) If not None, only return gene names that end with this suffix. exact_word : str, list-like or None, optional (default: None) If not None, only return gene names that contain this exact word. regex : str, list-like or None, optional (default: None) If not None, only return gene names that match this regular expression. Returns ------- genes : list-like, shape<=[n_features] List of matching genes """ if not _is_1d(data): try: data = data.columns.to_numpy() except AttributeError: raise TypeError( "data must be a list of gene names or a pandas " "DataFrame. Got {}".format(type(data).__name__) ) if ( starts_with is None and ends_with is None and regex is None and exact_word is None ): warnings.warn( "No selection conditions provided. " "Returning all genes.", UserWarning ) return _get_string_subset( data, starts_with=starts_with, ends_with=ends_with, exact_word=exact_word, regex=regex, )
[docs]def get_cell_set(data, starts_with=None, ends_with=None, exact_word=None, regex=None): """Get a list of cells from data. Parameters ---------- data : array-like, shape=[n_samples, n_features] or [n_samples] Input pd.DataFrame, or list of cell names starts_with : str, list-like or None, optional (default: None) If not None, only return cell names that start with this prefix. ends_with : str, list-like or None, optional (default: None) If not None, only return cell names that end with this suffix. exact_word : str, list-like or None, optional (default: None) If not None, only return cell names that contain this exact word. regex : str, list-like or None, optional (default: None) If not None, only return cell names that match this regular expression. Returns ------- cells : list-like, shape<=[n_features] List of matching cells """ if not _is_1d(data): try: data = data.index.to_numpy() except AttributeError: raise TypeError( "data must be a list of cell names or a pandas " "DataFrame. Got {}".format(type(data).__name__) ) if ( starts_with is None and ends_with is None and regex is None and exact_word is None ): warnings.warn( "No selection conditions provided. Returning all cells.", UserWarning ) return _get_string_subset( data, starts_with=starts_with, ends_with=ends_with, exact_word=exact_word, regex=regex, )
[docs]def select_cols( data, *extra_data, idx=None, starts_with=None, ends_with=None, exact_word=None, regex=None, ): """Select columns from a data matrix. Parameters ---------- data : array-like, shape=[n_samples, n_features] Input data extra_data : array-like, shape=[any, n_features], optional Optional additional data objects from which to select the same rows idx : list-like, shape=[m_features] Integer indices or string column names to be selected starts_with : str, list-like or None, optional (default: None) If not None, select columns that start with this prefix. ends_with : str, list-like or None, optional (default: None) If not None, select columns that end with this suffix. exact_word : str, list-like or None, optional (default: None) If not None, select columns that contain this exact word. regex : str, list-like or None, optional (default: None) If not None, select columns that match this regular expression. Returns ------- data : array-like, shape=[n_samples, m_features] Subsetted output data. extra_data : array-like, shape=[any, m_features] Subsetted extra data, if passed. Examples -------- data_subset = scprep.select.select_cols( data, idx=np.random.choice([True, False], data.shape[1]) ) data_subset, metadata_subset = scprep.select.select_cols( data, metadata, starts_with="MT" ) Raises ------ UserWarning : if no columns are selected """ if len(extra_data) > 0: _check_columns_compatible(data, *extra_data) if ( idx is None and starts_with is None and ends_with is None and exact_word is None and regex is None ): warnings.warn( "No selection conditions provided. Returning all columns.", UserWarning ) return tuple([data] + list(extra_data)) if len(extra_data) > 0 else data if idx is None: if not isinstance(data, pd.DataFrame): raise ValueError( "Can only select based on column names with DataFrame input. " "Please set `idx` to select specific columns." ) idx = get_gene_set( data, starts_with=starts_with, ends_with=ends_with, exact_word=exact_word, regex=regex, ) if isinstance(idx, pd.DataFrame): idx = _convert_dataframe_1d(idx) elif not isinstance(idx, (numbers.Integral, str)): idx = utils.toarray(idx) _check_idx_1d(idx) idx = idx.flatten() if utils.is_SparseDataFrame(data): # evil deprecated dataframe; get rid of it data = utils.SparseDataFrame(data) input_1d = _is_1d(data) if isinstance(data, pd.DataFrame): try: if isinstance(idx, (numbers.Integral, str)): data = data.loc[:, idx] else: if np.issubdtype(idx.dtype, np.dtype(bool).type): # temporary workaround for pandas error raise TypeError data = data.loc[:, idx] except (KeyError, TypeError): if isinstance(idx, str): raise if ( isinstance(idx, numbers.Integral) or np.issubdtype(idx.dtype, np.dtype(int)) or np.issubdtype(idx.dtype, np.dtype(bool)) ): data = data.loc[:, np.array(data.columns)[idx]] else: raise elif isinstance(data, pd.Series): try: if np.issubdtype(idx.dtype, np.dtype(bool).type): # temporary workaround for pandas error raise TypeError data = data.loc[idx] except (KeyError, TypeError): if ( isinstance(idx, numbers.Integral) or np.issubdtype(idx.dtype, np.dtype(int)) or np.issubdtype(idx.dtype, np.dtype(bool)) ): data = data.loc[np.array(data.index)[idx]] else: raise elif _is_1d(data): if isinstance(data, list): # can't numpy index a list data = np.array(data) data = data[idx] else: if isinstance( data, ( sparse.coo_matrix, sparse.bsr_matrix, sparse.lil_matrix, sparse.dia_matrix, ), ): data = data.tocsr() if isinstance(idx, pd.Series): idx = utils.toarray(idx) data = data[:, idx] if _get_column_length(data) == 0: warnings.warn("Selecting 0 columns.", UserWarning) elif isinstance(data, pd.DataFrame) and not input_1d: # convert to series if possible data = _convert_dataframe_1d(data, silent=True) if len(extra_data) > 0: data = [data] for d in extra_data: data.append(select_cols(d, idx=idx)) data = tuple(data) return data
[docs]def select_rows( data, *extra_data, idx=None, starts_with=None, ends_with=None, exact_word=None, regex=None, ): """Select rows from a data matrix. Parameters ---------- data : array-like, shape=[n_samples, n_features] Input data extra_data : array-like, shape=[n_samples, any], optional Optional additional data objects from which to select the same rows idx : list-like, shape=[m_samples], optional (default: None) Integer indices or string index names to be selected starts_with : str, list-like or None, optional (default: None) If not None, select rows that start with this prefix. ends_with : str, list-like or None, optional (default: None) If not None, select rows that end with this suffix. exact_word : str, list-like or None, optional (default: None) If not None, select rows that contain this exact word. regex : str, list-like or None, optional (default: None) If not None, select rows that match this regular expression. Returns ------- data : array-like, shape=[m_samples, n_features] Subsetted output data extra_data : array-like, shape=[m_samples, any] Subsetted extra data, if passed. Examples -------- data_subset = scprep.select.select_rows( data, idx=np.random.choice([True, False], data.shape[0]) ) data_subset, labels_subset = scprep.select.select_rows( data, labels, end_with="batch1" ) Raises ------ UserWarning : if no rows are selected """ if len(extra_data) > 0: _check_rows_compatible(data, *extra_data) if ( idx is None and starts_with is None and ends_with is None and exact_word is None and regex is None ): warnings.warn( "No selection conditions provided. " "Returning all rows.", UserWarning ) return tuple([data] + list(extra_data)) if len(extra_data) > 0 else data if idx is None: if not isinstance(data, pd.DataFrame): raise ValueError( "Can only select based on row names with DataFrame input. " "Please set `idx` to select specific rows." ) idx = get_cell_set( data, starts_with=starts_with, ends_with=ends_with, exact_word=exact_word, regex=regex, ) if isinstance(idx, pd.DataFrame): idx = _convert_dataframe_1d(idx) elif not isinstance(idx, (numbers.Integral, str)): idx = utils.toarray(idx) _check_idx_1d(idx) idx = idx.flatten() if utils.is_SparseDataFrame(data): # evil deprecated dataframe; get rid of it data = utils.SparseDataFrame(data) input_1d = _is_1d(data) if isinstance(data, (pd.DataFrame, pd.Series)): try: if isinstance(idx, (numbers.Integral, str)): data = data.loc[idx] else: if np.issubdtype(idx.dtype, np.dtype(bool).type): # temporary workaround for pandas error raise TypeError with warnings.catch_warnings(): warnings.filterwarnings("error", "Passing list-likes to .loc") data = data.loc[idx] except (KeyError, TypeError, FutureWarning): if isinstance(idx, str): raise if ( isinstance(idx, numbers.Integral) or np.issubdtype(idx.dtype, np.dtype(int)) or np.issubdtype(idx.dtype, np.dtype(bool)) ): data = data.loc[np.array(data.index)[idx]] else: raise elif _is_1d(data): if isinstance(data, list): # can't numpy index a list data = np.array(data) data = data[idx] else: if isinstance(data, (sparse.coo_matrix, sparse.bsr_matrix, sparse.dia_matrix)): data = data.tocsr() if isinstance(idx, pd.Series): idx = utils.toarray(idx) data = data[idx, :] if _get_row_length(data) == 0: warnings.warn("Selecting 0 rows.", UserWarning) elif isinstance(data, pd.DataFrame) and not input_1d: # convert to series if possible data = _convert_dataframe_1d(data, silent=True) if len(extra_data) > 0: data = [data] for d in extra_data: data.append(select_rows(d, idx=idx)) data = tuple(data) return data
[docs]def subsample(*data, n=10000, seed=None): """Subsample the number of points in a dataset. Selects a random subset of (optionally multiple) datasets. Helpful for plotting, or for methods with computational constraints. Parameters ---------- data : array-like, shape=[n_samples, any] Input data. Any number of datasets can be passed at once, so long as `n_samples` remains the same. n : int, optional (default: 10000) Number of samples to retain. Must be less than `n_samples`. seed : int, optional (default: None) Random seed Examples -------- data_subsample, labels_subsample = scprep.utils.subsample(data, labels, n=1000) """ N = data[0].shape[0] if len(data) > 1: _check_rows_compatible(*data) if N < n: raise ValueError("Expected n ({}) <= n_samples ({})".format(n, N)) np.random.seed(seed) select_idx = np.isin(np.arange(N), np.random.choice(N, n, replace=False)) data = [select_rows(d, idx=select_idx) for d in data] return tuple(data) if len(data) > 1 else data[0]
[docs]def highly_variable_genes( data, *extra_data, kernel_size=0.05, smooth=5, cutoff=None, percentile=80 ): """Select genes with high variability. Variability is computed as the deviation from a loess fit to the rolling median of the mean-variance curve Parameters ---------- data : array-like, shape=[n_samples, n_features] Input data extra_data : array-like, shape=[any, n_features], optional Optional additional data objects from which to select the same rows kernel_size : float or int, optional (default: 0.005) Width of rolling median window. If a float between 0 and 1, the width is given by kernel_size * data.shape[1]. Otherwise should be an odd integer smooth : int, optional (default: 5) Amount of smoothing to apply to the median filter cutoff : float, optional (default: None) Variability above which expression is deemed significant percentile : int, optional (Default: 80) Percentile above or below which to remove genes. Must be an integer between 0 and 100. Only one of `cutoff` and `percentile` should be specified. Returns ------- data : array-like, shape=[n_samples, m_features] Filtered output data, where m_features <= n_features extra_data : array-like, shape=[any, m_features] Filtered extra data, if passed. """ from . import measure var_genes = measure.gene_variability(data, kernel_size=kernel_size, smooth=smooth) keep_cells_idx = utils._get_filter_idx( var_genes, cutoff, percentile, keep_cells="above" ) return select_cols(data, *extra_data, idx=keep_cells_idx)