Source code for scprep.filter

from . import measure
from . import select
from . import utils
from scipy import sparse

import numbers
import numpy as np
import pandas as pd
import warnings


def remove_empty_genes(data, *extra_data):  # noqa
    warnings.warn(
        "`scprep.filter.remove_empty_genes` is deprecated. "
        "Use `scprep.filter.filter_empty_genes` instead.",
        DeprecationWarning,
    )
    return filter_empty_genes(data, *extra_data)


def remove_rare_genes(data, *extra_data, cutoff=0, min_cells=5):  # noqa
    warnings.warn(
        "`scprep.filter.remove_rare_genes` is deprecated. "
        "Use `scprep.filter.filter_rare_genes` instead.",
        DeprecationWarning,
    )
    return filter_rare_genes(data, *extra_data, cutoff=cutoff, min_cells=min_cells)


def remove_empty_cells(data, *extra_data, sample_labels=None):  # noqa
    warnings.warn(
        "`scprep.filter.remove_empty_cells` is deprecated. "
        "Use `scprep.filter.filter_empty_cells` instead.",
        DeprecationWarning,
    )
    return filter_empty_cells(data, *extra_data, sample_labels=sample_labels)


def remove_duplicates(data, *extra_data, sample_labels=None):  # noqa
    warnings.warn(
        "`scprep.filter.remove_duplicates` is deprecated. "
        "Use `scprep.filter.filter_duplicates` instead.",
        DeprecationWarning,
    )
    return filter_duplicates(data, *extra_data, sample_labels=sample_labels)


[docs]def filter_empty_genes(data, *extra_data): """Filter all genes with zero counts across all cells. This is equivalent to `filter_rare_genes(data, cutoff=0, min_cells=1)` but should be faster. Parameters ---------- data : array-like, shape=[n_samples, n_features] Input data extra_data : array-like, shape=[any, n_features], optional Optional additional data objects from which to select the same genes Returns ------- data : array-like, shape=[n_samples, m_features] Filtered output data, where m_features <= n_features extra_data : array-like, shape=[any, m_features] Filtered extra data, if passed. """ gene_sums = np.array(utils.matrix_sum(data, axis=0)).reshape(-1) keep_genes_idx = gene_sums > 0 data = select.select_cols(data, *extra_data, idx=keep_genes_idx) return data
[docs]def filter_rare_genes(data, *extra_data, cutoff=0, min_cells=5): """Filter all genes with negligible counts in all but a few cells. Parameters ---------- data : array-like, shape=[n_samples, n_features] Input data extra_data : array-like, shape=[any, n_features], optional Optional additional data objects from which to select the same rows cutoff : float, optional (default: 0) Number of counts above which expression is deemed non-negligible min_cells : int, optional (default: 5) Minimum number of cells above `cutoff` in order to retain a gene Returns ------- data : array-like, shape=[n_samples, m_features] Filtered output data, where m_features <= n_features extra_data : array-like, shape=[any, m_features] Filtered extra data, if passed. """ gene_sums = measure.gene_capture_count(data, cutoff=cutoff) keep_genes_idx = gene_sums >= min_cells data = select.select_cols(data, *extra_data, idx=keep_genes_idx) return data
[docs]def filter_empty_cells(data, *extra_data, sample_labels=None): """Remove all cells with zero library size. Parameters ---------- data : array-like, shape=[n_samples, n_features] Input data extra_data : array-like, shape=[n_samples, any], optional Optional additional data objects from which to select the same rows sample_labels : Deprecated Returns ------- data : array-like, shape=[m_samples, n_features] Filtered output data, where m_samples <= n_samples extra_data : array-like, shape=[m_samples, any] Filtered extra data, if passed. """ if sample_labels is not None: warnings.warn( "`sample_labels` is deprecated. " "Passing `sample_labels` as `extra_data`.", DeprecationWarning, ) extra_data = list(extra_data) + [sample_labels] cell_sums = measure.library_size(data) keep_cells_idx = cell_sums > 0 data = select.select_rows(data, *extra_data, idx=keep_cells_idx) return data
[docs]def filter_values( data, *extra_data, values=None, cutoff=None, percentile=None, keep_cells="above", return_values=False, sample_labels=None, filter_per_sample=None, ): """Remove all cells with `values` above or below a certain threshold. It is recommended to use :func:`~scprep.plot.histogram` to choose a cutoff prior to filtering. Parameters ---------- data : array-like, shape=[n_samples, n_features] Input data extra_data : array-like, shape=[n_samples, any], optional Optional additional data objects from which to select the same rows values : list-like, shape=[n_samples] Value upon which to filter cutoff : float or tuple of floats, optional (default: None) Value above or below which to retain cells. Only one of `cutoff` and `percentile` should be specified. percentile : int or tuple of ints, optional (Default: None) Percentile above or below which to retain cells. Must be an integer between 0 and 100. Only one of `cutoff` and `percentile` should be specified. keep_cells : {'above', 'below', 'between'} or None, optional (default: None) Keep cells above, below or between the cutoff. If None, defaults to 'above' when a single cutoff is given and 'between' when two cutoffs are given. return_values : bool, optional (default: False) If True, also return the values corresponding to the retained cells sample_labels : Deprecated filter_per_sample : Deprecated Returns ------- data : array-like, shape=[m_samples, n_features] Filtered output data, where m_samples <= n_samples filtered_values : list-like, shape=[m_samples] Values corresponding to retained samples, returned only if return_values is True extra_data : array-like, shape=[m_samples, any] Filtered extra data, if passed. """ if sample_labels is not None: warnings.warn( "`sample_labels` is deprecated. " "Passing `sample_labels` as `extra_data`.", DeprecationWarning, ) extra_data = list(extra_data) + [sample_labels] if filter_per_sample is not None: warnings.warn( "`filter_per_sample` is deprecated. " "Filtering as a single sample.", DeprecationWarning, ) assert values is not None keep_cells_idx = utils._get_filter_idx(values, cutoff, percentile, keep_cells) if return_values: extra_data = [values] + list(extra_data) data = select.select_rows(data, *extra_data, idx=keep_cells_idx) return data
[docs]def filter_library_size( data, *extra_data, cutoff=None, percentile=None, keep_cells=None, return_library_size=False, sample_labels=None, filter_per_sample=None, ): """Remove all cells with library size above or below a certain threshold. It is recommended to use :func:`~scprep.plot.plot_library_size` to choose a cutoff prior to filtering. Parameters ---------- data : array-like, shape=[n_samples, n_features] Input data extra_data : array-like, shape=[n_samples, any], optional Optional additional data objects from which to select the same rows cutoff : float or tuple of floats, optional (default: None) Library size above or below which to retain a cell. Only one of `cutoff` and `percentile` should be specified. percentile : int or tuple of ints, optional (Default: None) Percentile above or below which to retain a cell. Must be an integer between 0 and 100. Only one of `cutoff` and `percentile` should be specified. keep_cells : {'above', 'below', 'between'} or None, optional (default: None) Keep cells above, below or between the cutoff. If None, defaults to 'above' when a single cutoff is given and 'between' when two cutoffs are given. return_library_size : bool, optional (default: False) If True, also return the library sizes corresponding to the retained cells sample_labels : Deprecated filter_per_sample : Deprecated Returns ------- data : array-like, shape=[m_samples, n_features] Filtered output data, where m_samples <= n_samples filtered_library_size : list-like, shape=[m_samples] Library sizes corresponding to retained samples, returned only if return_library_size is True extra_data : array-like, shape=[m_samples, any] Filtered extra data, if passed. """ cell_sums = measure.library_size(data) return filter_values( data, *extra_data, values=cell_sums, cutoff=cutoff, percentile=percentile, keep_cells=keep_cells, return_values=return_library_size, sample_labels=sample_labels, filter_per_sample=filter_per_sample, )
[docs]def filter_gene_set_expression( data, *extra_data, genes=None, starts_with=None, ends_with=None, exact_word=None, regex=None, cutoff=None, percentile=None, library_size_normalize=False, keep_cells=None, return_expression=False, sample_labels=None, filter_per_sample=None, ): """Remove cells with total expression of a gene set above or below a threshold. It is recommended to use :func:`~scprep.plot.plot_gene_set_expression` to choose a cutoff prior to filtering. Parameters ---------- data : array-like, shape=[n_samples, n_features] Input data extra_data : array-like, shape=[n_samples, any], optional Optional additional data objects from which to select the same rows genes : list-like, optional (default: None) Integer column indices or string gene names included in gene set starts_with : str or None, optional (default: None) If not None, select genes that start with this prefix ends_with : str or None, optional (default: None) If not None, select genes that end with this suffix exact_word : str, list-like or None, optional (default: None) If not None, select genes that contain this exact word. regex : str or None, optional (default: None) If not None, select genes that match this regular expression cutoff : float or tuple of floats, optional (default: None) Expression value above or below which to remove cells. Only one of `cutoff` and `percentile` should be specified. percentile : int or tuple of ints, optional (Default: None) Percentile above or below which to retain a cell. Must be an integer between 0 and 100. Only one of `cutoff` and `percentile` should be specified. library_size_normalize : bool, optional (default: False) Divide gene set expression by library size keep_cells : {'above', 'below', 'between'} or None, optional (default: None) Keep cells above or below the cutoff. If None, defaults to 'below' for one cutoff and 'between' for two. return_expression : bool, optional (default: False) If True, also return the values corresponding to the retained cells sample_labels : Deprecated filter_per_sample : Deprecated Returns ------- data : array-like, shape=[m_samples, n_features] Filtered output data, where m_samples <= n_samples filtered_expression : list-like, shape=[m_samples] Gene set expression corresponding to retained samples, returned only if return_expression is True extra_data : array-like, shape=[m_samples, any] Filtered extra data, if passed. """ if keep_cells is None: if isinstance(cutoff, numbers.Number) or isinstance(percentile, numbers.Number): keep_cells = "below" cell_sums = measure.gene_set_expression( data, genes=genes, starts_with=starts_with, ends_with=ends_with, exact_word=exact_word, regex=regex, library_size_normalize=library_size_normalize, ) return filter_values( data, *extra_data, values=cell_sums, cutoff=cutoff, percentile=percentile, keep_cells=keep_cells, return_values=return_expression, sample_labels=sample_labels, filter_per_sample=filter_per_sample, )
def _find_unique_cells(data): """Identify unique cells. Parameters ---------- data : array-like, shape=[n_samples, n_features] Input data Returns ------- unique_idx : np.ndarray Sorted array of unique element indices """ if utils.is_SparseDataFrame(data): unique_idx = _find_unique_cells(data.to_coo()) elif utils.is_sparse_dataframe(data): unique_idx = _find_unique_cells(data.sparse.to_coo()) elif isinstance(data, pd.DataFrame): unique_idx = ~data.duplicated() elif isinstance(data, np.ndarray): _, unique_idx = np.unique(data, axis=0, return_index=True) unique_idx = np.sort(unique_idx) elif sparse.issparse(data): _, unique_data = np.unique(data.tolil().data, return_index=True) _, unique_index = np.unique(data.tolil().rows, return_index=True) unique_idx = np.sort(list(set(unique_index).union(set(unique_data)))) return unique_idx
[docs]def filter_duplicates(data, *extra_data, sample_labels=None): """Filter all duplicate cells. Parameters ---------- data : array-like, shape=[n_samples, n_features] Input data extra_data : array-like, shape=[n_samples, any], optional Optional additional data objects from which to select the same rows sample_labels : Deprecated Returns ------- data : array-like, shape=[m_samples, n_features] Filtered output data, where m_samples <= n_samples extra_data : array-like, shape=[m_samples, any] Filtered extra data, if passed. """ if sample_labels is not None: warnings.warn( "`sample_labels` is deprecated. " "Passing `sample_labels` as `extra_data`.", DeprecationWarning, ) extra_data = list(extra_data) + [sample_labels] unique_idx = _find_unique_cells(data) data = select.select_rows(data, *extra_data, idx=unique_idx) return data