from . import measure
from . import select
from . import utils
from scipy import sparse
import numbers
import numpy as np
import pandas as pd
import warnings
def remove_empty_genes(data, *extra_data): # noqa
warnings.warn(
"`scprep.filter.remove_empty_genes` is deprecated. "
"Use `scprep.filter.filter_empty_genes` instead.",
DeprecationWarning,
)
return filter_empty_genes(data, *extra_data)
def remove_rare_genes(data, *extra_data, cutoff=0, min_cells=5): # noqa
warnings.warn(
"`scprep.filter.remove_rare_genes` is deprecated. "
"Use `scprep.filter.filter_rare_genes` instead.",
DeprecationWarning,
)
return filter_rare_genes(data, *extra_data, cutoff=cutoff, min_cells=min_cells)
def remove_empty_cells(data, *extra_data, sample_labels=None): # noqa
warnings.warn(
"`scprep.filter.remove_empty_cells` is deprecated. "
"Use `scprep.filter.filter_empty_cells` instead.",
DeprecationWarning,
)
return filter_empty_cells(data, *extra_data, sample_labels=sample_labels)
def remove_duplicates(data, *extra_data, sample_labels=None): # noqa
warnings.warn(
"`scprep.filter.remove_duplicates` is deprecated. "
"Use `scprep.filter.filter_duplicates` instead.",
DeprecationWarning,
)
return filter_duplicates(data, *extra_data, sample_labels=sample_labels)
[docs]def filter_empty_genes(data, *extra_data):
"""Filter all genes with zero counts across all cells.
This is equivalent to `filter_rare_genes(data, cutoff=0, min_cells=1)`
but should be faster.
Parameters
----------
data : array-like, shape=[n_samples, n_features]
Input data
extra_data : array-like, shape=[any, n_features], optional
Optional additional data objects from which to select the same genes
Returns
-------
data : array-like, shape=[n_samples, m_features]
Filtered output data, where m_features <= n_features
extra_data : array-like, shape=[any, m_features]
Filtered extra data, if passed.
"""
gene_sums = np.array(utils.matrix_sum(data, axis=0)).reshape(-1)
keep_genes_idx = gene_sums > 0
data = select.select_cols(data, *extra_data, idx=keep_genes_idx)
return data
[docs]def filter_rare_genes(data, *extra_data, cutoff=0, min_cells=5):
"""Filter all genes with negligible counts in all but a few cells.
Parameters
----------
data : array-like, shape=[n_samples, n_features]
Input data
extra_data : array-like, shape=[any, n_features], optional
Optional additional data objects from which to select the same rows
cutoff : float, optional (default: 0)
Number of counts above which expression is deemed non-negligible
min_cells : int, optional (default: 5)
Minimum number of cells above `cutoff` in order to retain a gene
Returns
-------
data : array-like, shape=[n_samples, m_features]
Filtered output data, where m_features <= n_features
extra_data : array-like, shape=[any, m_features]
Filtered extra data, if passed.
"""
gene_sums = measure.gene_capture_count(data, cutoff=cutoff)
keep_genes_idx = gene_sums >= min_cells
data = select.select_cols(data, *extra_data, idx=keep_genes_idx)
return data
[docs]def filter_empty_cells(data, *extra_data, sample_labels=None):
"""Remove all cells with zero library size.
Parameters
----------
data : array-like, shape=[n_samples, n_features]
Input data
extra_data : array-like, shape=[n_samples, any], optional
Optional additional data objects from which to select the same rows
sample_labels : Deprecated
Returns
-------
data : array-like, shape=[m_samples, n_features]
Filtered output data, where m_samples <= n_samples
extra_data : array-like, shape=[m_samples, any]
Filtered extra data, if passed.
"""
if sample_labels is not None:
warnings.warn(
"`sample_labels` is deprecated. "
"Passing `sample_labels` as `extra_data`.",
DeprecationWarning,
)
extra_data = list(extra_data) + [sample_labels]
cell_sums = measure.library_size(data)
keep_cells_idx = cell_sums > 0
data = select.select_rows(data, *extra_data, idx=keep_cells_idx)
return data
[docs]def filter_values(
data,
*extra_data,
values=None,
cutoff=None,
percentile=None,
keep_cells="above",
return_values=False,
sample_labels=None,
filter_per_sample=None,
):
"""Remove all cells with `values` above or below a certain threshold.
It is recommended to use :func:`~scprep.plot.histogram` to
choose a cutoff prior to filtering.
Parameters
----------
data : array-like, shape=[n_samples, n_features]
Input data
extra_data : array-like, shape=[n_samples, any], optional
Optional additional data objects from which to select the same rows
values : list-like, shape=[n_samples]
Value upon which to filter
cutoff : float or tuple of floats, optional (default: None)
Value above or below which to retain cells. Only one of `cutoff`
and `percentile` should be specified.
percentile : int or tuple of ints, optional (Default: None)
Percentile above or below which to retain cells.
Must be an integer between 0 and 100. Only one of `cutoff`
and `percentile` should be specified.
keep_cells : {'above', 'below', 'between'} or None, optional (default: None)
Keep cells above, below or between the cutoff.
If None, defaults to 'above' when a single cutoff is given and
'between' when two cutoffs are given.
return_values : bool, optional (default: False)
If True, also return the values corresponding to the retained cells
sample_labels : Deprecated
filter_per_sample : Deprecated
Returns
-------
data : array-like, shape=[m_samples, n_features]
Filtered output data, where m_samples <= n_samples
filtered_values : list-like, shape=[m_samples]
Values corresponding to retained samples,
returned only if return_values is True
extra_data : array-like, shape=[m_samples, any]
Filtered extra data, if passed.
"""
if sample_labels is not None:
warnings.warn(
"`sample_labels` is deprecated. "
"Passing `sample_labels` as `extra_data`.",
DeprecationWarning,
)
extra_data = list(extra_data) + [sample_labels]
if filter_per_sample is not None:
warnings.warn(
"`filter_per_sample` is deprecated. " "Filtering as a single sample.",
DeprecationWarning,
)
assert values is not None
keep_cells_idx = utils._get_filter_idx(values, cutoff, percentile, keep_cells)
if return_values:
extra_data = [values] + list(extra_data)
data = select.select_rows(data, *extra_data, idx=keep_cells_idx)
return data
[docs]def filter_library_size(
data,
*extra_data,
cutoff=None,
percentile=None,
keep_cells=None,
return_library_size=False,
sample_labels=None,
filter_per_sample=None,
):
"""Remove all cells with library size above or below a certain threshold.
It is recommended to use :func:`~scprep.plot.plot_library_size` to
choose a cutoff prior to filtering.
Parameters
----------
data : array-like, shape=[n_samples, n_features]
Input data
extra_data : array-like, shape=[n_samples, any], optional
Optional additional data objects from which to select the same rows
cutoff : float or tuple of floats, optional (default: None)
Library size above or below which to retain a cell. Only one of `cutoff`
and `percentile` should be specified.
percentile : int or tuple of ints, optional (Default: None)
Percentile above or below which to retain a cell.
Must be an integer between 0 and 100. Only one of `cutoff`
and `percentile` should be specified.
keep_cells : {'above', 'below', 'between'} or None, optional (default: None)
Keep cells above, below or between the cutoff.
If None, defaults to 'above' when a single cutoff is given and
'between' when two cutoffs are given.
return_library_size : bool, optional (default: False)
If True, also return the library sizes corresponding to the retained cells
sample_labels : Deprecated
filter_per_sample : Deprecated
Returns
-------
data : array-like, shape=[m_samples, n_features]
Filtered output data, where m_samples <= n_samples
filtered_library_size : list-like, shape=[m_samples]
Library sizes corresponding to retained samples,
returned only if return_library_size is True
extra_data : array-like, shape=[m_samples, any]
Filtered extra data, if passed.
"""
cell_sums = measure.library_size(data)
return filter_values(
data,
*extra_data,
values=cell_sums,
cutoff=cutoff,
percentile=percentile,
keep_cells=keep_cells,
return_values=return_library_size,
sample_labels=sample_labels,
filter_per_sample=filter_per_sample,
)
[docs]def filter_gene_set_expression(
data,
*extra_data,
genes=None,
starts_with=None,
ends_with=None,
exact_word=None,
regex=None,
cutoff=None,
percentile=None,
library_size_normalize=False,
keep_cells=None,
return_expression=False,
sample_labels=None,
filter_per_sample=None,
):
"""Remove cells with total expression of a gene set above or below a threshold.
It is recommended to use :func:`~scprep.plot.plot_gene_set_expression` to
choose a cutoff prior to filtering.
Parameters
----------
data : array-like, shape=[n_samples, n_features]
Input data
extra_data : array-like, shape=[n_samples, any], optional
Optional additional data objects from which to select the same rows
genes : list-like, optional (default: None)
Integer column indices or string gene names included in gene set
starts_with : str or None, optional (default: None)
If not None, select genes that start with this prefix
ends_with : str or None, optional (default: None)
If not None, select genes that end with this suffix
exact_word : str, list-like or None, optional (default: None)
If not None, select genes that contain this exact word.
regex : str or None, optional (default: None)
If not None, select genes that match this regular expression
cutoff : float or tuple of floats, optional (default: None)
Expression value above or below which to remove cells. Only one of `cutoff`
and `percentile` should be specified.
percentile : int or tuple of ints, optional (Default: None)
Percentile above or below which to retain a cell.
Must be an integer between 0 and 100. Only one of `cutoff`
and `percentile` should be specified.
library_size_normalize : bool, optional (default: False)
Divide gene set expression by library size
keep_cells : {'above', 'below', 'between'} or None, optional (default: None)
Keep cells above or below the cutoff. If None, defaults to
'below' for one cutoff and 'between' for two.
return_expression : bool, optional (default: False)
If True, also return the values corresponding to the retained cells
sample_labels : Deprecated
filter_per_sample : Deprecated
Returns
-------
data : array-like, shape=[m_samples, n_features]
Filtered output data, where m_samples <= n_samples
filtered_expression : list-like, shape=[m_samples]
Gene set expression corresponding to retained samples,
returned only if return_expression is True
extra_data : array-like, shape=[m_samples, any]
Filtered extra data, if passed.
"""
if keep_cells is None:
if isinstance(cutoff, numbers.Number) or isinstance(percentile, numbers.Number):
keep_cells = "below"
cell_sums = measure.gene_set_expression(
data,
genes=genes,
starts_with=starts_with,
ends_with=ends_with,
exact_word=exact_word,
regex=regex,
library_size_normalize=library_size_normalize,
)
return filter_values(
data,
*extra_data,
values=cell_sums,
cutoff=cutoff,
percentile=percentile,
keep_cells=keep_cells,
return_values=return_expression,
sample_labels=sample_labels,
filter_per_sample=filter_per_sample,
)
def _find_unique_cells(data):
"""Identify unique cells.
Parameters
----------
data : array-like, shape=[n_samples, n_features]
Input data
Returns
-------
unique_idx : np.ndarray
Sorted array of unique element indices
"""
if utils.is_SparseDataFrame(data):
unique_idx = _find_unique_cells(data.to_coo())
elif utils.is_sparse_dataframe(data):
unique_idx = _find_unique_cells(data.sparse.to_coo())
elif isinstance(data, pd.DataFrame):
unique_idx = ~data.duplicated()
elif isinstance(data, np.ndarray):
_, unique_idx = np.unique(data, axis=0, return_index=True)
unique_idx = np.sort(unique_idx)
elif sparse.issparse(data):
_, unique_data = np.unique(data.tolil().data, return_index=True)
_, unique_index = np.unique(data.tolil().rows, return_index=True)
unique_idx = np.sort(list(set(unique_index).union(set(unique_data))))
return unique_idx
[docs]def filter_duplicates(data, *extra_data, sample_labels=None):
"""Filter all duplicate cells.
Parameters
----------
data : array-like, shape=[n_samples, n_features]
Input data
extra_data : array-like, shape=[n_samples, any], optional
Optional additional data objects from which to select the same rows
sample_labels : Deprecated
Returns
-------
data : array-like, shape=[m_samples, n_features]
Filtered output data, where m_samples <= n_samples
extra_data : array-like, shape=[m_samples, any]
Filtered extra data, if passed.
"""
if sample_labels is not None:
warnings.warn(
"`sample_labels` is deprecated. "
"Passing `sample_labels` as `extra_data`.",
DeprecationWarning,
)
extra_data = list(extra_data) + [sample_labels]
unique_idx = _find_unique_cells(data)
data = select.select_rows(data, *extra_data, idx=unique_idx)
return data