Source code for scprep.measure

from . import select
from . import utils
from scipy import sparse

import numpy as np
import pandas as pd
import scipy.signal


[docs]def library_size(data):
    """Measure the library size of each cell.

    Parameters
    ----------
    data : array-like, shape=[n_samples, n_features]
        Input data

    Returns
    -------
    library_size : list-like, shape=[n_samples]
        Sum over all genes for each cell
    """
    library_size = utils.matrix_sum(data, axis=1)
    if isinstance(library_size, pd.Series):
        library_size.name = "library_size"
    return library_size


[docs]def gene_set_expression(
    data,
    genes=None,
    library_size_normalize=False,
    starts_with=None,
    ends_with=None,
    exact_word=None,
    regex=None,
):
    """Measure the expression of a set of genes in each cell.

    Parameters
    ----------
    data : array-like, shape=[n_samples, n_features]
        Input data
    genes : list-like, shape<=[n_features], optional (default: None)
        Integer column indices or string gene names included in gene set
    library_size_normalize : bool, optional (default: False)
        Divide gene set expression by library size
    starts_with : str or None, optional (default: None)
        If not None, select genes that start with this prefix
    ends_with : str or None, optional (default: None)
        If not None, select genes that end with this suffix
    exact_word : str, list-like or None, optional (default: None)
        If not None, select genes that contain this exact word.
    regex : str or None, optional (default: None)
        If not None, select genes that match this regular expression

    Returns
    -------
    gene_set_expression : list-like, shape=[n_samples]
        Sum over genes for each cell
    """
    if library_size_normalize:
        from .normalize import library_size_normalize

        data = library_size_normalize(data)
    gene_data = select.select_cols(
        data,
        idx=genes,
        starts_with=starts_with,
        ends_with=ends_with,
        exact_word=exact_word,
        regex=regex,
    )
    if len(gene_data.shape) > 1:
        gene_set_expression = library_size(gene_data)
    else:
        gene_set_expression = gene_data
    if isinstance(gene_set_expression, pd.Series):
        gene_set_expression.name = "expression"
    return gene_set_expression


[docs]def gene_variability(data, kernel_size=0.005, smooth=5, return_means=False):
    """Measure the variability of each gene in a dataset.

    Variability is computed as the deviation from
    the rolling median of the mean-variance curve

    Parameters
    ----------
    data : array-like, shape=[n_samples, n_features]
        Input data
    kernel_size : float or int, optional (default: 0.005)
        Width of rolling median window. If a float between 0 and 1, the width is given
        by kernel_size * data.shape[1]. Otherwise should be an odd integer
    smooth : int, optional (default: 5)
        Amount of smoothing to apply to the median filter
    return_means : boolean, optional (default: False)
        If True, return the gene means

    Returns
    -------
    variability : list-like, shape=[n_samples]
        Variability for each gene
    """
    columns = data.columns if isinstance(data, pd.DataFrame) else None
    data = utils.to_array_or_spmatrix(data)
    if isinstance(data, sparse.dia_matrix):
        data = data.tocsc()
    data_std = utils.matrix_std(data, axis=0) ** 2
    data_mean = utils.toarray(data.mean(axis=0)).flatten()

    if kernel_size < 1:
        kernel_size = 2 * (int(kernel_size * len(data_std)) // 2) + 1

    order = np.argsort(data_mean)
    data_std_med = np.empty_like(data_std)
    data_std_order = data_std[order]
    # handle overhang with reflection
    data_std_order = np.r_[
        data_std_order[kernel_size::-1],
        data_std_order,
        data_std_order[:-kernel_size:-1],
    ]
    medfilt = scipy.signal.medfilt(data_std_order, kernel_size=kernel_size)[
        kernel_size:-kernel_size
    ]

    # apply a little smoothing
    for i in range(smooth):
        medfilt = np.r_[(medfilt[1:] + medfilt[:-1]) / 2, medfilt[-1]]

    data_std_med[order] = medfilt
    result = data_std - data_std_med

    if columns is not None:
        result = pd.Series(result, index=columns, name="variability")
        data_mean = pd.Series(data_mean, index=columns, name="mean")
    if return_means:
        result = result, data_mean
    return result


[docs]def gene_capture_count(data, cutoff=0):
    """Measure the number of cells in which each gene has non-negligible counts.

    Parameters
    ----------
    data : array-like, shape=[n_samples, n_features]
        Input data
    cutoff : float, optional (default: 0)
        Number of counts above which expression is deemed non-negligible

    Returns
    -------
    capture-count : list-like, shape=[m_features]
        Capture count for each gene
    """
    gene_sums = np.array(utils.matrix_sum(data > cutoff, axis=0)).reshape(-1)
    if isinstance(data, pd.DataFrame):
        gene_sums = pd.Series(gene_sums, index=data.columns, name="capture_count")
    return gene_sums