Source code for scprep.measure

from . import select
from . import utils
from scipy import sparse

import numpy as np
import pandas as pd
import scipy.signal


[docs]def library_size(data): """Measure the library size of each cell. Parameters ---------- data : array-like, shape=[n_samples, n_features] Input data Returns ------- library_size : list-like, shape=[n_samples] Sum over all genes for each cell """ library_size = utils.matrix_sum(data, axis=1) if isinstance(library_size, pd.Series): library_size.name = "library_size" return library_size
[docs]def gene_set_expression( data, genes=None, library_size_normalize=False, starts_with=None, ends_with=None, exact_word=None, regex=None, ): """Measure the expression of a set of genes in each cell. Parameters ---------- data : array-like, shape=[n_samples, n_features] Input data genes : list-like, shape<=[n_features], optional (default: None) Integer column indices or string gene names included in gene set library_size_normalize : bool, optional (default: False) Divide gene set expression by library size starts_with : str or None, optional (default: None) If not None, select genes that start with this prefix ends_with : str or None, optional (default: None) If not None, select genes that end with this suffix exact_word : str, list-like or None, optional (default: None) If not None, select genes that contain this exact word. regex : str or None, optional (default: None) If not None, select genes that match this regular expression Returns ------- gene_set_expression : list-like, shape=[n_samples] Sum over genes for each cell """ if library_size_normalize: from .normalize import library_size_normalize data = library_size_normalize(data) gene_data = select.select_cols( data, idx=genes, starts_with=starts_with, ends_with=ends_with, exact_word=exact_word, regex=regex, ) if len(gene_data.shape) > 1: gene_set_expression = library_size(gene_data) else: gene_set_expression = gene_data if isinstance(gene_set_expression, pd.Series): gene_set_expression.name = "expression" return gene_set_expression
[docs]def gene_variability(data, kernel_size=0.005, smooth=5, return_means=False): """Measure the variability of each gene in a dataset. Variability is computed as the deviation from the rolling median of the mean-variance curve Parameters ---------- data : array-like, shape=[n_samples, n_features] Input data kernel_size : float or int, optional (default: 0.005) Width of rolling median window. If a float between 0 and 1, the width is given by kernel_size * data.shape[1]. Otherwise should be an odd integer smooth : int, optional (default: 5) Amount of smoothing to apply to the median filter return_means : boolean, optional (default: False) If True, return the gene means Returns ------- variability : list-like, shape=[n_samples] Variability for each gene """ columns = data.columns if isinstance(data, pd.DataFrame) else None data = utils.to_array_or_spmatrix(data) if isinstance(data, sparse.dia_matrix): data = data.tocsc() data_std = utils.matrix_std(data, axis=0) ** 2 data_mean = utils.toarray(data.mean(axis=0)).flatten() if kernel_size < 1: kernel_size = 2 * (int(kernel_size * len(data_std)) // 2) + 1 order = np.argsort(data_mean) data_std_med = np.empty_like(data_std) data_std_order = data_std[order] # handle overhang with reflection data_std_order = np.r_[ data_std_order[kernel_size::-1], data_std_order, data_std_order[:-kernel_size:-1], ] medfilt = scipy.signal.medfilt(data_std_order, kernel_size=kernel_size)[ kernel_size:-kernel_size ] # apply a little smoothing for i in range(smooth): medfilt = np.r_[(medfilt[1:] + medfilt[:-1]) / 2, medfilt[-1]] data_std_med[order] = medfilt result = data_std - data_std_med if columns is not None: result = pd.Series(result, index=columns, name="variability") data_mean = pd.Series(data_mean, index=columns, name="mean") if return_means: result = result, data_mean return result
[docs]def gene_capture_count(data, cutoff=0): """Measure the number of cells in which each gene has non-negligible counts. Parameters ---------- data : array-like, shape=[n_samples, n_features] Input data cutoff : float, optional (default: 0) Number of counts above which expression is deemed non-negligible Returns ------- capture-count : list-like, shape=[m_features] Capture count for each gene """ gene_sums = np.array(utils.matrix_sum(data > cutoff, axis=0)).reshape(-1) if isinstance(data, pd.DataFrame): gene_sums = pd.Series(gene_sums, index=data.columns, name="capture_count") return gene_sums