from . import measure
from . import utils
from scipy import sparse
from sklearn.preprocessing import normalize
import numbers
import numpy as np
import pandas as pd
import warnings
def _get_scaled_libsize(data, rescale=10000, return_library_size=False):
if return_library_size or rescale in ["median", "mean"]:
libsize = measure.library_size(data)
else:
libsize = None
if rescale == "median":
rescale = np.median(utils.toarray(libsize))
if rescale == 0:
warnings.warn(
"Median library size is zero. " "Rescaling to mean instead.",
UserWarning,
)
rescale = np.mean(utils.toarray(libsize))
elif rescale == "mean":
rescale = np.mean(utils.toarray(libsize))
elif isinstance(rescale, numbers.Number):
pass
elif rescale is None:
rescale = 1
else:
raise ValueError(
"Expected rescale in ['median', 'mean'], a number "
"or `None`. Got {}".format(rescale)
)
return rescale, libsize
[docs]def library_size_normalize(data, rescale=10000, return_library_size=False):
"""Perform L1 normalization on input data.
Performs L1 normalization on input data such that the sum of expression
values for each cell sums to 1
then returns normalized matrix to the metric space using median UMI count
per cell effectively scaling all cells as if they were sampled evenly.
Parameters
----------
data : array-like, shape=[n_samples, n_features]
Input data
rescale : {'mean', 'median'}, float or `None`, optional (default: 10000)
Rescaling strategy. If 'mean' or 'median', normalized cells are scaled
back up to the mean or median expression value. If a float,
normalized cells are scaled up to the given value. If `None`, no
rescaling is done and all cells will have normalized library size of 1.
return_library_size : bool, optional (default: False)
If True, also return the library size pre-normalization
Returns
-------
data_norm : array-like, shape=[n_samples, n_features]
Library size normalized output data
filtered_library_size : list-like, shape=[m_samples]
Library size of cells pre-normalization,
returned only if return_library_size is True
"""
# pandas support
columns, index = None, None
if isinstance(data, pd.DataFrame):
columns, index = data.columns, data.index
if utils.is_sparse_dataframe(data):
data = data.sparse.to_coo()
elif utils.is_SparseDataFrame(data):
data = data.to_coo()
else:
# dense data
data = data.to_numpy()
calc_libsize = sparse.issparse(data) and (return_library_size or data.nnz > 2**31)
rescale, libsize = _get_scaled_libsize(data, rescale, calc_libsize)
if libsize is not None:
divisor = utils.toarray(libsize)
data_norm = utils.matrix_vector_elementwise_multiply(
data, 1 / np.where(divisor == 0, 1, divisor), axis=0
)
else:
if return_library_size:
data_norm, libsize = normalize(data, norm="l1", axis=1, return_norm=True)
else:
data_norm = normalize(data, norm="l1", axis=1)
data_norm = data_norm * rescale
if columns is not None:
# pandas dataframe
if sparse.issparse(data_norm):
data_norm = utils.SparseDataFrame(data_norm, default_fill_value=0.0)
else:
data_norm = pd.DataFrame(data_norm)
data_norm.columns = columns
data_norm.index = index
libsize = pd.Series(libsize, index=index, name="library_size", dtype="float64")
if return_library_size:
return data_norm, libsize
else:
return data_norm
[docs]def batch_mean_center(data, sample_idx=None):
"""Perform batch mean-centering on the data.
The features of the data are all centered such that
the column means are zero. Each batch is centered separately.
Parameters
----------
data : array-like, shape=[n_samples, n_features]
Input data
sample_idx : list-like, optional
Batch indices. If `None`, data is assumed to be a single batch
Returns
-------
data : array-like, shape=[n_samples, n_features]
Batch mean-centered output data.
"""
if (
sparse.issparse(data)
or utils.is_SparseDataFrame(data)
or utils.is_sparse_dataframe(data)
):
raise ValueError(
"Cannot mean center sparse data. " "Convert to dense matrix first."
)
if sample_idx is None:
sample_idx = np.ones(len(data))
else:
sample_idx = utils.toarray(sample_idx).flatten()
for sample in np.unique(sample_idx):
idx = sample_idx == sample
if isinstance(data, pd.DataFrame):
feature_means = data.iloc[idx].mean(axis=0)
data.iloc[idx] = data.iloc[idx] - feature_means
else:
feature_means = np.mean(data[idx], axis=0)
data[idx] = data[idx] - feature_means[None, :]
return data