from decorator import decorator
from scipy import sparse
import importlib
import numbers
import numpy as np
import pandas as pd
import re
import warnings
try:
ModuleNotFoundError
except NameError:
# python 3.5
ModuleNotFoundError = ImportError
__imported_pkgs = set()
def _try_import(pkg):
try:
return importlib.import_module(pkg)
except ModuleNotFoundError:
return None
def _version_check(version, min_version=None):
if min_version is None:
# no requirement
return True
min_version = str(min_version)
min_version_split = re.split(r"[^0-9]+", min_version)
version_split = re.split(r"[^0-9]+", version)
version_major = int(version_split[0])
min_major = int(min_version_split[0])
if min_major > version_major:
# failed major version requirement
return False
elif min_major < version_major:
# exceeded major version requirement
return True
elif len(min_version_split) == 1:
# no minor version requirement
return True
else:
version_minor = int(version_split[1])
min_minor = int(min_version_split[1])
if min_minor > version_minor:
# failed minor version requirement
return False
else:
# met minor version requirement
return True
def check_version(pkg, min_version=None):
try:
module = importlib.import_module(pkg)
except ModuleNotFoundError:
raise ModuleNotFoundError(
"{0} not found. "
"Please install it with e.g. `pip install --user {0}`".format(pkg)
)
if not _version_check(module.__version__, min_version):
raise ImportError(
"{0}>={1} is required (installed: {2}). "
"Please upgrade it with e.g."
" `pip install --user --upgrade {0}`".format(
pkg, min_version, module.__version__
)
)
@decorator
def _with_pkg(fun, pkg=None, min_version=None, *args, **kwargs):
global __imported_pkgs
if (pkg, min_version) not in __imported_pkgs:
check_version(pkg, min_version=min_version)
__imported_pkgs.add((pkg, min_version))
return fun(*args, **kwargs)
def _get_percentile_cutoff(data, cutoff=None, percentile=None, required=False):
"""Get a cutoff for a dataset.
Parameters
----------
data : array-like
cutoff : float or None, optional (default: None)
Absolute cutoff value. Only one of cutoff and percentile may be given
percentile : float or None, optional (default: None)
Percentile cutoff value between 0 and 100.
Only one of cutoff and percentile may be given
required : bool, optional (default: False)
If True, one of cutoff and percentile must be given.
Returns
-------
cutoff : float or None
Absolute cutoff value. Can only be None if required is False and
cutoff and percentile are both None.
"""
if percentile is not None:
if cutoff is not None:
raise ValueError(
"Only one of `cutoff` and `percentile` should be given."
"Got cutoff={}, percentile={}".format(cutoff, percentile)
)
if not isinstance(percentile, numbers.Number):
return [_get_percentile_cutoff(data, percentile=p) for p in percentile]
if percentile < 1:
warnings.warn(
"`percentile` expects values between 0 and 100."
"Got {}. Did you mean {}?".format(percentile, percentile * 100),
UserWarning,
)
cutoff = np.percentile(np.array(data).reshape(-1), percentile)
elif cutoff is None and required:
raise ValueError("One of either `cutoff` or `percentile` must be given.")
return cutoff
def _get_filter_idx(values, cutoff, percentile, keep_cells):
"""Return a boolean array to index cells based on a filter.
Parameters
----------
values : list-like, shape=[n_samples]
Value upon which to filter
cutoff : float or tuple of floats, optional (default: None)
Value above or below which to retain cells. Only one of `cutoff`
and `percentile` should be specified.
percentile : int or tuple of ints, optional (Default: None)
Percentile above or below which to retain cells.
Must be an integer between 0 and 100. Only one of `cutoff`
and `percentile` should be specified.
keep_cells : {'above', 'below', 'between'} or None, optional (default: None)
Keep cells above, below or between the cutoff.
If None, defaults to 'above' when a single cutoff is given and
'between' when two cutoffs are given.
Returns
-------
keep_cells_idx : list-like
Boolean retention array
"""
cutoff = _get_percentile_cutoff(values, cutoff, percentile, required=True)
if keep_cells is None:
if isinstance(cutoff, numbers.Number):
keep_cells = "above"
else:
keep_cells = "between"
if keep_cells == "above":
if not isinstance(cutoff, numbers.Number):
raise ValueError(
"Expected a single cutoff with keep_cells='above'."
" Got {}".format(cutoff)
)
keep_cells_idx = values > cutoff
elif keep_cells == "below":
if not isinstance(cutoff, numbers.Number):
raise ValueError(
"Expected a single cutoff with keep_cells='below'."
" Got {}".format(cutoff)
)
keep_cells_idx = values < cutoff
elif keep_cells == "between":
if isinstance(cutoff, numbers.Number) or len(cutoff) != 2:
raise ValueError(
"Expected cutoff of length 2 with keep_cells='between'."
" Got {}".format(cutoff)
)
keep_cells_idx = np.logical_and(
values > np.min(cutoff), values < np.max(cutoff)
)
else:
raise ValueError(
"Expected `keep_cells` in ['above', 'below', 'between']. "
"Got {}".format(keep_cells)
)
return keep_cells_idx
def _check_numpy_dtype(x):
try:
if all([len(xi) == len(x[0]) for xi in x]):
# all sequences of the same length; infer dtype
return None
else:
# sequences of different lengths; object dtype is forced
return object
except TypeError as e:
if str(e).startswith("sparse matrix length is ambiguous"):
# list contains sparse matrices; must be object
return object
elif str(e).endswith("has no len()"):
if any([hasattr(xi, "__len__") for xi in x]):
# some sequences and some not; must be object
return object
else:
# no sequences; infer
return None
else:
raise
[docs]def toarray(x):
"""Convert an array-like to a np.ndarray.
Parameters
----------
x : array-like
Array-like to be converted
Returns
-------
x : np.ndarray
"""
if is_SparseDataFrame(x):
x = x.to_coo().toarray()
elif is_SparseSeries(x):
x = x.to_dense().to_numpy()
elif isinstance(x, (pd.DataFrame, pd.Series, pd.Index)):
x = x.to_numpy()
elif isinstance(x, sparse.spmatrix):
x = x.toarray()
elif isinstance(x, np.matrix):
x = x.A
elif isinstance(x, list):
x_out = []
for xi in x:
try:
xi = toarray(xi)
except TypeError:
# recursed too far
pass
x_out.append(xi)
# convert x_out from list to array
x = np.array(x_out, dtype=_check_numpy_dtype(x_out))
elif isinstance(x, (np.ndarray, numbers.Number)):
pass
else:
raise TypeError("Expected array-like. Got {}".format(type(x)))
return x
[docs]def to_array_or_spmatrix(x):
"""Convert an array-like to a np.ndarray or scipy.sparse.spmatrix.
Parameters
----------
x : array-like
Array-like to be converted
Returns
-------
x : np.ndarray or scipy.sparse.spmatrix
"""
if is_SparseDataFrame(x):
x = x.to_coo()
elif is_sparse_dataframe(x) or is_sparse_series(x):
x = x.sparse.to_coo()
elif isinstance(
x, (sparse.spmatrix, np.ndarray, numbers.Number)
) and not isinstance(x, np.matrix):
pass
elif isinstance(x, list):
x_out = []
for xi in x:
try:
xi = to_array_or_spmatrix(xi)
except TypeError:
# recursed too far
pass
x_out.append(xi)
# convert x_out from list to array
x = np.array(x_out, dtype=_check_numpy_dtype(x_out))
else:
x = toarray(x)
return x
def is_SparseSeries(X):
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
"The SparseSeries class is removed from pandas. Accessing it from the "
"top-level namespace will also be removed in the next version",
FutureWarning,
)
try:
return isinstance(X, pd.SparseSeries)
except AttributeError:
return False
def is_SparseDataFrame(X):
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
"The SparseDataFrame class is removed from pandas. Accessing it from the "
"top-level namespace will also be removed in the next version",
FutureWarning,
)
try:
return isinstance(X, pd.SparseDataFrame)
except AttributeError:
return False
def is_sparse_dataframe(x):
if isinstance(x, pd.DataFrame) and not is_SparseDataFrame(x):
try:
x.sparse
return True
except AttributeError:
pass
return False
def is_sparse_series(x):
if isinstance(x, pd.Series) and not is_SparseSeries(x):
try:
x.sparse
return True
except AttributeError:
pass
return False
def dataframe_to_sparse(x, fill_value=0.0):
x = pd.DataFrame.sparse.from_spmatrix(
sparse.coo_matrix(x.values), index=x.index, columns=x.columns
)
x.sparse.fill_value = fill_value
return x
def SparseDataFrame(X, columns=None, index=None, default_fill_value=0.0):
if sparse.issparse(X):
X = pd.DataFrame.sparse.from_spmatrix(X)
X.sparse.fill_value = default_fill_value
else:
if is_SparseDataFrame(X) or not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
X = dataframe_to_sparse(X, fill_value=default_fill_value)
if columns is not None:
X.columns = columns
if index is not None:
X.index = index
return X
def fillna(data, fill, copy=True):
return_cls = None
if isinstance(data, (sparse.lil_matrix, sparse.dok_matrix)):
return_cls = type(data)
assert copy, f"Cannot fillna in-place for {return_cls.__name__}"
data = data.tocsr()
elif copy:
data = data.copy()
if sparse.issparse(data):
data.data[np.isnan(data.data)] = fill
if return_cls is not None:
data = return_cls(data)
else:
data[np.isnan(data)] = fill
return data
def _nansum(data, axis=None):
if sparse.issparse(data):
return np.sum(fillna(data, 0), axis=axis)
else:
return np.nansum(data, axis=axis)
[docs]def matrix_sum(data, axis=None, ignore_nan=False):
"""Get the column-wise, row-wise, or total sum of values in a matrix.
Parameters
----------
data : array-like, shape=[n_samples, n_features]
Input data
axis : int or None, optional (default: None)
Axis across which to sum. axis=0 gives column sums,
axis=1 gives row sums. None gives the total sum.
ignore_nan : bool, optional (default: False)
If True, uses `np.nansum` instead of `np.sum`
Returns
-------
sums : array-like or float
Sums along desired axis.
"""
sum_fn = _nansum if ignore_nan else np.sum
if axis not in [0, 1, None]:
raise ValueError("Expected axis in [0, 1, None]. Got {}".format(axis))
if isinstance(data, pd.DataFrame):
if is_SparseDataFrame(data):
if axis is None:
sums = sum_fn(data.to_coo())
else:
index = data.index if axis == 1 else data.columns
sums = pd.Series(
np.array(sum_fn(data.to_coo(), axis)).flatten(), index=index
)
elif is_sparse_dataframe(data):
if axis is None:
sums = sum_fn(data.sparse.to_coo())
else:
index = data.index if axis == 1 else data.columns
sums = pd.Series(
np.array(sum_fn(data.sparse.to_coo(), axis)).flatten(), index=index
)
elif axis is None:
sums = sum_fn(data.to_numpy())
else:
sums = sum_fn(data, axis)
else:
sums = sum_fn(data, axis=axis)
if isinstance(sums, np.matrix):
sums = np.array(sums).flatten()
return sums
[docs]def matrix_std(data, axis=None):
"""Get the column-wise, row-wise, or total standard deviation of a matrix.
Parameters
----------
data : array-like, shape=[n_samples, n_features]
Input data
axis : int or None, optional (default: None)
Axis across which to calculate standard deviation.
axis=0 gives column standard deviation,
axis=1 gives row standard deviation.
None gives the total standard deviation.
Returns
-------
std : array-like or float
Standard deviation along desired axis.
"""
if axis not in [0, 1, None]:
raise ValueError("Expected axis in [0, 1, None]. Got {}".format(axis))
index = None
if isinstance(data, pd.DataFrame) and axis is not None:
if axis == 1:
index = data.index
elif axis == 0:
index = data.columns
data = to_array_or_spmatrix(data)
if sparse.issparse(data):
if axis is None:
if isinstance(data, (sparse.lil_matrix, sparse.dok_matrix)):
data = data.tocoo()
data_sq = data.copy()
data_sq.data = data_sq.data**2
variance = data_sq.mean() - data.mean() ** 2
std = np.sqrt(variance)
else:
if axis == 0:
data = data.tocsc()
next_fn = data.getcol
N = data.shape[1]
elif axis == 1:
data = data.tocsr()
next_fn = data.getrow
N = data.shape[0]
std = []
for i in range(N):
col = next_fn(i)
col_sq = col.copy()
col_sq.data = col_sq.data**2
variance = col_sq.mean() - col.mean() ** 2
std.append(np.sqrt(variance))
std = np.array(std)
else:
std = np.std(data, axis=axis)
if index is not None:
std = pd.Series(std, index=index, name="std")
return std
[docs]def matrix_vector_elementwise_multiply(data, multiplier, axis=None):
"""Elementwise multiply a matrix by a vector.
Parameters
----------
data : array-like, shape=[n_samples, n_features]
Input data
multiplier : array-like, shape=[n_samples, 1] or [1, n_features]
Vector by which to multiply `data`
axis : int or None, optional (default: None)
Axis across which to sum. axis=0 multiplies each column,
axis=1 multiplies each row. None guesses based on dimensions
Returns
-------
product : array-like
Multiplied matrix
"""
if axis not in [0, 1, None]:
raise ValueError("Expected axis in [0, 1, None]. Got {}".format(axis))
if axis is None:
if data.shape[0] == data.shape[1]:
raise RuntimeError(
"`data` is square, cannot guess axis from input. "
"Please provide `axis=0` to multiply along rows or "
"`axis=1` to multiply along columns."
)
elif np.prod(multiplier.shape) == data.shape[0]:
axis = 0
elif np.prod(multiplier.shape) == data.shape[1]:
axis = 1
else:
raise ValueError(
"Expected `multiplier` to be a vector of length "
"`data.shape[0]` ({}) or `data.shape[1]` ({}). Got {}".format(
data.shape[0], data.shape[1], multiplier.shape
)
)
multiplier = toarray(multiplier)
if axis == 0:
if not np.prod(multiplier.shape) == data.shape[0]:
raise ValueError(
"Expected `multiplier` to be a vector of length "
"`data.shape[0]` ({}). Got {}".format(data.shape[0], multiplier.shape)
)
multiplier = multiplier.reshape(-1, 1)
else:
if not np.prod(multiplier.shape) == data.shape[1]:
raise ValueError(
"Expected `multiplier` to be a vector of length "
"`data.shape[1]` ({}). Got {}".format(data.shape[1], multiplier.shape)
)
multiplier = multiplier.reshape(1, -1)
if is_SparseDataFrame(data) or is_sparse_dataframe(data):
data = data.copy()
multiplier = multiplier.flatten()
if axis == 0:
for col in data.columns:
try:
mult_indices = data[col].values.sp_index.indices
except AttributeError:
mult_indices = data[col].values.sp_index.to_int_index().indices
new_data = data[col].values.sp_values * multiplier[mult_indices]
data[col].values.sp_values.put(
np.arange(data[col].sparse.npoints), new_data
)
else:
for col, mult in zip(data.columns, multiplier):
data[col] = data[col] * mult
elif isinstance(data, pd.DataFrame):
data = data.mul(multiplier.flatten(), axis=axis)
elif sparse.issparse(data):
if isinstance(
data,
(
sparse.lil_matrix,
sparse.dok_matrix,
sparse.coo_matrix,
sparse.bsr_matrix,
sparse.dia_matrix,
),
):
data = data.tocsr()
data = data.multiply(multiplier)
else:
data = data * multiplier
return data
[docs]def sparse_series_min(data):
"""Get the minimum value from a pandas sparse series.
Pandas SparseDataFrame does not handle np.min.
Parameters
----------
data : pd.Series[SparseArray]
Input data
Returns
-------
minimum : float
Minimum entry in `data`.
"""
return np.concatenate([data.sparse.sp_values, [data.sparse.fill_value]]).min()
[docs]def matrix_min(data):
"""Get the minimum value from a data matrix.
Pandas SparseDataFrame does not handle np.min.
Parameters
----------
data : array-like, shape=[n_samples, n_features]
Input data
Returns
-------
minimum : float
Minimum entry in `data`.
"""
if is_SparseDataFrame(data):
data = [np.min(data[col]) for col in data.columns]
elif is_sparse_dataframe(data):
data = [sparse_series_min(data[col]) for col in data.columns]
elif isinstance(data, pd.DataFrame):
data = np.min(data)
elif isinstance(data, sparse.lil_matrix):
data = [np.min(d) for d in data.data] + [0]
elif isinstance(data, sparse.dok_matrix):
data = list(data.values()) + [0]
elif isinstance(data, sparse.dia_matrix):
data = [np.min(data.data), 0]
return np.min(data)
[docs]def matrix_non_negative(data, allow_equal=True):
"""Check if all values in a matrix are non-negative.
Parameters
----------
data : array-like, shape=[n_samples, n_features]
Input data
allow_equal : bool, optional (default: True)
If True, min(data) can be equal to 0
Returns
-------
is_non_negative : bool
"""
return matrix_min(data) >= 0 if allow_equal else matrix_min(data) > 0
[docs]def matrix_any(condition):
"""Check if a condition is true anywhere in a data matrix.
np.any doesn't handle matrices of type pd.DataFrame
Parameters
----------
condition : array-like
Boolean matrix
Returns
-------
any : bool
True if condition contains any True values, False otherwise
"""
return np.sum(np.sum(condition)) > 0
[docs]def matrix_transpose(X):
"""Transpose a matrix in a memory-efficient manner.
Pandas sparse dataframes are coerced to dense
Parameters
----------
X : array-like, shape=[n,m]
Input data
Returns
-------
X_T : array-like, shape=[m,n]
Transposed input data
"""
if is_sparse_dataframe(X):
fill_values = np.array([dtype.fill_value for dtype in X.dtypes])
if not np.all(fill_values == fill_values[0]):
raise TypeError(
"Can only transpose sparse dataframes with constant fill value. "
"If you wish to proceed, first convert the data to dense with "
"scprep.utils.toarray."
)
X_T = X.sparse.to_coo().T
return SparseDataFrame(
X_T, index=X.columns, columns=X.index, default_fill_value=fill_values[0]
)
else:
return X.T
[docs]def check_consistent_columns(data, common_columns_only=True):
"""Ensure that a set of data matrices have consistent columns.
Parameters
----------
data : list of array-likes
List of matrices to be checked
common_columns_only : bool, optional (default: True)
With pandas inputs, drop any columns that are not common to
all matrices
Returns
-------
data : list of array-likes
List of matrices with consistent columns, subsetted if necessary
Raises
------
ValueError
Raised if data has inconsistent number of columns and does not
have column names for subsetting
"""
matrix_type = type(data[0])
matrix_shape = data[0].shape[1]
if issubclass(matrix_type, pd.DataFrame):
if not (
np.all([d.shape[1] == matrix_shape for d in data[1:]])
and np.all([data[0].columns == d.columns for d in data])
):
if common_columns_only:
common_genes = data[0].columns.values
for d in data[1:]:
common_genes = common_genes[np.isin(common_genes, d.columns.values)]
warnings.warn(
"Input data has inconsistent column names. "
"Subsetting to {} common columns. "
"To retain all columns, use "
"`common_columns_only=False`.".format(len(common_genes)),
UserWarning,
)
for i in range(len(data)):
data[i] = data[i][common_genes]
else:
columns = [d.columns.values for d in data]
all_columns = np.unique(np.concatenate(columns))
warnings.warn(
"Input data has inconsistent column names. "
"Padding with zeros to {} total columns.".format(len(all_columns)),
UserWarning,
)
else:
for d in data[1:]:
if not d.shape[1] == matrix_shape:
shapes = ", ".join([str(d.shape[1]) for d in data])
raise ValueError(
"Expected data all with the same number of "
"columns. Got {}".format(shapes)
)
return data
[docs]def combine_batches(
data, batch_labels, append_to_cell_names=None, common_columns_only=True
):
"""Combine data matrices from multiple batches and store a batch label.
Parameters
----------
data : list of array-like, shape=[n_batch]
All matrices must be of the same format and have the same number of
columns (or genes.)
batch_labels : list of `str`, shape=[n_batch]
List of names assigned to each batch
append_to_cell_names : bool, optional (default: None)
If input is a pandas dataframe, add the batch label corresponding to
each cell to its existing index (or cell name / barcode.)
Default behavior is `True` for dataframes and `False` otherwise.
common_columns_only : bool, optional (default: True)
With pandas inputs, drop any columns that are not common to
all data matrices
Returns
-------
data : data matrix, shape=[n_samples, n_features]
Number of samples is the sum of numbers of samples of all batches.
Number of features is the same as each of the batches.
sample_labels : list-like, shape=[n_samples]
Batch labels corresponding to each sample
"""
if not len(data) == len(batch_labels):
raise ValueError(
"Expected data ({}) and batch_labels ({}) to be the "
"same length.".format(len(data), len(batch_labels))
)
# check consistent type
matrix_type = type(data[0])
if is_SparseDataFrame(data[0]):
matrix_type = pd.DataFrame
if not issubclass(matrix_type, (np.ndarray, pd.DataFrame, sparse.spmatrix)):
raise ValueError(
"Expected data to contain pandas DataFrames, "
"scipy sparse matrices or numpy arrays. "
"Got {}".format(matrix_type.__name__)
)
for d in data[1:]:
if not isinstance(d, matrix_type):
types = ", ".join([type(d).__name__ for d in data])
raise TypeError(
"Expected data all of the same class. " "Got {}".format(types)
)
data = check_consistent_columns(data, common_columns_only=common_columns_only)
# check append_to_cell_names
if append_to_cell_names and not issubclass(matrix_type, pd.DataFrame):
warnings.warn(
"append_to_cell_names only valid for pd.DataFrame input."
" Got {}".format(matrix_type.__name__),
UserWarning,
)
elif append_to_cell_names is None:
if issubclass(matrix_type, pd.DataFrame):
if all([isinstance(d.index, pd.RangeIndex) for d in data]):
# rangeindex should still be a rangeindex
append_to_cell_names = False
else:
append_to_cell_names = True
else:
append_to_cell_names = False
# concatenate labels
sample_labels = np.concatenate(
[np.repeat(batch_labels[i], d.shape[0]) for i, d in enumerate(data)]
)
# conatenate data
if issubclass(matrix_type, pd.DataFrame):
data_combined = pd.concat(data, axis=0, sort=True, join="outer").fillna(0)
if append_to_cell_names:
index = np.concatenate(
[
np.core.defchararray.add(
np.array(d.index, dtype=str), "_" + str(batch_labels[i])
)
for i, d in enumerate(data)
]
)
data_combined.index = index
elif all([isinstance(d.index, pd.RangeIndex) for d in data]):
# rangeindex should still be a rangeindex
data_combined = data_combined.reset_index(drop=True)
sample_labels = pd.Series(
sample_labels, index=data_combined.index, name="sample_labels"
)
elif issubclass(matrix_type, sparse.spmatrix):
data_combined = sparse.vstack(data)
elif issubclass(matrix_type, np.ndarray):
data_combined = np.vstack(data)
return data_combined, sample_labels
def select_cols(data, idx):
raise RuntimeError(
"`scprep.utils.select_cols` is deprecated. Use "
"`scprep.select.select_cols` instead."
)
def select_rows(data, idx):
raise RuntimeError(
"`scprep.utils.select_rows` is deprecated. Use "
"`scprep.select.select_rows` instead."
)
def get_gene_set(data, starts_with=None, ends_with=None, regex=None):
raise RuntimeError(
"`scprep.utils.get_gene_set` is deprecated. Use "
"`scprep.select.get_gene_set` instead."
)
def get_cell_set(data, starts_with=None, ends_with=None, regex=None):
raise RuntimeError(
"`scprep.utils.get_cell_set` is deprecated. Use "
"`scprep.select.get_cell_set` instead."
)
def subsample(*data, n=10000, seed=None):
raise RuntimeError(
"`scprep.utils.subsample` is deprecated. Use "
"`scprep.select.subsample` instead."
)
[docs]def sort_clusters_by_values(clusters, values):
"""Sort `clusters` in increasing order of `values`.
Parameters
----------
clusters : array-like
An array of cluster assignments, like the output of
a `fit_predict()` call.
values : type
An associated value for each index in `clusters` to use
for sorting the clusters.
Returns
-------
new_clusters : array-likes
Reordered cluster assignments. `np.mean(values[new_clusters == 0])`
will be less than `np.mean(values[new_clusters == 1])` which
will be less than `np.mean(values[new_clusters == 2])`
and so on.
"""
clusters = toarray(clusters)
values = toarray(values)
if not len(clusters) == len(values):
raise ValueError(
"Expected clusters ({}) and values ({}) to be the "
"same length.".format(len(clusters), len(values))
)
uniq_clusters = np.unique(clusters)
means = np.array([np.mean(values[clusters == cl]) for cl in uniq_clusters])
new_clust_map = {
curr_cl: i for i, curr_cl in enumerate(uniq_clusters[np.argsort(means)])
}
return np.array([new_clust_map[cl] for cl in clusters])