Source code for scprep.run.slingshot

from .. import utils
from . import r_function

import numpy as np
import pandas as pd
import warnings


[docs]def install(site_repository=None, update=False, version=None, verbose=True):
    """Install the required R packages to run Slingshot.

    Parameters
    ----------
    site_repository : string, optional (default: None)
        additional repository in which to look for packages to install.
        This repository will be prepended to the default repositories
    update : boolean, optional (default: False)
        When False, don't attempt to update old packages.
        When True, update old packages automatically.
    version : string, optional (default: None)
        Bioconductor version to install, e.g., version = "3.8".
        The special symbol version = "devel" installs the current 'development' version.
        If None, installs from the current version.
    verbose : boolean, optional (default: True)
        Install script verbosity.
    """
    r_function.install_bioconductor(
        "slingshot",
        site_repository=site_repository,
        update=update,
        version=version,
        verbose=verbose,
    )


_Slingshot = r_function.RFunction(
    setup="""
        library(slingshot)
    """,
    args="""
        data, cluster_labels,
        start_cluster = NULL, end_cluster = NULL,
        distance = NULL, omega = NULL, lineages = list(), shrink = TRUE,
        extend = "y", reweight = TRUE, reassign = TRUE, thresh = 0.001,
        max_iter = 15, stretch = 2,
        smoother = "smooth.spline", shrink_method = "cosine",
        allow_breaks = TRUE, seed = NULL, ...
    """,
    body="""
        set.seed(seed)
        data <- as.matrix(data)
        cluster_labels <- as.factor(cluster_labels)

        # Run Slingshot
        sling <- slingshot(data, clusterLabels = cluster_labels,
                         start.clus = start_cluster, end.clus = end_cluster,
                         dist.fun = distance, omega = omega, lineages = lineages,
                         shrink = shrink, extend = extend, reweight = reweight,
                         reassign = reassign, thresh = thresh,
                         maxit = max_iter, stretch = stretch,
                         smoother = smoother, shrink.method = shrink_method,
                         allow.breaks = allow_breaks, ...)
        list(pseudotime = slingPseudotime(sling),
             curves = lapply(sling@curves, function(curve) curve$s[curve$ord,]))
    """,
)


[docs]def Slingshot(
    data,
    cluster_labels,
    start_cluster=None,
    end_cluster=None,
    distance=None,
    omega=None,
    shrink=True,
    extend="y",
    reweight=True,
    reassign=True,
    thresh=0.001,
    max_iter=15,
    stretch=2,
    smoother="smooth.spline",
    shrink_method="cosine",
    allow_breaks=True,
    seed=None,
    verbose=1,
    **kwargs,
):
    """Perform lineage inference with Slingshot.

    Given a reduced-dimensional data matrix n by p and a vector of cluster labels
    (or matrix of soft cluster assignments, potentially including a -1 label for
    "unclustered"), this function performs lineage inference using a cluster-based
    minimum spanning tree and constructing simulatenous principal curves for branching
    paths through the tree.

    For more details, read about Slingshot on GitHub_ and Bioconductor_.

    .. _GitHub: https://github.com/kstreet13/slingshot
    .. _Bioconductor: https://bioconductor.org/packages/release/bioc/html/slingshot.html

    Parameters
    ----------
    data : array-like, shape=[n_samples, n_dimensions]
        matrix of (reduced dimension) coordinates
        to be used for lineage inference.
    cluster_labels : list-like, shape=[n_samples]
        a vector of cluster labels, optionally including -1's for "unclustered."
    start_cluster : string, optional (default: None)
        indicates the cluster(s) of origin.
        Lineages will be represented by paths coming out of this cluster.
    end_cluster : string, optional (default: None)
        indicates the cluster(s) which will be forced leaf nodes.
        This introduces a constraint on the MST algorithm.
    distance : callable, optional (default: None)
        method for calculating distances between clusters.
        Must take two matrices as input, corresponding to subsets of reduced_dim.
        If the minimum cluster size is larger than the number dimensions,
        the default is to use the joint covariance matrix to find squared distance
        between cluster centers. If not, the default is to use the diagonal of the
        joint covariance matrix. Not currently implemented
    omega : float, optional (default: None)
        this granularity parameter determines the distance between every
        real cluster and the artificial cluster.
        It is parameterized such that this distance is omega / 2,
        making omega the maximum distance between two connected clusters.
        By default, omega = Inf.
    shrink : boolean or float, optional (default: True)
        boolean or numeric between 0 and 1, determines whether and how much to shrink
        branching lineages toward their average prior to the split.
    extend : {'y', 'n', 'pc1'}, optional (default: "y")
        how to handle root and leaf clusters of lineages when
        constructing the initial, piece-wise linear curve.
    reweight : boolean, optional (default: True)
        whether to allow cells shared between lineages to be reweighted during
        curve-fitting. If True, cells shared between lineages will be iteratively
        reweighted based on the quantiles of their projection distances to each curve.
    reassign : boolean, optional (default: True)
        whether to reassign cells to lineages at each iteration.
        If True, cells will be added to a lineage when their
        projection distance to the curve is less than the median
        distance for all cells currently assigned to the lineage.
        Additionally, shared cells will be removed from a lineage if
        their projection distance to the curve is above the 90th
        percentile and their weight along the curve is less than 0.1.
    thresh : float, optional (default: 0.001)
        determines the convergence criterion. Percent change in the
        total distance from cells to their projections along curves
        must be less than thresh.
    max_iter : int, optional (default: 15)
        maximum number of iterations
    stretch : int, optional (default: 2)
        factor between 0 and 2 by which curves can be extrapolated beyond endpoints
    smoother : {"smooth.spline", "lowess", "periodic_lowess"},
        optional (default: "smooth.spline")
        choice of smoother. "periodic_lowess" allows one to fit closed
        curves. Beware, you may want to use iter = 0 with "lowess".
    shrink_method : string, optional (default: "cosine")
        how to determine the appropriate amount of shrinkage for a
        branching lineage. Accepted values: "gaussian", "rectangular",
        "triangular", "epanechnikov", "biweight", "triweight",
        "cosine", "optcosine", "density".
    allow_breaks : boolean, optional (default: True)
        determines whether curves that branch very close to the origin
        should be allowed to have different starting points.
    seed : int or None, optional (default: None)
        Seed to use for generating random numbers.
    verbose : int, optional (default: 1)
        Logging verbosity between 0 and 2.

    Returns
    -------
    slingshot : dict
        Contains the following keys:
    pseudotime : array-like, shape=[n_samples, n_curves]
        Pseudotime projection of each cell onto each principal curve.
        Value is `np.nan` if the cell does not lie on the curve
    branch : list-like, shape=[n_samples]
        Branch assignment for each cell
    curves : array_like, shape=[n_curves, n_samples, n_dimensions]
        Coordinates of each principle curve in the reduced dimension

    Examples
    --------
    >>> import scprep
    >>> import phate
    >>> data, clusters = phate.tree.gen_dla(n_branch=4, n_dim=200, branch_length=200)
    >>> phate_op = phate.PHATE()
    >>> data_phate = phate_op.fit_transform(data)
    >>> slingshot = scprep.run.Slingshot(data_phate, clusters)
    >>> ax = scprep.plot.scatter2d(
    ...     data_phate,
    ...     c=slingshot['pseudotime'][:,0],
    ...     cmap='magma',
    ...     legend_title='Branch 1'
    ... )
    >>> scprep.plot.scatter2d(
    ...     data_phate,
    ...     c=slingshot['pseudotime'][:,1],
    ...     cmap='viridis',
    ...     ax=ax,
    ...     ticks=False,
    ...     label_prefix='PHATE',
    ...     legend_title='Branch 2'
    ...     )
    >>> for curve in slingshot['curves']:
    ...     ax.plot(curve[:,0], curve[:,1], c='black')
    >>> ax = scprep.plot.scatter2d(data_phate, c=slingshot['branch'],
    ...                        legend_title='Branch', ticks=False, label_prefix='PHATE')
    >>> for curve in slingshot['curves']:
    ...     ax.plot(curve[:,0], curve[:,1], c='black')
    """
    if seed is None:
        seed = np.random.randint(2**16 - 1)
    if distance is not None:
        raise NotImplementedError("distance argument not currently implemented")
    np.random.seed(seed)

    index = data.index if isinstance(data, pd.DataFrame) else None

    data = utils.toarray(data)
    if data.shape[1] > 3:
        warnings.warn(
            "Expected data to be low-dimensional. "
            "Got data.shape[1] = {}".format(data.shape[1]),
            UserWarning,
        )
    cluster_labels = utils.toarray(cluster_labels).flatten()
    if not cluster_labels.shape[0] == data.shape[0]:
        raise ValueError(
            "Expected len(cluster_labels) ({}) to equal "
            "data.shape[0] ({})".format(cluster_labels.shape[0], data.shape[0])
        )

    if start_cluster is not None:
        kwargs["start_cluster"] = start_cluster
    if end_cluster is not None:
        kwargs["end_cluster"] = end_cluster
    if omega is not None:
        kwargs["omega"] = omega

    slingshot = _Slingshot(
        data=data,
        cluster_labels=cluster_labels,
        shrink=shrink,
        extend=extend,
        reweight=reweight,
        reassign=reassign,
        thresh=thresh,
        max_iter=max_iter,
        stretch=stretch,
        smoother=smoother,
        shrink_method=shrink_method,
        allow_breaks=allow_breaks,
        **kwargs,
        seed=seed,
        rpy_verbose=verbose,
    )
    slingshot["curves"] = np.array(list(slingshot["curves"].values()))

    membership = (~np.isnan(slingshot["pseudotime"])).astype(int)
    branch = np.sum(membership * (2 ** np.arange(membership.shape[1])), axis=1)
    # reorder based on pseudotime
    branch_ids = np.unique(branch)
    branch_means = [
        np.nanmean(slingshot["pseudotime"][branch == id])
        if not np.all(np.isnan(slingshot["pseudotime"][branch == id]))
        else np.nan
        for id in branch_ids
    ]
    branch_order = np.argsort(branch_means)
    branch_old = branch.copy()
    for i in range(len(branch_order)):
        j = branch_order[i]
        if np.isnan(branch_means[j]):
            branch[branch_old == branch_ids[j]] = -1
        else:
            branch[branch_old == branch_ids[j]] = i
    slingshot["branch"] = branch

    if index is not None:
        slingshot["pseudotime"] = pd.DataFrame(slingshot["pseudotime"], index=index)
        slingshot["branch"] = pd.Series(slingshot["branch"], name="branch", index=index)
    return slingshot