Source code for scprep.plot.marker

from .. import select
from .. import stats
from .. import utils
from .tools import label_axis
from .utils import _get_figure
from .utils import parse_fontsize
from .utils import shift_ticklabels
from .utils import show
from .utils import temp_fontsize
from scipy.cluster import hierarchy

import numpy as np
import pandas as pd


def _make_scatter_arrays(
    data_clust,
    cluster_names,
    tissues,
    markers,
    gene_names,
    normalize_emd,
    normalize_expression,
):
    cluster_labels = []
    marker_labels = []
    tissue_labels = []
    x = []
    y = []
    c = []
    s = []
    # build points coordinate, color and size arrays
    for j, marker in enumerate(markers):
        s_row = []
        c_row = []
        for i, cluster in enumerate(cluster_names):
            in_cluster_expr, out_cluster_expr = data_clust[cluster]
            x.append(i)
            y.append(j)
            marker_labels.append(marker)
            cluster_labels.append(cluster)
            if tissues is not None:
                tissue_labels.append(tissues[j])
            gidx = np.where(gene_names == marker)
            marker_expr = in_cluster_expr[:, gidx]
            s_row.append(stats.EMD(marker_expr, out_cluster_expr[:, gidx]))
            c_row.append(np.mean(marker_expr))
        # row normalize
        s_row = np.array(s_row)
        if normalize_emd and np.max(s_row) != 0:
            s_row = 150 * s_row / np.max(s_row)
        c_row = np.array(c_row)
        if normalize_expression and np.max(c_row) != 0:
            c_row = c_row / np.max(c_row)
        s.append(s_row)
        c.append(c_row)

    s = np.concatenate(s)
    if not normalize_emd:
        s = 150 * s / np.max(s)
    c = np.concatenate(c)

    return x, y, c, s, cluster_labels, tissue_labels, marker_labels


def _cluster_tissues(tissue_names, cluster_names, tissue_labels, cluster_labels, s, c):
    # cluster tissues hierarchically using mean size and color
    tissue_features = []
    for tissue in tissue_names:
        tissue_data = []
        for cluster in cluster_names:
            tissue_cluster_idx = np.where(
                (np.array(tissue_labels) == tissue)
                & (np.array(cluster_labels) == cluster)
            )
            tissue_data.append(
                np.vstack([s[tissue_cluster_idx], c[tissue_cluster_idx]]).mean(axis=1)
            )
        tissue_features.append(np.concatenate(tissue_data))
    tissue_features = np.array(tissue_features)
    # normalize
    tissue_features = tissue_features / np.sqrt(np.sum(tissue_features**2))
    tissues_order = hierarchy.leaves_list(hierarchy.linkage(tissue_features))
    return tissues_order


def _cluster_markers(
    markers, tissues, marker_labels, tissue_labels, marker_groups_order, s, c
):
    # cluster markers hierarchically using mean size and color
    markers_order = []
    for marker_group in marker_groups_order:
        if len(marker_group) > 1:
            marker_names = markers[marker_group]
            marker_features = []
            for marker in marker_names:
                marker_idx = np.array(marker_labels) == marker
                if tissues is not None:
                    # check for markers that appear in multiple tissues
                    marker_idx = marker_idx & (
                        tissue_labels == tissues[marker_group[0]]
                    )
                marker_features.append(np.concatenate([s[marker_idx], c[marker_idx]]))
            marker_features = np.array(marker_features)
            # normalize
            marker_features = marker_features / np.sqrt(np.sum(marker_features**2))
            marker_group_order = hierarchy.leaves_list(
                hierarchy.linkage(marker_features)
            )
            markers_order.append(marker_group[marker_group_order])
        else:
            markers_order.append(marker_group)
    markers_order = np.concatenate(markers_order)
    return markers_order


[docs]@utils._with_pkg(pkg="matplotlib", min_version=3)
def marker_plot(
    data,
    clusters,
    markers,
    gene_names=None,
    normalize_expression=True,
    normalize_emd=True,
    reorder_tissues=True,
    reorder_markers=True,
    cmap="magma",
    title=None,
    figsize=None,
    ax=None,
    fontsize=None,
):
    """Plot marker gene enrichment.

    Generate a plot indicating the expression level and enrichment of
    a set of marker genes for each cluster.

    Color of each point indicates the expression of each gene in each cluster.
    The size of each point indicates how differentially expressed each gene is
    in each cluster.

    Parameters
    ----------
    data : array-like, shape=[n_cells, n_genes]
        Gene expression data for calculating expression statistics.
    clusters : list-like, shape=[n_cells]
        Cluster assignments for each cell. Should be ints
        like the output of most sklearn.cluster methods.
    markers : dict or list-like
        If a dictionary, keys represent tissues and
        values being a list of marker genes in each tissue.
        If a list, a list of marker genes.
    gene_names : list-like, shape=[n_genes]
        List of gene names.
    normalize_{expression,emd} : bool, optional (default: True)
        Normalize the expression and EMD of each row.
    reorder_{tissues,markers} : bool, optional (default: True)
        Reorder tissues and markers according to hierarchical clustering=
    cmap : str or matplotlib colormap, optional (default: 'inferno')
        Colormap with which to color points.
    title : str or None, optional (default: None)
        Title for the plot
    figsize : tuple or None, optional (default: None)
        If not None, sets the figure size (width, height)
    ax : `matplotlib.Axes` or None, optional (default: None)
        Axis to plot on. If None, a new axis will be created.
    fontsize : int or None, optional (default: None)
        Base fontsize.

    Returns
    -------
    ax : `matplotlib.Axes`
        axis on which plot was drawn

    Example
    -------
    >>> markers = {'Adaxial - Immature': ['myl10', 'myod1'],
                   'Adaxial - Mature': ['myog'],
                   'Presomitic mesoderm': ['tbx6', 'msgn1', 'tbx16'],
                   'Forming somites': ['mespba', 'ripply2'],
                   'Somites': ['meox1', 'ripply1', 'aldh1a2']}
    >>> cluster_marker_plot(data, clusters, gene_names, markers,
                            title="Tailbud - PSM")
    """
    with temp_fontsize(fontsize):
        if gene_names is None:
            if not isinstance(data, pd.DataFrame):
                raise ValueError(
                    "Either `data` must be a pd.DataFrame, or gene_names must "
                    "be provided. "
                    "Got gene_names=None, data as a {}".format(type(data))
                )
            gene_names = data.columns

        if isinstance(markers, dict):
            tissues, markers = tuple(
                zip(*[([k] * len(v), v) for k, v in markers.items()])
            )
            tissues, markers = np.concatenate(tissues), np.concatenate(markers)
        else:
            markers = utils.toarray(markers)
            tissues = None

        for gene in markers:
            if gene not in gene_names:
                raise ValueError(
                    "All genes in `markers` must appear "
                    "in gene_names. Did not find: {}".format(gene)
                )

        data = utils.to_array_or_spmatrix(data)

        cluster_names = np.unique(clusters)

        keep_genes = np.isin(gene_names, markers)
        data, gene_names = select.select_cols(data, gene_names, idx=keep_genes)

        fig, ax, show_fig = _get_figure(ax, figsize=figsize)

        # Do boolean indexing only once per cluster
        data_clust = {}
        for i, cluster in enumerate(cluster_names):
            in_cluster = clusters == cluster
            in_cluster_expr = data[in_cluster]
            out_cluster_expr = data[~in_cluster]
            data_clust[cluster] = (in_cluster_expr, out_cluster_expr)

        (
            x,
            y,
            c,
            s,
            cluster_labels,
            tissue_labels,
            marker_labels,
        ) = _make_scatter_arrays(
            data_clust,
            cluster_names,
            tissues,
            markers,
            gene_names,
            normalize_emd,
            normalize_expression,
        )

        # reorder y axis
        if tissues is not None and len(tissues) > 1:
            tissue_names = np.unique(tissues)
            if reorder_tissues:
                tissues_order = _cluster_tissues(
                    tissue_names, cluster_names, tissue_labels, cluster_labels, s, c
                )
            else:
                # keep tissues in order
                tissues_order = np.arange(len(tissue_names))
            marker_groups_order = [
                np.arange(len(markers))[tissues == tissue_names[i]]
                for i in tissues_order
            ]
        else:
            # only one tissue
            marker_groups_order = [np.arange(len(markers))]

        if reorder_markers and len(markers) > 1:
            markers_order = _cluster_markers(
                markers,
                tissues,
                marker_labels,
                tissue_labels,
                marker_groups_order,
                s,
                c,
            )
        else:
            # keep markers in order
            markers_order = np.concatenate(marker_groups_order)

        # reposition y coordinates
        y = np.array(y)
        y_new = np.zeros_like(y)
        for i in range(len(markers)):
            y_new[y == markers_order[i]] = i
        y = y_new

        ax.scatter(x, y, s=s, c=c, cmap=cmap, vmax=max(c) * 1.3)

        # Vertical and Horizontal Grid Lines
        for h in np.unique(y):
            ax.axhline(h, c="k", linewidth=0.1, zorder=0)
        for v in np.unique(x):
            ax.axvline(v, c="k", linewidth=0.1, zorder=0)

        ax.set_ylim(-0.5, len(markers) - 0.5)

        # Title
        title_fontsize = parse_fontsize(None, "xx-large")
        ax.set_title(title, fontsize=title_fontsize, fontweight="bold")

        # X axis decorators
        x_unique, x_unique_idx = np.unique(x, return_index=True)
        label_axis(
            ax.xaxis,
            label="Cluster",
            ticks=x_unique,
            ticklabels=np.array(cluster_labels)[x_unique_idx],
            ticklabel_rotation=45,
            ticklabel_horizontal_alignment="right",
        )
        shift_ticklabels(ax.xaxis, dx=0.1)

        # Y axis decorators
        label_axis(
            ax.yaxis, ticks=np.arange(len(markers)), ticklabels=markers[markers_order]
        )

        if tissues is not None:
            # Right Y axis decorators
            ax2 = ax.twinx()
            ax2.set_ylim(ax.get_ylim())
            label_axis(
                ax2.yaxis,
                ticks=np.arange(len(tissues)),
                ticklabels=tissues[markers_order],
            )

        if show_fig:
            show(fig)

    return ax