Source code for scprep.io.tenx

from . import hdf5
from .utils import _matrix_to_data_frame

import numpy as np
import os
import pandas as pd
import scipy.io as sio
import scipy.sparse as sp
import shutil
import tempfile
import urllib
import warnings
import zipfile


def _combine_gene_id(symbols, ids):
    """Create gene labels of the form SYMBOL (ID).

    Parameters
    ----------
    genes: pandas.DataFrame with columns['symbol', 'id']

    Returns
    -------
    pandas.Index with combined gene symbols and ids
    """
    columns = np.core.defchararray.add(np.array(symbols, dtype=str), " (")
    columns = np.core.defchararray.add(columns, np.array(ids, dtype=str))
    columns = np.core.defchararray.add(columns, ")")
    return columns


def _parse_10x_genes(symbols, ids, gene_labels="symbol", allow_duplicates=True):
    assert gene_labels in ["symbol", "id", "both"]
    if gene_labels == "symbol":
        columns = symbols
        if not allow_duplicates and len(np.unique(columns)) < len(columns):
            warnings.warn(
                "Duplicate gene names detected! Forcing `gene_labels='both'`. "
                "Alternatively, try `gene_labels='id'`, "
                "`allow_duplicates=True`, or load the matrix"
                " with `sparse=False`",
                RuntimeWarning,
            )
            gene_labels = "both"
    if gene_labels == "both":
        columns = _combine_gene_id(symbols, ids)
    elif gene_labels == "id":
        columns = ids
    return columns


def _find_gz_file(*path):
    """Find a file that could be gzipped."""
    path = os.path.join(*path)
    if os.path.isfile(path):
        return path
    else:
        return path + ".gz"


[docs]def load_10X(data_dir, sparse=True, gene_labels="symbol", allow_duplicates=None):
    """Load data produced from the 10X Cellranger pipeline.

    A default run of the `cellranger count` command will generate gene-barcode
    matrices for secondary analysis. For both "raw" and "filtered" output,
    directories are created containing three files:
    'matrix.mtx', 'barcodes.tsv', 'genes.tsv'.
    Running `scprep.io.load_10X(data_dir)` will return a Pandas DataFrame with
    genes as columns and cells as rows.

    Parameters
    ----------
    data_dir: string
        path to input data directory
        expects 'matrix.mtx(.gz)', '[genes/features].tsv(.gz)', 'barcodes.tsv(.gz)'
        to be present and will raise an error otherwise
    sparse: boolean
        If True, a sparse Pandas DataFrame is returned.
    gene_labels: string, {'id', 'symbol', 'both'} optional, default: 'symbol'
        Whether the columns of the dataframe should contain gene ids or gene
        symbols. If 'both', returns symbols followed by ids in parentheses.
    allow_duplicates : bool, optional (default: None)
        Whether or not to allow duplicate gene names. If None, duplicates are
        allowed for dense input but not for sparse input.

    Returns
    -------
    data: array-like, shape=[n_samples, n_features]
        If sparse, data will be a pd.DataFrame[pd.SparseArray]. Otherwise, data will
        be a pd.DataFrame.
    """
    if gene_labels not in ["id", "symbol", "both"]:
        raise ValueError(
            "gene_labels='{}' not recognized. "
            "Choose from ['symbol', 'id', 'both']".format(gene_labels)
        )

    if not os.path.isdir(data_dir):
        raise FileNotFoundError("{} is not a directory".format(data_dir))

    try:
        m = sio.mmread(_find_gz_file(data_dir, "matrix.mtx"))
        try:
            genes = pd.read_csv(
                _find_gz_file(data_dir, "genes.tsv"), delimiter="\t", header=None
            )
        except FileNotFoundError:
            genes = pd.read_csv(
                _find_gz_file(data_dir, "features.tsv"), delimiter="\t", header=None
            )
        if genes.shape[1] == 2:
            # Cellranger < 3.0
            genes.columns = ["id", "symbol"]
        else:
            # Cellranger >= 3.0
            genes.columns = ["id", "symbol", "measurement"]
        barcodes = pd.read_csv(
            _find_gz_file(data_dir, "barcodes.tsv"), delimiter="\t", header=None
        )

    except (FileNotFoundError, IOError):
        raise FileNotFoundError(
            "'matrix.mtx(.gz)', '[genes/features].tsv(.gz)', and 'barcodes.tsv(.gz)' "
            "must be present in {}".format(data_dir)
        )

    cell_names = barcodes[0]
    if allow_duplicates is None:
        allow_duplicates = not sparse
    gene_names = _parse_10x_genes(
        genes["symbol"].values.astype(str),
        genes["id"].values.astype(str),
        gene_labels=gene_labels,
        allow_duplicates=allow_duplicates,
    )

    data = _matrix_to_data_frame(
        m.T, cell_names=cell_names, gene_names=gene_names, sparse=sparse
    )
    return data


[docs]def load_10X_zip(filename, sparse=True, gene_labels="symbol", allow_duplicates=None):
    """Load zipped 10X data produced from the 10X Cellranger pipeline.

    Runs `load_10X` after unzipping the data contained in `filename`.

    Parameters
    ----------
    filename: string
        path to zipped input data directory
        expects 'matrix.mtx', 'genes.tsv', 'barcodes.tsv' to be present and
        will raise an error otherwise
    sparse: boolean
        If True, a sparse Pandas DataFrame is returned.
    gene_labels: string, {'id', 'symbol', 'both'} optional, default: 'symbol'
        Whether the columns of the dataframe should contain gene ids or gene
        symbols. If 'both', returns symbols followed by ids in parentheses.
    allow_duplicates : bool, optional (default: None)
        Whether or not to allow duplicate gene names. If None, duplicates are
        allowed for dense input but not for sparse input.

    Returns
    -------
    data: array-like, shape=[n_samples, n_features]
        If sparse, data will be a pd.DataFrame[pd.SparseArray]. Otherwise, data will
        be a pd.DataFrame.
    """
    if gene_labels not in ["id", "symbol", "both"]:
        raise ValueError(
            "gene_labels='{}' not recognized. "
            "Choose from ['symbol', 'id', 'both']".format(gene_labels)
        )

    if not os.path.isfile(filename):
        with tempfile.TemporaryDirectory() as download_dir:
            zip_filename = os.path.join(download_dir, "download.zip")
            try:
                with urllib.request.urlopen(filename) as url:
                    with open(zip_filename, "wb") as handle:
                        handle.write(url.read())
            except ValueError as e:
                if str(e).startswith("unknown url type:"):
                    # not actually a url
                    raise FileNotFoundError("No such file: '{}'".format(filename))
                else:
                    raise
            else:
                return load_10X_zip(
                    zip_filename,
                    sparse=sparse,
                    gene_labels=gene_labels,
                    allow_duplicates=allow_duplicates,
                )

    tmpdir = tempfile.mkdtemp()
    with zipfile.ZipFile(filename) as handle:
        files = handle.namelist()
        if len(files) < 3:
            valid_dirnames = []
        else:
            valid_dirnames = []
            for dirname in set([""] + ["/".join(f.split("/")[:-1]) for f in files]):
                subdir_files = [f for f in files if f.startswith(dirname)]

                # Helper function to parse paths
                def path(fn, dirname):
                    if dirname != "":
                        path = "{}/{}".format(dirname, fn)
                    else:
                        path = fn
                    return path

                if (
                    (
                        path("barcodes.tsv", dirname) in subdir_files
                        or path("barcodes.tsv.gz", dirname) in subdir_files
                    )
                    and (
                        (
                            path("genes.tsv", dirname) in subdir_files
                            or path("genes.tsv.gz", dirname) in subdir_files
                        )
                        or (
                            path("features.tsv", dirname) in subdir_files
                            or path("features.tsv.gz", dirname) in subdir_files
                        )
                    )
                    and (
                        path("matrix.mtx", dirname) in subdir_files
                        or path("matrix.mtx.gz", dirname) in subdir_files
                    )
                ):
                    valid_dirnames.append(dirname)
        if len(valid_dirnames) != 1:
            raise ValueError(
                "Expected a single zipped folder containing 'matrix.mtx(.gz)', "
                "'[genes/features].tsv(.gz)', and 'barcodes.tsv(.gz)'. Got {}".format(
                    files
                )
            )
        dirname = valid_dirnames[0]
        handle.extractall(path=tmpdir)
    data = load_10X(os.path.join(tmpdir, dirname))
    shutil.rmtree(tmpdir)
    return data


[docs]@hdf5.with_HDF5
def load_10X_HDF5(
    filename,
    genome=None,
    sparse=True,
    gene_labels="symbol",
    allow_duplicates=None,
    backend=None,
):
    """Load HDF5 10X data produced from the 10X Cellranger pipeline.

    Equivalent to `load_10X` but for HDF5 format.

    Parameters
    ----------
    filename: string
        path to HDF5 input data
    genome : str or None, optional (default: None)
        Name of the genome to which CellRanger ran analysis. If None, selects
        the first available genome, and prints all available genomes if more
        than one is available. Invalid for Cellranger 3.0 HDF5 files.
    sparse: boolean
        If True, a sparse Pandas DataFrame is returned.
    gene_labels: string, {'id', 'symbol', 'both'} optional, default: 'symbol'
        Whether the columns of the dataframe should contain gene ids or gene
        symbols. If 'both', returns symbols followed by ids in parentheses.
    allow_duplicates : bool, optional (default: None)
        Whether or not to allow duplicate gene names. If None, duplicates are
        allowed for dense input but not for sparse input.
    backend : string, {'tables', 'h5py' or None} optional, default: None
        Selects the HDF5 backend. By default, selects whichever is available,
        using tables if both are available.

    Returns
    -------
    data: array-like, shape=[n_samples, n_features]
        If sparse, data will be a pd.DataFrame[pd.SparseArray]. Otherwise, data will
        be a pd.DataFrame.
    """
    if gene_labels not in ["id", "symbol", "both"]:
        raise ValueError(
            "gene_labels='{}' not recognized. "
            "Choose from ['symbol', 'id', 'both']".format(gene_labels)
        )

    # default allow_duplicates
    if allow_duplicates is None:
        allow_duplicates = not sparse

    with hdf5.open_file(filename, "r", backend=backend) as f:

        # handle genome
        groups = hdf5.list_nodes(f)
        try:
            # Cellranger 3.0
            group = hdf5.get_node(f, "matrix")
            if genome is not None:
                raise NotImplementedError(
                    "Selecting genomes for Cellranger 3.0 files is not "
                    "currently supported. Please file an issue at "
                    "https://github.com/KrishnaswamyLab/scprep/issues"
                )
        except (AttributeError, KeyError):
            # Cellranger 2.0
            if genome is None:
                print_genomes = ", ".join(groups)
                genome = groups[0]
                if len(groups) > 1:
                    print(
                        "Available genomes: {}. Selecting {} by default".format(
                            print_genomes, genome
                        )
                    )
            try:
                group = hdf5.get_node(f, genome)
            except (AttributeError, KeyError):
                print_genomes = ", ".join(groups)
                raise ValueError(
                    "Genome {} not found in {}. "
                    "Available genomes: {}".format(genome, filename, print_genomes)
                )

        try:
            # Cellranger 3.0
            features = hdf5.get_node(group, "features")
            gene_symbols = hdf5.get_node(features, "name")
            gene_ids = hdf5.get_node(features, "id")
        except (KeyError, IndexError):
            # Cellranger 2.0
            gene_symbols = hdf5.get_node(group, "gene_names")
            gene_ids = hdf5.get_node(group, "genes")

        # convert to string column names
        gene_names = _parse_10x_genes(
            symbols=[g.decode() for g in hdf5.get_values(gene_symbols)],
            ids=[g.decode() for g in hdf5.get_values(gene_ids)],
            gene_labels=gene_labels,
            allow_duplicates=allow_duplicates,
        )

        cell_names = [
            b.decode() for b in hdf5.get_values(hdf5.get_node(group, "barcodes"))
        ]
        data = hdf5.get_values(hdf5.get_node(group, "data"))
        indices = hdf5.get_values(hdf5.get_node(group, "indices"))
        indptr = hdf5.get_values(hdf5.get_node(group, "indptr"))
        shape = hdf5.get_values(hdf5.get_node(group, "shape"))
        data = sp.csc_matrix((data, indices, indptr), shape=shape)
        data = _matrix_to_data_frame(
            data.T, gene_names=gene_names, cell_names=cell_names, sparse=sparse
        )
        return data