from . import hdf5
from .utils import _matrix_to_data_frame
import numpy as np
import os
import pandas as pd
import scipy.io as sio
import scipy.sparse as sp
import shutil
import tempfile
import urllib
import warnings
import zipfile
def _combine_gene_id(symbols, ids):
"""Create gene labels of the form SYMBOL (ID).
Parameters
----------
genes: pandas.DataFrame with columns['symbol', 'id']
Returns
-------
pandas.Index with combined gene symbols and ids
"""
columns = np.core.defchararray.add(np.array(symbols, dtype=str), " (")
columns = np.core.defchararray.add(columns, np.array(ids, dtype=str))
columns = np.core.defchararray.add(columns, ")")
return columns
def _parse_10x_genes(symbols, ids, gene_labels="symbol", allow_duplicates=True):
assert gene_labels in ["symbol", "id", "both"]
if gene_labels == "symbol":
columns = symbols
if not allow_duplicates and len(np.unique(columns)) < len(columns):
warnings.warn(
"Duplicate gene names detected! Forcing `gene_labels='both'`. "
"Alternatively, try `gene_labels='id'`, "
"`allow_duplicates=True`, or load the matrix"
" with `sparse=False`",
RuntimeWarning,
)
gene_labels = "both"
if gene_labels == "both":
columns = _combine_gene_id(symbols, ids)
elif gene_labels == "id":
columns = ids
return columns
def _find_gz_file(*path):
"""Find a file that could be gzipped."""
path = os.path.join(*path)
if os.path.isfile(path):
return path
else:
return path + ".gz"
[docs]def load_10X(data_dir, sparse=True, gene_labels="symbol", allow_duplicates=None):
"""Load data produced from the 10X Cellranger pipeline.
A default run of the `cellranger count` command will generate gene-barcode
matrices for secondary analysis. For both "raw" and "filtered" output,
directories are created containing three files:
'matrix.mtx', 'barcodes.tsv', 'genes.tsv'.
Running `scprep.io.load_10X(data_dir)` will return a Pandas DataFrame with
genes as columns and cells as rows.
Parameters
----------
data_dir: string
path to input data directory
expects 'matrix.mtx(.gz)', '[genes/features].tsv(.gz)', 'barcodes.tsv(.gz)'
to be present and will raise an error otherwise
sparse: boolean
If True, a sparse Pandas DataFrame is returned.
gene_labels: string, {'id', 'symbol', 'both'} optional, default: 'symbol'
Whether the columns of the dataframe should contain gene ids or gene
symbols. If 'both', returns symbols followed by ids in parentheses.
allow_duplicates : bool, optional (default: None)
Whether or not to allow duplicate gene names. If None, duplicates are
allowed for dense input but not for sparse input.
Returns
-------
data: array-like, shape=[n_samples, n_features]
If sparse, data will be a pd.DataFrame[pd.SparseArray]. Otherwise, data will
be a pd.DataFrame.
"""
if gene_labels not in ["id", "symbol", "both"]:
raise ValueError(
"gene_labels='{}' not recognized. "
"Choose from ['symbol', 'id', 'both']".format(gene_labels)
)
if not os.path.isdir(data_dir):
raise FileNotFoundError("{} is not a directory".format(data_dir))
try:
m = sio.mmread(_find_gz_file(data_dir, "matrix.mtx"))
try:
genes = pd.read_csv(
_find_gz_file(data_dir, "genes.tsv"), delimiter="\t", header=None
)
except FileNotFoundError:
genes = pd.read_csv(
_find_gz_file(data_dir, "features.tsv"), delimiter="\t", header=None
)
if genes.shape[1] == 2:
# Cellranger < 3.0
genes.columns = ["id", "symbol"]
else:
# Cellranger >= 3.0
genes.columns = ["id", "symbol", "measurement"]
barcodes = pd.read_csv(
_find_gz_file(data_dir, "barcodes.tsv"), delimiter="\t", header=None
)
except (FileNotFoundError, IOError):
raise FileNotFoundError(
"'matrix.mtx(.gz)', '[genes/features].tsv(.gz)', and 'barcodes.tsv(.gz)' "
"must be present in {}".format(data_dir)
)
cell_names = barcodes[0]
if allow_duplicates is None:
allow_duplicates = not sparse
gene_names = _parse_10x_genes(
genes["symbol"].values.astype(str),
genes["id"].values.astype(str),
gene_labels=gene_labels,
allow_duplicates=allow_duplicates,
)
data = _matrix_to_data_frame(
m.T, cell_names=cell_names, gene_names=gene_names, sparse=sparse
)
return data
[docs]def load_10X_zip(filename, sparse=True, gene_labels="symbol", allow_duplicates=None):
"""Load zipped 10X data produced from the 10X Cellranger pipeline.
Runs `load_10X` after unzipping the data contained in `filename`.
Parameters
----------
filename: string
path to zipped input data directory
expects 'matrix.mtx', 'genes.tsv', 'barcodes.tsv' to be present and
will raise an error otherwise
sparse: boolean
If True, a sparse Pandas DataFrame is returned.
gene_labels: string, {'id', 'symbol', 'both'} optional, default: 'symbol'
Whether the columns of the dataframe should contain gene ids or gene
symbols. If 'both', returns symbols followed by ids in parentheses.
allow_duplicates : bool, optional (default: None)
Whether or not to allow duplicate gene names. If None, duplicates are
allowed for dense input but not for sparse input.
Returns
-------
data: array-like, shape=[n_samples, n_features]
If sparse, data will be a pd.DataFrame[pd.SparseArray]. Otherwise, data will
be a pd.DataFrame.
"""
if gene_labels not in ["id", "symbol", "both"]:
raise ValueError(
"gene_labels='{}' not recognized. "
"Choose from ['symbol', 'id', 'both']".format(gene_labels)
)
if not os.path.isfile(filename):
with tempfile.TemporaryDirectory() as download_dir:
zip_filename = os.path.join(download_dir, "download.zip")
try:
with urllib.request.urlopen(filename) as url:
with open(zip_filename, "wb") as handle:
handle.write(url.read())
except ValueError as e:
if str(e).startswith("unknown url type:"):
# not actually a url
raise FileNotFoundError("No such file: '{}'".format(filename))
else:
raise
else:
return load_10X_zip(
zip_filename,
sparse=sparse,
gene_labels=gene_labels,
allow_duplicates=allow_duplicates,
)
tmpdir = tempfile.mkdtemp()
with zipfile.ZipFile(filename) as handle:
files = handle.namelist()
if len(files) < 3:
valid_dirnames = []
else:
valid_dirnames = []
for dirname in set([""] + ["/".join(f.split("/")[:-1]) for f in files]):
subdir_files = [f for f in files if f.startswith(dirname)]
# Helper function to parse paths
def path(fn, dirname):
if dirname != "":
path = "{}/{}".format(dirname, fn)
else:
path = fn
return path
if (
(
path("barcodes.tsv", dirname) in subdir_files
or path("barcodes.tsv.gz", dirname) in subdir_files
)
and (
(
path("genes.tsv", dirname) in subdir_files
or path("genes.tsv.gz", dirname) in subdir_files
)
or (
path("features.tsv", dirname) in subdir_files
or path("features.tsv.gz", dirname) in subdir_files
)
)
and (
path("matrix.mtx", dirname) in subdir_files
or path("matrix.mtx.gz", dirname) in subdir_files
)
):
valid_dirnames.append(dirname)
if len(valid_dirnames) != 1:
raise ValueError(
"Expected a single zipped folder containing 'matrix.mtx(.gz)', "
"'[genes/features].tsv(.gz)', and 'barcodes.tsv(.gz)'. Got {}".format(
files
)
)
dirname = valid_dirnames[0]
handle.extractall(path=tmpdir)
data = load_10X(os.path.join(tmpdir, dirname))
shutil.rmtree(tmpdir)
return data
[docs]@hdf5.with_HDF5
def load_10X_HDF5(
filename,
genome=None,
sparse=True,
gene_labels="symbol",
allow_duplicates=None,
backend=None,
):
"""Load HDF5 10X data produced from the 10X Cellranger pipeline.
Equivalent to `load_10X` but for HDF5 format.
Parameters
----------
filename: string
path to HDF5 input data
genome : str or None, optional (default: None)
Name of the genome to which CellRanger ran analysis. If None, selects
the first available genome, and prints all available genomes if more
than one is available. Invalid for Cellranger 3.0 HDF5 files.
sparse: boolean
If True, a sparse Pandas DataFrame is returned.
gene_labels: string, {'id', 'symbol', 'both'} optional, default: 'symbol'
Whether the columns of the dataframe should contain gene ids or gene
symbols. If 'both', returns symbols followed by ids in parentheses.
allow_duplicates : bool, optional (default: None)
Whether or not to allow duplicate gene names. If None, duplicates are
allowed for dense input but not for sparse input.
backend : string, {'tables', 'h5py' or None} optional, default: None
Selects the HDF5 backend. By default, selects whichever is available,
using tables if both are available.
Returns
-------
data: array-like, shape=[n_samples, n_features]
If sparse, data will be a pd.DataFrame[pd.SparseArray]. Otherwise, data will
be a pd.DataFrame.
"""
if gene_labels not in ["id", "symbol", "both"]:
raise ValueError(
"gene_labels='{}' not recognized. "
"Choose from ['symbol', 'id', 'both']".format(gene_labels)
)
# default allow_duplicates
if allow_duplicates is None:
allow_duplicates = not sparse
with hdf5.open_file(filename, "r", backend=backend) as f:
# handle genome
groups = hdf5.list_nodes(f)
try:
# Cellranger 3.0
group = hdf5.get_node(f, "matrix")
if genome is not None:
raise NotImplementedError(
"Selecting genomes for Cellranger 3.0 files is not "
"currently supported. Please file an issue at "
"https://github.com/KrishnaswamyLab/scprep/issues"
)
except (AttributeError, KeyError):
# Cellranger 2.0
if genome is None:
print_genomes = ", ".join(groups)
genome = groups[0]
if len(groups) > 1:
print(
"Available genomes: {}. Selecting {} by default".format(
print_genomes, genome
)
)
try:
group = hdf5.get_node(f, genome)
except (AttributeError, KeyError):
print_genomes = ", ".join(groups)
raise ValueError(
"Genome {} not found in {}. "
"Available genomes: {}".format(genome, filename, print_genomes)
)
try:
# Cellranger 3.0
features = hdf5.get_node(group, "features")
gene_symbols = hdf5.get_node(features, "name")
gene_ids = hdf5.get_node(features, "id")
except (KeyError, IndexError):
# Cellranger 2.0
gene_symbols = hdf5.get_node(group, "gene_names")
gene_ids = hdf5.get_node(group, "genes")
# convert to string column names
gene_names = _parse_10x_genes(
symbols=[g.decode() for g in hdf5.get_values(gene_symbols)],
ids=[g.decode() for g in hdf5.get_values(gene_ids)],
gene_labels=gene_labels,
allow_duplicates=allow_duplicates,
)
cell_names = [
b.decode() for b in hdf5.get_values(hdf5.get_node(group, "barcodes"))
]
data = hdf5.get_values(hdf5.get_node(group, "data"))
indices = hdf5.get_values(hdf5.get_node(group, "indices"))
indptr = hdf5.get_values(hdf5.get_node(group, "indptr"))
shape = hdf5.get_values(hdf5.get_node(group, "shape"))
data = sp.csc_matrix((data, indices, indptr), shape=shape)
data = _matrix_to_data_frame(
data.T, gene_names=gene_names, cell_names=cell_names, sparse=sparse
)
return data