Source code for scprep.run.dyngen

from . import r_function

import pandas as pd

_install_dyngen = r_function.RFunction(
    args="""lib=.libPaths()[1], dependencies=NA,
            repos='http://cran.rstudio.com', verbose=TRUE""",
    body="""
        install.packages(
            c("dynwrap", "dyngen"),
            lib=lib,
            repos=repos,
            dependencies=dependencies
        )
    """,
)

_get_backbones = r_function.RFunction(
    setup="""
        library(dyngen)
    """,
    body="""
        names(list_backbones())
    """,
)

_DyngenSimulate = r_function.RFunction(
    args="""
        backbone_name=character(), num_cells=500, num_tfs=100, num_targets=50,
        num_hks=25,simulation_census_interval=10, compute_cellwise_grn=FALSE,
        compute_rna_velocity=FALSE, n_jobs=7, random_state=NA, verbose=TRUE
    """,
    setup="""
        library(dyngen)
    """,
    body="""
        if (!is.na(random_state)) {
            set.seed(random_state)
        }

        backbones <- list('bifurcating'=backbone_bifurcating(),
                  'bifurcating_converging'=backbone_bifurcating_converging(),
                  'bifurcating_cycle'=backbone_bifurcating_cycle(),
                  'bifurcating_loop'=backbone_bifurcating_loop(),
                  'binary_tree'=backbone_binary_tree(),
                  'branching'=backbone_branching(),
                  'consecutive_bifurcating'=backbone_consecutive_bifurcating(),
                  'converging'=backbone_converging(),
                  'cycle'=backbone_cycle(),
                  'cycle_simple'=backbone_cycle_simple(),
                  'disconnected'=backbone_disconnected(),
                  'linear'=backbone_linear(),
                  'linear_simple'=backbone_linear_simple(),
                  'trifurcating'=backbone_trifurcating()
                 )

        backbone <- backbones[[backbone_name]]
        # silent default behavior of dyngen
        if (num_tfs < nrow(backbone$module_info)) {
            if (verbose) {
            cat("If input num_tfs is less than backbone default,",
                "Dyngen uses backbone default.\n")
            }
            num_tfs <- nrow(backbone$module_info)
        }
        if (verbose) {
            cat('Run Parameters:')
            cat('\n\tBackbone:', backbone_name)
            cat('\n\tNumber of Cells:', num_cells)
            cat('\n\tNumber of TFs:', num_tfs)
            cat('\n\tNumber of Targets:', num_targets)
            cat('\n\tNumber of HKs:', num_hks, '\n')
        }

        init <- initialise_model(
          backbone=backbone,
          num_cells=num_cells,
          num_tfs=num_tfs,
          num_targets=num_targets,
          num_hks=num_hks,
          simulation_params=simulation_default(
            census_interval=as.double(simulation_census_interval),
            kinetics_noise_function = kinetics_noise_simple(mean=1, sd=0.005),
            ssa_algorithm = ssa_etl(tau=300/3600),
            compute_cellwise_grn=compute_cellwise_grn,
            compute_rna_velocity=compute_rna_velocity),
          num_cores = n_jobs,
          download_cache_dir=NULL,
          verbose=verbose
        )
        out <- generate_dataset(init)
        data <- list(cell_info = as.data.frame(out$dataset$cell_info),
             expression = as.data.frame(as.matrix(out$dataset$expression)))

        if (compute_cellwise_grn) {
            data[['bulk_grn']] <- as.data.frame(out$dataset$regulatory_network)
            data[['cellwise_grn']] <- as.data.frame(out$dataset$regulatory_network_sc)
        }
        if (compute_rna_velocity) {
            data[['rna_velocity']] <- as.data.frame(as.matrix(out$dataset$rna_velocity))
        }

        data
    """,
)


def install(
    lib=None,
    dependencies=None,
    repos="http://cran.us.r-project.org",
    verbose=True,
):
    """Install Dyngen from CRAN.

    Parameters
    ----------
    lib: string
        Directory to install the package.
        If missing, defaults to the first element of .libPaths().
    dependencies: boolean, optional (default: None/NA)
        When True, installs all packages specified under "Depends", "Imports",
        "LinkingTo" and "Suggests".
        When False, installs no dependencies.
        When None/NA, installs all packages specified under "Depends", "Imports"
        and "LinkingTo".
    repos: string, optional (default: "http://cran.us.r-project.org"):
        R package repository.
    verbose: boolean, optional (default: True)
        Install script verbosity.
    """

    kwargs = {}
    if lib is not None:
        kwargs["lib"] = lib
    if dependencies is not None:
        kwargs["dependencies"] = dependencies

    _install_dyngen(
        repos=repos,
        verbose=verbose,
        **kwargs,
    )


def get_backbones():
    """Output full list of cell trajectory backbones.

    Returns
    -------
    backbones: array of backbone names
    """
    return _get_backbones()


[docs]def DyngenSimulate(
    backbone,
    num_cells=500,
    num_tfs=100,
    num_targets=50,
    num_hks=25,
    simulation_census_interval=10,
    compute_cellwise_grn=False,
    compute_rna_velocity=False,
    n_jobs=7,
    random_state=None,
    verbose=True,
    force_num_cells=False,
):
    """Simulate dataset with cellular backbone.

    The backbone determines the overall dynamic process during a simulation.
    It consists of a set of gene modules, which regulate each other such that
    expression of certain genes change over time in a specific manner.

    DyngenSimulate is a Python wrapper for the R package Dyngen.
    Default values obtained from Github vignettes.
    For more details, read about Dyngen on Github_.

    .. _Github: https://github.com/dynverse/dyngen

    Parameters
    ----------
    backbone: string
           Backbone name from dyngen list of backbones.
           Get list with get_backbones()).
    num_cells: int, optional (default: 500)
           Number of cells.
    num_tfs: int, optional (default: 100)
           Number of transcription factors.
           The TFs are the main drivers of the molecular changes in the simulation.
           A TF can only be regulated by other TFs or itself.

           NOTE: If num_tfs input is less than nrow(backbone$module_info),
           Dyngen will default to nrow(backbone$module_info).
           This quantity varies between backbones and with each run (without seed).
           It is generally less than 75.
           It is recommended to input num_tfs >= 100 to stabilize the output.
    num_targets: int, optional (default: 50)
           Number of target genes.
           Target genes are regulated by a TF or another target gene,
           but are always downstream of at least one TF.
    num_hks: int, optional (default: 25)
           Number of housekeeping genees.
           Housekeeping genes are completely separate from any TFs or target genes.
    simulation_census_interval: int, optional (default: 10)
           Stores the abundance levels only after a specific interval has passed.
           The lower the interval, the higher detail of simulation trajectory retained,
           though many timepoints will contain similar information.
    compute_cellwise_grn: boolean, optional (default: False)
           If True, computes the ground truth cellwise gene regulatory networks.
           Also outputs ground truth bulk (entire dataset) regulatory network.
           NOTE: Increases compute time significantly.
    compute_rna_velocity: boolean, optional (default: False)
           If true, computes the ground truth propensity ratios after simulation.
           NOTE: Increases compute time significantly.
    n_jobs: int, optional (default: 8)
           Number of cores to use.
    random_state: int, optional (default: None)
           Fixes seed for simulation generator.
    verbose: boolean, optional (default: True)
           Data generation verbosity.
    force_num_cells: boolean, optional (default: False)
           Dyngen occassionally produces fewer cells than specified.
           Set this flag to True to rerun Dyngen until correct cell count is reached.

    Returns
    -------
    Dictionary data of pd.DataFrames:
    data['cell_info']: pd.DataFrame, shape (n_cells, 4)
           Columns: cell_id, step_ix, simulation_i, sim_time
           sim_time is the simulated timepoint for a given cell.

    data['expression']: pd.DataFrame, shape (n_cells, n_genes)
           Log-transformed counts with dropout.

    If compute_cellwise_grn is True,
    data['bulk_grn']: pd.DataFrame, shape (n_tf_target_interactions, 4)
           Columns: regulator, target, strength, effect.
           Strength is positive and unbounded.
           Effect is either +1 (for activation) or -1 (for inhibition).

    data['cellwise_grn']: pd.DataFrame, shape (n_tf_target_interactions_per_cell, 4)
           Columns: cell_id, regulator, target, strength.
           The output does not include all edges per cell.
           The regulatory effect lies between [−1, 1], where -1 is complete inhibition
           of target by TF, +1 is maximal activation of target by TF,
           and 0 is inactivity of the regulatory interaction between R and T.

    If compute_rna_velocity is True,
    data['rna_velocity']: pd.DataFrame, shape (n_cells, n_genes)
           Propensity ratios for each cell.

    Example
    --------
    >>> import scprep
    >>> scprep.run.dyngen.install()
    >>> backbones = scprep.run.dyngen.get_backbones()
    >>> data = scprep.run.DyngenSimulate(backbone=backbones[0])
    """
    if backbone not in get_backbones():
        raise ValueError(
            (
                "Input not in default backbone list. "
                "Choose backbone from get_backbones()"
            )
        )

    kwargs = {}
    if random_state is not None:
        kwargs["random_state"] = random_state

    rdata = _DyngenSimulate(
        backbone_name=backbone,
        num_cells=num_cells,
        num_tfs=num_tfs,
        num_targets=num_targets,
        num_hks=num_hks,
        simulation_census_interval=simulation_census_interval,
        compute_cellwise_grn=compute_cellwise_grn,
        compute_rna_velocity=compute_rna_velocity,
        n_jobs=n_jobs,
        verbose=verbose,
        rpy_verbose=verbose,
        **kwargs,
    )

    if force_num_cells:
        if random_state is None:
            random_state = -1

        if pd.DataFrame(rdata["cell_info"]).shape[0] != num_cells:
            random_state += 1
            rdata = DyngenSimulate(
                backbone=backbone,
                num_cells=num_cells,
                num_tfs=num_tfs,
                num_targets=num_targets,
                num_hks=num_hks,
                simulation_census_interval=simulation_census_interval,
                compute_cellwise_grn=compute_cellwise_grn,
                compute_rna_velocity=compute_rna_velocity,
                n_jobs=n_jobs,
                verbose=verbose,
                random_state=random_state,
                force_num_cells=force_num_cells,
            )

    data = {}
    data["cell_info"] = pd.DataFrame(rdata["cell_info"])
    data["expression"] = pd.DataFrame(rdata["expression"])
    if compute_cellwise_grn:
        data["cellwise_grn"] = pd.DataFrame(rdata["cellwise_grn"])
        data["bulk_grn"] = pd.DataFrame(rdata["bulk_grn"])
    if compute_rna_velocity:
        data["rna_velocity"] = pd.DataFrame(rdata["rna_velocity"])

    return data