R/sctkQCUtils.R
In singleCellTK: Comprehensive and Interactive Analysis of Single Cell RNA-Seq Data

Documented in constructSCE exportSCE generateMeta getSceParams qcInputProcess

#' Export data in SingleCellExperiment object
#' @param inSCE A \link[SingleCellExperiment]{SingleCellExperiment} object
#' that contains the data. QC metrics are stored in colData of the
#' singleCellExperiment object.
#' @param samplename Sample name. This will be used as name of subdirectories
#' and the prefix of flat file output. Default is 'sample'.
#' @param type Type of data. The type of data stored in SingleCellExperiment object.
#' It can be 'Droplets'(raw droplets matrix) or 'Cells' (cells matrix).
#' @param directory Output directory. Default is './'.
#' @param format The format of output. It currently supports flat files, rds files
#' and python h5 files. It can output multiple formats. Default: c("SCE", "AnnData", "FlatFile", "HTAN").
#' @return Generates a file containing data from \code{inSCE}, in specified \code{format}.
#' @examples
#' data(scExample)
#' \dontrun{
#' exportSCE(sce, format = "SCE")
#' }
#' @export
exportSCE <- function(inSCE,
                      samplename = "sample",
                      directory = "./",
                      type = "Cells",
                      format = c("SCE", "AnnData", "FlatFile", "HTAN")) {

    if (any(!format %in% c("SCE", "AnnData", "FlatFile", "HTAN"))) {
        warning("Output format must be 'SCE', 'AnnData', 'HTAN' or 'FlatFile'. Format ",
             paste(format[!format %in% c("SCE", "AnnData", "FlatFile", "HTAN")], sep = ","),
             " is not supported now. ") #             "Only output the supported formats in the provided options. "
    }

    format <- format[format %in% c("SCE", "AnnData", "FlatFile", "HTAN")]
    message("The output format is [",
            paste(format, collapse = ","), "]. ")

    if (length(format) == 0) {
        warning("None of the provided format is supported now. Therefore, the output ",
            "will be SCE, AnnData, FlatFile and HTAN. ")
        format <- c("SCE", "AnnData", "FlatFile", "HTAN")
    }

    ## Create directories and save objects
    dir.create(file.path(directory, samplename), showWarnings = TRUE, recursive = TRUE)

    if ("SCE" %in% format) {
        ## Export to R
        fp <- file.path(directory, samplename, "R")
        dir.create(fp, showWarnings = TRUE, recursive = TRUE)
        fn <- file.path(fp, paste0(samplename , paste0("_", type, ".rds")))
        saveRDS(object = inSCE, file = fn)
    }

    if ("FlatFile" %in% format) {
        ## Export to flatfile
        fp <- file.path(directory, samplename, "FlatFile")
        dir.create(fp, showWarnings = TRUE, recursive = TRUE)
        fn <- file.path(fp, type)
        exportSCEtoFlatFile(inSCE, outputDir = fn, sample=samplename)
    }

    if ("AnnData" %in% format) {
        ## Export to Python AnnData
        fp <- file.path(directory, samplename, "Python")
        dir.create(fp, showWarnings = TRUE, recursive = TRUE)
        fn <- file.path(fp, type)
        exportSCEtoAnnData(inSCE, outputDir=fn, compression='gzip', prefix=samplename)
    }
}


#' Generate manifest file for droplet and cell count data
#' @param dropletSCE A \link[SingleCellExperiment]{SingleCellExperiment} object containing
#' droplet count matrix data
#' @param cellSCE A \link[SingleCellExperiment]{SingleCellExperiment} object containing
#' cell count matrix data
#' @param samplename The sample name of the \link[SingleCellExperiment]{SingleCellExperiment} objects
#' @param dir The output directory of the SCTK QC pipeline.
#' @param HTAN Whether generate manifest file with the format required by HTAN. Default is TRUE.
#' @param dataType Type of the input data. It can be one of "Droplet", "Cell" or "Both".
#' @return A \link[SingleCellExperiment]{SingleCellExperiment} object which combines all
#' objects in sceList. The colData is merged.
#' @export
#' @examples
#' data(scExample, package = "singleCellTK")
#' generateMeta(sce, dir = ".", samplename = "Sample",
#'  HTAN = TRUE)
#' @importFrom SummarizedExperiment assay colData
generateMeta <- function(dropletSCE = NULL,
                          cellSCE = NULL,
                          samplename,
                          dir,
                          HTAN=TRUE,
                          dataType = c("Droplet", "Cell", "Both")) {
  level3List <- list()
  level4List <- list()
  dataType = match.arg(dataType)

  directory <- file.path(basename(dir), samplename)
  filterDir <- file.path(directory, 'FlatFile', 'Cells')
  rawDir <- file.path(directory, 'FlatFile', 'Droplets')


  WorkFlowData = c(
    WorkFlow = 'singleCellTK QC pipeline',
    WorkFlowVer = paste('singleCellTK', utils::sessionInfo()$otherPkgs$singleCellTK$Version, sep=':'),
    ParRaw = 'Ran perCellQC, EmptyDrops and barcodeRankDrops using singleCellTK',
    ParFiltered = 'Ran perCellQC, doublet detection and decontX using singleCellTK',
    PardecontX = 'Ran perCellQC, doublet detection and decontX using singleCellTK',
    ColData = 'Ran perCellQC, doublet detection and decontX using singleCellTK',
    DecontXUMAP = 'UMAP dimension reduction generated by decontX',
    ScrubletTSNE = 'tSNE dimension reduction generated by Scrublet',
    ScrubletUMAP = 'UMAP dimension reduction generated by Scrublet'
  )


  ### calculate summary stats
  if (dataType == "Droplet" | dataType == "Both") {
    droplet_stat = c(CellNum = ncol(dropletSCE),
              MedianReads = stats::median(colData(dropletSCE)$sum),
              MedianGenes = stats::median(colData(dropletSCE)$detected),
              DataType = 'Droplet Matrix',
              FileName = file.path(rawDir, 'assays', paste0(samplename,'_counts.mtx.gz')))
  }

  if (dataType == "Cell" | dataType == "Both") {
    decontX_stat = c(CellNum = ncol(cellSCE),
              MedianReads = stats::median(Matrix::colSums(assay(cellSCE, 'decontXcounts'))),
              MedianGenes = stats::median(Matrix::colSums(assay(cellSCE, 'decontXcounts') > 0)),
              DataType = 'Decontaminated cell matrix return returned by runDecontX',
              FileName = file.path(filterDir, 'assays', paste0(samplename,'_decontXcounts.mtx.gz')))

    cell_stat = c(CellNum = ncol(cellSCE),
                   MedianReads = stats::median(colData(cellSCE)$sum),
                   MedianGenes = stats::median(colData(cellSCE)$detected),
                   DataType = 'Cell Matrix',
                   FileName = file.path(filterDir, 'assays', paste0(samplename,'_counts.mtx.gz')),
                   ColData = file.path(filterDir, paste0(samplename,'_colData.txt.gz')),
                   DecontXUMAP = file.path(filterDir, 'reducedDims', paste0(samplename,'_decontX_UMAP.txt.gz')),
                   ScrubletTSNE = file.path(filterDir, 'reducedDims', paste0(samplename,'_scrublet_TSNE.txt.gz')),
                   ScrubletUMAP = file.path(filterDir, 'reducedDims', paste0(samplename,'_scrublet_TSNE.txt.gz')))
  }

  data <- list("Raw" = if (exists("droplet_stat")) {droplet_stat} else {NULL},
               "Filtered" = if (exists("cell_stat")) {cell_stat} else {NULL},
               "decontX" =  if (exists("decontX_stat")) {decontX_stat} else {NULL})


  if (dataType == "Cell") { types <- c("Filtered", "decontX") }
  if (dataType == "Droplet") { types <- c("Raw") }
  if (dataType == "Both") { types <- c("Filtered", "decontX", "Raw")}

  for (type in types) {
    level3List[[type]] <- data.frame(
      SAMPLE = samplename, DATA_TYPE = data[[type]]['DataType'],
      MATRIX_TYPE = 'Raw Counts', DATA_CATEGORY = 'Gene Expression',
      CELL_MEDIAN_NUM_READS = data[[type]]['MedianReads'],
      CELL_MEDIAN_NUM_GENES = data[[type]]['MedianGenes'],
      CELL_TOTAL = data[[type]]['CellNum'],
      FILE_NAME = data[[type]]['FileName'],
      WORKFLOW_TYPE = WorkFlowData['WorkFlow'],
      WORKFLOW_PARAMETERS = WorkFlowData[paste0('Par', type)],
      WORKFLOW_VERSION = WorkFlowData['WorkFlowVer'],
      stringsAsFactors = FALSE)

    if (isTRUE(HTAN)) {
      level3List[[type]] <- cbind(level3List[[type]],HTAN_BIOSPECIMEN_ID = samplename,
                                  HTAN_PARENT_ID = '', HTAN_PARENT_FILE_ID = '')
    }

    if (type == 'Filtered') {
      for (metric in c('ColData', 'DecontXUMAP', 'ScrubletTSNE', 'ScrubletUMAP')) {
        level4List[[metric]] <- data.frame(
          SAMPLE = samplename, FILE_NAME = data[[type]][metric],
          WORKFLOW_TYPE = WorkFlowData[metric],
          WORKFLOW_PARAMETERS = file.path(directory, paste0(samplename, '_QCParameters.yaml')),
          WORKFLOW_VERSION = WorkFlowData['WorkFlowVer'])
        if (isTRUE(HTAN)) {
          level4List[[metric]] <- cbind(level4List[[metric]],  HTAN_BIOSPECIMEN_ID = samplename,
                                        HTAN_PARENT_ID = '',HTAN_PARENT_FILE_ID = data[[type]]['FileName'])
        }
      }
    }
  }

  level3Meta <- do.call(base::rbind, level3List)
  level4Meta <- do.call(base::rbind, level4List)
  return(list(level3Meta, level4Meta))
}

#' Extract QC parameters from the SingleCellExperiment object
#' @param inSCE A \link[SingleCellExperiment]{SingleCellExperiment} object.
#' @param skip Skip extracting the parameters of the provided QC functions.
#' @param ignore Skip extracting the content within QC functions.
#' @param directory The output directory of the SCTK_runQC.R pipeline.
#' @param samplename The sample name of the \link[SingleCellExperiment]{SingleCellExperiment} objects.
#' @param writeYAML Whether output yaml file to store parameters. Default if TRUE. If FALSE,
#' return character object.
#' @return If \code{writeYAML} TRUE, a yaml object will be generated. If FALSE, character object.
#' @export
getSceParams <- function(inSCE,
                         skip = c("scrublet", "runDecontX","runBarcodeRanksMetaOutput"),
                         ignore = c("algorithms", "estimates","contamination",
                                    "z","sample","rank","BPPARAM","batch","geneSetCollection",
                                    "barcodeArgs"),
                         directory = './',
                         samplename = '',
                         writeYAML = TRUE) {

  meta <- S4Vectors::metadata(inSCE)
  algos <- names(meta)[!names(meta) %in% skip]
  outputs <- '---'
  parList <- list()
  dir <- file.path(directory, samplename)

  for (algo in algos) {
    params <- meta[[algo]][[1]]
    if (length(params) == 1) {params <- params[[1]]} ### extract params from sublist
    params <- params[which(!names(params) %in% ignore)]
    parList[[algo]] <- params
  }

  outputs <- paste(outputs, yaml::as.yaml(parList), sep='\n')
  if (isTRUE(writeYAML)) {
    filename <- paste0(samplename, '_QCParameters.yaml')
    cat(outputs, file=file.path(dir, filename))
  } else {
    return(outputs)
  }
}


#' Create SingleCellExperiment object from csv or txt input
#' @param data A \link[data.table]{data.table} object containing the count matrix.
#' @param samplename The sample name of the data.
#' @return A \link[SingleCellExperiment]{SingleCellExperiment} object containing
#' the count matrix.
#' @export
constructSCE <- function(data, samplename) {
    gene <- data[[1]]
    data <- data[, -1]
    barcode <- colnames(data)
    mat <- methods::as(data, "Matrix")
    dimnames(mat) <- list(gene, barcode)
    coln <- paste(samplename, barcode, sep = '_')

    sce <- SingleCellExperiment::SingleCellExperiment(
        assays = list(counts = mat))
    SummarizedExperiment::rowData(sce) <- S4Vectors::DataFrame(feature = gene)
    SummarizedExperiment::colData(sce) <- S4Vectors::DataFrame(barcode,
        column_name = coln,
        sample = samplename,
        row.names = coln)

    return(sce)
}

#' Create SingleCellExperiment object from command line input arguments
#' @param preproc Method used to preprocess the data. It's one of the path provided in --preproc argument.
#' @param path Base path of the dataset. It's one of the path provided in --bash_path argument.
#' @param samplename The sample name of the data. It's one of the path provided in --sample argument.
#' @param raw The directory contains droplet matrix, gene and cell barcodes information. It's one of the path provided in --raw_data_path argument.
#' @param fil The directory contains cell matrix, gene and cell barcodes information. It's one of the path provided in --cell_data_path argument.
#' @param ref The name of reference used by cellranger. Only need for CellrangerV2 data.
#' @param rawFile The full path of the RDS file or Matrix file of the raw gene count matrix. It's one of the path provided in --raw_data argument.
#' @param filFile The full path of the RDS file or Matrix file of the cell count matrix. It's one of the path provided in --cell_data argument.
#' @param dataType Type of the input. It can be "Both", "Droplet" or "Cell". It's one of the path provided in --genome argument.
#' @return A list of \link[SingleCellExperiment]{SingleCellExperiment} object containing
#' the droplet or cell data or both,depending on the dataType that users provided.
#' @export
qcInputProcess <- function(preproc,
    samplename,
    path,
    raw,
    fil,
    ref,
    rawFile,
    filFile,
    dataType) {

    dropletSCE <- NULL
    cellSCE <- NULL

    if (preproc == "BUStools") {
        dropletSCE <- importBUStools(BUStoolsDirs = path, samples = samplename, class = "Matrix", delayedArray=FALSE)
        return(list(dropletSCE, cellSCE))
    }

    if (preproc == "SEQC") {
        dropletSCE <- importSEQC(seqcDirs = path, samples = samplename, prefix = samplename, class = "Matrix", delayedArray=FALSE)
        return(list(dropletSCE, cellSCE))
    }

    if (preproc == "STARSolo") {
        if (dataType == "Both") {
            dropletSCE <- importSTARsolo(STARsoloDirs = path, samples = samplename, STARsoloOuts = "Gene/raw", class = "Matrix", delayedArray=FALSE)
            cellSCE <- importSTARsolo(STARsoloDirs = path, samples = samplename, STARsoloOuts = "Gene/filtered", class = "Matrix", delayedArray=FALSE)
        } else if (dataType == "Cell") {
            cellSCE <- importSTARsolo(STARsoloDirs = path, samples = samplename, STARsoloOuts = "Gene/filtered", class = "Matrix", delayedArray=FALSE)
        } else if (dataType == "Droplet") {
            dropletSCE <- importSTARsolo(STARsoloDirs = path, samples = samplename, STARsoloOuts = "Gene/raw", class = "Matrix", delayedArray=FALSE)
        }
        return(list(dropletSCE, cellSCE))
    }

    if (preproc == "CellRangerV3") {
        if (!is.null(path)) {
            if (dataType == "Both") {
                dropletSCE <- importCellRangerV3(cellRangerDirs = path, sampleDirs = samplename, sampleNames = samplename, dataType="raw", class = "Matrix", delayedArray=FALSE)
                cellSCE <- importCellRangerV3(cellRangerDirs = path, sampleDirs = samplename, sampleNames = samplename, dataType="filtered", class = "Matrix", delayedArray=FALSE)
            } else if (dataType == "Cell") {
                cellSCE <- importCellRangerV3(cellRangerDirs = path, sampleDirs = samplename, sampleNames = samplename, dataType="filtered", class = "Matrix", delayedArray=FALSE)
            } else if (dataType == "Droplet") {
                dropletSCE <- importCellRangerV3(cellRangerDirs = path, sampleDirs = samplename, sampleNames = samplename, dataType="raw", class = "Matrix", delayedArray=FALSE)
            }
        } else {
            if (dataType == "Both") {
                dropletSCE <- importCellRangerV3Sample(dataDir = raw, sampleName = samplename, class = "Matrix", delayedArray=FALSE)
                cellSCE <- importCellRangerV3Sample(dataDir = fil, sampleName = samplename, class = "Matrix", delayedArray=FALSE)
            } else if (dataType == "Cell") {
                cellSCE <- importCellRangerV3Sample(dataDir = fil, sampleName = samplename, class = "Matrix", delayedArray=FALSE)
            } else if (dataType == "Droplet") {
                dropletSCE <- importCellRangerV3Sample(dataDir = raw, sampleName = samplename, class = "Matrix", delayedArray=FALSE)
            }
        }
        return(list(dropletSCE, cellSCE))
    }

    if (preproc == "CellRangerV2") {
        if (!is.null(path)) {
            if (dataType == "Both") {
                dropletSCE <- importCellRangerV2(cellRangerDirs = path, sampleDirs = samplename, sampleNames = samplename, class="Matrix", delayedArray = FALSE, reference = ref, dataTypeV2="raw")
                cellSCE <- importCellRangerV2(cellRangerDirs = path, sampleDirs = samplename, sampleNames = samplename, class="Matrix", delayedArray = FALSE, reference = ref, dataTypeV2="filtered")
            } else if (dataType == "Cell") {
                cellSCE <- importCellRangerV2(cellRangerDirs = path, sampleDirs = samplename, sampleNames = samplename, class="Matrix", delayedArray = FALSE, reference = ref, dataTypeV2="filtered")
            } else if (dataType == "Droplet") {
                dropletSCE <- importCellRangerV2(cellRangerDirs = path, sampleDirs = samplename, sampleNames = samplename, class="Matrix", delayedArray = FALSE, reference = ref, dataTypeV2="raw")
            }
        } else {
            if (dataType == "Both") {
                dropletSCE <- importCellRangerV2Sample(dataDir = raw, sampleName = samplename, class = "Matrix", delayedArray=FALSE)
                cellSCE <- importCellRangerV2Sample(dataDir = fil, sampleName = samplename, class = "Matrix", delayedArray=FALSE)
            } else if (dataType == "Cell") {
                cellSCE <- importCellRangerV2Sample(dataDir = fil, sampleName = samplename, class = "Matrix", delayedArray=FALSE)
            } else if (dataType == "Droplet") {
                dropletSCE <- importCellRangerV2Sample(dataDir = raw, sampleName = samplename, class = "Matrix", delayedArray=FALSE)
            }
        }
        return(list(dropletSCE, cellSCE))
    }

    if (preproc == "Optimus") {
        ## by default has both droplet and cell data.
        dropletSCE <- importOptimus(OptimusDirs = path, samples = samplename, delayedArray = FALSE)
        cellSCE <- dropletSCE[,which(dropletSCE$dropletUtils_emptyDrops_IsCell)]

        if (dataType == "Cell") {
            dropletSCE <- NULL
        } else if (dataType == "Droplet") {
            cellSCE <- NULL
        }
        return(list(dropletSCE, cellSCE))
    }

    if (preproc == "DropEst") {
        if (dataType == "Both") {
            dropletSCE <- importDropEst(sampleDirs=path, dataType="raw", sampleNames=samplename, delayedArray=FALSE)
            cellSCE <- importDropEst(sampleDirs=path, dataType="filtered", sampleNames=samplename, delayedArray=FALSE)
        } else if (dataType == "Cell") {
            cellSCE <- importDropEst(sampleDirs=path, dataType="filtered", sampleNames=samplename, delayedArray=FALSE)
        } else if (dataType == "Droplet") {
            dropletSCE <- importDropEst(sampleDirs=path, dataType="raw", sampleNames=samplename, delayedArray=FALSE)
        }
        return(list(dropletSCE, cellSCE))
    }

    if (preproc == "SceRDS") {
        if (dataType == "Both") {
            dropletSCE <- readRDS(rawFile)
            cellSCE <- readRDS(filFile)
        } else if (dataType == "Cell") {
            cellSCE <- readRDS(filFile)
        } else if (dataType == "Droplet") {
            dropletSCE <- readRDS(rawFile)
        }
        return(list(dropletSCE, cellSCE))
    }

    if (preproc == "CountMatrix") {
        if (dataType == "Both") {
            dropletMM <- data.table::fread(rawFile)
            dropletSCE <- constructSCE(data = dropletMM, samplename = samplename)
            cellMM <- data.table::fread(filFile)
            cellSCE <- constructSCE(data = cellMM, samplename = samplename)
        } else if (dataType == "Cell") {
            cellMM <- data.table::fread(filFile)
            cellSCE <- constructSCE(data = cellMM, samplename = samplename)
        } else if (dataType == "Droplet") {
            dropletMM <- data.table::fread(rawFile)
            dropletSCE <- constructSCE(data = dropletMM, samplename = samplename)
        }
        return(list(dropletSCE, cellSCE))
    }

    ## preproc is not one of the method above. Stop the pipeline.
    stop(paste0("'", preproc, "' not supported."))
}
Any scripts or data that you put into this service are public.
singleCellTK documentation built on Nov. 8, 2020, 5:21 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
singleCellTK
Comprehensive and Interactive Analysis of Single Cell RNA-Seq Data

R/sctkQCUtils.R
In singleCellTK: Comprehensive and Interactive Analysis of Single Cell RNA-Seq Data

Defines functions qcInputProcess constructSCE getSceParams generateMeta exportSCE

Documented in constructSCE exportSCE generateMeta getSceParams qcInputProcess

Try the singleCellTK package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

singleCellTK Comprehensive and Interactive Analysis of Single Cell RNA-Seq Data

R/sctkQCUtils.R In singleCellTK: Comprehensive and Interactive Analysis of Single Cell RNA-Seq Data

Defines functions qcInputProcess constructSCE getSceParams generateMeta exportSCE

Documented in constructSCE exportSCE generateMeta getSceParams qcInputProcess

Try the singleCellTK package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

singleCellTK
Comprehensive and Interactive Analysis of Single Cell RNA-Seq Data

R/sctkQCUtils.R
In singleCellTK: Comprehensive and Interactive Analysis of Single Cell RNA-Seq Data