R/CTSSs.R

Defines functions gf_wrapper2 gf_reducer2 bw_mapper2 bw_mapper bg_mapper

#### Mappers ####

bg_mapper <- function(file, seqinfo, strand = "*"){
    # Import
    o <- import.bedGraph(file, genome=seqinfo)

    # Instead of conversion, stop if not proper CTSSs-like
    stopifnot(all(width(o) == 1))
    # o <- as(o, "StitchedGPos")
    # score(o) <- as.integer(score(o))

    # Set strand
    strand(o) <- strand

    # Return
    o
}

bw_mapper <- function(range, file, seqinfo, strand = "*"){
  # Prune levels
  pruned <- range
  seqlevels(pruned, pruning.mode = "coarse") <- seqlevels(file)

  # Check if there is nothing to import
  if(length(pruned) == 0){
    o <- GRanges(seqinfo=seqinfo)
    score(o) <- numeric()
  }else{
    # Import
    o <- import.bw(con=file,
                   format="bigwig",
                   which=pruned,
                   as="GRanges")

    # Instead of conversion, stop if not proper CTSSs-like
    stopifnot(all(width(o) == 1))

    # Force seqlevels and strand
    seqlevels(o) <- seqlevels(seqinfo)
    seqinfo(o) <- seqinfo
    strand(o) <- strand
  }

  # Return
  o
}

bw_mapper2 <- function(range, file, seqinfo, strand = "*"){
    # Import
    o <- suppressWarnings(import.bw(con=file,
                                    format="bigwig",
                                    which=range,
                                    as="GRanges"))

    # Instead of conversion, stop if not proper CTSSs-like
    stopifnot(all(width(o) == 1))
    # o <- as(o, "StitchedGPos")
    # score(o) <- as.integer(score(o)) # Not used by dgC anyway

    # Force seqlevels and strand
    seqlevels(o) <- seqlevels(seqinfo)
    seqinfo(o) <- seqinfo
    strand(o) <- strand

    # Return
    o
}

#### Reducers ####

gf_reducer2 <- function(mapped){
    # Make table of j = sample indicies
    o <- unlist(GRangesList(mapped), use.names=FALSE)
    o$j <- rep(seq_along(mapped), elementNROWS(mapped))

    # Find unique sites
    unique_pos <- granges(o)
    unique_pos <- unique(unique_pos)
    unique_pos <- sort(unique_pos)

    # Overlap to get i = position indices
    o$i <- match(o, unique_pos)

    # Determine the shape of the sparse matrix
    mat_dim <- c(length(unique_pos), length(mapped))
    mat_names <- list(NULL, names(mapped))

    # Build sparse matrix
    if (length(o) > 0) {
      o <- Matrix::sparseMatrix(i = o$i,
                                     j = o$j,
                                     x = score(o),
                                     dims = mat_dim,
                                     dimnames = mat_names)
    } else {
      # Simply create an empty matrix
      o <- Matrix::sparseMatrix(i = integer(0),
                                j = integer(0),
                                x = integer(0),
                                dims = c(0, mat_dim[2]),
                                dimnames = mat_names)
    }

    # # Simplify so sparse matrix encoding
    # indices <- as.matrix(mcols(indices)[,c("i", "j", "score")])
    # rle_dim <- c(length(unique_pos), length(mapped))
    # o <- indices[,1:2]
    # o <- SparseArraySeed(dim=rle_dim,
    #                      aind=o,
    #                      nzdata=indices[,3])
    #
    # # To DataFrame of Rles via RleMatrix
    # # o <- as(o, "RleMatrix")
    # o <- DelayedArray(o)
    # o <- as(o, "DataFrame")

    # Build RSE
    o <- SummarizedExperiment(assays=list(counts=o),
                              rowRanges=unique_pos)
    colnames(o) <- names(mapped)

    # Return
    o
}

gf_wrapper2 <- function(files, ranges, seqinfo, strand) {
    # Run GenomicFiles
    o <- GenomicFiles::reduceByRange(ranges = ranges,
                                     files = files,
                                     MAP = pryr::partial(bw_mapper,
                                                         seqinfo = seqinfo,
                                                         strand = strand),
                                     REDUCE = gf_reducer2,
                                     iterate = FALSE)

    # Merge output
    #message("Attempting to bind chunks...")
    o <- do.call(rbind, o)

    # Return
    o
}

#### Other helpers ####

setGeneric("cleanDesign", function(design, SampleIDs) {
    standardGeneric("cleanDesign")
})

setMethod("cleanDesign", signature(design = "NULL"), function(design, SampleIDs) {
    DataFrame(row.names = SampleIDs)
})

setMethod("cleanDesign", signature(design = "DataFrame"), function(design, SampleIDs) {
    assert_that(nrow(design) == length(SampleIDs),
                all(rownames(design) == SampleIDs))
    design
})

setMethod("cleanDesign", signature(design = "data.frame"), function(design, SampleIDs) {
    assert_that(nrow(design) == length(SampleIDs),
                all(rownames(design) == SampleIDs))
    methods::as(design, "DataFrame")
})

setMethod("cleanDesign", signature(design = "ANY"), function(design, SampleIDs) {
    stop("design must either NULL or a DataFrame-/data.frame-object!")
})

# pseudoSparsity <- function(x){
#     stopifnot(methods::is(x, "DataFrame"))
#
#     # Find non-zero
#     o <- vapply(X=x,
#                 FUN=function(x) sum(x == 0),
#                 FUN.VALUE=integer(1))
#
#     # Calc sparsity
#     o <- sum(o) / (ncol(x) * nrow(x))
#
#     # Return
#     o
# }

# clean_design <- function(ids, design=NULL){
#     # Decide what to
#     if (is.null(design)) {
#         design <- DataFrame(row.names = ids)
#     } else if (methods::is(design, "DataFrame")) {
#         assert_that(all(rownames(design) == ids))
#     } else if (is.data.frame(design)) {
#         assert_that(all(rownames(design) == ids))
#         design <- methods::as(design, "DataFrame")
#     } else {
#         stop("design must either NULL or a DataFrame-/data.frame-object!")
#     }
#
#     # Return
# }

#### Main function ####

#' Quantify CAGE Transcriptions Start Sites (CTSSs)
#'
#' This function reads in CTSS count data from a series of BigWig-files (or bedGraph-files) and
#' returns a CTSS-by-library count matrix. For efficient processing, the count
#' matrix is stored as a sparse matrix (dgCMatrix from the Matrix package), and CTSSs are compressed to a GPos object if possible.
#'
#' @param plusStrand BigWigFileList or character: BigWig/bedGraph files with plus-strand CTSS data.
#' @param minusStrand BigWigFileList or character: BigWig/bedGraph files with minus-strand CTSS data.
#' @param design DataFrame or data.frame: Additional information on samples which will be added to the ouput
#' @param genome Seqinfo: Genome information. If NULL the smallest common genome
#'   will be found using bwCommonGenome when BigWig-files are analyzed.
#' @param nTiles integer: Number of genomic tiles to parallelize over.
#' @param ... additional arguments passed to methods.
#'
#' @return RangedSummarizedExperiment, where assay is a sparse matrix
#'   (dgCMatrix) of CTSS counts and design stored in colData.
#' @family Quantification functions
#' @importClassesFrom Matrix dgCMatrix
#' @export
#' @examples
#' \dontrun{
#' # Load the example data
#' data('exampleDesign')
#' # Use the BigWig-files included with the package:
#' bw_plus <- system.file('extdata', exampleDesign$BigWigPlus,
#'                        package = 'CAGEfightR')
#' bw_minus <- system.file('extdata', exampleDesign$BigWigMinus,
#'                         package = 'CAGEfightR')
#'
#' # Create two named BigWigFileList-objects:
#' bw_plus <- BigWigFileList(bw_plus)
#' bw_minus <- BigWigFileList(bw_minus)
#' names(bw_plus) <- exampleDesign$Name
#' names(bw_minus) <- exampleDesign$Name
#'
#' # Quantify CTSSs, by default this will use the smallest common genome:
#' CTSSs <- quantifyCTSSs(plusStrand=bw_plus,
#'                        minusStrand=bw_minus,
#'                        design=exampleDesign)
#'
#' # Alternatively, a genome can be specified:
#' si <- seqinfo(bw_plus[[1]])
#' si <- si['chr18']
#' CTSSs_subset <- quantifyCTSSs(plusStrand=bw_plus,
#'                        minusStrand=bw_minus,
#'                        design=exampleDesign,
#'                        genome=si)
#'
#' # Quantification can be speed up by using multiple cores:
#' library(BiocParallel)
#' register(MulticoreParam(workers=3))
#' CTSSs_subset <- quantifyCTSSs(plusStrand=bw_plus,
#'                        minusStrand=bw_minus,
#'                        design=exampleDesign,
#'                        genome=si)
#'
#' # CAGEfightR also support bedGraph files, first BigWig is converted
#' bg_plus <- replicate(n=length(bw_plus), tempfile(fileext="_plus.bedGraph"))
#' bg_minus <- replicate(n=length(bw_minus), tempfile(fileext="_minus.bedGraph"))
#' names(bg_plus) <- names(bw_plus)
#' names(bg_minus) <- names(bw_minus)
#'
#' convertBigWig2BedGraph(input=sapply(bw_plus, resource), output=bg_plus)
#' convertBigWig2BedGraph(input=sapply(bw_minus, resource), output=bg_minus)
#'
#' # Then analyze: Note a genome MUST be supplied here!
#' si <- bwCommonGenome(bw_plus, bw_minus)
#' CTSSs_via_bg <- quantifyCTSSs(plusStrand=bg_plus,
#'                         minusStrand=bg_minus,
#'                         design=exampleDesign,
#'                         genome=si)
#'
#' # Confirm that the two approaches yield the same results
#' all(assay(CTSSs_via_bg) == assay(CTSSs))
#' }
setGeneric("quantifyCTSSs", function(plusStrand, minusStrand, design=NULL, genome=NULL, ...) {
    standardGeneric("quantifyCTSSs")
})

#' @rdname quantifyCTSSs
#' @export
setMethod("quantifyCTSSs",
          signature(plusStrand = "BigWigFileList",
                    minusStrand = "BigWigFileList"),
  function(plusStrand, minusStrand, design=NULL, genome=NULL, nTiles=1L) {
      # Pre-checks
      assert_that(length(plusStrand) == length(minusStrand),
                  bwValid(plusStrand),
                  bwValid(minusStrand),
                  !is.null(names(plusStrand)),
                  !is.null(names(minusStrand)),
                  all(names(plusStrand) == names(minusStrand)),
                  is.count(nTiles))

      # Set design
      message("Checking design...")
      design <- cleanDesign(design, SampleIDs=names(plusStrand))

      # Aquire seqinfo if missing.
      if (is.null(genome)) {
          message("Finding common genome...")
          genome <- bwCommonGenome(plusStrand, minusStrand, method = "intersect")
      } else if (methods::is(genome, "Seqinfo")) {
          message("Checking supplied genome compatibility...")
          bwGenomeCompatibility(plusStrand, minusStrand, genome)
      } else {
          stop("genome must either NULL or a Seqinfo-object!")
      }

      # Setup tiles and workers
      grl <- tileGenome(genome, ntile=nTiles)
      n_workers <- BiocParallel::bpworkers()
      n_workers <- ifelse(is.numeric(n_workers), n_workers, 1)

      message("Iterating over ", length(grl),
              " genomic tiles in ", length(plusStrand),
              " samples using ", n_workers, " worker(s)...")

      # Load data
      message("Importing CTSSs from plus strand...")
      plus_strand <- gf_wrapper2(files = plusStrand,
                                 ranges = grl,
                                 seqinfo = genome,
                                 strand = "+")

      message("Importing CTSSs from minus strand...")
      minus_strand <- gf_wrapper2(files = minusStrand,
                                  ranges = grl,
                                  seqinfo = genome,
                                  strand = "-")

      # Merge and format
      message("Merging strands...")
      o <- rbind(plus_strand, minus_strand)
      rm(plus_strand, minus_strand)

      # Attach design
      colData(o) <- design

      message("Formatting output...")
      # Calculate sparsity before seed
      zero_frac <- 1 - (Matrix::nnzero(assay(o)) / length(assay(o)))

      # Wrap in DelayedArray
      #assay(o) <- DelayedArray(assay(o))

      # Convert to GPos if possible:
      if(length(o) < .Machine$integer.max){
        rowRanges(o) <- methods::as(rowRanges(o), "StitchedGPos")
      }

      # Attach design

      # Sparsity
      #  Convert to appropriate seed
      # message("Formatting to specified seed...")
      # if(seed == "DataFrame"){
      #   assays(o) <- List(counts=DelayedArray(assay(o)))
      # }else if(seed == "dgCMatrix"){
      #   assays(o) <- List(counts=DelayedArray(assay(o)))
      #   assay(o) <- DelayedArray(as(assay(o), "dgCMatrix"))
      # }else if(seed == "RleMatrix"){
      #   assays(o) <- List(counts=RleArray(assay(o))) # Chunk by column
      # }else if(seed == "RleMatrix2"){
      #   assays(o) <- List(counts=as(assay(o), "RleMatrix")) # High compression
      # }else if(seed == "SparseArraySeed"){
      #   assays(o) <- List(counts=DelayedArray(assay(o)))
      #   assay(o) <- DelayedArray(as(assay(o), "SparseArraySeed"))
      # }else{
      #   warning("Did not wrap seed in DelayedArray!")
      # }

      # Calculate sparsity
      #pseudo_sparsity <- pseudoSparsity(assay(o))

      # Final conversion
      #assays(o) <- List(counts=assayCompression(assay(o, "counts")))

      # # Calculate sparsity
      # sparse_percent <- vapply(X=assay(o),
      #                          FUN=function(x) sum(x == 0),
      #                          FUN.VALUE=integer(1))
      # sparse_percent <- sum(sparse_percent) / (ncol(o) * nrow(o))

      # Final conversion
      # if(assayStorage == "RleMatrix"){
      #     assays(o) <- List(counts=RleArray(assay(o, "counts")))
      # }else if(assayStorage == "DelayedArray") {
      #     assays(o) <- List(counts=DelayedArray(assay(o, "counts")))
      # }else if(assayStorage == "DataFrame") {
      #     warning("Output is a simple DataFrame of Rle's: ",
      #     "This is mainly intended for debugging and will not work with remaining CAGEfightR functions!")
      # }else{
      #     stop("assayStorage must be in: RleMatrix, DelayedArray, DataFrame")
      # }

      # Post warnings
      if(!length(plusStrand) == ncol(o)){
        warning("Output has the wrong dimensions! ",
                "Check input files with checkCTSSs.")
      }

      if(!identical(seqinfo(o), genome)){
        warning("Seqinfo was not properly set on output! ",
                "Check input files with checkCTSSs")
      }

      # Basic Summary
      message("### CTSS summary ###")
      message("Number of samples: ", ncol(o))
      message("Number of CTSSs: ", format(nrow(o) / 1e6L, digits = 4), " millions")
      message("Sparsity: ", format(zero_frac * 100, digits=4), " %")
      #message("Type of DelayedArray seed: dgCMatrix")
      message("Type of rowRanges: ", class(rowRanges(o)))
      message("Final object size: ", utils::capture.output(pryr::object_size(o)))

      # Return
      o
})

#' @rdname quantifyCTSSs
#' @export
setMethod("quantifyCTSSs",
          signature(plusStrand = "character",
                    minusStrand = "character"),
          function(plusStrand, minusStrand, design=NULL, genome=NULL) {
              # Pre-checks
              assert_that(all(checkFileSeries(plusStrand)),
                          all(checkFileSeries(minusStrand)),
                          all(checkExtensions(plusStrand, "bedGraph")),
                          all(checkExtensions(minusStrand, "bedGraph")),
                          length(plusStrand) == length(minusStrand),
                          !is.null(names(plusStrand)),
                          !is.null(names(minusStrand)),
                          all(names(plusStrand) == names(minusStrand)),
                          methods::is(genome, "Seqinfo"))

              # Set design
              message("Checking design...")
              design <- cleanDesign(design, SampleIDs=names(plusStrand))

              # Setup tiles and workers
              n_workers <- BiocParallel::bpworkers()
              n_workers <- ifelse(is.numeric(n_workers), n_workers, 1)

              message("Iterating over ", length(plusStrand),
                      " samples using ", n_workers, " worker(s)...")

              # Load data
              message("Importing CTSSs from plus strand...")
              plus_strand <- BiocParallel::bplapply(plusStrand,
                                                    bg_mapper,
                                                    seqinfo=genome,
                                                    strand="+")
              plus_strand <- gf_reducer2(plus_strand)

              message("Importing CTSSs from minus strand...")
              minus_strand <- BiocParallel::bplapply(minusStrand,
                                                    bg_mapper,
                                                    seqinfo=genome,
                                                    strand="-")
              minus_strand <- gf_reducer2(minus_strand)

              # Merge and format
              message("Merging strands...")
              o <- rbind(plus_strand, minus_strand)
              rm(plus_strand, minus_strand)

              # Attach design
              colData(o) <- design

              message("Formatting output...")
              # Calculate sparsity before seed
              zero_frac <- 1 - (Matrix::nnzero(assay(o)) / length(assay(o)))

              # Wrap in DelayedArray
              #assay(o) <- DelayedArray(assay(o))

              # Convert to GPos if possible:
              if(length(o) < .Machine$integer.max){
                rowRanges(o) <- methods::as(rowRanges(o), "StitchedGPos")
              }

              # Basic Summary
              message("### CTSS summary ###")
              message("Number of samples: ", ncol(o))
              message("Number of CTSSs: ", format(nrow(o) / 1e6L, digits = 4), " millions")
              message("Sparsity: ", format(zero_frac * 100, digits=4), " %")
              #message("Type of DelayedArray seed: dgCMatrix")
              message("Type of rowRanges: ", class(rowRanges(o)))
              message("Final object size: ", utils::capture.output(pryr::object_size(o)))

              # Return
              o
})
MalteThodberg/CAGEfightR documentation built on Sept. 11, 2021, 4:42 a.m.