R/helper.R

Defines functions integrateWithSingleCell normTransform normalizeGeneLength fpm fpkm collapseReplicates unmix

Documented in collapseReplicates fpkm fpm integrateWithSingleCell normalizeGeneLength normTransform unmix

#' Unmix samples using loss in a variance stabilized space
#'
#' Unmixes samples in \code{x} according to \code{pure} components,
#' using numerical optimization. The components in \code{pure}
#' are added on the scale of gene expression (either normalized counts, or TPMs).
#' The loss function when comparing fitted expression to the
#' samples in \code{x} occurs in a variance stabilized space.
#' This task is sometimes referred to as "deconvolution",
#' and can be used, for example, to identify contributions from
#' various tissues.
#' Note: some groups have found that the mixing contributions
#' may be more accurate if very lowly expressed genes across \code{x}
#' and \code{pure} are first removed. We have not explored this fully.
#' Note: if the \code{pbapply} package is installed a progress bar
#' will be displayed while mixing components are fit.
#'
#' @param x normalized counts or TPMs of the samples to be unmixed
#' @param pure normalized counts or TPMs of the "pure" samples
#' @param alpha for normalized counts, the dispersion of the data
#' when a negative binomial model is fit. this can be found by examining
#' the asymptotic value of \code{dispersionFunction(dds)}, when using
#' \code{fitType="parametric"} or the mean value when using
#' \code{fitType="mean"}.
#' @param shift for TPMs, the shift which approximately stabilizes the variance
#' of log shifted TPMs. Can be assessed with \code{vsn::meanSdPlot}.
#' @param power either 1 (for L1) or 2 (for squared) loss function.
#' Default is 1.
#' @param format \code{"matrix"} or \code{"list"}, default is \code{"matrix"}.
#' whether to output just the matrix of mixture components, or a list (see Value).
#' 
#' @param quiet suppress progress bar. default is FALSE, show progress bar
#' if pbapply is installed.
#'
#' @return a matrix, the mixture components for each sample in \code{x} (rows).
#' The "pure" samples make up the columns, and so each row sums to 1.
#' If colnames existed on the input matrices they will be propagated to the output matrix.
#' If \code{format="list"}, then a list, containing as elements:
#' (1) the matrix of mixture components,
#' (2) the correlations in the variance stabilized space of the fitted samples
#' to the samples in \code{x}, and
#' (3) the fitted samples as a matrix with the same dimension as \code{x}.
#'
#' @examples
#'
#' # some artificial data
#' cts <- matrix(c(80,50,1,100,
#'                 1,1,60,100,
#'                 0,50,60,100), ncol=4, byrow=TRUE)
#' # make a DESeqDataSet
#' dds <- DESeqDataSetFromMatrix(cts,
#'   data.frame(row.names=seq_len(ncol(cts))), ~1)
#' colnames(dds) <- paste0("sample",1:4)
#'
#' # note! here you would instead use
#' # estimateSizeFactors() to do actual normalization
#' sizeFactors(dds) <- rep(1, ncol(dds))
#'
#' norm.cts <- counts(dds, normalized=TRUE)
#'
#' # 'pure' should also have normalized counts...
#' pure <- matrix(c(10,0,0,
#'                  0,0,10,
#'                  0,10,0), ncol=3, byrow=TRUE)
#' colnames(pure) <- letters[1:3]
#' 
#' # for real data, you need to find alpha after fitting estimateDispersions()
#' mix <- unmix(norm.cts, pure, alpha=0.01)
#' 
#' @export
unmix <- function(x, pure, alpha, shift, power=1, format="matrix", quiet=FALSE) {

  format <- match.arg(format, c("matrix","list"))
  if (missing(alpha)) stopifnot(!missing(shift))
  if (missing(shift)) stopifnot(!missing(alpha))
  stopifnot(missing(shift) | missing(alpha))
  stopifnot(power %in% 1:2)
  stopifnot(nrow(x) == nrow(pure))
  stopifnot(ncol(pure) > 1)
  
  if (requireNamespace("pbapply", quietly=TRUE) & !quiet) {
    lapply <- pbapply::pblapply
  }

  cor.msg <- "some columns of 'pure' are highly correlated (>.99 after VST),
  may result in instabilty of unmix(). visually inspect correlations of 'pure'"
  
  if (missing(shift)) {
    stopifnot(alpha > 0)
    # variance stabilizing transformation for NB w/ fixed dispersion alpha
    vst <- function(q, alpha) ( 2 * asinh(sqrt(alpha * q)) - log(alpha) - log(4) ) / log(2)
    pure.cor <- cor(vst(pure, alpha)); diag(pure.cor) <- 0
    if (any(pure.cor > .99)) warning(cor.msg)
    sumLossVST <- function(p, i, vst, alpha, power) {
      sum(abs(vst(x[,i], alpha) - vst(pure %*% p, alpha))^power)
    }
    res <- lapply(seq_len(ncol(x)), function(i) {
      optim(par=rep(1, ncol(pure)), fn=sumLossVST, gr=NULL, i, vst, alpha, power,
            method="L-BFGS-B", lower=1e-6, upper=100)$par
    })
  } else {
    stopifnot(shift > 0)
    # VST of shifted log
    vstSL <- function(q, shift) log(q + shift)
    pure.cor <- cor(vstSL(pure, shift)); diag(pure.cor) <- 0
    if (any(pure.cor > .99)) warning(cor.msg)
    sumLossSL <- function(p, i, vst, shift, power) {
      sum(abs(vstSL(x[,i], shift) - vstSL(pure %*% p, shift))^power)
    }
    res <- lapply(seq_len(ncol(x)), function(i) {
      optim(par=rep(1, ncol(pure)), fn=sumLossSL, gr=NULL, i, vstSL, shift, power,
            method="L-BFGS-B", lower=0, upper=100)$par
    })
  }

  mix <- do.call(rbind, res)
  mix <- mix / MatrixGenerics::rowSums(mix)
  colnames(mix) <- colnames(pure)
  rownames(mix) <- colnames(x)

  if (format == "matrix") {
    return(mix)
  } else {
    fitted <- pure %*% t(mix)
    cor <- if (missing(shift)) {
      cor(vst(x, alpha), vst(fitted, alpha))
    } else {
      cor(vstSL(x, shift), vstSL(fitted, shift))
    }
    return(list(mix=mix, cor=diag(cor), fitted=fitted))
  }
  
}

#' Collapse technical replicates in a RangedSummarizedExperiment or DESeqDataSet
#'
#' Collapses the columns in \code{object} by summing within levels
#' of a grouping factor \code{groupby}. The purpose of this function
#' is to sum up read counts from technical replicates to create an object
#' with a single column of read counts for each sample.
#' This function will issue a warning if there are other assays other than
#' \code{"counts"}, see details below in 'Value'.
#'
#' Note: by "technical replicates", we mean multiple sequencing runs of the same
#' library, in constrast to "biological replicates" in which multiple
#' libraries are prepared from separate biological units.
#' Optionally renames the columns of returned object with the levels of the
#' grouping factor.
#' Note: this function is written very simply and
#' can be easily altered to produce other behavior by examining the source code.
#'
#' @param object A \code{RangedSummarizedExperiment} or \code{DESeqDataSet}
#' @param groupby a grouping factor, as long as the columns of object
#' @param run optional, the names of each unique column in object. if provided,
#' a new column \code{runsCollapsed} will be added to the \code{colData}
#' which pastes together the names of \code{run}
#' @param renameCols whether to rename the columns of the returned object
#' using the levels of the grouping factor
#'
#' @return the \code{object} with as many columns as levels in \code{groupby}.
#' This object has \code{"counts"} data which is summed from the various
#' columns which are grouped together, and the \code{colData} is subset using
#' the first column for each group in \code{groupby}.
#' Other assays are dropped, as it is not unambiguous the correct
#' form of combination, and a warning is printed if they are present, so
#' the user is aware they should take care of such assays manually.
#'
#' @examples
#'
#' dds <- makeExampleDESeqDataSet(m=12)
#' 
#' # make data with two technical replicates for three samples
#' dds$sample <- factor(sample(paste0("sample",rep(1:9, c(2,1,1,2,1,1,2,1,1)))))
#' dds$run <- paste0("run",1:12)
#'
#' ddsColl <- collapseReplicates(dds, dds$sample, dds$run)
#'
#' # examine the colData and column names of the collapsed data
#' colData(ddsColl)
#' colnames(ddsColl)
#'
#' # check that the sum of the counts for "sample1" is the same
#' # as the counts in the "sample1" column in ddsColl
#' matchFirstLevel <- dds$sample == levels(dds$sample)[1]
#' stopifnot(all(rowSums(counts(dds[,matchFirstLevel])) == counts(ddsColl[,1])))
#' 
#' @export
collapseReplicates <- function(object, groupby, run, renameCols=TRUE) {
  if (!is.factor(groupby)) groupby <- factor(groupby)
  stopifnot(assayNames(object)[1] == "counts")
  if (length(assayNames(object)) > 1) {
    warning("

  Warning! collapseReplicates only sums columns of the 'counts' assay.
  Other assays are dropped from output, and must be manually combined,
  as it is not unambiguous how to combine non-count assays.\n")
    assays(object) <- assays(object)[1]
  }
  groupby <- droplevels(groupby)
  stopifnot(length(groupby) == ncol(object))
  sp <- split(seq(along=groupby), groupby)
  countdata <- sapply(sp, function(i) MatrixGenerics::rowSums(assay(object)[,i,drop=FALSE]))
  mode(countdata) <- "integer"
  colsToKeep <- sapply(sp, `[`, 1)
  collapsed <- object[,colsToKeep]
  dimnames(countdata) <- dimnames(collapsed)
  assay(collapsed) <- countdata
  if (!missing(run)) {
    stopifnot(length(groupby) == length(run))
    colData(collapsed)$runsCollapsed <- sapply(sp, function(i) paste(run[i],collapse=","))
  }
  if (renameCols) {
    colnames(collapsed) <- levels(groupby)
  }
  stopifnot(sum(as.numeric(assay(object))) == sum(as.numeric(assay(collapsed))))
  collapsed
}

#' FPKM: fragments per kilobase per million mapped fragments
#'
#' The following function returns fragment counts normalized
#' per kilobase of feature length per million mapped fragments
#' (by default using a robust estimate of the library size,
#' as in \code{\link{estimateSizeFactors}}).
#'
#' The length of the features (e.g. genes) is calculated one of two ways:
#' (1) If there is a matrix named "avgTxLength" in \code{assays(dds)},
#' this will take precedence in the length normalization.
#' This occurs when using the tximport-DESeq2 pipeline.
#' (2) Otherwise, feature length is calculated 
#' from the \code{rowRanges} of the dds object,
#' if a column \code{basepairs} is not present in \code{mcols(dds)}.
#' The calculated length is the number of basepairs in the union of all \code{GRanges}
#' assigned to a given row of \code{object}, e.g., 
#' the union of all basepairs of exons of a given gene.
#' Note that the second approach over-estimates the gene length
#' (average transcript length, weighted by abundance is a more appropriate
#' normalization for gene counts), and so the FPKM will be an underestimate of the true value.
#' 
#' Note that, when the read/fragment counting has inter-feature dependencies, a strict
#' normalization would not incorporate the basepairs of a feature which
#' overlap another feature. This inter-feature dependence is not taken into
#' consideration in the internal union basepair calculation.
#'
#' @param object a \code{DESeqDataSet}
#' @param robust whether to use size factors to normalize
#' rather than taking the column sums of the raw counts,
#' using the \code{\link{fpm}} function.
#'
#' @return a matrix which is normalized per kilobase of the
#' union of basepairs in the \code{GRangesList} or \code{GRanges}
#' of the mcols(object), and per million of mapped fragments,
#' either using the robust median ratio method (robust=TRUE, default)
#' or using raw counts (robust=FALSE).
#' Defining a column \code{mcols(object)$basepairs} takes
#' precedence over internal calculation of the kilobases for each row.
#'
#' @examples
#'
#' # create a matrix with 1 million counts for the
#' # 2nd and 3rd column, the 1st and 4th have
#' # half and double the counts, respectively.
#' m <- matrix(1e6 * rep(c(.125, .25, .25, .5), each=4),
#'             ncol=4, dimnames=list(1:4,1:4))
#' mode(m) <- "integer"
#' se <- SummarizedExperiment(list(counts=m), colData=DataFrame(sample=1:4))
#' dds <- DESeqDataSet(se, ~ 1)
#' 
#' # create 4 GRanges with lengths: 1, 1, 2, 2.5 Kb
#' gr1 <- GRanges("chr1",IRanges(1,1000)) # 1kb
#' gr2 <- GRanges("chr1",IRanges(c(1,1001),c( 500,1500))) # 1kb
#' gr3 <- GRanges("chr1",IRanges(c(1,1001),c(1000,2000))) # 2kb
#' gr4 <- GRanges("chr1",IRanges(c(1,1001),c(200,1300))) # 500bp
#' rowRanges(dds) <- GRangesList(gr1,gr2,gr3,gr4)
#' 
#' # the raw counts
#' counts(dds)
#'
#' # the FPM values
#' fpm(dds)
#' 
#' # the FPKM values
#' fpkm(dds)
#' 
#' @seealso \code{\link{fpm}}
#'
#' @docType methods
#' @name fpkm
#' @rdname fpkm
#' 
#' @export
fpkm <- function(object, robust=TRUE) {
  fpm <- fpm(object, robust=robust)
  if ("avgTxLength" %in% assayNames(object)) {
    exprs <- 1e3 * fpm / assays(object)[["avgTxLength"]]
    if (robust) {
      sf <- estimateSizeFactorsForMatrix(exprs)
      exprs <- t(t(exprs)/sf)
      return(exprs)
    } else {
      return(exprs)
    }
  }
  if (is.null(mcols(object)$basepairs)) {
    if (is(rowRanges(object), "GRangesList")) {
      ubp <- DataFrame(basepairs = sum(width(reduce(rowRanges(object)))))
    } else if (is(rowRanges(object), "GRanges")) {
      ubp <- DataFrame(basepairs = width(rowRanges(object)))
    }
    if (all(ubp$basepairs == 0)) {
      stop("rowRanges(object) has all ranges of zero width.
the user should instead supply a column, mcols(object)$basepairs,
which will be used to produce FPKM values")
    }
    if (is.null(mcols(mcols(object)))) {
      mcols(object) <- ubp
    } else {
      mcols(ubp) <- DataFrame(type="intermediate",
                              description="count of basepairs in the union of all ranges")
      mcols(object) <- cbind(mcols(object), ubp)
    }
  }
  1e3 * sweep(fpm, 1, mcols(object)$basepairs, "/")
}

#' FPM: fragments per million mapped fragments
#'
#' Calculates either a robust version (default)
#' or the traditional matrix of fragments/counts per million mapped
#' fragments (FPM/CPM).
#' Note: this function is written very simply and
#' can be easily altered to produce other behavior by examining the source code.
#' 
#' @param object a \code{DESeqDataSet}
#' @param robust whether to use size factors to normalize
#' rather than taking the column sums of the raw counts.
#' If TRUE, the size factors and the geometric mean of
#' column sums are multiplied to create a robust library size estimate.
#' Robust normalization is not used if average transcript lengths are present.
#' 
#' @return a matrix which is normalized per million of mapped fragments,
#' either using the robust median ratio method (robust=TRUE, default)
#' or using raw counts (robust=FALSE).
#'
#' @examples
#'
#' # generate a dataset with size factors: .5, 1, 1, 2
#' dds <- makeExampleDESeqDataSet(m = 4, n = 1000,
#'                                interceptMean=log2(1e3),
#'                                interceptSD=0,
#'                                sizeFactors=c(.5,1,1,2),
#'                                dispMeanRel=function(x) .01)
#'
#' # add a few rows with very high count
#' counts(dds)[4:10,] <- 2e5L
#'
#' # in this robust version, the counts are comparable across samples
#' round(head(fpm(dds), 3))
#'
#' # in this column sum version, the counts are still skewed:
#' # sample1 < sample2 & 3 < sample 4
#' round(head(fpm(dds, robust=FALSE), 3))
#'
#' # the column sums of the robust version
#' # are not equal to 1e6, but the
#' # column sums of the non-robust version
#' # are equal to 1e6 by definition
#' 
#' colSums(fpm(dds))/1e6
#' colSums(fpm(dds, robust=FALSE))/1e6
#'
#' @seealso \code{\link{fpkm}}
#'
#' @docType methods
#' @name fpm
#' @rdname fpm
#' 
#' @export
fpm <- function(object, robust=TRUE) {
  # we do something different if average tx lengths are present
  noAvgTxLen <- !("avgTxLength" %in% assayNames(object))
  if (robust & is.null(sizeFactors(object)) & noAvgTxLen) {
    object <- estimateSizeFactors(object)
  }
  k <- counts(object)
  library.sizes <- if (robust & noAvgTxLen) {
    sizeFactors(object) * exp(mean(log(MatrixGenerics::colSums(k))))
  } else {
    MatrixGenerics::colSums(k)
  }
  1e6 * sweep(k, 2, library.sizes, "/")  
}



#' Normalize for gene length
#'
#' Normalize for gene length using the output of transcript abundance estimators
#'
#' This function is deprecated and moved to a new general purpose package,
#' tximport, which will be added to Bioconductor.
#'
#' @param ... ...
#' 
#' @export
normalizeGeneLength <- function(...) {
  .Deprecated("tximport, a separate package on Bioconductor")
}

#' Normalized counts transformation
#'
#' A simple function for creating a \code{\link{DESeqTransform}}
#' object after applying: \code{f(count(dds,normalized=TRUE) + pc)}.
#' 
#' @param object a DESeqDataSet object
#' @param f a function to apply to normalized counts
#' @param pc a pseudocount to add to normalized counts
#' 
#' @seealso \code{\link{varianceStabilizingTransformation}}, \code{\link{rlog}}
#' 
#' @export
normTransform <- function(object, f=log2, pc=1) {
  if (is.null(colnames(object))) {
    colnames(object) <- seq_len(ncol(object))
  }
  if (is.null(sizeFactors(object)) & is.null(normalizationFactors(object))) {
    object <- estimateSizeFactors(object)
  }
  nt <- f(counts(object, normalized=TRUE) + pc)
  se <- SummarizedExperiment(
    assays = nt,
    colData = colData(object),
    rowRanges = rowRanges(object),
    metadata = metadata(object))
  DESeqTransform(se)
}

#' Integrate bulk DE results with Bioconductor single-cell RNA-seq datasets
#'
#' A function that assists with integration of bulk DE results tables
#' with pre-processed scRNA-seq datasets available on Bioconductor,
#' for downstream visualization tasks. The user is prompted to pick
#' a scRNA-seq dataset from a menu. The output of the function is
#' a list with the original results table, bulk gene counts,
#' and the SingleCellExperiment object selected by the user.
#'
#' This function assists the user in choosing a datset from a menu of options
#' that are selected based on the organism of the current dataset.
#' Currently only human and mouse bulk and single-cell RNA-seq datasets
#' are supported, and it is assumed that the bulk DE dataset has GENCODE
#' or Ensembl gene identifiers. Following the selection of the scRNA-seq
#' dataset, visualization can be performed with a package \code{vizWithSCE},
#' which can be installed with \code{install_github("KwameForbes/vizWithSCE")}.
#' 
#' @param res a results table, as produced via \code{\link{results}}
#' @param dds a DESeqDataSet with the bulk gene expression data
#' (should contain gene-level counts)
#' @param ... additional arguments passed to the dataset-accessing function
#'
#' @return list containing: res, dds, and a SingleCellExperiment as selected
#' by the user
#'
#' @examples
#'
#' \dontrun{
#'   # involves interactive menu selection...
#'   dds <- makeExampleDESeqDataSet()
#'   rownames(dds) <- paste0("ENSG",1:nrow(dds))
#'   dds <- DESeq(dds)
#'   res <- results(dds)
#'   dat <- integrateWithSingleCell(res, dds)
#' }
#' 
#' @author Kwame Forbes
#' 
#' @export
integrateWithSingleCell <- function(res, dds, ...) {

  # function written by Kwame Forbes, to assist with integration of
  # bulk DE results with Bioconductor single-cell RNA-seq datasets.
  # provides a menu of dataset options from pre-processed scRNA-seq
  # datasets. Downstream visualization is performed using the
  # vizWithSCE package:
  
  # https://github.com/KwameForbes/vizWithSCE

  stopifnot(is(res, "DESeqResults"))
  stopifnot(is(dds, "DESeqDataSet"))
  
  # figure out organism from 'dds' or 'res', either using tximeta metadata
  # or guesswork on the gene names in the results table
  tximetaOrg <- metadata(dds)$txomeInfo$organism
  if (!is.null(tximetaOrg)) {
    org <- if (tximetaOrg == "Homo sapiens") {
             "human"
           } else if (tximetaOrg == "Mus musculus") {
             "mouse"
           } else {
             stop("Only human and mouse are currently supported")
           }
  } else {
    test.gene <- rownames(res)[1]
    org <- if (substr(test.gene, 1, 4) == "ENSG") {
             "human"
           } else if (substr(test.gene, 1, 7) == "ENSMUSG") {
             "mouse"
           } else {
             stop("Only human and mouse are currently supported")
           }
  }
  
  message(paste("Your dataset appears to be", org, "\n"))

  # read in table of options from vizWithSCE CSV file
  csv.file <- system.file("extdata/singleCellTab.csv", package="DESeq2")
  tab <- read.csv(csv.file)
  
  message(paste("Choose a",org,"single-cell dataset to integrate with (0 to cancel):\n"))
  
  tab <- tab[tab$org == org,]
  tab2 <- tab[,c("pkg","func","data", "pub","nCells","desc")]
  tab2$data <- ifelse(is.na(tab2$data), "", tab2$data)
  rownames(tab2) <- seq_len(nrow(tab2))

  # print the table in two parts
  print(tab2[,1:3])
  print(tab2[,4:6])

  cat("\n")
  
  # repeat the message below the fold
  message(paste("Choose a",org,"single-cell dataset to integrate with (0 to cancel):"))
  
  menuOpts <- ifelse(is.na(tab$data), tab$func, paste(tab$func, tab$data, sep="-"))
  ans <- menu(menuOpts)

  if (ans == 0) stop("No scRNA-seq dataset selected")
  
  pkg <- tab$pkg[ans]
  if (!requireNamespace(package=pkg, quietly=TRUE)) {
    message(paste0("Package: '",pkg, "' is not installed"))
    ask <- askYesNo("Would you like to install the necessary data package?")
    if (ask) {
      if (!requireNamespace(package="BiocManager", quietly=TRUE)) {
        stop("'BiocManager' required to install packages, install from CRAN")
      }
      BiocManager::install(pkg)
    } else {
      stop("Package would need to be installed")
    }
    if (requireNamespace(package=pkg, quietly=TRUE)) {
      message("Data package was installed successfully")
    } else {
      stop("Data package still needs to be installed for integrateWithSingleCell to work")
    }
  }

  # load package
  require(pkg, character.only=TRUE)

  # if the dataset is in the scRNAseq package...
  if (pkg == "scRNAseq") {
    # if only one dataset within the function...
    if (is.na(tab$data[ans])) {
      sce <- do.call(tab$func[ans], list(ensembl=TRUE, ...))
    } else {
      sce <- do.call(tab$func[ans], list(which=tab$data[ans], ensembl=TRUE, ...))
    }
  } else {
    # if only one dataset within the function...
    if (is.na(tab$data[ans])) {
      sce <- do.call(tab$func[ans], list(...))
    } else {
      sce <- do.call(tab$func[ans], list(dataset=tab$data[ans], ...))
    }
  }

  # return the original two objects and the SingleCellExperiment
  return(list(res=res, dds=dds, sce=sce))
  
}
mikelove/DESeq2 documentation built on July 25, 2024, 11:11 p.m.