ganalyse: Easy Analysis of RNASeq DE

#' @title Gather counts from \code{fpkm} or \code{raw} object
#' 
#' @description \code{gather_counts} takes an object of class \code{fpkm} or 
#' \code{raw} and returns a new object of the class \code{fpkm_counts} or 
#' \code{raw_counts}, which inherits from \code{fpkm/raw} respectively. An 
#' extra \code{counts} column is added to the input object and returned. The 
#' \code{counts} column is a list of \code{data.table}s.
#' 
#' @details \code{gather_counts} is an S3 generic with methods implemented for 
#' both \code{fpkm} and \code{raw} objects. 
#' 
#' In case of \code{fpkm} objects, the fpkm values are assumed to be 
#' generated by \code{cufflinks}. The argument \code{by} provides the 
#' the level at which differential expression has to be computed, 
#' since it contains fpkm counts for all expressed isoforms. See details for 
#' possible values for \code{by}.
#' 
#' In case of \code{raw}, the most common analysis is differential 
#' gene expression. Transcript level read counts are not possible (or makes 
#' very less sense) with raw counts. See \code{get_counts} function from 
#' \code{gcount} package for more.
#' 
#' @param x An object of class \code{fpkm} or \code{raw}. See 
#' \code{?rnaseq}.
#' @param by Level at which reads should be aggregated to (if necessary). 
#' There are three possible values:
#' 
#' \code{"gene-id"} (default): The column \code{"gene_id"} must be available 
#' in the raw or fpkm counts file. For fpkm, values from all transcripts 
#' corresponding to each gene id are summed together.
#' 
#' \code{"gene-name"}: The column \code{gene_short_name} must be available in 
#' the raw or fpkm counts file. In some cases, it might be necessary to use 
#' gene name instead of id. For fpkm, values from all transcripts 
#' corresponding to each gene name are summed together instead.
#' 
#' \code{"transcript-id"}: Only valid for fpkm counts. The values are retained 
#' as such, as they already contain the expression associated with each 
#' transcript. The columns \code{tracking_id} and \code{gene_id} must be 
#' available in the fpkm counts file.
#' 
#' @param threshold In case of \code{fpkm} objects, features whose row means 
#' >= \code{threshold} will alone be retained. In case of \code{raw} objects, 
#' features whose rpkm values >= \code{threshold} will alone be retained. 
#' Default value is \code{0.1} for \code{fpkm} and \code{1} for \code{raw} 
#' objects respectively. 
#' 
#' Note that \code{threshold} is applied on \code{fpkm} or \code{raw} counts, 
#' not their log transformed values.
#' 
#' @param log_base Value to pass to the \code{base} argument of 
#' \code{\link[base]{log}}. If \code{FALSE} (default), the values are not 
#' log transformed. \code{TRUE} defaults to base=2.
#' 
#' In case of \code{raw_counts}, it is recommended that the raw counts are  
#' \emph{not} log transformed. Some plotting functions (e.g., spectral maps) 
#' might be better on log transformed values. The argument is exposed for 
#' those cases.
#' 
#' In case of \code{fpkm_counts}, the recommendation is to log 
#' transform the values before using \code{\link{limma_dge}}.
#' @param verbose Logical. Default is \code{FALSE}. If \code{TRUE}, sends 
#' useful status messages to the console.
#' @param ... Additional arguments to be passed to or from other methods.
#' @return A new object of class \code{fpkm_counts} or \code{raw_counts} 
#' corresponding to \code{fpkm} or \code{raw} objects respectively.
#' @seealso \code{\link{rnaseq}} \code{\link{limma_dge}} 
#' \code{\link{edger_dge}} \code{\link{as.eset}} \code{\link{show_counts}}
#' \code{\link{construct_design}} \code{\link{construct_contrasts}} 
#' \code{\link{write_dge}} \code{\link{volcano_plot}} 
#' \code{\link{density_plot}}
#' @export
#' @examples 
#' path = system.file("tests", package="ganalyse")
#' 
#' # ----- fpkm ----- # 
#' fpkm_path = file.path(path, "fpkm", "annotation.txt")
#' fpkm_obj = rnaseq(fpkm_path, format="fpkm", experiment="sample")
#' (fpkm_counts = gather_counts(fpkm_obj, by="gene-id", log_base=2L))
#' class(fpkm_counts)
#' 
#' # ----- raw ----- # 
#' raw_path = file.path(path, "raw", "annotation.txt")
#' raw_obj = rnaseq(raw_path, format="raw", experiment="sample")
#' (raw_counts = gather_counts(raw_obj, by="gene-id", threshold=1L))
#' class(raw_counts)
gather_counts <- function(x, ...) {
    UseMethod("gather_counts")
}

#' @rdname gather_counts
#' @export
gather_counts.default <- function(x, ...) {
    stop("'gather_counts' expects input to be objects of class 'fpkm ", 
            "or 'raw'. See ?rnaseq for more.")
}

#' @rdname gather_counts
#' @export
gather_counts.fpkm <- function(x, by=c("gene-id", "gene-name", 
        "transcript-id"), threshold=0.1, log_base=FALSE, verbose=FALSE, ...) {

    if (length(invalid <- setdiff(c("path", "sample"), names(x))))
        stop("Column(s) ", paste(invalid, collapse=","), 
                " not present in input x.")
    if (!is.numeric(threshold) || length(threshold) != 1L || 
            !is.finite(threshold) || threshold < 0L)
        stop("threshold must be a non-negative finite 
                numeric value of length=1")
    if (is.logical(log_base)) log_base = as.integer(log_base)
    if (!is.numeric(log_base) || length(log_base) != 1L || 
            !is.finite(log_base) || log_base < 0L)
        stop("log_base must be a non-negative finite numeric value of ", 
                "length=1 or logical TRUE/FALSE")
    if (log_base == 1L) log_base = 2L # minimum value
    pseudocount = 1L
    by = match.arg(by)

    keep=fpkm=FPKM=gene_id=gene_short_name=id=NULL
    if (verbose) cat("Loading fpkm...\n")
    counts <- function(f) {
        switch(by, 
        `gene-id` = {
            cols = c("gene_id", "FPKM")
            dt = suppressWarnings(fread(f, select=cols))[, 
                    list(fpkm=sum(FPKM)), by=list(id=gene_id)]
            # dt[, "gene_id" := id]
        }, 
        `gene-name` = {
            cols=c("gene_short_name", "FPKM")
            dt = suppressWarnings(fread(f, select=cols))[, 
                    list(fpkm=sum(FPKM)), by=list(id=gene_short_name)]
            # dt[, "gene_id" := id]
        },                 
        `transcript-id` = {
            # cols = c("tracking_id", "gene_id", "FPKM")
            cols = c("tracking_id", "FPKM")
            dt = suppressWarnings(fread(f, select=cols))
            setnames(dt, c("id", "fpkm"))
            # setnames(dt, c("id", "gene_id", "fpkm"))
            # setcolorder(dt, c("id", "fpkm", "gene_id"))
        }) # ,
        # exon =, intron = {
        #     dt = suppressWarnings(fread(x, select=c("seqname", 
        #               "start", "end", "gene_id", "FPKM")))
        #     dt[, "gene_id" := paste(gene_id, collapse=","), 
        #               by = list(seqname, start, end)]
        #     dt[, "id" := paste(gene_id, ";", seqname, ":", 
        #               start, "-", end, sep="", colapse="")]
        #     dt[, c("seqname", "start", "end") := NULL]
        #     dt = unique(dt, cols = c("id", "FPKM"))
        #     setnames(dt, "FPKM", "fpkm")
        #     setcolorder(dt, c("id", "fpkm", "gene_id"))
        # }
        # to log or not to log?
        setnames(dt, tolower(names(dt)))
    }
    ans = lapply(x[["path"]], counts)
    # # filter for features
    # if (!missing(select_features)) 
    #     ans = ans[gene_id %in% select_features]
    # ans[, "gene_id" := NULL] # select_features handled above if condition 
    #                          # is TRUE, gene_id no needed anymore.

    # threshold
    if (threshold > 0L) {
        if (verbose) cat("Filtering features with row means < ", 
                threshold, ".\n", sep="")
        ids = rbindlist(ans)[, list(keep=mean(fpkm)>=threshold), by="id"]
        ids = ids[(keep), id]
        ans = lapply(ans, function(x) x[id %in% ids])
    } else {
        if (verbose) cat("Ignoring 'threshold' argument.\n")
    }
    # to log or not to log
    if (log_base) {
        if (verbose) cat("Log transforming FPKM counts using base ", 
                            log_base, ".\n", sep="")
        for (i in seq_along(ans)) {
            t = ans[[i]]
            t[, "fpkm" := log(fpkm+pseudocount, base=log_base)]
            ans[[i]] = t
        }
    } else {
        if (verbose) cat("Skipping logarithm on fpkm.\n")
    }
    # TODO: check if eset objects ignore row names in design matrix to extract 
    # right samples... this seems to have been the source of a nasty bug that 
    # results from dcast reordering columns alphabetically.
    cols = setdiff(names(x), "counts")
    x = copy(x)[, cols, with=FALSE][, "counts" := ans][]
    setattr(x, 'class', unique(c("fpkm_counts", class(x))))
    x[]
}

#' @rdname gather_counts
#' @export
gather_counts.raw <- function(x, by=c("gene-id", "gene-name"), 
        threshold=1L, log_base=FALSE, verbose=FALSE, ...) {

    if (length(invalid <- setdiff(c("path", "sample"), names(x))))
        stop("Column(s) ", paste(invalid, collapse=","), 
                " not present in input x.")
    if (!is.numeric(threshold) || length(threshold) != 1L || 
            !is.finite(threshold) || threshold < 0L)
        stop("threshold must be a non-negative finite numeric 
                    value of length=1")
    if (is.logical(log_base)) log_base = as.integer(log_base)
    if (!is.numeric(log_base) || length(log_base) != 1L || 
            !is.finite(log_base) || log_base < 0L)
        stop("log_base must be a non-negative finite numeric value of ", 
                "length=1 or logical TRUE/FALSE")
    if (log_base == 1L) log_base = 2L
    by = match.arg(by)

    gene_id=gene_name=keep=id=reads=sample_total=NULL
    if (verbose) cat("Loading raw counts...\n")
    counts <- function(f) {
        switch(by, 
        `gene-id` = {
            cols = c("gene_id", "length", "reads")
            dt = suppressWarnings(fread(f, select=cols))[, 
                    list(length=length[1L], raw=sum(reads)), 
                            by=list(id=gene_id)]
            # dt[, "gene_id" := id]
        }, 
        `gene-name` = {
            cols = c("gene_name", "length", "reads")
            dt = suppressWarnings(fread(f, select=cols))[, 
                    list(length=length[1L], raw=sum(reads)), 
                            by=list(id=gene_name)]
            # dt[, "gene_id" := id]
        }) #, 
        # exon =, intron = {
        #     dt = suppressWarnings(fread(x, select=c("seqname", 
        #               "start", "end", "length", "gene_id", "reads")))
        #     dt[, "gene_id" := paste(gene_id, collapse=","), 
        #               by = list(seqname, start, end)]
        #     dt[, id := paste(gene_id, ";", seqname, ":", 
        #               start, "-", end, sep="", colapse="")]
        #     dt[, c("seqname", "start", "end") := NULL]
        #     dt = unique(dt, cols = c("id", "reads"))
        #     setnames(dt, "reads", "raw")
        #     setcolorder(dt, c("id", "length", "raw", "gene_id"))
        # })
        setnames(dt, tolower(names(dt)))
    }
    ans = lapply(x[["path"]], counts)
    # # filter for features
    # if (!missing(select_features)) 
    #     ans = ans[gene_id %in% select_features]
    # ans[, "gene_id" := NULL] # select_features handled above if condition is 
    #                          # TRUE, gene_id no needed anymore.

    # threshold
    if (threshold > 0L) {
        if (verbose) cat("Filtering features with row means < ", 
                threshold, ".\n", sep="")
        setattr(ans, 'names', x[["sample"]])
        tmp = rbindlist(ans, idcol="sample")[, 
                "sample_total" := sum(raw), by="sample"][, 
                "rpkm" := raw/sample_total/length * 1e9L]
        # rpkm > threshold in 50% of the samples
        tmp = tmp[, list(keep=sum(rpkm>threshold) > .N/2L), by="id"]
        ids = tmp[(keep), id]
        ans = lapply(ans, function(x) x[id %in% ids])
    } else {
        if (verbose) cat("Ignoring 'threshold' argument.\n")
    }
    # to log or not to log?
    if (log_base) {
        if (verbose) cat("Log transforming raw counts using base ", 
                            log_base, "\n", sep="")
        for (i in seq_along(ans)) {
            t = ans[[i]]
            t[, "raw" := log(raw+1L, base=log_base)]
            ans[[i]] = t
        }
    } else {
      if (verbose) cat("Skipping logarithm on raw counts.\n")
    }
    cols = setdiff(names(x), "counts")
    x = copy(x)[, cols, with=FALSE][, "counts" := ans][]
    setattr(x, 'class', unique(c("raw_counts", class(x))))
    x[]
}

# ----- internal functions ----- #

rpkm <- function(x, len) {
    x = as.matrix(x)
    tot_reads = matrix(rep(colSums(x), nrow(x)), byrow=TRUE, nrow=nrow(x))
    len_mat = matrix(rep(len, each=ncol(x)), ncol=ncol(x), byrow=TRUE)
    out = x / tot_reads
    out = out/len_mat
    out = out * 10^9
}

filter_by_rpkm <- function(x, len, reps, cndn) {
    x = rpkm(x, len)
    rowSums(x > 1) >= reps
}