#' @title Gather counts from \code{fpkm} or \code{raw} object
#'
#' @description \code{gather_counts} takes an object of class \code{fpkm} or
#' \code{raw} and returns a new object of the class \code{fpkm_counts} or
#' \code{raw_counts}, which inherits from \code{fpkm/raw} respectively. An
#' extra \code{counts} column is added to the input object and returned. The
#' \code{counts} column is a list of \code{data.table}s.
#'
#' @details \code{gather_counts} is an S3 generic with methods implemented for
#' both \code{fpkm} and \code{raw} objects.
#'
#' In case of \code{fpkm} objects, the fpkm values are assumed to be
#' generated by \code{cufflinks}. The argument \code{by} provides the
#' the level at which differential expression has to be computed,
#' since it contains fpkm counts for all expressed isoforms. See details for
#' possible values for \code{by}.
#'
#' In case of \code{raw}, the most common analysis is differential
#' gene expression. Transcript level read counts are not possible (or makes
#' very less sense) with raw counts. See \code{get_counts} function from
#' \code{gcount} package for more.
#'
#' @param x An object of class \code{fpkm} or \code{raw}. See
#' \code{?rnaseq}.
#' @param by Level at which reads should be aggregated to (if necessary).
#' There are three possible values:
#'
#' \code{"gene-id"} (default): The column \code{"gene_id"} must be available
#' in the raw or fpkm counts file. For fpkm, values from all transcripts
#' corresponding to each gene id are summed together.
#'
#' \code{"gene-name"}: The column \code{gene_short_name} must be available in
#' the raw or fpkm counts file. In some cases, it might be necessary to use
#' gene name instead of id. For fpkm, values from all transcripts
#' corresponding to each gene name are summed together instead.
#'
#' \code{"transcript-id"}: Only valid for fpkm counts. The values are retained
#' as such, as they already contain the expression associated with each
#' transcript. The columns \code{tracking_id} and \code{gene_id} must be
#' available in the fpkm counts file.
#'
#' @param threshold In case of \code{fpkm} objects, features whose row means
#' >= \code{threshold} will alone be retained. In case of \code{raw} objects,
#' features whose rpkm values >= \code{threshold} will alone be retained.
#' Default value is \code{0.1} for \code{fpkm} and \code{1} for \code{raw}
#' objects respectively.
#'
#' Note that \code{threshold} is applied on \code{fpkm} or \code{raw} counts,
#' not their log transformed values.
#'
#' @param log_base Value to pass to the \code{base} argument of
#' \code{\link[base]{log}}. If \code{FALSE} (default), the values are not
#' log transformed. \code{TRUE} defaults to base=2.
#'
#' In case of \code{raw_counts}, it is recommended that the raw counts are
#' \emph{not} log transformed. Some plotting functions (e.g., spectral maps)
#' might be better on log transformed values. The argument is exposed for
#' those cases.
#'
#' In case of \code{fpkm_counts}, the recommendation is to log
#' transform the values before using \code{\link{limma_dge}}.
#' @param verbose Logical. Default is \code{FALSE}. If \code{TRUE}, sends
#' useful status messages to the console.
#' @param ... Additional arguments to be passed to or from other methods.
#' @return A new object of class \code{fpkm_counts} or \code{raw_counts}
#' corresponding to \code{fpkm} or \code{raw} objects respectively.
#' @seealso \code{\link{rnaseq}} \code{\link{limma_dge}}
#' \code{\link{edger_dge}} \code{\link{as.eset}} \code{\link{show_counts}}
#' \code{\link{construct_design}} \code{\link{construct_contrasts}}
#' \code{\link{write_dge}} \code{\link{volcano_plot}}
#' \code{\link{density_plot}}
#' @export
#' @examples
#' path = system.file("tests", package="ganalyse")
#'
#' # ----- fpkm ----- #
#' fpkm_path = file.path(path, "fpkm", "annotation.txt")
#' fpkm_obj = rnaseq(fpkm_path, format="fpkm", experiment="sample")
#' (fpkm_counts = gather_counts(fpkm_obj, by="gene-id", log_base=2L))
#' class(fpkm_counts)
#'
#' # ----- raw ----- #
#' raw_path = file.path(path, "raw", "annotation.txt")
#' raw_obj = rnaseq(raw_path, format="raw", experiment="sample")
#' (raw_counts = gather_counts(raw_obj, by="gene-id", threshold=1L))
#' class(raw_counts)
gather_counts <- function(x, ...) {
UseMethod("gather_counts")
}
#' @rdname gather_counts
#' @export
gather_counts.default <- function(x, ...) {
stop("'gather_counts' expects input to be objects of class 'fpkm ",
"or 'raw'. See ?rnaseq for more.")
}
#' @rdname gather_counts
#' @export
gather_counts.fpkm <- function(x, by=c("gene-id", "gene-name",
"transcript-id"), threshold=0.1, log_base=FALSE, verbose=FALSE, ...) {
if (length(invalid <- setdiff(c("path", "sample"), names(x))))
stop("Column(s) ", paste(invalid, collapse=","),
" not present in input x.")
if (!is.numeric(threshold) || length(threshold) != 1L ||
!is.finite(threshold) || threshold < 0L)
stop("threshold must be a non-negative finite
numeric value of length=1")
if (is.logical(log_base)) log_base = as.integer(log_base)
if (!is.numeric(log_base) || length(log_base) != 1L ||
!is.finite(log_base) || log_base < 0L)
stop("log_base must be a non-negative finite numeric value of ",
"length=1 or logical TRUE/FALSE")
if (log_base == 1L) log_base = 2L # minimum value
pseudocount = 1L
by = match.arg(by)
keep=fpkm=FPKM=gene_id=gene_short_name=id=NULL
if (verbose) cat("Loading fpkm...\n")
counts <- function(f) {
switch(by,
`gene-id` = {
cols = c("gene_id", "FPKM")
dt = suppressWarnings(fread(f, select=cols))[,
list(fpkm=sum(FPKM)), by=list(id=gene_id)]
# dt[, "gene_id" := id]
},
`gene-name` = {
cols=c("gene_short_name", "FPKM")
dt = suppressWarnings(fread(f, select=cols))[,
list(fpkm=sum(FPKM)), by=list(id=gene_short_name)]
# dt[, "gene_id" := id]
},
`transcript-id` = {
# cols = c("tracking_id", "gene_id", "FPKM")
cols = c("tracking_id", "FPKM")
dt = suppressWarnings(fread(f, select=cols))
setnames(dt, c("id", "fpkm"))
# setnames(dt, c("id", "gene_id", "fpkm"))
# setcolorder(dt, c("id", "fpkm", "gene_id"))
}) # ,
# exon =, intron = {
# dt = suppressWarnings(fread(x, select=c("seqname",
# "start", "end", "gene_id", "FPKM")))
# dt[, "gene_id" := paste(gene_id, collapse=","),
# by = list(seqname, start, end)]
# dt[, "id" := paste(gene_id, ";", seqname, ":",
# start, "-", end, sep="", colapse="")]
# dt[, c("seqname", "start", "end") := NULL]
# dt = unique(dt, cols = c("id", "FPKM"))
# setnames(dt, "FPKM", "fpkm")
# setcolorder(dt, c("id", "fpkm", "gene_id"))
# }
# to log or not to log?
setnames(dt, tolower(names(dt)))
}
ans = lapply(x[["path"]], counts)
# # filter for features
# if (!missing(select_features))
# ans = ans[gene_id %in% select_features]
# ans[, "gene_id" := NULL] # select_features handled above if condition
# # is TRUE, gene_id no needed anymore.
# threshold
if (threshold > 0L) {
if (verbose) cat("Filtering features with row means < ",
threshold, ".\n", sep="")
ids = rbindlist(ans)[, list(keep=mean(fpkm)>=threshold), by="id"]
ids = ids[(keep), id]
ans = lapply(ans, function(x) x[id %in% ids])
} else {
if (verbose) cat("Ignoring 'threshold' argument.\n")
}
# to log or not to log
if (log_base) {
if (verbose) cat("Log transforming FPKM counts using base ",
log_base, ".\n", sep="")
for (i in seq_along(ans)) {
t = ans[[i]]
t[, "fpkm" := log(fpkm+pseudocount, base=log_base)]
ans[[i]] = t
}
} else {
if (verbose) cat("Skipping logarithm on fpkm.\n")
}
# TODO: check if eset objects ignore row names in design matrix to extract
# right samples... this seems to have been the source of a nasty bug that
# results from dcast reordering columns alphabetically.
cols = setdiff(names(x), "counts")
x = copy(x)[, cols, with=FALSE][, "counts" := ans][]
setattr(x, 'class', unique(c("fpkm_counts", class(x))))
x[]
}
#' @rdname gather_counts
#' @export
gather_counts.raw <- function(x, by=c("gene-id", "gene-name"),
threshold=1L, log_base=FALSE, verbose=FALSE, ...) {
if (length(invalid <- setdiff(c("path", "sample"), names(x))))
stop("Column(s) ", paste(invalid, collapse=","),
" not present in input x.")
if (!is.numeric(threshold) || length(threshold) != 1L ||
!is.finite(threshold) || threshold < 0L)
stop("threshold must be a non-negative finite numeric
value of length=1")
if (is.logical(log_base)) log_base = as.integer(log_base)
if (!is.numeric(log_base) || length(log_base) != 1L ||
!is.finite(log_base) || log_base < 0L)
stop("log_base must be a non-negative finite numeric value of ",
"length=1 or logical TRUE/FALSE")
if (log_base == 1L) log_base = 2L
by = match.arg(by)
gene_id=gene_name=keep=id=reads=sample_total=NULL
if (verbose) cat("Loading raw counts...\n")
counts <- function(f) {
switch(by,
`gene-id` = {
cols = c("gene_id", "length", "reads")
dt = suppressWarnings(fread(f, select=cols))[,
list(length=length[1L], raw=sum(reads)),
by=list(id=gene_id)]
# dt[, "gene_id" := id]
},
`gene-name` = {
cols = c("gene_name", "length", "reads")
dt = suppressWarnings(fread(f, select=cols))[,
list(length=length[1L], raw=sum(reads)),
by=list(id=gene_name)]
# dt[, "gene_id" := id]
}) #,
# exon =, intron = {
# dt = suppressWarnings(fread(x, select=c("seqname",
# "start", "end", "length", "gene_id", "reads")))
# dt[, "gene_id" := paste(gene_id, collapse=","),
# by = list(seqname, start, end)]
# dt[, id := paste(gene_id, ";", seqname, ":",
# start, "-", end, sep="", colapse="")]
# dt[, c("seqname", "start", "end") := NULL]
# dt = unique(dt, cols = c("id", "reads"))
# setnames(dt, "reads", "raw")
# setcolorder(dt, c("id", "length", "raw", "gene_id"))
# })
setnames(dt, tolower(names(dt)))
}
ans = lapply(x[["path"]], counts)
# # filter for features
# if (!missing(select_features))
# ans = ans[gene_id %in% select_features]
# ans[, "gene_id" := NULL] # select_features handled above if condition is
# # TRUE, gene_id no needed anymore.
# threshold
if (threshold > 0L) {
if (verbose) cat("Filtering features with row means < ",
threshold, ".\n", sep="")
setattr(ans, 'names', x[["sample"]])
tmp = rbindlist(ans, idcol="sample")[,
"sample_total" := sum(raw), by="sample"][,
"rpkm" := raw/sample_total/length * 1e9L]
# rpkm > threshold in 50% of the samples
tmp = tmp[, list(keep=sum(rpkm>threshold) > .N/2L), by="id"]
ids = tmp[(keep), id]
ans = lapply(ans, function(x) x[id %in% ids])
} else {
if (verbose) cat("Ignoring 'threshold' argument.\n")
}
# to log or not to log?
if (log_base) {
if (verbose) cat("Log transforming raw counts using base ",
log_base, "\n", sep="")
for (i in seq_along(ans)) {
t = ans[[i]]
t[, "raw" := log(raw+1L, base=log_base)]
ans[[i]] = t
}
} else {
if (verbose) cat("Skipping logarithm on raw counts.\n")
}
cols = setdiff(names(x), "counts")
x = copy(x)[, cols, with=FALSE][, "counts" := ans][]
setattr(x, 'class', unique(c("raw_counts", class(x))))
x[]
}
# ----- internal functions ----- #
rpkm <- function(x, len) {
x = as.matrix(x)
tot_reads = matrix(rep(colSums(x), nrow(x)), byrow=TRUE, nrow=nrow(x))
len_mat = matrix(rep(len, each=ncol(x)), ncol=ncol(x), byrow=TRUE)
out = x / tot_reads
out = out/len_mat
out = out * 10^9
}
filter_by_rpkm <- function(x, len, reps, cndn) {
x = rpkm(x, len)
rowSums(x > 1) >= reps
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.