### constructor function for ballgown objects
### helper functions:
.readIntron <- function(file, meas){
cc = c('integer', 'character', 'factor', 'integer', 'integer', 'integer',
'integer', 'numeric')
if(!identical(meas, 'all')){
dummy = c(rep(meas[1], 5), 'rcount', 'ucount', 'mrcount')
cc[which(!(dummy %in% meas))] = 'NULL'
}
intron = read.table(file, header=TRUE, sep="\t", colClasses=cc, quote="")
intron = intron[order(intron$i_id), ]
rownames(intron) = 1:nrow(intron)
return(intron)
}
.readExon <- function(file, meas) {
cc = c('integer', 'character', 'factor', 'integer', 'integer',
'integer', 'integer', rep('numeric', 5))
if(!identical(meas, 'all')){
dummy = c(rep(meas[1], 5), 'rcount', 'ucount', 'mrcount', 'cov',
'cov_sd', 'mcov', 'mcov_sd')
cc[which(!(dummy %in% meas))] = 'NULL'
}
exon = read.table(file, header=TRUE, sep="\t", colClasses=cc, quote="")
exon = exon[order(exon$e_id), ]
rownames(exon) = 1:nrow(exon)
return(exon)
}
.readTranscript <- function(file, meas) {
cc = c('integer', 'character', 'factor', 'integer', 'integer',
'character', 'integer', 'integer', 'character', 'character',
'numeric', 'numeric')
if(!identical(meas, 'all')){
if(!('cov' %in% meas)) cc[11] = 'NULL'
if(!('FPKM' %in% meas)) cc[12] = 'NULL'
}
transcript = read.table(file, header=TRUE, sep="\t",
colClasses=cc, quote="")
transcript = transcript[order(transcript$t_id), ]
rownames(transcript) = 1:nrow(transcript)
return(transcript)
}
# make it so I can write warning/messages and whatnot that respect the 80char
# limit in my script, but still look pretty when displayed.
.makepretty = function(x){
msg = gsub('\n', ' ', x)
msg = gsub(' ', '', msg)
msg
}
#' constructor function for ballgown objects
#'
#' @name ballgown-constructor
#'
#' @param samples vector of file paths to folders containing sample-specific
#' ballgown data (generated by \code{tablemaker}). If \code{samples} is
#' provided, \code{dataDir} and \code{samplePattern} are not used.
#' @param dataDir file path to top-level directory containing sample-specific
#' folders with ballgown data in them. Only used if \code{samples} is NULL.
#' @param samplePattern regular expression identifying the subdirectories of\
#' \code{dataDir} containing data to be loaded into the ballgown object (and
#' only those subdirectories). Only used if \code{samples} is NULL.
#' @param bamfiles optional vector of file paths to read alignment files for
#' each sample. If provided, make sure to sort properly (e.g., in the same
#' order as \code{samples}). Default NULL.
#' @param pData optional \code{data.frame} with rows corresponding to samples and
#' columns corresponding to phenotypic variables.
#' @param verbose if \code{TRUE}, print status messages and timing information
#' as the object is constructed.
#' @param meas character vector containing either "all" or one or more of:
#' "rcount", "ucount", "mrcount", "cov", "cov_sd", "mcov", "mcov_sd", or
#' "FPKM". The resulting ballgown object will only contain the specified
#' expression measurements, for the appropriate features. See vignette for
#' which expression measurements are available for which features. "all"
#' creates the full object.
#'
#' @details Because experimental data is recorded so variably, it is the user's
#' responsibility to format \code{pData} correctly. In particular, it's
#' really important that the rows of \code{pData} (corresponding to samples)
#' are ordered the same way as \code{samples} or the
#' \code{dataDir}/\code{samplePattern} combo. You can run
#' \code{list.files(path = dataDir, pattern = samplePattern)} to see the sample
#' order if \code{samples} was not used.
#'
#' If you are creating a ballgown object for a large experiment, this function
#' may run slowly and use a large amount of RAM. We recommend running this
#' constructor as a batch job and saving the resulting ballgown object as an rda
#' file. The rda file usually has reasonable size on disk, and the object in it
#' shouldn't take up too much RAM when loaded, so the time and memory use in
#' creating the object is a one-time cost.
#'
#' @return an object of class \code{ballgown}
#'
#' @author Leonardo Collado-Torres, Alyssa Frazee
#'
#' @rdname ballgown-constructor
#'
#' @seealso \code{\link{ballgownrsem}}, for loading RSEM output into a ballgown
#' object
#' @export
#' @examples
#' bg = ballgown(dataDir=system.file('extdata', package='ballgown'),
#' samplePattern='sample')
#' pData(bg) = data.frame(id=sampleNames(bg), group=rep(c(1,0), each=10))
ballgown = function(samples=NULL, dataDir=NULL, samplePattern=NULL,
bamfiles = NULL, pData = NULL, verbose=TRUE, meas="all"){
if(verbose) message(date())
if(all(c(is.null(samples),is.null(dataDir),is.null(samplePattern)))){
stop(.makepretty('must provide either "samples" or both "dataDir" and
"samplePattern"'))
}
if(length(setdiff(meas, c("rcount", "ucount", "mrcount", "cov",
"cov_sd", "mcov", "mcov_sd", "FPKM", "all"))) != 0){
msg = 'meas can be either "all" or one or more of "rcount",
"ucount", "cov", "cov_sd", "mcov", "mcov_sd", or "FPKM". See
vignette for details.'
stop(.makepretty(msg))
}
if('all' %in% meas & length(meas) > 1){
stop(.makepretty('when meas is "all", all types of measurements are
included by default.'))
}
## Determine where data is located
if(is.null(samples)){
if(is.null(samplePattern)|is.null(dataDir)){
stop(.makepretty('must provide both "dataDir" and "samplePattern"
if "samples" is NULL.'))
}
samples = list.files(path=dataDir,
pattern=samplePattern, full.names=TRUE)
names(samples) = list.files(path=dataDir, pattern=samplePattern)
}else{
names(samples) = sapply(samples, function(x){
tail(strsplit(x, split="/")[[1]],n=1)
}, USE.NAMES=FALSE)
}
## check to see if you actually have ballgown data:
n = length(samples)
subfiles = list.files(samples)
ctabs = subfiles[subfiles %in%
c('e_data.ctab', 'i_data.ctab', 't_data.ctab', 'e2t.ctab', 'i2t.ctab')]
if(length(ctabs) != 5*n){
msg = 'Something is wrong: are you missing .ctab files? Do extra
files/folders (other than tablemaker output folders) match your
samples/dataDir/samplePattern argument(s)?'
stop(.makepretty(msg))
}
## Read tables linking exons/introns to transcripts
if(verbose) message(paste0(date(), ": Reading linking tables"))
e2t = read.table(list.files(samples[1], "e2t.ctab", full.names=TRUE),
header=TRUE, sep="\t", colClasses=c("integer", "integer"))
i2t = read.table(list.files(samples[1], "i2t.ctab", full.names=TRUE),
header=TRUE, sep="\t", colClasses=c("integer", "integer"))
## Order by transcript id
e2t = e2t[order(e2t$t_id), ]
i2t = i2t[order(i2t$t_id), ]
rownames(e2t) = 1:nrow(e2t)
rownames(i2t) = 1:nrow(i2t)
## Read counts for all introns
if(verbose) message(paste0(date(), ": Reading intron data files"))
# if intron data is requested:
if( any(c('all', 'rcount', 'ucount', 'mrcount') %in% meas)){
intronFiles = sapply(samples, list.files, pattern="i_data.ctab",
full.names=TRUE)
intronAll = NULL
for(f in intronFiles){
ind = which(intronFiles == f)
intronAll[[ind]] = .readIntron(f, meas)
}
## Merge the intron results
if(verbose) message(paste0(date(), ": Merging intron data"))
#[ensure ctab files all contain same introns]
sumdiff = sapply(intronAll, function(x){
sum(x$i_id != intronAll[[1]]$i_id)
})
if(!(all(sumdiff==0))){
msg = 'intron ids were either not the same or not in the same order
across samples. double check i_data.ctab for each sample.'
stop(.makepretty(msg))
}
idataOnly = lapply(intronAll, function(x) x[,-c(1:5)] )
intron = data.frame(intronAll[[1]][,1:5], as.data.frame(idataOnly))
if(identical(meas, 'all')){
colnames(intron) = c('i_id', 'chr', 'strand', 'start', 'end',
paste(c('rcount', 'ucount', 'mrcount'),
rep(names(samples), each=3), sep="."))
}else{
meas_avail = intersect(c('rcount', 'ucount', 'mrcount'), meas)
#^order important! meas has to go after rcount/ucount/mrcount
#^because of file order. intersect() keeps in order of first thing.
colnames(intron) = c('i_id', 'chr', 'strand', 'start', 'end',
paste(meas_avail, rep(names(samples), each=length(meas_avail)),
sep="."))
}
} else { # no intron data requested:
f1 = list.files(samples[1], pattern='i_data.ctab', full.names=TRUE)
intron = .readIntron(f1, meas)
stopifnot(ncol(intron) == 5)
}
## Make intron data into GRanges object
#A. fix strand information for compatibility w/ GRanges
intron$strand = as.character(intron$strand)
intron$strand[intron$strand=="."] <- "*"
#B. get names of transcripts each intron belongs to
tnamesin = split(i2t$t_id, i2t$i_id)
tnamesin.ord = as.character(tnamesin)[match(intron$i_id, names(tnamesin))]
#C. make the GRanges object
introngr = GRanges(seqnames=Rle(intron$chr),
ranges=IRanges(start=intron$start, end=intron$end),
strand = Rle(intron$strand),
id=intron$i_id,
transcripts = tnamesin.ord)
## Read exon data
if(verbose) message(paste0(date(), ": Reading exon data files"))
# if any exon data is requested:
emeas = c('all', 'rcount', 'ucount', 'mrcount', 'cov', 'cov_sd', 'mcov',
'mcov_sd')
if( any(emeas %in% meas)){
exonFiles = sapply(samples, list.files, pattern="e_data.ctab",
full.names=TRUE)
exonAll = NULL
for(f in exonFiles){
ind = which(exonFiles == f)
exonAll[[ind]] = .readExon(f, meas)
}
## Merge the exon results
if(verbose) message(paste0(date(), ": Merging exon data"))
#[ensure ctab files all contain same exons]
sumdiffex = sapply(exonAll, function(x){
sum(x$e_id != exonAll[[1]]$e_id)
})
if(!(all(sumdiffex==0))){
stop(.makepretty('exon ids were either not the same or not in the
same order across samples. double check e_data.ctab for each
sample.'))
}
edataOnly = lapply(exonAll, function(x) x[,-c(1:5)])
exon = data.frame(exonAll[[1]][,1:5], as.data.frame(edataOnly))
if(identical(meas, 'all')){
colnames(exon) = c('e_id', 'chr', 'strand', 'start', 'end',
paste(emeas[-1], rep(names(samples), each=7), sep="."))
}else{
meas_avail = intersect(emeas[-1], meas)
#^order important!
colnames(exon) = c('e_id', 'chr', 'strand', 'start', 'end',
paste(meas_avail, rep(names(samples), each=length(meas_avail)),
sep="."))
}
} else { # no exon data requested:
f1 = list.files(samples[1], pattern='e_data.ctab', full.names=TRUE)
exon = .readExon(f1, meas)
stopifnot(ncol(exon) == 5)
}
## Make exon data into GRanges object
#A. fix strand information for compatibility w/ GRanges
exon$strand = as.character(exon$strand)
exon$strand[exon$strand=="."] <- "*"
#B. get names of transcripts each exon belongs to
tnamesex = split(e2t$t_id, e2t$e_id)
#C. make the GRanges object
tnamesex.ord = as.character(tnamesex)[match(exon$e_id, names(tnamesex))]
exongr = GRanges(seqnames=Rle(exon$chr),
ranges=IRanges(start=exon$start, end=exon$end),
strand=Rle(exon$strand),
id=exon$e_id,
transcripts=tnamesex.ord)
## Read transcript data
if(verbose) message(paste0(date(), ": Reading transcript data files"))
# if any transcript data requested:
if( any(c('all', 'cov', 'FPKM') %in% meas)){
tFiles = sapply(samples, list.files, pattern="t_data.ctab",
full.names=TRUE)
tAll = NULL
for(f in tFiles){
ind = which(tFiles == f)
tAll[[ind]] = .readTranscript(f, meas)
}
## Merge the transcript results
if(verbose) message(paste0(date(), ": Merging transcript data"))
#[ensure ctab files all contain same transcripts]
sumdifft = sapply(tAll, function(x) sum(x$t_id != tAll[[1]]$t_id))
if(!(all(sumdifft==0))){
msg = 'Transcript ids were either not the same or not in the same
order across samples. Double check t_data.ctab for each sample.'
stop(.makepretty(msg))
}
tdataOnly = lapply(tAll, function(x) x[,-c(1:10)])
transcript = data.frame(tAll[[1]][,1:10], as.data.frame(tdataOnly))
if(identical(meas, 'all')){
colnames(transcript) = c('t_id', 'chr', 'strand', 'start', 'end',
't_name', 'num_exons', 'length', 'gene_id', 'gene_name',
paste(c('cov', 'FPKM'), rep(names(samples), each=2), sep="."))
}else{
meas_avail = intersect(c('cov', 'FPKM'), meas)
#^order important!
colnames(transcript) = c('t_id', 'chr', 'strand', 'start', 'end',
't_name','num_exons', 'length', 'gene_id', 'gene_name',
paste(meas_avail, rep(names(samples), each=length(meas_avail)),
sep="."))
}
} else { # no transcript data requested:
f1 = list.files(samples[1], pattern='t_data.ctab', full.names=TRUE)
transcript = .readTranscript(f1, meas)
stopifnot(ncol(transcript) == 10)
}
## Make transcripts into a GRanges list object
mm = match(e2t$e_id, mcols(exongr)$id)
if(any(is.na(mm))){
warning(paste('the following exon(s) did not appear in e_data.ctab:',
paste(e2t$e_id[which(is.na(mm))], collapse=", ")))
}
tgrl = split(exongr[mm[!is.na(mm)]], e2t$t_id[!is.na(mm)])
## Connect transcripts to genes:
t2g = data.frame(t_id = transcript$t_id, g_id = transcript$gene_id)
## Read phenotype table, if given:
stopifnot(is.null(pData) | class(pData) == 'data.frame')
if(!is.null(pData)){
if(!all(pData[,1] == names(samples))){
msg = 'Rows of pData did not seem to be in the same order as the
columns of the expression data. Attempting to rearrange
pData...'
warning(.makepretty(msg))
tmp = try(pData <- pData[,match(names(samples), pData[,1])],
silent=TRUE)
if(class(tmp) == "try-error"){
msg = 'first column of pData does not match the names of the
folders containing the ballgown data.'
stop(.makepretty(msg))
}else{
message('successfully rearranged!')
}
}
}
if(verbose) message('Wrapping up the results')
result = new('ballgown',
expr=list(intron=intron, exon=exon, trans=transcript),
indexes=list(e2t=e2t, i2t=i2t, t2g=t2g, bamfiles=bamfiles, pData=pData),
structure=list(intron=introngr, exon=exongr, trans=tgrl),
dirs=samples, mergedDate=date(), meas=meas, RSEM=FALSE)
if(verbose) message(date())
return(result)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.