#' Load count files
#'
#' Load count files
#'
#' @param target target \code{data.frame} of the project
#' @param rawDir path to the directory containing the count files
#' @param skip number of lines of the data file to skip before beginning to read data
#' @param sep column separator when reading the count files (tabulation by default)
#' @param versionName versionName of the project
#' @param featuresToRemove vector of features Id to remove from the counts
#' @return The \code{matrix} of raw counts
#' @author Marie-Agnes Dillies and Hugo Varet
# created Feb 6th, 2012
# modified May 2nd, 2012 (colnames -> target$label)
# modified Sept 19th, 2013 (DESeq not required as using DESeq2)
# modified Sept 20th, 2013 (keep only the 2 first columns)
# modified Oct 8th, 2013 (keep NAs in the data)
# modified Feb 14th, 2014 (use the first column for the labels & the second column for the files)
# modified Mar 05th, 2014 (replace NAs by 0, cancel the modification of Oct 8th, 2013)
# modified Mar 05th, 2014 (added info argument to add features with only null counts to the data)
# modified May 13th, 2014 (added skip argument)
# modified July 30th, 2014 (combined loadCountData, HTSeqClean, removeRRNA and raw2counts)
# modified Oct 27th, 2014 (removed merge(info,counts))
# modified Feb 9th, 2015 (order counts according to row names)
# modified Feb 18th, 2015 (remove HTSeq-count lines with featuresToRemove)
# modified March 17th, 2015 (print features removed and top/bottom of the matrix)
# modified March 23rd, 2015 (fixed a bug when printing features removed)
# modified April 10th, 2015 (now exports the names of the features removed from the data)
# modified June 15th, 2015 (check if there are any duplicated features)
# modified Sept 3rd, 2015 (removed info argument)
# modified April 11th, 2016 (added sep parameter)
# modified February 21st, 2017 (detect featureCounts or HTSeq-count input, remove header argument)
# modified February 23rd, 2017 (fixed a bug when determining if counts come from featureCounts or HTSeq-count)
# modified September 5th, 2018 (fixed a bug when determining if counts come from featureCounts or HTSeq-count)
# modified October 4th, 2018 (fixed a bug when determining if counts come from featureCounts or HTSeq-count)
# modified June 1th, 2019 (print duplicated ids if any)
loadCountData <- function(target, rawDir="raw", skip=0, sep="\t", versionName="",
featuresToRemove=c("alignment_not_unique", "ambiguous", "no_feature", "not_aligned", "too_low_aQual")){
labels <- as.character(target[,1])
files <- as.character(target[,2])
# detect if input count files are from featureCounts or HTSeq-count
f1 <- read.table(paste(rawDir,files[1],sep="/"), sep=sep, quote="\"", header=FALSE, skip=5, nrows=5, stringsAsFactors=FALSE)
if (ncol(f1) >= 7 && is.numeric(f1[,7])){
# counter featurecounts
idCol <- 1
countsCol <- 7
header <- TRUE
} else{
if (ncol(f1) >= 2 && is.numeric(f1[,2])){
# counter htseq-count
idCol <- 1
countsCol <- 2
header <- FALSE
} else{
stop("Can't determine if count files come from HTSeq-count or featureCounts")
}
}
rawCounts <- read.table(paste(rawDir,files[1],sep="/"), sep=sep, quote="\"", header=header, skip=skip, stringsAsFactors=FALSE)
rawCounts <- rawCounts[,c(idCol, countsCol)]
colnames(rawCounts) <- c("Id", labels[1])
if (any(duplicated(rawCounts$Id))){
stop("Duplicated feature names in ", files[1], ": ",
paste(unique(rawCounts$Id[duplicated(rawCounts$Id)]), collapse=", "))
}
cat(files[1],": ",length(rawCounts[,labels[1]])," rows and ",sum(rawCounts[,labels[1]]==0)," null count(s)\n",sep="")
for (i in 2:length(files)){
tmp <- read.table(paste(rawDir,files[i],sep="/"), sep=sep, quote="\"", header=header, skip=skip, stringsAsFactors=FALSE)
tmp <- tmp[,c(idCol, countsCol)]
colnames(tmp) <- c("Id", labels[i])
if (any(duplicated(tmp$Id))){
stop("Duplicated feature names in ", files[i], ": ",
paste(unique(tmp$Id[duplicated(tmp$Id)]), collapse=", "))
}
rawCounts <- merge(rawCounts, tmp, by="Id", all=TRUE)
cat(files[i],": ",length(tmp[,labels[i]])," rows and ",sum(tmp[,labels[i]]==0)," null count(s)\n",sep="")
}
rawCounts[is.na(rawCounts)] <- 0
# transformation de la data.frame en matrice
counts <- as.matrix(rawCounts[,-1])
rownames(counts) <- rawCounts[,1]
# ordering according to row names
counts <- counts[order(rownames(counts)),]
# remove some features
cat("\nFeatures removed:\n")
featuresRemoved <- NULL
for (f in featuresToRemove){
match <- grep(f, rownames(counts))
if (length(match)>0){
cat(rownames(counts)[match], sep = "\n")
featuresRemoved <- unique(c(featuresRemoved,rownames(counts)[match]))
counts <- counts[-match,]
}
}
if (is.null(featuresRemoved)) featuresRemoved <- "No feature has been removed"
write.table(data.frame(featuresRemoved), file=paste0("tables/",versionName,".featuresRemoved.xls"),
sep="\t", col.names=TRUE, row.names=FALSE, quote=FALSE)
cat("\nTop of the counts matrix:\n")
print(head(counts))
cat("\nBottom of the counts matrix:\n")
print(tail(counts))
return(counts)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.