# bowtie alignments: --best --strata -m 1 (return one alignment, but only if it is a unique mapper)
# plus -q/-f [--phred33/64-quals] [-C] -S --quiet
# paired:
# valid values are NULL, "no", "fr", "rf", "ff"
# in case of NULL, QuasR determines the paired status automatically from the given sampleFile (2 columns:unpaired, 3 columns:paired).
# The orientation for paired end is set to "fr" by default. In case of bam input files, the user needs to specify manually the
# status ("no","fr","rf","ff")
# snpFile need four columns (chrom, pos, ref, alt) with no header
# geneAnnotation should be the path to either a gtf file or a TxDb sqlite file
#' Align reads
#'
#' Create read alignments against reference genome and optional auxiliary
#' targets if not yet existing. If necessary, also build target indices for
#' the aligner.
#'
#' Before generating new alignments, \code{qAlign} looks for previously
#' generated alignments as well as for an aligner index. If no aligner
#' index exists, it will be automatically created and stored in the same
#' directory as the provided fasta file, or as an R package in the case
#' of a BSgenome reference. The name of this R package will be the same
#' as the BSgenome package name, with an additional suffix from the
#' aligner (e.g. \code{BSgenome.Hsapiens.UCSC.hg19.Rbowtie}). The
#' generated bam files contain both aligned und unaligned reads. For
#' paired-end samples, by default no alignments will be reported for
#' read pairs where only one of the reads could be aligned.
#'
#' \code{sampleFile} is a tab-delimited text file listing all the input
#' sequences to be included in a given analysis. The file has either two
#' (single-end) or three columns (paired-end). The first row contains the
#' column names, and additional rows contain relative or absolute path
#' and name of input sequence file(s), as well as the according sample
#' name. Three input file formats are supported (fastq, fasta and bam).
#' All input files in one \code{sampleFile} need to be in the same
#' format, and are recognized by their extension (.fq, .fastq, .fa,
#' .fasta, .fna, .bam), in raw or compressed form (e.g. .fastq.gz). If
#' bam files are provided, then no alignments are generated by
#' \code{qAlign}, and the alignments contained in the bam files will be
#' used instead.
#'
#' The column names in \code{sampleFile} have to match to the ones in the
#' examples below, for a single-read experiment:
#' \tabular{ll}{
#' FileName \tab SampleName \cr
#' chip_1_1.fq.bz2 \tab Sample1 \cr
#' chip_2_1.fq.bz2 \tab Sample2
#' }
#' and for a paired-end experiment:
#' \tabular{lll}{
#' FileName1 \tab FileName2 \tab SampleName \cr
#' rna_1_1.fq.bz2 \tab rna_1_2.fq.bz2 \tab Sample1 \cr
#' rna_2_1.fq.bz2 \tab rna_2_2.fq.bz2 \tab Sample2
#' }
#'
#' The \dQuote{SampleName} column is the human-readable name for each
#' sample that will be used as sample labels. Multiple sequence files may
#' be associated to the same sample name, which instructs \code{QuasR} to
#' combine those files.
#'
#' \code{auxiliaryFile} is a tab-delimited text file listing one or
#' several additional target sequence files in fasta format. Reads that
#' do not map against the reference genome will be aligned against each
#' of these target sequence files. The first row contains the column
#' names which have to match to the ones in the example below:
#' \tabular{ll}{
#' FileName \tab AuxName \cr
#' NC_001422.1.fa \tab phiX174
#' }
#'
#' \code{snpFile} is a tab-delimited text file without a header and
#' contains four columns with chromosome name, position, reference allele
#' and alternative allele, as in the example below:
#' \tabular{llll}{
#' chr1 \tab 8596 \tab G \tab A \cr
#' chr1 \tab 18443 \tab G \tab A \cr
#' chr1 \tab 18981 \tab C \tab T \cr
#' chr1 \tab 19341 \tab G \tab A
#' }
#'
#' The reference and alternative alleles will be injected into the
#' reference genome, resulting in two separate genomes. All reads will be
#' aligned separately to both of these genomes, and the alignments will
#' be combined, only retaining the best alignment for each read. In the
#' final alignment, each read will be marked with a tag that classifies
#' it into reference (\code{R}), alternative (\code{A}) or unknown
#' (\code{U}), if the reads maps equally well to both genomes.
#'
#' If \code{bisulfite} is set to \dQuote{dir} or \dQuote{undir}, reads
#' will be C-to-T converted and aligned to a similarly converted genome.
#'
#' If \code{alignmentParameter} is \code{NULL} (recommended),
#' \code{qAlign} will select default parameters that are suitable for the
#' experiment type. Please note that for bisulfite or allele-specific
#' experiments, each read is aligned multiple times, and resulting
#' alignments need to be combined. This requires special settings for the
#' alignment parameters that are not recommended to be changed. For
#' \sQuote{simple} experiments (neither bisulfite, allele-specific, nor
#' spliced), alignments are generated using the parameters \code{-m
#' maxHits --best --strata}. This will align reads with up to
#' \dQuote{maxHits} best hits in the genome and selects one of them randomly.
#'
#' @param sampleFile The name of a text file listing input sequence files
#' and sample names (see \sQuote{Details}).
#' @param genome The reference genome for primary alignments, one of:
#' \itemize{
#' \item a string referring to a \dQuote{BSgenome} package
#' (e.g. \dQuote{"BSgenome.Hsapiens.UCSC.hg19"}), which will be
#' downloaded automatically from Bioconductor if not present
#' \item the name of a fasta sequence file containing one or
#' several sequences (chromosomes) to be used as a reference. The aligner
#' index will be created when necessary and stored in a default
#' location (see \sQuote{Details}).
#' }
#' @param auxiliaryFile The name of a text file listing sequences to be
#' used as additional targets for alignment of reads not mapping to the
#' reference genome (see \sQuote{Details}).
#' @param aligner selects the aligner program to be used for aligning the
#' reads. Currently, only \dQuote{Rbowtie} and \dQuote{Rhisat2} are supported,
#' which are R wrapper packages for \sQuote{bowtie} / \sQuote{SpliceMap} and
#' \sQuote{hisat2}, respectively (see \code{\link[Rbowtie]{Rbowtie-package}} and
#' \code{\link[Rhisat2]{Rhisat2-package}} packages).
#' @param maxHits sets the maximal number of allowed mapping positions
#' per read (default: 1). If a read produces more than \code{maxHits}
#' alignments, no alignments will be reported for it. In case of a
#' multi-mapping read, a single alignment is randomly selected.
#' @param paired defines the type of paired-end library and can be set to
#' one of \code{no} (single read experiment, default), \code{fr} (fw/rev),
#' \code{ff} (fw/fw) or \code{rf} (rev/fw).
#' @param splicedAlignment If \code{TRUE}, reads will be aligned using a
#' spliced aligner, depending on the value of \code{aligner} described above:
#' \describe{
#' \item{\code{aligner="Rhisat2"}}{: This is the recommended setting
#' for spliced alignments and will use hisat2 from the
#' \code{\link[Rhisat2]{Rhisat2-package}}. See also the \code{geneAnnotation}
#' argument below for providing known exon-exon junctions.}
#' \item{\code{aligner="Rbowtie"}}{: This is not recommended and only
#' available for legacy reasons. It will use SpliceMap to produce spliced
#' alignments (without using a database of known exon-exon junctions).
#' Compared to the alternative alignment modes (non-spliced or spliced using
#' \code{Rhisat2} as aligner), this alignment mode is about ten-fold slower
#' and also less sensitive. Furthermore, SpliceMap can only be used for
#' reads with a minimal length of 50nt; SpliceMap ignores reads that are
#' shorter, and these reads will not be contained in the BAM file,
#' neither as mapped or unmapped reads.}
#' }
#' @param snpFile The name of a text file listing single nucleotide
#' polymorphisms to be used for allele-specific alignment and
#' quantification (see \sQuote{Details}).
#' @param bisulfite For bisulfite-converted samples (Bis-seq), the type
#' of bisulfite library (\dQuote{dir} for directional libraries,
#' \dQuote{undir} for undirectional libraries).
#' @param alignmentParameter An optional string containing command line
#' parameters to be used for the aligner, to overrule the default
#' alignment parameters used by \code{QuasR}. Please use with caution;
#' some alignment parameters may break assumptions made by
#' \code{QuasR}. Default parameters are listed in \sQuote{Details}.
#' @param projectName An optional name for the alignment project.
#' @param alignmentsDir The directory to be used for storing alignments
#' (bam files). If set to \code{NULL} (default), bam files will be
#' generated at the location of the input sequence files.
#' @param lib.loc can be used to change the default library path of
#' R. The library path is used by \code{QuasR} to store aligner index
#' packages created from \code{BSgenome} reference genomes.
#' @param cacheDir specifies the location to store (potentially huge)
#' temporary files. If set to \code{NULL} (default), the temporary
#' directory of the current R session as returned by \code{tempdir()}
#' will be used.
#' @param clObj A cluster object, created by the package \pkg{parallel}, to
#' enable parallel processing and speed up the alignment process.
#' @param checkOnly If \code{TRUE}, prevents the automatic creation of
#' alignments or aligner indices. This allows to quickly check for missing
#' alignment files without starting the potentially long process of
#' their creation. In the case of missing alignments or indices, an
#' exception is thrown.
#' @param geneAnnotation Only used if \code{aligner} is \code{"Rhisat2"}.
#' The path to either a gtf file or a sqlite database generated by exporting
#' a \code{TxDb} object. This file is used to generate a splice site file
#' for \code{Rhisat2}, that will be used to guide the spliced alignment.
#' Please note that if using an sqlite database file, do not use the one
#' contained in the installed package folder of a \code{TxDb} package.
#' \code{QuasR} (through \code{Rhisat2}) creates additional files in the
#' folder which would interfere with the loading of the TxDb package.
#'
#' @export
#'
#' @return A \code{\link[=qProject-class]{qProject}} object.
#'
#' @author Anita Lerch, Dimos Gaidatzis, Charlotte Soneson and Michael Stadler
#' @keywords utilities misc
#'
#' @seealso
#' \code{\link[=qProject-class]{qProject}},
#' \code{\link[parallel]{makeCluster}} from package \pkg{parallel},
#' \code{\link[Rbowtie]{Rbowtie-package}} package,
#' \code{\link[Rhisat2]{Rhisat2-package}} package
#'
#' @name qAlign
#' @aliases qAlign
#'
#' @examples
#' \dontrun{
#' # see qCount, qMeth and qProfile manual pages for examples
#' example(qCount)
#' example(qMeth)
#' example(qProfile)
#' }
#'
qAlign <- function(sampleFile, genome, auxiliaryFile = NULL, aligner = "Rbowtie",
maxHits = 1, paired = NULL, splicedAlignment = FALSE,
snpFile = NULL, bisulfite = "no", alignmentParameter = NULL,
projectName = "qProject", alignmentsDir = NULL, lib.loc = NULL,
cacheDir = NULL, clObj = NULL, checkOnly = FALSE,
geneAnnotation = NULL) {
# check if the user provided a samplefile and a genome
if (missing(sampleFile)) {
stop("Missing 'sampleFile' parameter.")
}
if (missing(genome)) {
stop("Missing 'genome' parameter.")
}
# create a qProject, perform various tests, check for installed
# BSgenome and load aligner packages
# create fasta indices (.fai) files for all the reference sequences
# (genome & aux) as well as all md5 sum files
proj <- createQProject(sampleFile, genome, auxiliaryFile, aligner,
maxHits, paired, splicedAlignment, snpFile,
bisulfite, alignmentParameter, projectName,
alignmentsDir, lib.loc, cacheDir, geneAnnotation)
# display the status of the project. in case of checkOnly=TRUE,
# throw an exception if there are alignment files missing
missingFilesMessage(proj, checkOnly)
# check if there are any genomic alignment files missing. if so,
# create index and align
if (any(is.na(proj@alignments$FileName))) {
# build genome index
if (is.na(proj@snpFile)) {
if (proj@genomeFormat == "file") {
buildIndex(proj@genome, paste(proj@genome, proj@alnModeID, sep = "."),
proj@alnModeID, resolveCacheDir(proj@cacheDir))
} else if (proj@genomeFormat == "BSgenome") {
buildIndexPackage(proj@genome, proj@aligner, proj@alnModeID,
resolveCacheDir(proj@cacheDir), proj@lib.loc)
} else {
stop("Fatal error 45906")
}
} else {
# create indices for the two secondary genomes specified by snps
buildIndexSNP(proj@snpFile, paste(proj@snpFile, proj@alnModeID, sep = "."),
proj@genome, proj@genomeFormat, proj@alnModeID,
resolveCacheDir(proj@cacheDir))
}
# Rhisat2, generate splice site file
if (proj@aligner == "Rhisat2" && !is.na(proj@geneAnnotation)) {
buildSpliceSiteFile(proj@geneAnnotation, proj@geneAnnotationFormat)
}
# align to the genome (qProject gets updated with the alignment file names)
proj <- createGenomicAlignments(proj, clObj)
}
# create indices for the auxiliary files if necessary
# (if there are any alignment files missing)
if (any(is.na(proj@auxAlignments))) {
for (i in seq_len(nrow(proj@aux))) {
buildIndex(proj@aux$FileName[i],
paste(proj@aux$FileName[i], proj@alnModeID, sep = "."),
proj@alnModeID, resolveCacheDir(proj@cacheDir),
checkMD5 = TRUE)
}
proj <- createAuxAlignments(proj, clObj)
}
return(proj)
}
# inspect the qProject and give an overview of all the files that need to be computed
#' @keywords internal
#' @importFrom GenomeInfoDb seqlengths
#' @importFrom Rsamtools scanBamHeader
#' @importFrom utils flush.console installed.packages
#' @importFrom stats na.omit
missingFilesMessage <- function(proj, checkOnly) {
genomeIndexNeedsToBeCreated <- FALSE
spliceSiteFileNeedsToBeCreated <- FALSE
genomicAlignmentsNeedToBeCreated <- sum(is.na(proj@alignments$FileName))
auxAlignmentsNeedToBeCreated <- sum(is.na(proj@auxAlignments))
if (any(is.na(proj@alignments$FileName))) {
if (is.na(proj@snpFile)) {
if (proj@genomeFormat == "file") {
indexPath <- paste(proj@genome, proj@alnModeID, sep = ".")
if (file.exists(indexPath)) {
if (!("ref_md5Sum.txt" %in% dir(indexPath))) {
genomeIndexNeedsToBeCreated <- TRUE
}
} else {
genomeIndexNeedsToBeCreated <- TRUE
}
} else if (proj@genomeFormat == "BSgenome") {
indexPackageName <- paste(proj@genome, proj@alnModeID, sep = ".")
if (!(indexPackageName %in% utils::installed.packages()[, 'Package'])) {
genomeIndexNeedsToBeCreated <- TRUE
}
} else {
stop("'genomeFormat' must be 'file' or 'BSgenome'")
}
} else {
indexPathR <- paste(proj@snpFile, basename(proj@genome),
"R", "fa", proj@alnModeID, sep = ".")
indexPathA <- paste(proj@snpFile, basename(proj@genome),
"A", "fa", proj@alnModeID, sep = ".")
if (file.exists(indexPathR) & file.exists(indexPathA)) {
if (!("ref_md5Sum.txt" %in% dir(indexPathR)) |
!("ref_md5Sum.txt" %in% dir(indexPathA))) {
genomeIndexNeedsToBeCreated <- TRUE
}
} else {
genomeIndexNeedsToBeCreated <- TRUE
}
}
## Splice site file
if (!is.null(proj@geneAnnotation) && !is.na(proj@geneAnnotation)) {
if (!file.exists(paste0(proj@geneAnnotation, ".SpliceSites.txt"))) {
spliceSiteFileNeedsToBeCreated <- TRUE
}
}
}
## for non-bam projects, check if sequence names and lengths in pre-existing
## bam files are consistent with BSgenome reference
if (proj@samplesFormat != "bam" && any(!is.na(proj@alignments$FileName)) &&
proj@genomeFormat == "BSgenome") {
refchrs <- GenomeInfoDb::seqlengths(get(proj@genome))
refchrs <- refchrs[order(names(refchrs))]
bamchrsL <- lapply(Rsamtools::scanBamHeader(
stats::na.omit(proj@alignments$FileName)),
function(bh) {
x <- bh$targets
x[order(names(x))]
})
ok <- unlist(lapply(bamchrsL, function(x) identical(x, refchrs)))
if (!all(ok)) {
warnfiles <- stats::na.omit(proj@alignments$FileName)[!ok]
warning("The genome in ", proj@genome,
" has changed since some bam files\n",
"in this project were generated.\n",
"The following pre-existing bam files were generated for a\n",
"genome that is inconsistent with the current verion of ",
proj@genome, ":\n ",
paste(warnfiles, collapse = "\n "))
}
}
if (!genomeIndexNeedsToBeCreated & !spliceSiteFileNeedsToBeCreated &
genomicAlignmentsNeedToBeCreated == 0 & auxAlignmentsNeedToBeCreated == 0) {
message("all necessary alignment files found")
} else {
message("alignment files missing - need to:")
if (genomeIndexNeedsToBeCreated) {
message(" create alignment index for the genome")
}
if (spliceSiteFileNeedsToBeCreated) {
message(" create splice site file for gene annotation")
}
if (genomicAlignmentsNeedToBeCreated > 0) {
message(" create ", genomicAlignmentsNeedToBeCreated,
" genomic alignment(s)")
}
if (auxAlignmentsNeedToBeCreated > 0) {
message(" create ", auxAlignmentsNeedToBeCreated,
" auxiliary alignment(s)")
}
if (checkOnly) {
stop("alignment or index files are missing and checkOnly=TRUE. ",
"Aborting execution", call. = FALSE)
}
if (interactive()) {
message("will start in ", appendLF = FALSE); utils::flush.console()
for (i in 9:1) {
message("..", i, "s", appendLF = FALSE); utils::flush.console()
Sys.sleep(1)
}
message("")
}
}
}
# read all the input information, perform various checks and compile the data into a qProject object
#' @keywords internal
#' @importFrom tools file_path_as_absolute file_ext file_path_sans_ext
#' @importFrom BSgenome available.genomes
#' @importFrom ShortRead FastqStreamer yield
#' @importFrom Biostrings quality
#' @importFrom Biobase testBioCConnection
#' @importFrom methods new is
#' @importFrom utils installed.packages read.delim
createQProject <- function(sampleFile, genome, auxiliaryFile, aligner,
maxHits, paired, splicedAlignment,
snpFile, bisulfite, alignmentParameter,
projectName, alignmentsDir,
lib.loc, cacheDir, geneAnnotation) {
# instantiate a qProject
proj <- methods::new("qProject")
# -----------DETERMINE THE FORMAT OF THE GENOME AND DOWNLOAD FROM BIOCONDUCTOR IF NEEDED -------------
if (length(genome) != 1) {
stop("The genome parameter should contain only a single entry", call. = FALSE)
}
# if there exists a / or \\ at the end of the genome string, remove it.
# this is required for the consistent behavior of file.exists on windows
# systems. file.exists("dir/") != file.exists("dir").
genome <- sub("(\\\\|/)$", "", genome)
if (file.exists(genome)) {
# genome is a directory or a file
if (!(file.info(genome)$isdir)) {
# genome is a file, check if it is a fasta file
if (consolidateFileExtensions(genome) == "fasta") {
proj@genome <- tools::file_path_as_absolute(genome)
proj@genomeFormat <- "file"
} else {
stop("The specified genome ", genome,
" does not have the extension of a fasta file (fa,fasta,fna)",
call. = FALSE)
}
} else {
# genome is a directory
stop("The specified genome has to be a file and not a directory: ",
genome, call. = FALSE)
}
} else {
# check if the genome is a BSgenome.
if (genome %in% utils::installed.packages(lib.loc = lib.loc)[, 'Package']) {
# BSgenome is already installed, load it
if (!require(genome, character.only = TRUE, quietly = TRUE,
lib.loc = c(lib.loc, .libPaths()))) {
stop("The BSgenome ", genome,
" is installed but cannot be loaded. ",
"The version of the BSgenome might be outdated ",
"(verify using BiocManager::valid()).",
call. = FALSE)
}
proj@genome <- genome
proj@genomeFormat <- "BSgenome"
} else {
message("The specified genome is not a fasta file or an installed BSgenome.")
message("Connecting to Bioconductor to check for available genomes ",
"(internet connection required)...", appendLF = FALSE)
if (Biobase::testBioCConnection()) {
# Connection to Bioconductor OK
message("OK")
if (genome %in% BSgenome::available.genomes()) {
# The genome is available in Bioconductor,
# request installation
message(genome, " is available from Bioconductor.")
stop("Please install ", genome, " using:\n",
" BiocManager::install(\"", genome, "\")\n",
"before calling qAlign.", call. = FALSE)
} else {
# The genome is not available in Bioconductor
stop(genome,
" is not available from Bioconductor. ",
"Use BSgenome::available.genomes() for a complete list.",
call. = FALSE)
}
} else {
# No connection to Bioconductor
stop("Could not check if ", genome,
" is available from Bioconductor.\n",
"Use BSgenome::available.genomes() for a complete list,\n",
"and install ", genome, " using:\n",
" BiocManager::install(\"", genome, "\")\n",
"before calling qAlign.", call. = FALSE)
}
}
}
# --------------------------------- PROCESS THE GENOME ANNOTATION ---------------------------------
if (!is.null(geneAnnotation)) {
if (methods::is(geneAnnotation, "character") && length(geneAnnotation) == 1 &&
file.exists(geneAnnotation) &&
tools::file_ext(geneAnnotation) %in% c("gtf", "gff", "gff3", "sqlite")) {
if (tools::file_ext(geneAnnotation) == "sqlite") {
proj@geneAnnotationFormat <- "TxDb"
} else {
proj@geneAnnotationFormat <- "gtf"
}
} else {
stop("'geneAnnotation' must be a path to an existing sqlite, gtf or gff file.")
}
proj@geneAnnotation <- tools::file_path_as_absolute(geneAnnotation)
} else {
proj@geneAnnotation <- NA_character_
proj@geneAnnotationFormat <- NA_character_
}
# ---------------------------------------- PARSE THE PAIRED PARAMETER ---------------------------------
if (!is.null(paired)) {
if (!(paired %in% c("no", "fr", "rf", "ff"))) {
stop("The parameter paired supports only NULL, 'no', 'fr', 'rf' and 'ff'",
call. = FALSE)
}
}
# ---------------------------------------- PARSE THE SAMPLE FILE -------------------------------------
if (!file.exists(sampleFile)) {
stop("Cannot open ", sampleFile, call. = FALSE)
}
samples <- utils::read.delim(sampleFile, header = TRUE, colClasses = "character")
if (nrow(samples) == 0) {
stop(sampleFile," is either empty or there is a header missing", call. = FALSE)
}
# get absolute path of the sample.txt file
sampleFileAbsolutePath <- dirname(tools::file_path_as_absolute(sampleFile))
# Check if the reads are single end
if (ncol(samples) == 2) {
if (all(colnames(samples) == c("FileName", "SampleName"))) {
# this is a valid single end samples file. check if listed files exist
for (i in seq_len(nrow(samples))) {
pathRet <- pathAsAbsoluteRedirected(samples$FileName[i],
sampleFileAbsolutePath)
if (!is.null(pathRet)) {
if (samples$SampleName[i] == "") {
stop(samples$FileName[i], " listed in ", sampleFile,
" has no sample name", call. = FALSE)
}
samples$FileName[i] <- pathRet
} else {
stop(samples$FileName[i], " listed in ", sampleFile,
" does not exist", call. = FALSE)
}
}
# determine format of the files (fa,fasta,fna)
proj@samplesFormat <- determineSamplesFormat(samples$FileName)
if (proj@samplesFormat == "bam") {
# samples are provided as bam files
if (is.null(paired)) {
stop("Paired status of the provided bam files cannot be ",
"determined automatically, please set the paired parameter",
call. = FALSE)
}
if (length(unique(samples$FileName)) != nrow(samples)) {
stop("There are duplicate files in sampleFile. ",
"This is not allowed.", call. = FALSE)
}
if (paired == "no") {
proj@reads <- samples
proj@reads$FileName <- NA_character_
} else {
proj@reads <- data.frame(FileName1 = NA_character_,
FileName2 = NA_character_,
SampleName = samples$SampleName,
stringsAsFactors = FALSE)
}
proj@paired <- paired
proj@alignments <- samples
} else {
# samples are provided as reads
if (!is.null(paired)) {
if (paired != "no") {
stop(sampleFile,
" contains 2 columns which represents unpaired end data.",
" Please reset the paired parameter",
call. = FALSE)
}
}
if (length(unique(samples$FileName)) != nrow(samples)) {
stop("There are duplicate files in sampleFile. ",
"This is not allowed as it would result in ",
"non-unique alignment file names",
call. = FALSE)
}
proj@paired <- "no"
proj@reads <- samples
proj@alignments <- samples
proj@alignments$FileName <- NA_character_
}
} else {
# incorrect format
stop(sampleFile,
" should contain column names FileName,SampleName (single end data)",
call. = FALSE)
}
# Check if the reads are paired data
} else if (ncol(samples) == 3) {
if (all(colnames(samples) == c("FileName1", "FileName2", "SampleName"))) {
# this is a valid paired end samples file. check if listed files exist
for (i in seq_len(nrow(samples))) {
pathRet <- pathAsAbsoluteRedirected(samples$FileName1[i],
sampleFileAbsolutePath)
if (!is.null(pathRet)) {
if (samples$SampleName[i] == "") {
stop(samples$FileName1[i], " listed in ",
sampleFile, " has no sample name", call. = FALSE)
}
samples$FileName1[i] <- pathRet
} else {
stop(samples$FileName1[i], " listed in ", sampleFile,
" does not exist", call. = FALSE)
}
}
for (i in seq_len(nrow(samples))) {
pathRet <- pathAsAbsoluteRedirected(samples$FileName2[i],
sampleFileAbsolutePath)
if (!is.null(pathRet)) {
if (samples$SampleName[i] == "") {
stop(samples$FileName2[i], " listed in ",
sampleFile, " has no sample name", call. = FALSE)
}
samples$FileName2[i] <- pathRet
} else {
stop(samples$FileName2[i], " listed in ",
sampleFile, " does not exist", call. = FALSE)
}
}
# determine format of the files (fa,fasta,fna)
proj@samplesFormat <- determineSamplesFormat(c(samples$FileName1,
samples$FileName2))
if (proj@samplesFormat == "bam") {
stop("Bam files need to be listed in a two column file: ",
sampleFile, call. = FALSE)
}
if (!is.null(paired)) {
if (paired == "no") {
stop(sampleFile,
" contains 3 columns which represents paired end data. ",
"Please reset the paired parameter",
call. = FALSE)
}
}
if (length(unique(c(samples$FileName1, samples$FileName2))) !=
(2 * nrow(samples))) {
stop("There are duplicate files in sampleFile. ",
"This is not allowed as it would result in ",
"non-unique alignment file names",
call. = FALSE)
}
if (is.null(paired)) {
proj@paired <- "fr"
} else {
proj@paired <- paired
}
proj@reads <- samples
proj@alignments <- data.frame(FileName = NA_character_,
SampleName = samples$SampleName,
stringsAsFactors = FALSE)
} else {
# incorrect format
stop(sampleFile,
" should contain the column names FileName1,FileName2,SampleName (paired end data)",
call. = FALSE)
}
# The format of the file is not supported
} else {
stop(sampleFile,
" should be a tab delimited file with either 2 or 3 columns",
call. = FALSE)
}
# --------------- FOR FASTQ FILES, READ A SMALL CHUNK TO GUESS THE QUALITY FORMAT (phred33 or phred64) -------
if (proj@samplesFormat == "fastq") {
# add an additional quality column to the reads table
proj@reads <- data.frame(proj@reads, phred = NA_character_,
stringsAsFactors = FALSE)
# switch off omp parallelization in ShortRead::FastqStreamer
nthreads <- .Call(ShortRead:::.set_omp_threads, 1L)
for (i in seq_len(nrow(proj@reads))) {
# take only first column even for paired end data
fastq_fs <- ShortRead::FastqStreamer(proj@reads[i, 1], n = 2000,
readerBlockSize = 5e5)
tryCatch({
fastq_sq <- ShortRead::yield(fastq_fs)
}, error = function(ex) {
close(fastq_fs)
stop("Incorrect format for the fastq file: ",
proj@reads[i, 1], call. = FALSE)
})
close(fastq_fs)
# determine the format of the quality values
if (inherits(Biostrings::quality(fastq_sq), "FastqQuality")) {
proj@reads$phred[i] <- "33"
} else if (inherits(Biostrings::quality(fastq_sq), "SFastqQuality")) {
proj@reads$phred[i] <- "64"
} else {
stop("The quality values of the provided sequences files are not interpretable: ",
proj@reads[i, 1], call. = FALSE)
}
}
# switch back on omp parallelization in ShortRead::FastqStreamer
.Call(ShortRead:::.set_omp_threads, nthreads)
}
# -------------------- CALCULATE MD5 SUB SUMS FOR ALL READ FILES ----------------------------------------
if (proj@samplesFormat != "bam") {
if (proj@paired == "no") {
proj@reads_md5subsum <- data.frame(
md5subsum = md5subsum(proj@reads$FileName),
stringsAsFactors = FALSE
)
} else {
proj@reads_md5subsum <- data.frame(
md5subsum1 = md5subsum(proj@reads$FileName1),
md5subsum2 = md5subsum(proj@reads$FileName2),
stringsAsFactors = FALSE)
}
} else {
if (proj@paired == "no") {
proj@reads_md5subsum <- as.data.frame(
matrix(NA_character_, nrow = nrow(proj@reads), ncol = 1),
stringsAsFactors = FALSE
)
colnames(proj@reads_md5subsum) <- "md5subsum"
} else {
proj@reads_md5subsum <- as.data.frame(
matrix(NA_character_, nrow = nrow(proj@reads), ncol = 2),
stringsAsFactors = FALSE
)
colnames(proj@reads_md5subsum) <- c("md5subsum1", "md5subsum2")
}
}
# ----------------------------------- PARSE THE SNP FILE ------------------------------------
if (!is.null(snpFile)) {
if (!file.exists(snpFile)) {
stop("Cannot open ", snpFile, call. = FALSE)
}
proj@snpFile <- tools::file_path_as_absolute(snpFile)
} else {
proj@snpFile <- NA_character_
}
# ---------------------------------------- PARSE THE AUXILIARY FILE -------------------------------------
if (!is.null(auxiliaryFile)) {
if (proj@samplesFormat == "bam") {
stop("The option 'auxiliaryFile' is not supported for bam input files",
call. = FALSE)
}
if (!is.na(proj@snpFile)) {
stop("The option 'auxiliaryFile' is not supported in allelic mode",
call. = FALSE)
}
if (!file.exists(auxiliaryFile)) {
stop("Cannot open ", auxiliaryFile, call. = FALSE)
}
auxiliaries <- utils::read.delim(auxiliaryFile, header = TRUE,
colClasses = "character")
if (nrow(auxiliaries) == 0) {
stop(auxiliaryFile, " is either empty or there is a header missing",
call. = FALSE)
}
# get absolute path of the auxiliary.txt file
auxFileAbsolutePath <- dirname(tools::file_path_as_absolute(auxiliaryFile))
if (ncol(auxiliaries) == 2) {
if (all(colnames(auxiliaries) == c("FileName", "AuxName"))) {
# this is a valid auxiliaries file. check if listed files exist
for (i in seq_len(nrow(auxiliaries))) {
pathRet <- pathAsAbsoluteRedirected(auxiliaries$FileName[i],
auxFileAbsolutePath)
if (!is.null(pathRet)) {
if (auxiliaries$AuxName[i] == "") {
stop(auxiliaries$FileName[i], " listed in ",
auxiliaryFile, " has no name", call. = FALSE)
}
auxiliaries$FileName[i] <- pathRet
} else {
stop(auxiliaries$FileName[i], " listed in ",
auxiliaryFile, " does not exist", call. = FALSE)
}
}
# test the file extensions
allAuxFileExts <- unique(consolidateFileExtensions(auxiliaries$FileName))
if (!all(consolidateFileExtensions(auxiliaries$FileName) == "fasta")) {
stop("All auxiliary files need to have a fasta extension (fa,fasta,fna)", call. = FALSE)
}
if (length(unique(auxiliaries$AuxName)) != nrow(auxiliaries)) {
stop("All auxiliary files need to have a unique AuxName",
call. = FALSE)
}
# store the auxiliary files
proj@aux <- auxiliaries
} else {
# incorrect format
stop(auxiliaryFile,
" should contain column names FileName,AuxName",
call. = FALSE)
}
} else {
stop(auxiliaryFile,
" should be a tab delimited file with 2 columns", call. = FALSE)
}
# Fill data into auxAlignments
auxAlignments <- data.frame(
matrix(NA_character_, nrow = nrow(auxiliaries), ncol = nrow(proj@reads)),
stringsAsFactors = FALSE
)
rownames(auxAlignments) <- auxiliaries$AuxName
colnames(auxAlignments) <- proj@reads$SampleName
proj@auxAlignments <- auxAlignments
}
# ----------------------------------- PARSE BISULFITE PARAMETER -----------------------------------
if (bisulfite %in% c("no", "dir", "undir")) {
proj@bisulfite <- bisulfite
if (bisulfite != "no" && !(proj@paired %in% c("no", "fr"))) {
stop("Bisulfite paired-end mode only supports pair orientation 'fr'",
call. = FALSE)
}
} else {
stop("Bisulfite mode only supports 'no', 'dir' and 'undir'", call. = FALSE)
}
# ----------------------------------- PARSE MAXHITS PARAMETER --------------------------------------
proj@maxHits <- maxHits
#------------------------------------ PARSE THE PROJECT NAME ----------------------------------
if (is.null(projectName)) {
proj@projectName <- NA_character_
} else {
if (length(projectName) == 1) {
proj@projectName <- projectName
} else {
stop("The projectName should contain only a single entry",
call. = FALSE)
}
}
# ---------------------------------- PARSE THE BAMFILE DIRECTORY --------------------------
if (is.null(alignmentsDir)) {
proj@alignmentsDir <- NA_character_
} else {
alignmentsDir <- sub("(\\\\|/)$", "", alignmentsDir) # for windows systems
if (file.exists(alignmentsDir)) {
# it's a directory or a file
if (file.info(alignmentsDir)$isdir) {
proj@alignmentsDir <- tools::file_path_as_absolute(alignmentsDir)
} else {
stop("alignmentsDir ", alignmentsDir,
" is not a directory", call. = FALSE)
}
} else {
stop("alignmentsDir ", alignmentsDir, " does not exist", call. = FALSE)
}
}
# ---------------------------------- PARSE THE LIB.LOC DIRECTORY --------------------------
if (is.null(lib.loc)) {
proj@lib.loc <- NA_character_
} else {
lib.loc <- sub("(\\\\|/)$", "", lib.loc) # for windows systems
if (file.exists(lib.loc)) {
# it's a directory or a file
if (file.info(lib.loc)$isdir) {
proj@lib.loc <- tools::file_path_as_absolute(lib.loc)
} else {
stop("lib.loc ", lib.loc, " is not a directory", call. = FALSE)
}
} else {
stop("lib.loc ", lib.loc, " does not exist", call. = FALSE)
}
}
# ---------------------------------- PARSE THE CACHE DIRECTORY --------------------------
if (is.null(cacheDir)) {
proj@cacheDir <- NA_character_
} else {
cacheDir <- sub("(\\\\|/)$", "", cacheDir) # for windows systems
if (file.exists(cacheDir)) {
# it's a directory or a file
if (file.info(cacheDir)$isdir) {
# if not already present, create a subdirectory in the cacheDir,
# this prevent collisions
# when the the same cacheDir is being used by multiple users
proj@cacheDir <- file.path(tools::file_path_as_absolute(cacheDir),
basename(tempdir()))
if (!file.exists(proj@cacheDir)) {
if (!dir.create(path = proj@cacheDir, showWarnings = FALSE)) {
stop("No permissions to write in the cacheDir: ",
proj@cacheDir, call. = FALSE)
}
}
} else {
stop("cacheDir ", cacheDir, " is not a directory", call. = FALSE)
}
} else {
stop("cacheDir ", cacheDir, " does not exist", call. = FALSE)
}
}
# --------------- SET THE ALIGNERMODE ID AND LOAD THE ALIGNER PACKAGE IF REQUIRED --------------------
supportedAligners <- c("Rbowtie", "Rhisat2")
if (aligner %in% supportedAligners) {
proj@aligner <- aligner
} else {
stop("The specified aligner is unknown, please select one of the following: ",
paste(supportedAligners, collapse = ","), call. = FALSE)
}
if ((proj@samplesFormat %in% c("fasta", "fastq")) & (proj@bisulfite == "no")) {
alnModeID <- proj@aligner
} else if ((proj@samplesFormat %in% c("fasta", "fastq")) &
(proj@bisulfite != "no")) {
if (aligner == "Rhisat2") {
stop("Bisulfite alignment mode is not available for Rhisat2")
}
alnModeID <- paste(proj@aligner, "CtoT", sep = "")
} else if ((proj@samplesFormat %in% c("csfasta", "csfastq")) &
proj@bisulfite == "no") {
if (aligner == "Rhisat2") {
stop("Color space alignment mode is not available for Rhisat2")
}
alnModeID <- paste(proj@aligner, "Cs", sep = "")
} else if ((proj@samplesFormat %in% c("csfasta","csfastq")) &
proj@bisulfite != "no") {
stop("Bisulfite alignment mode is not available for color space reads")
} else if (proj@samplesFormat == "bam") {
alnModeID <- NA_character_
proj@aligner <- NA_character_
} else {
stop("Fatal error 2340923")
}
proj@alnModeID <- alnModeID
# load the aligner package in the case where there are missing
# alignments (genomic or aux)
if (any(is.na(proj@alignments$FileName)) | any(is.na(proj@auxAlignments))) {
pkgname <- aligner
if (!requireNamespace(pkgname, quietly = TRUE)) {
stop("The ", pkgname, " package is required for alignments, but not ",
"installed.\nInstall it using BiocManager::install(\"", pkgname, "\")",
call. = FALSE)
}
# these test are not needed while Rbowtie is in "Depends"
#if(!(pkgname %in% installed.packages()[,"Package"])){stop(pkgname, " package is required for the alignments but not installed on this system",call.=FALSE)}
#if(!require(pkgname, character.only=TRUE, quietly=TRUE)){stop("Fatal error 340954")}
#pkg_version <- as.character(packageVersion(pkgname))
#QuasR_suggests <- unlist(strsplit(installed.packages()['QuasR','Suggests'], ","))
#QuasR_suggests_pkg <- grep(pkgname, QuasR_suggests, value=T)
}
#------------------------------------ PARSE SPLICED ALIGNMENT PARAMETER ---------------------------
proj@splicedAlignment <- splicedAlignment
if (proj@splicedAlignment & (proj@bisulfite != "no")) {
stop("The spliced alignment mode is not supported for bisulfite samples")
}
if (!is.na(proj@aligner) && proj@aligner == "Rbowtie" && proj@splicedAlignment &&
!(proj@paired %in% c("no", "fr"))) {
stop("The spliced alignment mode with Rbowtie only supports the pair orientation 'fr'")
}
#------------------------------------ PARSE THE ALIGNMENT PARAMETERS ----------------------------------
if (is.na(proj@aligner)) {
proj@alignmentParameter <- ""
} else {
if (is.null(alignmentParameter)) {
if (!proj@splicedAlignment) {
if (proj@aligner == "Rbowtie") { # bowtie
# Test for the case where no merge reorder is going to be
# executed later on. In that case maxhits needs to
# to be reinforced by bowtie.
if ((proj@bisulfite == "no") && (is.na(proj@snpFile))) {
proj@alignmentParameter <-
paste("-m", proj@maxHits, "--best --strata")
} else {
proj@alignmentParameter <-
paste("-k", proj@maxHits + 1, "--best --strata")
}
# For the allelic case, ignore qualities. Anyways the
# assignment to the R or A genome is based on sequence.
if ((proj@samplesFormat == "fasta") || !is.null(snpFile)) {
proj@alignmentParameter <-
paste(proj@alignmentParameter, "-v 2")
}
if (proj@paired != "no") {
proj@alignmentParameter <-
paste(proj@alignmentParameter, "--maxins 500")
}
} else { # hisat2
if (is.na(proj@snpFile)) {
proj@alignmentParameter <-
paste("-k", proj@maxHits + 1)
} else {
# XV tag is not added to singly aligned reads and
# will make qCount fail
proj@alignmentParameter <-
paste("-k", proj@maxHits + 1, "--no-mixed --no-discordant")
}
}
} else {
if (proj@aligner == "Rbowtie") { # spliced, bowtie
if (is.na(proj@snpFile)) {
proj@alignmentParameter <-
"-max_intron 400000 -min_intron 20000 -max_multi_hit 10 -selectSingleHit TRUE -seed_mismatch 1 -read_mismatch 2 -try_hard yes"
} else {
proj@alignmentParameter <-
"-max_intron 400000 -min_intron 20000 -max_multi_hit 10 -selectSingleHit FALSE -seed_mismatch 1 -read_mismatch 2 -try_hard yes"
}
} else { # spliced, hisat2
if (!is.na(proj@geneAnnotation)) {
proj@alignmentParameter <-
paste("-k", proj@maxHits + 1,
"--known-splicesite-infile",
paste0(proj@geneAnnotation, ".SpliceSites.txt"))
} else {
proj@alignmentParameter <-
paste("-k", proj@maxHits + 1)
}
if (!is.na(proj@snpFile)) {
# XV tag is not added to singly aligned reads and
# will make qCount fail
proj@alignmentParameter <-
paste(proj@alignmentParameter, "--no-mixed --no-discordant")
}
}
}
} else {
if (length(alignmentParameter) == 1) {
proj@alignmentParameter <- alignmentParameter
} else {
stop("The alignmentParameter should contain only a ",
"single character string",
call. = FALSE)
}
}
}
# ---------------------------------- CREATE .fai AND .md5 FILES --------------------------
# create fasta indices (.fai) files for all the reference sequences (genome & aux)
# create all the .md5 files necessary to identify preexisting bam files
createReferenceSequenceIndices(proj)
# --------------------- SEARCH FOR BAM FILES THAT HAVE BEEN CREATED PREVIOUSLY ----------------------
# search for genomic alignments
for (i in seq_len(nrow(proj@reads))) {
if (is.na(proj@alignments$FileName[i])) {
projBamInfo <- bamInfoOnlyBaseName(qProjectBamInfo(proj, i))
if (is.na(proj@alignmentsDir)) {
bamDir <- dirname(proj@reads[i, 1])
} else {
bamDir <- proj@alignmentsDir
}
samplePrefix <- basename(tools::file_path_sans_ext(proj@reads[i, 1],
compression = TRUE))
filesInBamDir <- list.files(bamDir, pattern = ".bam$")
bamFilesToInspect <- filesInBamDir[sub("\\_[^\\_]+.bam$", "",
filesInBamDir) == samplePrefix]
bamTxtFilesToInspect <- paste(file.path(bamDir, bamFilesToInspect),
"txt", sep = ".")
bamTxtFilesToInspectExist <-
bamTxtFilesToInspect[file.exists(bamTxtFilesToInspect)]
bamFilesToInspectWithTxt <-
file.path(bamDir, bamFilesToInspect[file.exists(bamTxtFilesToInspect)])
compatibleBamFileInd <- NULL
if (length(bamTxtFilesToInspectExist) > 0) {
for (m in seq_len(length(bamTxtFilesToInspectExist))) {
bamInfoT_DF <- utils::read.delim(bamTxtFilesToInspectExist[m],
header = FALSE, row.names = 1,
stringsAsFactors = FALSE)
bamInfoT <- bamInfoT_DF[, 1]
names(bamInfoT) <- rownames(bamInfoT_DF)
bamInfoT <- bamInfoOnlyBaseName(bamInfoT)
# compare the actual parameters to the one from the bam file on disk
## Exclude slots related to the geneAnnotation if the aligner is Rbowtie
if (proj@aligner == "Rbowtie") {
projBamInfo <-
projBamInfo[!(names(projBamInfo) %in%
c("geneAnnotation",
"geneAnnotationFormat",
"geneAnnotation.md5"))]
bamInfoT <-
bamInfoT[!(names(bamInfoT) %in%
c("geneAnnotation",
"geneAnnotationFormat",
"geneAnnotation.md5"))]
}
if (identical(projBamInfo, bamInfoT)) {
compatibleBamFileInd[length(compatibleBamFileInd) + 1] <- m
}
}
if (length(compatibleBamFileInd) > 1) {
for (k in seq_len(length(compatibleBamFileInd))) {
message(bamFilesToInspectWithTxt[k])
}
stop("Multiple bam files exist with same alignment parameters (see above list). QuasR is unable to decide which one to use. Please delete manually the respective bam files", call. = FALSE)
}
if (length(compatibleBamFileInd) == 1) {
proj@alignments$FileName[i] <-
bamFilesToInspectWithTxt[compatibleBamFileInd]
}
}
}
}
# search for aux alignments
if (nrow(proj@auxAlignments) > 0) {
for (i in seq_len(ncol(proj@auxAlignments))) {
for (j in seq_len(nrow(proj@auxAlignments))) {
projBamInfo <- bamInfoOnlyBaseName(qProjectBamInfo(proj, i, j))
if (is.na(proj@auxAlignments[j, i])) {
if (is.na(proj@alignmentsDir)) {
bamDir <- dirname(proj@reads[i, 1])
} else {
bamDir <- proj@alignmentsDir
}
samplePrefix <- basename(
tools::file_path_sans_ext(proj@reads[i, 1], compression = TRUE))
filesInBamDir <- list.files(bamDir, pattern = ".bam$")
bamFilesToInspect <- filesInBamDir[sub("\\_[^\\_]+.bam$", "",
filesInBamDir) == samplePrefix]
bamTxtFilesToInspect <- paste(file.path(bamDir, bamFilesToInspect),
"txt", sep = ".")
bamTxtFilesToInspectExist <-
bamTxtFilesToInspect[file.exists(bamTxtFilesToInspect)]
bamFilesToInspectWithTxt <-
file.path(bamDir,
bamFilesToInspect[file.exists(bamTxtFilesToInspect)])
compatibleBamFileInd <- NULL
if (length(bamTxtFilesToInspectExist) > 0) {
for (m in seq_len(length(bamTxtFilesToInspectExist))) {
bamInfoT_DF <- utils::read.delim(bamTxtFilesToInspectExist[m],
header = FALSE, row.names = 1,
stringsAsFactors = FALSE)
bamInfoT <- bamInfoT_DF[, 1]
names(bamInfoT) <- rownames(bamInfoT_DF)
bamInfoT <- bamInfoOnlyBaseName(bamInfoT)
# compare the actual parameters to the one from the bam file on disk
if (proj@aligner == "Rbowtie") {
projBamInfo <-
projBamInfo[!(names(projBamInfo) %in%
c("geneAnnotation",
"geneAnnotationFormat",
"geneAnnotation.md5"))]
bamInfoT <-
bamInfoT[!(names(bamInfoT) %in%
c("geneAnnotation",
"geneAnnotationFormat",
"geneAnnotation.md5"))]
}
if (identical(projBamInfo, bamInfoT)) {
compatibleBamFileInd[length(compatibleBamFileInd) + 1] <- m
}
}
if (length(compatibleBamFileInd) > 1) {
for (k in seq_len(length(compatibleBamFileInd))) {
message(bamFilesToInspectWithTxt[k])
}
stop("Multiple bam files exist with same alignment parameters (see above list). QuasR is unable to decide which one to use. Please delete manually the respective bam files", call. = FALSE)
}
if (length(compatibleBamFileInd) == 1) {
proj@auxAlignments[j, i] <-
bamFilesToInspectWithTxt[compatibleBamFileInd]
}
}
}
}
}
}
return(proj)
}
# create search index (.fai) files for all the reference sequences that are
# going to be used for the alignments (genome & aux)
# do nothing if the fai files exist already
# create an .md5 file that contains the md5sum of the referece sequences
# if snp file exists, calculate md5 and store it in a (.md5) file
#' @keywords internal
#' @importFrom Biostrings fasta.seqlengths
#' @importFrom Rsamtools indexFa scanFaIndex
#' @importFrom GenomeInfoDb seqnames
#' @importFrom tools md5sum
#' @importFrom utils write.table
createReferenceSequenceIndices <- function(proj) {
# create fasta index for the genome if it is not a BSgenome and if the .fai file is not present
if (proj@genomeFormat == "file") {
if (!file.exists(paste(proj@genome, "fai", sep = "."))) {
# test if the sequence file is a valid fasta file using the
# fasta.seqlengths() function from Biostrings
# this is necessary because indexFa() from Rsamtools would
# crash R if it is passed an invalid fasta file
message(paste("Creating .fai file for:", proj@genome))
result <- try(Biostrings::fasta.seqlengths(proj@genome))
if (is(result, "try-error")) {
stop("The fasta file ", proj@genome,
" is not a valid fasta file", call. = FALSE)
}
# remove everything after the white space after the id
# (not done by fasta.seqlengths)
faInfoNames <- sub("\\s+.+", "", names(result), perl = TRUE)
if (length(unique(faInfoNames)) != length(faInfoNames)) {
stop("Sequence names in the file: ", proj@genome,
" are not unique", call. = FALSE)
}
if (is(try(Rsamtools::indexFa(proj@genome)), "try-error")) {
stop("Cannot write into the directory where ", proj@genome,
" is located. Make sure you have the right permissions",
call. = FALSE)
}
} else {
faiSeqNames <- as.character(GenomeInfoDb::seqnames(
Rsamtools::scanFaIndex(proj@genome)))
if (length(unique(faiSeqNames)) != length(faiSeqNames)) {
stop("Sequence names in the file: ", proj@genome,
" are not unique (information extracted from the .fai file",
call. = FALSE)
}
}
# create md5 sum file
if (!file.exists(paste(proj@genome, "md5", sep = "."))) {
utils::write.table(tools::md5sum(proj@genome),
paste(proj@genome, "md5", sep = "."),
sep = "\t", quote = FALSE, col.names = FALSE,
row.names = FALSE)
}
}
# create fasta index for the auxilliaries if not present already
if (nrow(proj@aux) > 0) {
for (i in seq_len(nrow(proj@aux))) {
if (!file.exists(paste(proj@aux$FileName[i], "fai", sep = "."))) {
# test if the sequence file is a valid fasta file using the
# fasta.seqlengths() function from Biostrings
# this is necessary because indexFa() from Rsamtools would
# crash R if it is passed an invalid fasta file (R version 2.14.1)
result <- try(Biostrings::fasta.seqlengths(proj@aux$FileName[i]))
if (is(result, "try-error")) {
stop("The fasta file ", proj@aux$FileName[i],
" is not a valid fasta file", call. = FALSE)
}
# remove everything after the white space after the id
# (not done by fasta.seqlengths)
faInfoNames <- sub("\\s+.+", "", names(result), perl = TRUE)
if (length(unique(faInfoNames)) != length(faInfoNames)) {
stop("Sequence names in the file: ", proj@aux$FileName[i],
" are not unique", call. = FALSE)
}
if (is(try(Rsamtools::indexFa(proj@aux$FileName[i])),
"try-error")) {
stop("Cannot write into the directory where ",
proj@aux$FileName[i],
" is located. Make sure you have the right permissions",
call. = FALSE)
}
}
if (!file.exists(paste(proj@aux$FileName[i], "md5", sep = "."))) {
# create md5 sum file
utils::write.table(tools::md5sum(proj@aux$FileName[i]),
paste(proj@aux$FileName[i], "md5", sep = "."),
sep = "\t", quote = FALSE,
col.names = FALSE, row.names = FALSE)
}
}
}
#create md5 sum file for the snp file
if (!is.na(proj@snpFile)) {
if (!file.exists(paste(proj@snpFile, "md5", sep = "."))) {
utils::write.table(tools::md5sum(proj@snpFile),
paste(proj@snpFile, "md5", sep = "."),
sep = "\t", quote = FALSE,
col.names = FALSE, row.names = FALSE)
}
}
# create md5 sum file for the geneAnnotation
if (!is.na(proj@geneAnnotation)) {
if (!file.exists(paste0(proj@geneAnnotation, ".md5"))) {
utils::write.table(tools::md5sum(proj@geneAnnotation),
paste0(proj@geneAnnotation, ".md5"),
sep = "\t", quote = FALSE,
col.names = FALSE, row.names = FALSE)
}
}
}
# returns information for one bam file in qProject
# sampleNr specifies the sample for which the information is compiled
# if auxNr is not NULL, it returns information about aux bam file
#' @keywords internal
#' @importFrom utils packageVersion read.delim
qProjectBamInfo <- function(proj, sampleNr, auxNr = NULL) {
if (sampleNr > nrow(proj@reads)) {
stop("project has no sample ", sampleNr)
}
if (!is.null(auxNr)) {
if (auxNr > nrow(proj@aux)) {
stop("project has no aux ",auxNr)
}
}
alnInfo <- NULL
if (proj@paired == "no") {
alnInfo["reads.FileName1"] <- proj@reads$FileName[sampleNr]
alnInfo["reads.FileName2"] <- NA
alnInfo["reads.md5subsum1"] <- proj@reads_md5subsum$md5subsum[sampleNr]
alnInfo["reads.md5subsum2"] <- NA
} else {
alnInfo["reads.FileName1"] <- proj@reads$FileName1[sampleNr]
alnInfo["reads.FileName2"] <- proj@reads$FileName2[sampleNr]
alnInfo["reads.md5subsum1"] <- proj@reads_md5subsum$md5subsum1[sampleNr]
alnInfo["reads.md5subsum2"] <- proj@reads_md5subsum$md5subsum2[sampleNr]
}
alnInfo["samplesFormat"] <- proj@samplesFormat
alnInfo["genome"] <- proj@genome
if (proj@genomeFormat == "file") {
alnInfo["genome.md5"] <- as.character(
utils::read.delim(paste(proj@genome, "md5",
sep = "."),
header = FALSE,
colClasses = "character")[1, 1])
} else {
alnInfo["genome.md5"] <- NA
}
alnInfo["genomeFormat"] <- proj@genomeFormat
alnInfo["aux"] <- NA
alnInfo["aux.md5"] <- NA
alnInfo["aligner"] <- proj@aligner
if (!is.na(proj@aligner)) {
alnInfo["aligner.version"] <- as.character(utils::packageVersion(proj@aligner))
} else {
alnInfo["aligner.version"] <- NA
}
alnInfo["maxHits"] <- proj@maxHits
alnInfo["paired"] <- proj@paired
alnInfo["splicedAlignment"] <- proj@splicedAlignment
alnInfo["snpFile"] <- proj@snpFile
alnInfo["snpFile.md5"] <- NA
alnInfo["bisulfite"] <- proj@bisulfite
alnInfo["alignmentParameter"] <- proj@alignmentParameter
alnInfo["geneAnnotation"] <- proj@geneAnnotation
alnInfo["geneAnnotationFormat"] <- proj@geneAnnotationFormat
alnInfo["geneAnnotation.md5"] <- NA
alnInfo["QuasR.version"] <- as.character(utils::packageVersion('QuasR'))
if (!is.null(auxNr)) {
alnInfo["aux"] <- proj@aux$FileName[auxNr]
alnInfo["aux.md5"] <- as.character(
utils::read.delim(paste(proj@aux$FileName[auxNr], "md5", sep = "."),
header = FALSE, colClasses = "character")[1, 1])
}
if (!is.na(proj@snpFile)) {
alnInfo["snpFile.md5"] <- as.character(
utils::read.delim(paste(proj@snpFile, "md5", sep = "."),
header = FALSE, colClasses = "character")[1, 1])
}
if (!is.na(proj@geneAnnotation)) {
alnInfo["geneAnnotation.md5"] <- as.character(
utils::read.delim(paste0(proj@geneAnnotation, ".md5"),
header = FALSE, colClasses = "character")[1, 1])
}
return(alnInfo)
}
# replace full file paths in bamInfo by base name
# remove the aligner and QuasR version. this allows using bam files generated with an older QuasR version
#' @keywords internal
bamInfoOnlyBaseName <- function(bamInfo) {
bamInfo["reads.FileName1"] <- basename(bamInfo["reads.FileName1"])
bamInfo["reads.FileName2"] <- basename(bamInfo["reads.FileName2"])
bamInfo["genome"] <- basename(bamInfo["genome"])
bamInfo["aux"] <- basename(bamInfo["aux"])
bamInfo["snpFile"] <- basename(bamInfo["snpFile"])
bamInfo["geneAnnotation"] <- basename(bamInfo["geneAnnotation"])
if ("aligner.version" %in% names(bamInfo)) {
bamInfo <- bamInfo[!(names(bamInfo) %in% "aligner.version")]
}
if ("QuasR.version" %in% names(bamInfo)) {
bamInfo <- bamInfo[!(names(bamInfo) %in% "QuasR.version")]
}
return(bamInfo)
}
# helper function that converts filepaths to absolute filepaths in the case where the working
# directory needs to be temporarily redirected. This is needed if e.g. the samples.txt file
# is not located in the working directory.
# returns the absolute path if the file exists
# returns NULL if the file does not exist
#' @keywords internal
#' @importFrom tools file_path_as_absolute
pathAsAbsoluteRedirected <- function(fileName, redirectPath) {
curwd <- getwd() # store the original working directory required to jump back
on.exit(setwd(curwd)) # make sure that it will jump back in a case of error or not
setwd(redirectPath) # jump to the redirected position
if (file.exists(fileName)) {
return(tools::file_path_as_absolute(fileName))
} else {
return(NULL)
}
}
# helper function that returns the temporary directory given the cacheDir (from qProject)
# if the user specified cacheDir, then it returns it. but if the user did not specify a cacheDir
# it returns the R temporary directory (which can be different for different machines)
#' @keywords internal
resolveCacheDir <- function(cacheDir) {
if (is.na(cacheDir))
return(tempdir())
else
return(cacheDir)
}
# given a vector of filenames, the function determines the final format
# taking into account all the different types of extensions for
# sequence files. Throughs an error if one ore more samples do not contain
# a valid file extension
#' @keywords internal
determineSamplesFormat <- function(filename) {
fileExtension <- consolidateFileExtensions(filename, compressed = TRUE)
validExtsSel <- fileExtension %in%
c("fasta", "fastq", "bam", "csfasta", "csfastq")
if (all(validExtsSel)) {
if (length(unique(fileExtension)) == 1) {
return(unique(fileExtension))
} else {
stop("Mixed sample file formats are not allowed: ",
paste(unique(fileExtension), collapse = ","),
call. = FALSE)
}
} else {
stop(filename[!validExtsSel][1],
" does not have a valid file extension ",
"(fa,fna,fasta,fq,fastq,bam,csfasta,csfastq)",
call. = FALSE)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.