ShortRead: FASTQ input and manipulation

.read_csv_portion <- function(dirPath, pattern, colClasses, ...) {
    ## visit several files, then collapse
    files <- .file_names(dirPath, pattern)
    lsts <- lapply(files, function(fl, ...) {
        tryCatch({
            read.csv(fl, ...)
        }, error=function(err) {
            read.csv(gzfile(fl), ...)
        })
    }, ..., colClasses=unname(colClasses), stringsAsFactors=FALSE)
    cclasses <- colClasses[!sapply(colClasses, is.null)]
    lst <- lapply(seq_along(names(cclasses)),
                  function(idx) unlist(lapply(lsts, "[[", idx)))
    names(lst) <- names(cclasses)
    lst
}

.readAligned_SolexaAlign <-
    function(dirPath, pattern=character(0), ...,
             quote="", sep="", comment.char="#", header=FALSE,
             Lpattern="", Rpattern="")
{
    csvClasses <- xstringClasses <-
        list(sequence="DNAString", alignQuality="integer",
             nMatch="integer", position="character", strand="factor",
             refSequence=NULL, nextBestAlignQuality="integer")
    xstringNames <- "sequence"
    csvClasses[xstringNames] <- list(NULL)
    xstringClasses[!names(xstringClasses) %in% xstringNames] <-
        list(NULL)

    ## CSV portion
    lst <- .read_csv_portion(dirPath, pattern, csvClasses, ...,
                             col.names=names(csvClasses),
                             quote=quote, sep=sep,
                             comment.char=comment.char, header=header)
    idx <- regexpr(":", lst[["position"]], fixed=TRUE)
    chromosome <- substr(lst[["position"]], 1, idx-1)
    chromosome[idx==-1] <- NA
    chromosome <- factor(chromosome)
    position <- as.integer(substr(lst[["position"]], idx+1,
                                  nchar(lst[["position"]])))
    df <- data.frame(nMatch=lst$nMatch,
                     nextBestAlignQuality=lst$nextBestAlignQuality)
    meta <- data.frame(labelDescription=c(
                         "Number of matches",
                         "Next-best alignment quality score"))
    alignData <- AlignedDataFrame(df, meta)

    ## XStringSet classes
    sets <- readXStringColumns(dirPath, pattern, xstringClasses,
                               ..., sep=" \t")
    len <- length(sets[["sequence"]])
    wd <- width(sets[["sequence"]])
    q <- paste(rep(" ", max(wd)), collapse="")
    quality <- BStringSet(Views(BString(q), start=rep(1, len), end=wd))
    AlignedRead(sread=sets[["sequence"]],
                id=BStringSet(character(len)),
                quality=SFastqQuality(quality),
                chromosome=chromosome,
                position=position,
                strand=.toStrand_Solexa(lst[["strand"]]),
                alignQuality=NumericQuality(lst[["alignQuality"]]),
                alignData=alignData)
}

.readAligned_SolexaResult <-
    function(dirPath, pattern=character(0), ...,
             sep="\t", comment.char="#", quote="", header=FALSE)
{
    csvClasses <- xstringClasses <-
        list(id=NULL, sequence="DNAString", matchCode="factor",
             nExactMatch="integer", nOneMismatch="integer",
             nTwoMismatch="integer", chromosome="factor",
             position="integer", strand="factor",
             NCharacterTreatment="factor", mismatchDetailOne="character",
             mismatchDetailTwo="character")
    xstringNames <- "sequence"
    csvClasses[xstringNames] <- list(NULL)
    xstringClasses[!names(xstringClasses) %in% xstringNames] <-
        list(NULL)

    ## CSV portion
    lst <- .read_csv_portion(dirPath, pattern, csvClasses, ...,
                             col.names=names(csvClasses),
                             quote=quote, sep=sep,
                             comment.char=comment.char, header=header)
    df <- data.frame(matchCode=lst[["matchCode"]],
                     nExactMatch=lst[["nExactMatch"]],
                     nOneMismatch=lst[["nOneMismatch"]],
                     nTwoMismatch=lst[["nTwoMismatch"]],
                     NCharacterTreatment=lst[["NCharacterTreatment"]],
                     mismatchDetailOne=lst[["mismatchDetailOne"]],
                     mismatchDetailTwo=lst[["mismatchDetailTwo"]])
    meta <- data.frame(labelDescription=c(
                         "Type of match; see ?'readAligned,character-method'",
                         "Number of exact matches",
                         "Number of 1-error mismatches",
                         "Number of 2-error mismatches",
                         "Treatment of 'N'; .: NA; D: deletion; |: insertion",
                         "Mismatch error 1 detail; see ?'readAligned,character-method",
                         "Mismatch error 2 detail; see ?'readAligned,character-method"))
    alignData <- AlignedDataFrame(df, meta)
    ## XStringSet classes
    sets <- readXStringColumns(dirPath, pattern, xstringClasses,
                               ..., sep=sep)
    len <- length(sets[["sequence"]])
    wd <- width(sets[["sequence"]])
    q <- paste(rep(" ", max(wd)), collapse="")
    sfq <- BStringSet(Views(BString(q), start=rep(1, len), end=wd))
    AlignedRead(sread=sets[["sequence"]],
                quality=SFastqQuality(sfq),
                chromosome=lst[["chromosome"]],
                position=lst[["position"]],
                strand=.toStrand_Solexa(lst[["strand"]]),
                alignQuality=NumericQuality(rep(NA_integer_,
                  length(sfq))),
                alignData=alignData)
}

.SolexaExport_AlignedDataFrame <- function(data)
{
    lbls <- c(run="Analysis pipeline run",
              lane="Flow cell lane",
              tile="Flow cell tile",
              x="Cluster x-coordinate",
              y="Cluster y-coordinate",
              filtering="Read successfully passed filtering?",
              contig="Contig",
              multiplexIndex="Multiplex index",
              pairedReadNumber="Paired read number")[names(data)]
    AlignedDataFrame(data=data, metadata=data.frame(labelDescription=lbls))
}

.readAligned_SolexaExport <-
  function(dirPath, pattern=character(0), ..., withAll=FALSE,
           withId=withAll, withMultiplexIndex=withAll,
           withPairedReadNumber=withAll,
           sep="\t", commentChar="#")
{
    files <- .file_names(dirPath, pattern)
    .Call(.read_solexa_export, files, sep, commentChar,
          c(withId, withMultiplexIndex, withPairedReadNumber))
}

.readAligned_Maq_ADF <- function(lst) {
    df <- data.frame(nMismatchBestHit=lst$nMismatchBestHit,
                     mismatchQuality=lst$mismatchQuality,
                     nExactMatch24=lst$nExactMatch24,
                     nOneMismatch24=lst$nOneMismatch24)
    meta <- data.frame(labelDescription=c(
                         "Number of mismatches of the best hit",
                         "Sum of mismatched base qualities of the best hit",
                         "Number of 0-mismatch hits of the first 24 bases",
                         "Number of 1-mismatch hits of the first 24 bases"))
    AlignedDataFrame(df, meta)
}


.maqmap_warning_seen <- local({
    seen <- FALSE
    function() {
        if (!seen) {
            seen <<- TRUE
            FALSE
        } else seen
    }
})

.maqmap_file_list_error <-
    function(files, type)
{
    .throw(SRError("UserArgumentMismatch",
                   "%s for '%s' must match 1 file, got\n  %s",
                   "'dirPath', 'pattern'", type,
                   paste(files, collapse="\n  ")))
}

.readAligned_MaqMapOld <-
    function(dirPath, pattern=character(0), records=-1L, ...)
{
    files <- .file_names(dirPath, pattern)
    if (length(files) > 1)
        .maqmap_file_list_error(files, "MAQMapShort")
    lst <- .Call(.read_maq_map, files, as.integer(records), FALSE)
    AlignedRead(sread=lst[["readSequence"]],
                id=lst[["readId"]],
                quality=FastqQuality(lst[["fastqScores"]]),
                chromosome=lst[["chromosome"]],
                position=lst[["position"]],
                strand=lst[["strand"]],
                alignQuality=IntegerQuality(lst[["alignQuality"]]),
                alignData=.readAligned_Maq_ADF(lst))
}

.readAligned_MaqMap <-
    function(dirPath, pattern=character(0), records=-1L, ...)
{
    files <- .file_names(dirPath, pattern)
    if (length(files) > 1)
        .maqmap_file_list_error(files, "MAQMap")
    lst <- .Call(.read_maq_map, files, as.integer(records), TRUE)
    AlignedRead(sread=lst[["readSequence"]],
                id=lst[["readId"]],
                quality=FastqQuality(lst[["fastqScores"]]),
                chromosome=lst[["chromosome"]],
                position=lst[["position"]],
                strand=lst[["strand"]],
                alignQuality=IntegerQuality(lst[["alignQuality"]]),
                alignData=.readAligned_Maq_ADF(lst))
}

.readAligned_MaqMapview <-
    function(dirPath, pattern=character(0), ..., sep="\t", header=FALSE,
             quote="")
{
    colClasses <-
        list(NULL, chromosome="factor", position="integer",
             strand="factor", NULL, NULL, alignQuality="integer", NULL,
             NULL, nMismatchBestHit="integer",
             mismatchQuality="integer", nExactMatch24="integer",
             nOneMismatch24="integer", NULL, NULL, NULL)
    ## CSV portion
    csv <- .read_csv_portion(dirPath, pattern, colClasses, sep=sep,
                             header=header, quote=quote, ...)
    ## XStringSet components
    colClasses <- list("BString", NULL, NULL, NULL, NULL,
                       NULL, NULL, NULL, NULL, NULL, NULL, NULL,
                       NULL, NULL, "DNAString", "BString")
    sets <- readXStringColumns(dirPath, pattern,
                               colClasses, sep=sep, header=header)
    AlignedRead(sread=sets[[2]], id=sets[[1]],
                quality=FastqQuality(sets[[3]]),
                chromosome=csv[["chromosome"]],
                position=csv[["position"]],
                strand=factor(csv[["strand"]], levels=.STRAND_LEVELS),
                alignQuality=IntegerQuality(csv[["alignQuality"]]),
                alignData=.readAligned_Maq_ADF(csv))
}

.Bowtie_AlignedDataFrame <- function(similar, mismatch)
{
    df <- data.frame(similar=similar, mismatch=mismatch,
                     stringsAsFactors=FALSE)
    meta <- data.frame(labelDescription=c(
                         "if Bowtie >= 0.9.9.3 (May 12, 2009)?: number of alignments aligning to the same reference characters; else 'Reserved'",
                         "Comma-separated mismatch positions"))
    AlignedDataFrame(df, meta)
}

.readAligned_Bowtie <-
  function(dirPath, pattern=character(0), ...,
           qualityType=c("FastqQuality", "SFastqQuality"),
           sep="\t", commentChar="#")
{
    tryCatch(qualityType <- match.arg(qualityType),
             error=function(err) {
                 .throw(SRError("UserArgumentMismatch",
                                conditionMessage(err)))
             })
    files <- .file_names(dirPath, pattern)
    .Call(.read_bowtie, files, qualityType, sep, commentChar)
}

.SOAP_AlignedDataFrame <-
    function(nEquallyBestHits, pairedEnd, alignedLength,
             typeOfHit, hitDetail)
{
    df <- data.frame(nEquallyBestHits=nEquallyBestHits,
                     pairedEnd=factor(pairedEnd),
                     alignedLength=alignedLength,
                     typeOfHit=typeOfHit,
                     hitDetail=hitDetail,
                     stringsAsFactors=FALSE)
    meta <- data.frame(labelDescription=c(
                         "Number of equally-best hits",
                         "Paired end, a or b",
                         "Length of read used in alignment",
                         "Integer indicator of match type; 0: exact; 1-100: mismatch; 100+n: n-base insertion; 200+n: n-base deletion",
                         "Detailed description of match"))
    AlignedDataFrame(df, meta)
}

.readAligned_SOAP <-
    function(dirPath, pattern=character(0), ...,
             qualityType="SFastqQuality", sep="\t", commentChar="#")
{
    files <- .file_names(dirPath, pattern)
    .Call(.read_soap, files, qualityType, sep, commentChar)
}

.readAligned_character <-
    function(dirPath, pattern=character(0),
             type=c(
               "SolexaExport", "SolexaAlign",
               "SolexaPrealign", "SolexaRealign",
               "SolexaResult",
               "MAQMap", "MAQMapShort", "MAQMapview",
               "Bowtie", "SOAP"),
             ..., filter=srFilter())
{
    if (missing(type))
        .arg_missing_err("type", "readAligned,character-method",
                       "help(\"readAligned,character-method\")")
    if (!is.character(type) || length(type) != 1)
        .arg_mismatch_type_err("type", "character(1)")
    if (!missing(filter))
        .check_type_and_length(filter, "SRFilter", NA)
    vals <- eval(formals(.readAligned_character)$type)
    if (!type %in% vals)
        .arg_mismatch_value_err("type", type, vals)
    aln <-
        tryCatch({
            switch(type,
                   SolexaExport=.readAligned_SolexaExport(dirPath,
                     pattern=pattern, ...),
                   SolexaPrealign=,
                   SolexaAlign=,
                   SolexaRealign=.readAligned_SolexaAlign(dirPath,
                     pattern=pattern, ...),
                   SolexaResult=.readAligned_SolexaResult(dirPath,
                     pattern=pattern, ...),
                   MAQMap=.readAligned_MaqMap(dirPath, pattern, ...),
                   MAQMapShort=.readAligned_MaqMapOld(dirPath, pattern, ...),
                   MAQMapview=.readAligned_MaqMapview(
                     dirPath, pattern=pattern, ...),
                   Bowtie=.readAligned_Bowtie(dirPath, pattern, ...),
                   SOAP=.readAligned_SOAP(dirPath, pattern, ...))
        }, error=function(err) {
            if (is(err, "SRError")) stop(err)
            else {
                pat <- paste(pattern, collapse=" ")
                txt <- paste("'%s' failed to parse files",
                             "dirPath: '%s'",
                             "pattern: '%s'",
                             "type: '%s'",
                             "error: %s", sep="\n  ")
                msg <- sprintf(txt, "readAligned",
                               paste(dirPath, collapse="'\n    '"),
                               paste(pat, collapse="'\n    '"),
                               type, conditionMessage(err))
                .throw(SRError("Input/Output", msg))
            }
        })
    if (!missing(filter))
        aln <- aln[filter(aln)]
    aln
}

setMethod(readAligned, "character", .readAligned_character)