R/HTSeqCounting.R

###########################################################################/**
# @RdocClass HTSeqCounting
#
# @title "The HTSeqCounting class"
#
# \description{
#  @classhierarchy
#
#  ...
# }
#
# @synopsis
#
# \arguments{
#  \item{dataSet}{A @see "BamDataSet".}
#  \item{transcripts}{A @see "GtfDataFile" specifying a gene model.}
#  \item{...}{Arguments passed to @see "AbstractAlignment".}
# }
#
# \section{Fields and Methods}{
#  @allmethods "public"
# }
#
# \section{Supported operating systems}{
#   ...
# }
#
# @author "HB"
#
# \references{
#  [1] Simon Anders, \emph{HTSeq: Analysing high-throughput sequencing
#      data with Python}, EMBL, Jan 2014.
#      \url{http://www-huber.embl.de/users/anders/HTSeq/} \cr
# }
#*/###########################################################################
setConstructorS3("HTSeqCounting", function(dataSet=NULL, transcripts=NULL, ...) {
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Validate arguments
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Validate arguments
  if (!is.null(dataSet)) {
    # Argument 'dataSet':
    dataSet <- Arguments$getInstanceOf(dataSet, "BamDataSet")

    # Argument 'transcripts':
    transcripts <- Arguments$getInstanceOf(transcripts, "GtfDataFile")
  } # if (!is.null(dataSet))


  # Arguments '...':
  args <- list(...)

  extend(AromaSeqTransform(dataSet, ...), "HTSeqCounting",
    .transcripts = transcripts
  )
})


setMethodS3("getRootPath", "HTSeqCounting", function(this, ...) {
  "htseqCountData"
}, protected=TRUE)


setMethodS3("getParameters", "HTSeqCounting", function(this, ...) {
  params <- NextMethod("getAsteriskTags")
  params$transcripts <- this$.transcripts
  params
}, protected=TRUE)


setMethodS3("getSampleNames", "HTSeqCounting", function(this, ...) {
  ds <- getInputDataSet(this)
  getFullNames(ds, ...)
}, protected=TRUE)


setMethodS3("process", "HTSeqCounting", function(this, ..., skip=TRUE, force=FALSE, verbose=FALSE) {
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Assert external software versions
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  .stop_if_not(isCapableOf(aroma.seq, "htseq"))


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Validate arguments
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Argument 'force':
  force <- Arguments$getLogical(force)

  # Argument 'verbose':
  verbose <- Arguments$getVerbose(verbose)
  if (verbose) {
    pushState(verbose)
    on.exit(popState(verbose))
  }

  verbose && enter(verbose, "HTSeq counting")
  ds <- getInputDataSet(this)
  verbose && cat(verbose, "Input data set:")
  verbose && print(verbose, ds)


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Identify groups to be processed
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  if (force) {
    todo <- seq_along(ds)
  } else {
    counts <- getOutputDataSet(this, onMissing="NA", verbose=less(verbose, 1))
    todo <- which(!sapply(counts, FUN=isFile))
  }
  verbose && cat(verbose, "Number of samples to process: ", length(todo))

  # Already done?
  if (!force && length(todo) == 0L) {
    verbose && cat(verbose, "Already processed.")
    verbose && print(verbose, counts)
    verbose && exit(verbose)
    return(counts)
  }

  outPath <- getPath(this)
  verbose && cat(verbose, "Output directory: ", outPath)

  params <- getParameters(this)
  transcripts <- params$transcripts
  verbose && cat(verbose, "Using transcripts:")
  verbose && print(verbose, transcripts)
  # Workaround for *gzipped* GTF files (not supported by HTSeq binaries)
  if (isGzipped(transcripts)) {
    verbose && enter(verbose, "Temporary uncompressing file")
    pathnameZ <- getPathname(transcripts)
    pathname <- gunzip(pathnameZ, temporary=TRUE, remove=FALSE)
    done <- FALSE
    on.exit({
      if (done) file.remove(pathname)
    }, add=TRUE)
    transcripts <- newInstance(transcripts, pathname)
    verbose && cat(verbose, "Using (temporary) transcripts:")
    verbose && print(verbose, transcripts)
    verbose && exit(verbose)
  }
  # Sanity check
  .stop_if_not(!isGzipped(transcripts))

  verbose && cat(verbose, "Number of samples: ", length(ds))


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Validating compatibility between GTF and BAM data set
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  verbose && enter(verbose, "Validating compatibility between GTF and BAM data set")
  # Get one BAM file.
  bam <- ds[[1L]]
  bamNames <- getTargetNames(bam)
  verbose && printf(verbose, "Target/chromosome names in BAM: %s [%d]\n", hpaste(bamNames), length(bamNames))

  # Slightly faster to parse unzipped file.
  gtf <- transcripts
  gtfNames <- getSeqNames(gtf, unique=TRUE)
  gtf <- params$transcripts
  verbose && printf(verbose, "Sequence/chromosome names in GTF: %s [%d]\n", hpaste(gtfNames), length(gtfNames))

  inBoth <- intersect(bamNames, gtfNames)
  if (length(inBoth) == 0L) {
    msg <- sprintf("Incompatible GTF file: None of the %d sequence/chromosome names in the GTF (%s) are found in the BAM file (%s): [%s] not in [%s]", length(gtfNames), getPathname(gtf), getPathname(bam), hpaste(gtfNames), hpaste(bamNames))
    verbose && cat(verbose, msg)
    throw(msg)
  }

  inBAMnotGTF <- setdiff(bamNames, gtfNames)
  if (length(inBAMnotGTF) > 0L) {
    msg <- sprintf("Possibly an incompatible GTF file: Found %d target/chromosome names in the BAM file (%s) that are not in the GTF (%s): [%s] not in [%s]", length(inBAMnotGTF), getPathname(bam), getPathname(gtf), hpaste(inBAMnotGTF), hpaste(gtfNames))
    verbose && cat(verbose, msg)
    # FIXME: Should we use throw() here instead?  Set some thresholding? /HB 2014-03-10
    warning(msg)
  }

  inGTFnotBAM <- setdiff(gtfNames, bamNames)
  if (length(inGTFnotBAM) > 0L) {
    msg <- sprintf("Not counting all sequences/chromosomes in GTF file: Found %d sequence/chromosome names in the GTF (%s) that are not in the BAM file (%s): [%s] not in [%s]", length(inGTFnotBAM), getPathname(gtf), getPathname(bam), hpaste(inGTFnotBAM), hpaste(gtfNames))
    verbose && cat(verbose, msg)
    warning(msg)
  }
  verbose && exit(verbose)


  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Apply aligner to each of the FASTQ files
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  outPath <- getPath(this)

  res <- listenv()
  for (kk in seq_along(todo)) {
    ii <- todo[kk]
    df <- ds[[ii]]
    name <- getFullName(df)

    verbose && enter(verbose, sprintf("htseq-count #%d ('%s') of %d", kk, name, length(todo)))

    verbose && cat(verbose, "BAM file:")
    verbose && print(verbose, df)
    pathnameBAM <- getPathname(df)

    filenameD <- sprintf("%s.count", getFullName(df))
    pathnameD <- Arguments$getWritablePathname(filenameD, path=outPath, mustNotExist=FALSE)

    # Nothing to do?
    if (skip && isFile(pathnameD)) {
      verbose && cat(verbose, "Already processed. Skipping.")
      res[[kk]] <- pathnameD
      verbose && exit(verbose)
      next
    }

    res[[kk]] %<-% {
      pathnameGTF <- getPathname(transcripts)
      verbose && cat(verbose, "GTF pathname: ", pathnameGTF)

      # Final sample-specific output directory
      args <- list(
        pathnameS=pathnameBAM,
        gff=pathnameGTF,
        pathnameD=pathnameD
      )

      # Is BAM file sorted?
      isSorted <- isSorted(df)
      if (isSorted) {
        # ...then assume it is sorted by position (aroma.seq policy)
        # FIXME: Check how BAM file is sorted
        args$orderedBy <- "position"
      }

      # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
      # BEGIN: ATOMIC OUTPUT
      # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  ##    # Write to temporary output directory
  ##    args$outPath <- sprintf("%s.tmp", args$outPath)
  ##    verbose && cat(verbose, "Temporary output directory: ", args$outPath)

      verbose && cat(verbose, "Arguments passed to htseqCount():")
      verbose && str(verbose, args)
      args$verbose <- less(verbose, 1)
      res <- do.call(htseqCount, args=args)

      verbose && cat(verbose, "Results:")
      verbose && print(verbose, res)

      # Was there a non-zero exit status?
      status <- attr(res, "status")
      if (!is.null(status)) {
        verbose && cat(verbose, "Status: ", status)
      }

  ##   # Rename from temporary to final directory
  ##    file.rename(args$outPath, outPathS)
  ##    verbose && cat(verbose, "Final output directory: ", outPathS)
      # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
      # END: ATOMIC OUTPUT
      # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

      ## Sanity check
      pathnameD <- Arguments$getReadablePathname(pathnameD)

      verbose && exit(verbose)

      pathnameD
    } ## %<-%
  } ## for (kk ...)

  ## Resolve futures
  res <- resolve(res)

  ## All files are processed; can clean up temporary files now
  done <- FALSE

  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  # Get results
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  counts <- getOutputDataSet(this, onMissing="error", verbose=less(verbose, 1))
  verbose && print(verbose, counts)

  # Sanity check
  .stop_if_not(all(sapply(counts, FUN=isFile)))

  verbose && exit(verbose)

  counts
})
HenrikBengtsson/aroma.seq documentation built on Feb. 15, 2021, 2:21 a.m.