R/initialisers.R

Defines functions collapsePairs rcAlns readTargetBam alnsToCrisprSet separateChimeras

Documented in collapsePairs rcAlns readTargetBam

# readsToTarget generic -----
#'@title Trims reads to a target region.
#'@description Trims aligned reads to one or several target regions,
#'optionally reverse complementing the alignments.
#'@param reads A GAlignments object, or a character vector of the filenames
#'@param target A GRanges object specifying the range to narrow alignments to
#'@author Helen Lindsay
#'@rdname readsToTarget
#'
#'@import methods
#'@import BiocParallel
#'@import Biostrings
#'@import ggplot2
#'@import GenomicAlignments
#'@import GenomicRanges
#'@import IRanges
#'@import Rsamtools
#'@import S4Vectors
#'@importFrom grDevices colorRampPalette
#'@importFrom grid gpar grid.rect
#'@importFrom gridExtra grid.arrange arrangeGrob
#'@importFrom reshape2 melt
#'@importFrom AnnotationDbi select
#'@importFrom GenomeInfoDb seqlengths
#'@importFrom utils modifyList
#'@export
setGeneric("readsToTarget", function(reads, target, ...) {
  standardGeneric("readsToTarget")})
# -----

# readsToTarget GAlignments, GRanges -----
#'@param name An experiment name for the reads.  (Default: NULL)
#'@param reverse.complement (Default: TRUE)  Should the alignments be
#' oriented to match the strand of the target? If TRUE, targets located 
#  on the postive strand are displayed with respect to the postive 
#' strand and targets on the negative strand with respect to the 
#' negative strand. If FALSE, the parameter 'orientation' must be set
#' to determine the orientation.  
#' 'reverse.complement' will be replaced by 'orientation' in a later release. 
#'@param collapse.pairs  If reads are paired, should pairs be collapsed? 
#' (Default: FALSE) Note: only collapses primary alignments, 
#' and assumes that there is only one primary alignment per read.
# May fail with blat alignments converted to bam.
#'@param use.consensus Take the consensus sequence for non-matching pairs? If FALSE,
#'the sequence of the first read is used.  Can be very slow. (Default: FALSE)
#'@param store.chimeras Should chimeric reads be stored?  (Default: FALSE)
#'@param orientation One of "target" (reads are displayed on the same
#' strand as the target) "opposite" (reads are displayed on the opposite)
#' strand from the target or "positive" (reads are displayed on the forward
#' strand regardless of the strand of the target)   (Default:"target") 
#'@param verbose Print progress and statistics (Default: TRUE)
#'@param minoverlap Minimum number of bases the aligned read must 
#' share with the target site.  If not specified, the aligned
#' read must completely span the target region.  (Default: NULL)
#'@return (signature("GAlignments", "GRanges")) A \code{\link{CrisprRun}} object
#'@examples
#'# Load the metadata table
#'md_fname <- system.file("extdata", "gol_F1_metadata_small.txt", package = "CrispRVariants")
#'md <- read.table(md_fname, sep = "\t", stringsAsFactors = FALSE)
#'
#'# Get bam filenames and their full paths
#'bam_fnames <- sapply(md$bam.filename, function(fn){
#'  system.file("extdata", fn, package = "CrispRVariants")})
#'
#'reference <- Biostrings::DNAString("GGTCTCTCGCAGGATGTTGCTGG")
#'gd <- GenomicRanges::GRanges("18", IRanges::IRanges(4647377, 4647399),
#'        strand = "+")
#'
#'crispr_set <- readsToTarget(bam_fnames, target = gd, reference = reference,
#'                            names = md$experiment.name, target.loc = 17)
#'
#'@rdname readsToTarget
setMethod("readsToTarget", signature("GAlignments", "GRanges"),
          function(reads, target, ..., reverse.complement = TRUE,
                   chimeras = NULL, collapse.pairs = FALSE,
                   use.consensus = FALSE, store.chimeras = FALSE, 
                   verbose = TRUE, name = NULL, minoverlap = NULL,
                   orientation = c("target","opposite","positive")){

    orientation <- match.arg(orientation)
    dots <- list(...)
    
    # If calling directly rather than internally, run checks
    if (! "checked" %in% names(dots) & isTRUE(dots$checked)){
      .checkReadsToTarget(target, reference = NULL, target.loc = NULL,
                          reverse.complement, orientation,
                          chimeras)
      .checkForPaired(reads)
    }

    keep.unpaired <- TRUE
    if ("keep.unpaired" %in% names(dots)){
        keep.unpaired <- dots["keep.unpaired"]
    }
    
    # Choose which strand to orient reads to
    if (isTRUE(reverse.complement) & as.character(strand(target)) == "*"){
      message(paste0("Target does not have a strand, ",
                     "but reverse.complement is TRUE.\n",
                     "Orienting reads to reference strand."))
      rc = FALSE
    } else {
      rc <- rcAlns(as.character(strand(target)), orientation)
    }
    
    # If there are no non-chimeric reads, chimeras can still be stored
    if (length(reads) == 0){
      if (length(chimeras) == 0) {return(NULL)}
      crun <- CrisprRun(reads, target, rc = rc,
                        name = name, chimeras = chimeras, 
                        verbose = verbose)
      return(crun)
    }

    # Check if alignments are paired and should be collapsed
    if (isTRUE(collapse.pairs)){
      if (is.null(names(reads)) | ! ("flag" %in% names(mcols(reads))) ){
        stop("Reads must include names and bam flags for collapsing pairs")
      }
    }

    if (is.null(chimeras)) {
      chimeras <- GenomicAlignments::GAlignments()
      if (isTRUE(store.chimeras)) {
        # There is no way to specify tolerance here, or to use target.loc
        temp <- separateChimeras(reads, target, by.flag = collapse.pairs,
                                 verbose = verbose)
        reads <- temp$bam
        chimeras <- temp$chimeras[[1]]
      }
    }

    # To do: check whether this is redundant with narrowAlignments
    # Filter out reads that don't span the target region
    # Not using findOverlaps because reads may be paired, i.e. names nonunique
    if (is.null(minoverlap)){
      bam <- reads[start(reads) <= start(target) & end(reads) >= end(target) &
                   seqnames(reads) == as.character(seqnames(target))]
    } else { bam <- reads }

    if (isTRUE(verbose)){
      message(sprintf("%s of %s nonchimeric reads span the target range\n",
                  length(bam), length(reads)))
    }

    # If bam and chimeras are empty, no further calculation needed
    if (length(bam) == 0 & length(chimeras) == 0) return(NULL)

    if (length(bam) == 0){
      crun <- CrisprRun(bam, target, rc = rc,
                  name = name, chimeras = chimeras, verbose = verbose)
      return(crun)
    }

    # If bam is non-empty, orient and narrow the reads to the target

    # narrow aligned reads
    result <- narrowAlignments(bam, target, reverse.complement = rc,
                            verbose = verbose, minoverlap = minoverlap)
    
    # Collapse pairs of narrowed reads

    if (isTRUE(collapse.pairs)){
      gen_ranges <- GenomicAlignments::cigarRangesAlongReferenceSpace(
                       cigar(result), pos = start(result))
      
      result <- collapsePairs(result, genome.ranges = gen_ranges,
                              use.consensus = use.consensus,
                              keep.unpaired = keep.unpaired, verbose = verbose)

      result <- result$alignments
    }

    if (length(result) == 0){
      if (length(chimeras) == 0){
        return(NULL)
      }
      crun <- CrisprRun(result, target, rc = rc, name = name,
                        chimeras = chimeras, verbose = verbose)
      return(crun)
    }

    crun <- CrisprRun(result, target, rc = rc, name = name,
                      chimeras = chimeras, verbose = verbose)
    crun
}) # -----


# readsToTarget GAlignmentsList, GRanges -----
#'@rdname readsToTarget
setMethod("readsToTarget", signature("GAlignmentsList", "GRanges"),
          function(reads, target, ..., reference = reference,
                   names = NULL, reverse.complement = TRUE, target.loc = 17,
                   chimeras = NULL, collapse.pairs = FALSE, use.consensus = FALSE,
                   orientation = c("target","opposite","positive"),
                   minoverlap = NULL, verbose = TRUE){

    # To do: Deal with potentially empty chimeras

    orientation <- match.arg(orientation)
    
    # Always run checks as this function is not called by others
    .checkReadsToTarget(target, reference = NULL, target.loc = NULL,
                        reverse.complement, orientation,
                        chimeras)
    
    # Check: if chimeras and reads are supplied, they should have equal length
    nch <- length(chimeras)
    nreads <- length(reads)
    
    if (nreads > 0 & nch > 0 & ! nreads == nch){
      stop("Chimeras must be either NULL or a GAlignmentsList ",
           "of length equal to reads")
    }

    # Collapse pairs if required and initialise CrisprSet object
    cset <- alnsToCrisprSet(reads, reference, target, reverse.complement,
                            collapse.pairs, names, use.consensus, target.loc,
                            verbose, chimeras = chimeras, minoverlap = minoverlap,
                            orientation = orientation, checked = TRUE, ...)
    
    cset
}) # -----


# readsToTarget character, GRanges -----
#'@param names Experiment names for each bam file.  If not supplied, filenames are used.
#'@param chimeras Flag to determine how chimeric reads are treated.  One of
#'"ignore", "exclude", and "merge".  Default "count", "merge" not implemented yet
#'@param reference The reference sequence
#'@param exclude.ranges Ranges to exclude from consideration, e.g. homologous to a pcr primer.
#'@param exclude.names Alignment names to exclude
#'@param ... Extra arguments for initialising CrisprSet
#'@return (signature("character", "GRanges")) A \code{\link{CrisprSet}} object
#'@rdname readsToTarget
setMethod("readsToTarget", signature("character", "GRanges"),
          function(reads, target, ..., reference, reverse.complement = TRUE,
                   target.loc = 17, exclude.ranges = GRanges(), exclude.names = NA,
                   chimeras = c("count","exclude","ignore", "merge"),
                   collapse.pairs = FALSE, use.consensus = FALSE,
                   orientation = c("target","opposite","positive"),
                   names = NULL, minoverlap = NULL, verbose = TRUE){

    # Prepare input parameters
    args <- list(...)
    chimeras <- match.arg(chimeras)
    orientation <- match.arg(orientation)
    
    # If reading in alignments, always run checks
    .checkFnamesExist(reads)
    .checkReadsToTarget(target, reference, target.loc,
                        reverse.complement, orientation, chimeras)
            
    if (! class(reference) == "DNAString"){
      reference <- Biostrings::DNAString(reference[[1]])
    }
    
    c_to_t <- 5
    if ("chimera.to.target" %in% names(args)){
       c_to_t <- args[["chimera.to.target"]]
    }
       
    # Read in the bam files, separate chimeric and non-chimeric reads
    temp <- lapply(reads, readTargetBam, target = target,
                   exclude.ranges = exclude.ranges,
                   exclude.names = exclude.names,
                   chimeras = chimeras, by.flag = collapse.pairs,
                   chimera.to.target = c_to_t,
                   verbose = verbose)

    alns <- lapply(temp, "[[", "bam")
    chimeras <- lapply(temp, "[[", "chimeras")

    # If names are not specified, set them to the filenames
    if (is.null(names)){
       names <- basename(reads)
    }
    names <- as.character(names)

    # Collapse pairs, count insertions, create CrisprSet objects
    cset <- alnsToCrisprSet(alns, reference, target, reverse.complement,
                            collapse.pairs, names = names,
                            use.consensus = use.consensus,
                            target.loc = target.loc, verbose = verbose,
                            chimeras = chimeras, minoverlap = minoverlap,
                            orientation = orientation, checked = TRUE, ...)
     cset
}) # -----

#__________________________________________________________________
# readsToTargets (for alignments to multiple guides)
#__________________________________________________________________
# readsToTargets -----
#'@export
#'@rdname readsToTarget
setGeneric("readsToTargets", function(reads, targets, ...) {
  standardGeneric("readsToTargets")})

#'@param targets A set of targets to narrow reads to
#'@param references A set of reference sequences matching the targets.
#'References for negative strand targets should be on the negative strand.
#'@param primer.ranges A set of GRanges, corresponding to the targets.
#'Read lengths are typically greater than target regions, and it can
#'be that reads span multiple targets.  If primer.ranges are available,
#'they can be used to assign such reads to the correct target.
#'@param target.loc The zero point for renumbering (Default: 17)
#'@param ignore.strand Should strand be considered when finding overlaps?
#'(See \code{\link[GenomicAlignments]{findOverlaps}} )
#'@param bpparam A BiocParallel parameter for parallelising across reads.
#'Default: no parallelisation.  (See \code{\link[BiocParallel]{bpparam}})
#'@param chimera.to.target Number of bases that may separate a chimeric read
#'set from the target.loc for it to be assigned to the target. (Default: 5)
#'@rdname readsToTarget
setMethod("readsToTargets", signature("character", "GRanges"),
          function(reads, targets, ..., references, primer.ranges = NULL,
                   target.loc = 17, reverse.complement = TRUE,
                   collapse.pairs = FALSE, use.consensus = FALSE,
                   ignore.strand = TRUE, names = NULL,
                   bpparam = BiocParallel::SerialParam(),
                   orientation = c("target","opposite","positive"),
                   chimera.to.target = 5, verbose = TRUE){

            dummy <- .checkReadsToTargets(targets, primer.ranges, references)
            .checkFnamesExist(reads)
            
            if (is.null(names)){
              names <- reads
            }

            param <- Rsamtools::ScanBamParam(what = c("seq", "flag"))
            args <- list(...)
            ntargets <- length(targets)

            bams <- BiocParallel::bplapply(seq_along(reads), function(i){
              if (verbose) message(sprintf("Loading alignments for %s\n\n",
                                       names[i]))
              bam <- GenomicAlignments::readGAlignments(reads[i],
                                              param = param, use.names = TRUE)
              if (length(bam) == 0){
                if (verbose) message("No reads in alignment\n")
                return(NULL)
              }
              return(bam)
           }, BPPARAM = bpparam)

           bams <- GAlignmentsList(bams)
           if (collapse.pairs == FALSE) dummy <- .checkForPaired(bams)

           orientation <- match.arg(orientation)
           result <- readsToTargets(bams, targets, references = references,
                       target.loc = target.loc, verbose = verbose,
                       reverse.complement = reverse.complement,
                       ignore.strand = ignore.strand,
                       collapse.pairs = collapse.pairs, names = names,
                       bpparam = bpparam, use.consensus = use.consensus,
                       chimera.to.target = chimera.to.target,
                       orientation = orientation)
          result

          })


#'@rdname readsToTarget
setMethod("readsToTargets", signature("GAlignmentsList", "GRanges"),
          function(reads, targets, ..., references, primer.ranges = NULL,
                   target.loc = 17, reverse.complement = TRUE, 
                   collapse.pairs = FALSE, use.consensus = FALSE,
                   ignore.strand = TRUE, names = NULL, 
                   bpparam = BiocParallel::SerialParam(),
                   chimera.to.target = 5, 
                   orientation = c("target", "opposite", "positive"),
                   verbose = TRUE){

    # To do:
    # Currently this returns a list with the non-empty CrisprSets
    # consider making the list the length of the supplied targets

    dummy <- .checkReadsToTargets(targets, primer.ranges, references)
    if (collapse.pairs == FALSE) dummy <- .checkForPaired(reads)

    if (is.null(names)){
      if (! is.null(names(reads))){
        names <- names(reads)
      }else{
        names <- sprintf("Sample %s",seq_along(reads))
      }
    }

    orientation <- match.arg(orientation)

    byPCR <- BiocParallel::bplapply(reads, function(bam){

      if (verbose) message(sprintf("Assigning chimeric reads to targets \n"))

      # Change to default: just supply the cut site (target.loc)
      ch_tgts <- resize(resize(targets, target.loc + 1, fix="start"), 2, fix = "end")

      temp <- separateChimeras(bam, ch_tgts, chimera.to.target,
                              by.flag = collapse.pairs, verbose = verbose)
      bam <- temp$bam
      chimerasByPCR <- temp$chimeras

      # If primer.ranges are provided, match reads to primers
      # If not, match reads to targets
      if (! is.null(primer.ranges)){
        hits <- readsByPCRPrimer(bam, primer.ranges, verbose = verbose)
        splits <- split(queryHits(hits), subjectHits(hits))
      } else{
        hits <- findOverlaps(targets, bam, type = "within", ignore.strand = TRUE)
        duplicates <- (duplicated(subjectHits(hits)) |
                       duplicated(subjectHits(hits), fromLast = TRUE))
        if (verbose){
          msg <- paste0("%s (%.2f%%) reads of %s overlap a target\n",
                        "  %s (%.2f%%) of these overlapping multiple targets removed\n",
                        "  %s (%.2f%%) reads mapped to a single target\n\n")
          rhits <- length(unique(subjectHits(hits)))
          bl <- length(bam)
          ndups <- sum(duplicated(subjectHits(hits)))
          nndups <- sum(!duplicates)
          message(sprintf(msg, rhits, rhits/bl*100, bl, ndups, ndups/rhits*100,
                      nndups, nndups/bl*100))
        }
        hits <- hits[!duplicates]
        splits <- split(subjectHits(hits), queryHits(hits))
      }
      bamByPCR <- as.list(rep(NA, length(targets)))
      names(bamByPCR) <- seq_along(targets)
      for (nm in names(splits)){
        bamByPCR[[nm]] <- bam[splits[[nm]]]
      }
      byPCR <- list(bamByPCR = bamByPCR, chimerasByPCR = chimerasByPCR)
      byPCR
    }, BPPARAM = bpparam)

    # Remove any empty alignments
    to_keep <- which(lapply(byPCR, length) > 0)
    byPCR <- byPCR[to_keep]
    names <- names[to_keep]
    
    if (length(names) == 0){
      stop("No files contain on target reads")
    }

    # Reformat to list by guides instead of samples
    tlist <- function(i) {
      lapply(temp, "[[", i)
    }
    temp <- lapply(byPCR, "[[", "chimerasByPCR")
    chimerasByPCR <- lapply(seq_along(temp[[1]]), tlist)
    temp <- lapply(byPCR, "[[", "bamByPCR")
    bamByPCR <- lapply(seq_along(temp[[1]]), tlist)
    tg_gr <- as(targets, "GRangesList")
    

    result <- BiocParallel::bplapply(seq_along(bamByPCR), function(i){
      bams <- bamByPCR[[i]]
      tgt <- tg_gr[[i]]
      mcols(tgt) <- mcols(targets[i])
      chs <- chimerasByPCR[[i]]
      ref <- references[[i]]
      if (isTRUE(verbose)){
        message(sprintf("\n\nWorking on target %s\n", names(tgt)))
      }
      
      cset <- alnsToCrisprSet(bams, ref, tgt, reverse.complement,
                              collapse.pairs, names, use.consensus, target.loc,
                              verbose, chimeras = chs,
                              orientation = orientation, ...)
    }, BPPARAM = bpparam)

    if (length(result) == 0){
      warning("No reads span a target")
      return(result)
    }

    if (!is.null(names(targets))) {
      names(result) <- names(targets)
    }

    result <- result[!sapply(result, is.null)]
    result
  }) # -----


#__________________________________________________________________
# Data import and processing
#__________________________________________________________________

# separateChimeras -----
separateChimeras <- function(bam, targets, tolerance = 5,
                             by.flag = TRUE, verbose = FALSE){

  # The supplementary alignment flag must be set to distinguish paired from
  # chimeric reads. Tolerance is added on both sides
  # A better approach might be to explicitly consider where chimeras
  # join w.r.t read
  # Worth warning if there are chimeras independent of the guide?

  # Find chimeras
  ch_idxs <- findChimeras(bam, by.flag)
  chimeras <- bam[ch_idxs]
  original_ln <- length(chimeras)

  # Setup return data
  chimerasByPCR <- vector("list", length(targets))
  names(chimerasByPCR) <- as.character(seq_along(targets))

  # If the target is completely contained within one member of the
  # chimeric set (two members if paired), do not count it as a chimera
  guide_within <- subjectHits(findOverlaps(targets, chimeras,
                              ignore.strand = TRUE, type = "within"))
  ordered <- unique(guide_within[order(guide_within)])

  # If any targets contained within a chimeric read, remove all members of set
  if (length(ordered) > 0){
    lns_rle <- rle(names(chimeras)[ordered])$lengths
    grps <- rep(1:length(lns_rle), lns_rle)
    is_first <- paste(grps, bitwAnd(mcols(chimeras)$flag[ordered], 64),
                      sep = ".")
    not_dup <- !(duplicated(is_first) | duplicated(is_first, fromLast = TRUE))

    # Remove all chimeras with guides included from the chimeric sets
    non_ch <- names(chimeras) %in% names(chimeras)[ordered][not_dup]
    ch_idxs <- ch_idxs[!non_ch]
    chimeras <- bam[ch_idxs]
  }

  # Assign chimeras to targets
  tgt_plus_tol <- targets + tolerance
  hits <- findOverlaps(chimeras, tgt_plus_tol, ignore.strand = TRUE)

  # Exclude members that match multiple targets
  is_dup <- duplicated(queryHits(hits)) | duplicated(queryHits(hits), fromLast = TRUE)
  hits <- hits[!is_dup]
  splits <- split(queryHits(hits), subjectHits(hits))

  #For each hit, collect all alignments with the same name
  idx_by_primer <- lapply(splits, function(idxs){
    ch_idxs[names(chimeras) %in% names(chimeras)[idxs]]
  })

  ibp <- unlist(idx_by_primer)

  alnsByPCR <- lapply(idx_by_primer, function(ids){ bam[ids]})
  chimerasByPCR[names(splits)] <- alnsByPCR

  if (isTRUE(verbose)){
    # How many chimeric sets were not assigned?
    n_inc <- length(unique(ibp))
    n_total <- length(ch_idxs)
    n_dup <- sum(duplicated(ibp) | duplicated(ibp, fromLast = TRUE))
    pct_inc <- n_inc/n_total * 100
    pct_multi <- n_dup/n_total * 100
    removed <- original_ln - length(chimeras)

    rm_pct <- removed/original_ln * 100
    message(sprintf(paste0("%s from %s (%.2f%%) chimeras did not involve guide\n",
                    "%s from %s (%.2f%%) remaining chimeric reads included\n",
                    "%s (%.2f%%) assigned to more than one target\n"),
                removed, original_ln, rm_pct, n_inc, n_total,
                pct_inc, n_dup, pct_multi))
  }

  # Remove chimeras from the bam
  if (length(ch_idxs) >= 2){
    bam <- bam[-ch_idxs]
  }
  # Return list of chimerasByPCR and bam
  result <- list(bam = bam, chimerasByPCR = chimerasByPCR)
  result
} # -----


# alnsToCrisprSet -----
#@param label.alleles  (logical(1)) Calculate allele labels using the
#default counting method.  If FALSE, allele labels are not set at 
#initialisation (Default: TRUE).
alnsToCrisprSet <- function(alns, reference, target, reverse.complement,
                            collapse.pairs, names, use.consensus, target.loc,
                            verbose, orientation, chimeras = NULL,
                            store.chimeras = FALSE, minoverlap = NULL,
                            allele.labels = TRUE, ...){

    # Flag for whether variants should be counted w.r.t the negative strand
    rc <- rcAlns(as.character(strand(target)), orientation)
  
    # The reference is with respect to the guide.  If the opposite
    # strand is being displayed, reverse the reference.
    # For display wrt target, rc is TRUE for -ve, FALSE for +v
    is_neg <- as.character(strand(target)) == "-"
  
    #if (! (is_neg == rc)){
    #  reference <- Biostrings::reverseComplement(reference)
    #}
  
    # Narrow alignments for each sample
    crispr.runs <- lapply(seq_along(alns), function(i){
      aln <- alns[[i]]

      if (!class(aln) == "GAlignments") {
        aln <- GenomicAlignments::GAlignments()
      }

      chim <- chimeras[[i]]
      if (is.null(chim)) {chim <- GenomicAlignments::GAlignments()}
      checked <- "checked" %in% names(list(...))
      crun <- readsToTarget(aln, target = target,
                reverse.complement = reverse.complement, chimeras = chim,
                collapse.pairs = collapse.pairs, use.consensus = use.consensus,
                verbose = verbose, name = names[i], orientation = orientation,
                minoverlap = minoverlap, target.loc = target.loc,
                checked = checked)
      crun
    })

    # Remove empty samples
    to_rm <- sapply(crispr.runs, is.null)
    if (any(to_rm)){
      if (verbose){
        rm_nms <- paste0(names[to_rm], collapse = ",", sep = "\n")
        message(sprintf("Excluding samples that have no on target reads:\n%s",
                    rm_nms))
      }
      crispr.runs <- crispr.runs[!to_rm]
      names <- names[!to_rm]
      if (length(crispr.runs) == 0) {
        warning("Could not narrow reads to target, ",
                "no samples have on-target alignments")
        return()
      }
    }
    
    # Combine samples into a single object
    cset <- CrisprSet(crispr.runs, reference, target, rc = rc,
                      target.loc = target.loc, 
                      verbose = verbose, names = names, ...)
    
    # Set the allele labels
    if (isTRUE(allele.labels) & ! all(lengths(alns(cset)) == 0)){
        cig_labs_defaults = list(renumbered = TRUE,
                                 match_label = "no variant",
                                 mismatch_label = "SNV",
                                 split.snv = TRUE,
                                 upstream.snv = 8,
                                 downstream.snv = 6,
                                 bpparam = BiocParallel::SerialParam())
        dots <- list(...)
        cig_labs_defaults <- modifyList(cig_labs_defaults, 
                dots[names(dots) %in% names(cig_labs_defaults)])
        do.call(cset$setCigarLabels, cig_labs_defaults)
    }
    cset
} # -----


# readTargetBam -----
#'@title Internal CrispRVariants function for reading and filtering a bam file
#'@description Includes options for excluding reads either by name or range.
#'The latter is useful if chimeras are excluded.  Reads are excluded before
#'chimeras are detected, thus a chimeric read consisting of two sections, one of
#'which overlaps an excluded region, will not be considered chimeric.
#'Chimeric reads can be ignored, excluded, which means that all sections of a
#'chimeric read will be removed, or merged, which means that chimeras will be
#'collapsed into a single read where possible. (Not implemented yet)
#'If chimeras = "merge", chimeric reads are merged if all segments
# are from the same chromosome, do not overlap, and are aligned to the same strand.
# It is assumed that sequences with two alignments are chimeras, not alternate mappings
#'@param file The name of a bam file to read in
#'@param target A GRanges object containing a single target range
#'@param exclude.ranges A GRanges object of regions that should not be counted,
#'e.g. primer or cloning vector sequences that have a match in the genome
#'@param exclude.names A vector of read names to exclude.
#'@param chimera.to.target Maximum distance between endpoints of chimeras and
#'target.loc for assigning chimeras to targets (default: 5)
#'@param chimeras Flag to determine how chimeric reads are treated.  One of
#'"ignore", "exclude", "count" and "merge".  Default "ignore".
#'@param max.read.overlap Maximum number of bases mapped to two positions
#'for chimeras to be merged (Default: 10)
#'@param max.unmapped Maximum number of bases that are unmapped for chimeras
#'to be merged (Default: 4)
#'@param verbose Print stats about number of alignments read and filtered.  (Default: TRUE)
#'@param by.flag Is the supplementary alignment flag set?  Used for identifying chimeric
#'alignments, function is much faster if TRUE.  Not all aligners set this flag.  If FALSE,
#'chimeric alignments are identified using read names (Default: TRUE)
#'@return A GenomicAlignments::GAlignment obj
readTargetBam <- function(file, target, exclude.ranges = GRanges(),
                          exclude.names = NA, chimera.to.target = 5,
                          chimeras = c("count", "ignore","exclude","merge"),
                          max.read.overlap = 10, max.unmapped = 4,
                          by.flag = TRUE, verbose = TRUE){

    ch.action <- match.arg(chimeras)
    if (ch.action == "ignore"){
      # If chimeras are not to be excluded or merged,
      # we only need to read in reads overlapping the target region
      if (! file.exists(paste0(file, ".bam"))){
        Rsamtools::indexBam(file)
      }
      param <- Rsamtools::ScanBamParam(what = c("seq", "flag"), which = target)
    } else {
      # In this case, must read in the entire bam to be sure of finding chimeric reads
      param <- Rsamtools::ScanBamParam(what = c("seq", "flag"))
    }
    bam <- GenomicAlignments::readGAlignments(file, param = param, use.names = TRUE)
    if (length(bam) == 0){
      return(list(bam = GenomicAlignments::GAlignments(),
                  chimeras = GenomicAlignments::GAlignments()))
    }

    # Check that "seq" is not empty
    unq_wdths <- unique(width(mcols(bam)$seq))
    if (length(unq_wdths) == 0){
      if (unq_wdths == 0) stop("No sequence found in bam file")
    }

    #Exclude reads by name or range
    temp <- excludeFromBam(bam, exclude.ranges, exclude.names)

    if (isTRUE(verbose)){
      original <- length(bam)
      message(sprintf("Read %s alignments, excluded %s\n", original,
                      original - length(temp)))
    }
    bam <- temp

    if (length(bam) == 0 | ch.action == "ignore"){
      return(list(bam = bam, chimeras = GenomicAlignments::GAlignments()))
    }

    chimera_idxs <- findChimeras(bam, by.flag = by.flag)

    if (chimeras == "exclude"){
      if( length(chimera_idxs) >= 2){
        bam <- bam[-chimera_idxs]
      }
      if (isTRUE(verbose)){
        message(sprintf("%s reads after filtering chimeras\n", length(bam)))
      }
      return(list(bam = bam, chimeras = GenomicAlignments::GAlignments()))
    }
    if (chimeras == "count"){
      temp <- separateChimeras(bam, target, tolerance = chimera.to.target,
                               by.flag = by.flag, verbose = verbose)
      return(list(bam = temp$bam, chimeras = temp$chimeras[[1]]))
    }
    if (chimeras == "merge"){
      result <- mergeChimeras(bam, chimera_idxs, 
                              max_read_overlap = max.read.overlap,
                              max_unmapped = max.unmapped,
                              verbose = verbose)
      return(list(bam = c(bam[-chimera_idxs], result$merged),
                  chimeras = result$unmerged))
    }
} # -----


# rcAlns -----
#'@title Internal CrispRVariants function for determining read orientation
#'@description Function for determining whether reads should be oriented to the
#'target strand, always displayed on the positive strand, or oriented to 
# the strand opposite the target.
#'@param target.strand  The target strand (one of "+","-","*") 
#'@param orientation One of "target", "opposite" and "positive" (Default: "target")
#'@return A logical value indicating whether reads should be reverse complemented
#'@author Helen Lindsay
rcAlns <- function(target.strand, orientation){

    if (orientation %in% c("opposite","target") & target.strand == "*"){
        warning(paste0("Target does not have a strand\n",
                       "Orienting reads to reference strand."))
      return(FALSE)
    }
    if (orientation == "positive") return(FALSE)
    if (orientation == "target"){
      if (target.strand == "-") return(TRUE)
    } 
    if (orientation == "opposite"){
      if (target.strand == "+") return(TRUE)
    } 
    return(FALSE)
              
} # -----


# collapsePairs -----
#'@title Internal CrispRVariants function for collapsing pairs with concordant indels
#'@description Given a set of alignments to a target region, finds read pairs.
#'Compares insertion/deletion locations within pairs using the cigar string.
#'Pairs with non-identical indels are excluded.  Pairs with identical indels are
#'collapsed to a single read, taking the consensus sequence of the pairs.
#'@param alns A GAlignments object.  We do not use GAlignmentPairs because amplicon-seq
#'can result in pairs in non-standard pairing orientation.
#'Must include BAM flag, must not include unmapped reads.
#'@param use.consensus Should the consensus sequence be used if pairs have a mismatch?
#'Setting this to be TRUE makes this function much slower (Default: TRUE)
#'@param keep.unpaired Should unpaired and chimeric reads be included?  (Default: TRUE)
#'@param verbose Report statistics on reads kept and excluded
#'@param ... Additional items with the same length as alns,
#'that should be filtered to match alns.
#'@return The alignments, with non-concordant pairs removed and concordant pairs
#'represented by a single read.
#'@author Helen Lindsay
collapsePairs <- function(alns, use.consensus = TRUE, keep.unpaired = TRUE,
                          verbose = TRUE, ...){
    dots <- list(...)
 
    if (length(dots) > 0){
      if (! unique(sapply(dots, length)) == length(alns)){
        stop("Each ... argument supplied must have the ",
             "same length as the alignments")
      }
    }
  
    # 1 = 2^0 = paired flag
    # 2048 = 2^11 = supplementary alignment flag
    is_primary <- !(bitwAnd(mcols(alns)$flag, 2048) & bitwAnd(mcols(alns)$flag, 1))
    pairs <- findChimeras(alns[is_primary], by.flag = FALSE) 
    # Above just matches read names

    # If there are no pairs, no need to do anything further
    if (length(pairs) == 0){
      if (isTRUE(keep.unpaired)){
        return(c(list("alignments" = alns), dots))
      } else {
        return(NULL)
      }
    }
    # Pairs are primary alignments with the same name
    nms <- rle(names(alns)[is_primary][pairs])
    nms_codes <- rep(1:length(nms$lengths), nms$lengths)

    # If reads have the same insertions and deletions, they have identical cigar strings
    cig_runs <- rle(paste(cigar(alns)[is_primary][pairs], nms_codes, sep = "."))$lengths
    concordant <- rep(cig_runs, cig_runs) == rep(nms$lengths,nms$lengths)

    # Keep first alignment from all concordant pairs
    # Flag 64 = 2^6 = first alignment in pair
    is_pair <- which(is_primary)[pairs]
    is_first <- as.logical(bitwAnd(mcols(alns)$flag[is_pair], 64))
    keep <- is_pair[concordant & is_first]

    if (verbose){
      nunpaired <- length(alns) - length(is_pair)
      cc_true <- sum(concordant)/2
      cc_false <- sum(!concordant)/2
      stats <- paste0("\nCollapsing paired alignments:\n",
                "%s original alignments\n",
                "  %s are not part of a primary alignment pair\n",
                "     (singletons and chimeras)\n",
                "  %s reads are paired \n",
                "    %s pairs have the same insertions/deletions\n",
                "    %s pairs have different insertions/deletions\n",
                "Keeping the first member of %s concordant read pairs\n")
      message(sprintf(stats, length(alns), nunpaired, length(is_pair),
                  cc_true, cc_false, cc_true))
    }
    if (keep.unpaired){
      # Keep non-pairs, including non-primary and singletons
      keep <- c(keep, setdiff(c(1:length(alns)),is_pair))
      if (verbose) message(sprintf("Keeping %s unpaired reads\n", nunpaired))
    }
    keep_alns <- alns[keep]

    if (use.consensus){
      seq_runs <- rle(paste0(nms_codes, mcols(alns[is_primary][pairs])$seq))$lengths
      same_seq <- rep(seq_runs, seq_runs) == rep(nms$lengths, nms$lengths)
  
      ncc_seqs <- mcols(alns[is_primary][pairs][concordant & !same_seq])$seq
      if (verbose){
        message(sprintf("Finding consensus for %s pairs with mismatches\n",
                    length(ncc_seqs)/2))
      }
      if (length(ncc_seqs) >= 2){
        consensus <- sapply(seq(1,length(ncc_seqs), by = 2), function(i){
        Biostrings::consensusString(ncc_seqs[i:(i+1)])
        })
        # Overwrite the sequence of the non-concordant pairs.
        # The concordant alignments are at the start of keep
        ncc_idxs <- cumsum(concordant & is_first)[concordant & is_first & !same_seq]
        mcols(keep_alns[ncc_idxs])$seq <- Biostrings::DNAStringSet(consensus)
      }
    }
    if (length(keep) == 0) return(NULL)
    filtered.dots <- lapply(dots, function(x) x[keep])

    result <- c(list("alignments" = keep_alns), filtered.dots)
    result
} # -----

Try the CrispRVariants package in your browser

Any scripts or data that you put into this service are public.

CrispRVariants documentation built on Nov. 8, 2020, 11:09 p.m.