ORFik: Open Reading Frames in Genomics

Documented in asTX extendLeaders extendTrailers makeExonRanks makeORFNames pmapFromTranscriptF pmapToTranscriptF reduceKeepAttr subsetToFrame tile1 txSeqsFromFa windowPerGroup

#' Make grouping by exons ranks
#'
#' There are two ways to make vector of exon ranking:
#' 1. Iterate per exon in ORF, byTranscript = FALSE
#' 2. Iterate per ORF in transcript, byTranscript = TRUE.
#'
#' Either by transcript or by original groupings.
#' Must be ordered, so that same transcripts are ordered together.
#' @param grl a \code{\link{GRangesList}}
#' @param byTranscript logical (default: FALSE), groups orfs by transcript
#' name or ORF name, if ORfs are by transcript, check duplicates.
#' @return an integer vector of indices for exon ranks
#' @importFrom S4Vectors nrun Rle
#' @keywords internal
makeExonRanks <- function(grl, byTranscript = FALSE) {
  validGRL(class(grl))
  g <- groupings(grl)

  if (byTranscript & length(g) > 1) {
    inds <- rep.int(1L, length(g))
    oldNames <- names(grl)
    oldNames <- data.table::chmatch(oldNames, oldNames)
    for (x in seq.int(2L, length(g))) {
      if (g[x] != g[x - 1L]) {
        if (oldNames[g[x]] == oldNames[g[x] - 1L]) {
          inds[x] <- inds[x - 1L] + 1L
        }
      } else{
        inds[x] <- inds[x - 1L]
      }
    }
    return(inds)
  }
  return(g)
}


#' Make ORF names per orf
#'
#' grl must be grouped by transcript
#' If a list of orfs are grouped by transcripts, but does not have
#' ORF names, then create them and return the new GRangesList
#' @param grl a \code{\link{GRangesList}}
#' @param groupByTx logical (T), should output GRangesList be grouped by
#' transcripts (T) or by ORFs (F)?
#' @return (GRangesList) with ORF names, grouped by transcripts, sorted.
#' @importFrom S4Vectors DataFrame
#' @export
#' @examples
#' gr_plus <- GRanges(seqnames = c("chr1", "chr1"),
#'                    ranges = IRanges(c(7, 14), width = 3),
#'                    strand = c("+", "+"))
#' gr_minus <- GRanges(seqnames = c("chr2", "chr2"),
#'                     ranges = IRanges(c(4, 1), c(9, 3)),
#'                     strand = c("-", "-"))
#' grl <- GRangesList(tx1 = gr_plus, tx2 = gr_minus)
#' makeORFNames(grl)
#'
makeORFNames <- function(grl, groupByTx = TRUE) {
  ranks <- makeExonRanks(grl, byTranscript = TRUE)
  asGR <- unlistGrl(grl)
  mcols(asGR) <- DataFrame(row.names = names(asGR),
                           names = paste0(names(asGR), "_", ranks))
  if (groupByTx) {
    return(groupGRangesBy(asGR))
  } else {
    return(groupGRangesBy(asGR, asGR$names))
  }
}

#' Tile each GRangesList group to 1-base resolution.
#'
#' Will tile a GRangesList into single bp resolution, each group of the list
#' will be splited by positions of 1. Returned values are sorted as the same
#' groups as the original GRangesList, except they are in bp resolutions.
#' This is not supported originally by GenomicRanges for GRangesList.
#' @param grl a \code{\link{GRangesList}} object with names.
#' @param sort.on.return logical (TRUE), should the groups be
#'  sorted before return (Negative ranges should be in decreasing order).
#'  Makes it a bit slower, but much safer for downstream analysis.
#' @param matchNaming logical (TRUE), should groups keep unlisted names
#'  and meta data.(This make the list very big, for > 100K groups)
#' @param is.sorted logical (TRUE), grl is presorted
#' (negative coordinates are decreasing). Set to FALSE if they are not,
#' else output will most likely be wrong!
#' @return a GRangesList grouped by original group, tiled to 1. Groups with identical names
#' will be merged.
#' @importFrom S4Vectors DataFrame
#' @export
#' @family ExtendGenomicRanges
#' @examples
#' gr1 <- GRanges("1", ranges = IRanges(start = c(1, 10, 20),
#'                                      end = c(5, 15, 25)),
#'                strand = "+")
#' gr2 <- GRanges("1", ranges = IRanges(start = c(20, 30, 40),
#'                                      end = c(25, 35, 45)),
#'                strand = "+")
#' names(gr1) = rep("tx1_1", 3)
#' names(gr2) = rep("tx1_2", 3)
#' grl <- GRangesList(tx1_1 = gr1, tx1_2 = gr2)
#' tile1(grl)
#'
tile1 <- function(grl, sort.on.return = TRUE, matchNaming = TRUE,
                  is.sorted = TRUE) {
  if (!is.grl(grl)) stop("grl must be a GRangesList")
  if (!is.sorted) grl <- sortPerGroup(grl)

  # optimize by removing all unecessary data
  ORFs <- unlist(grl, use.names = FALSE)
  mcols(ORFs) <- NULL
  names(ORFs) <- NULL
  tilex <- tile(ORFs, width =  1L)
  grouping <- groupings(grl)
  names(tilex) <- grouping

  # only negative with > 1 exons must be sorted
  if (sort.on.return) {
    negIndices <- !strandBool(tilex) & numExonsPerGroup(tilex) > 1
    tilex[negIndices] <- sortPerGroup(tilex[negIndices], quick.rev = TRUE)
  }

  if (matchNaming) {
    if (!is.null(names(grl))) {
      names(tilex) <- names(grl)[as.integer(grouping)]
    }
    tilex <- matchNaming(tilex, grl[1L])
  } else {
    tilex <- unlistGrl(tilex)
    grouping <- names(tilex)
    if (!is.null(names(grl))) {
      grouping <- names(grl)[as.integer(grouping)]
    }
    names(tilex) <- NULL
    tilex <- groupGRangesBy(tilex, grouping)
  }
  return(tilex)
}

#' Map genomic to transcript coordinates by reference
#'
#' Map range coordinates between features in the genome and
#' transcriptome (reference) space.
#'
#' Similar to GenomicFeatures' pmapToTranscripts, but in this version the
#' grl ranges are compared to reference ranges with same name, not
#' by index. This gives a large speedup, but also requires
#' all objects must be named.
#' @param grl a \code{\link{GRangesList}} of ranges within
#'  the reference, grl must either have names matching, or a meta column called
#' 'names' that gives grouping names. i.e. grl named uORF_1_ENST00001, must then
#'  have a names meta column with ENST00001.
#' @param reference a GRangesList of ranges that include grl as a subset
#' of ranges. Example: cds is grl and mrna can be reference
#' @inheritParams pmapToTranscriptF
#' @return a GRangesList in transcript coordinates
#' @family ExtendGenomicRanges
#' @export
#' @examples
#' seqname <- c("tx1", "tx2", "tx3")
#' seqs <- c("ATGGGTATTTATA", "AAAAA", "ATGGGTAATA")
#' grIn1 <- GRanges(seqnames = "1",
#'                  ranges = IRanges(start = c(21, 10), end = c(23, 19)),
#'                  strand = "-")
#' grIn2 <- GRanges(seqnames = "1",
#'                  ranges = IRanges(start = c(1), end = c(5)),
#'                  strand = "-")
#' grIn3 <- GRanges(seqnames = "1",
#'                  ranges = IRanges(start = c(1010), end = c(1019)),
#'                  strand = "-")
#' grl <- GRangesList(grIn1, grIn2, grIn3)
#' names(grl) <- seqname
#' # Find ORFs
#' test_ranges <- findMapORFs(grl, seqs,
#'                  "ATG|TGG|GGG",
#'                  "TAA|AAT|ATA",
#'                  longestORF = FALSE,
#'                  minimumLength = 0)
#' # Genomic coordinates ORFs
#' test_ranges
#' # Transcript coordinate ORFs
#' asTX(test_ranges, reference = grl)
#' # seqnames will here be index of transcript it came from
#'
asTX <- function(grl, reference,
                 ignore.strand = FALSE,
                 x.is.sorted = TRUE,
                 tx.is.sorted = TRUE) {
  orfNames <- txNames(grl, reference) # Find tx they came from
  if (sum(orfNames %in% names(reference)) != length(orfNames)) {
    orfNames <- names(grl)
    if (is.null(orfNames) || sum(orfNames %in% names(reference)) != length(orfNames)) {
      stop("not all references are present, so can not map to transcripts.",
           " Does your annotation contains  '_'+number naming?",
           "ORFik uses this for a special purpose.")
    }
  }
  reference <- reference[orfNames] # Duplicate needed references
  names(reference) <- NULL
  return(pmapToTranscriptF(grl, reference,
                           ignore.strand = ignore.strand,
                           x.is.sorted = x.is.sorted,
                           tx.is.sorted = tx.is.sorted))
}

#' Faster pmapToTranscript
#'
#' Map range coordinates between features in the transcriptome and
#' genome (reference) space.
#' The length of x must be the same as length of transcripts. Only exception is
#' if x have integer names like (1, 3, 3, 5), so that x[1] maps to 1, x[2] maps
#' to transcript 3 etc.
#'
#' This version tries to fix the shortcommings of GenomicFeature's version.
#' Much faster and uses less memory.
#' Implemented as dynamic program optimized c++ code.
#' @param x GRangesList/GRanges/IRangesList/IRanges to map
#' to transcriptomic coordinates
#' @param transcripts a GRangesList/GRanges/IRangesList/IRanges to
#'  map against (the genomic coordinates). Must be of lower abstraction level
#'  than x. So if x is GRanges, transcripts can not be IRanges etc.
#' @param ignore.strand When ignore.strand is TRUE, strand is ignored in
#'  overlaps operations (i.e., all strands are considered "+") and the
#'  strand in the output is '*'. \cr
#'  When ignore.strand is FALSE (default) strand in the output is taken from the
#'  transcripts argument. When transcripts is a GRangesList, all inner list
#'  elements of a common list element must have the same strand
#'  or an error is thrown. \cr
#'  Mapped position is computed by counting from the transcription start site
#'   (TSS) and is not affected by the value of ignore.strand.
#' @param x.is.sorted if x is a GRangesList object, are "-" strand groups pre-sorted
#' in decreasing order within group, default: TRUE
#' @param tx.is.sorted if transcripts is a GRangesList object,
#' are "-" strand groups pre-sorted in decreasing order within group,
#' default: TRUE
#' @return object of same class as input x, names from ranges are kept.
#' @export
#' @examples
#' library(GenomicFeatures)
#' # Need 2 ranges object, the target region and whole transcript
#' # x is target region
#' x <- GRanges("chr1", IRanges(start = c(26, 29), end = c(27, 29)), "+")
#' names(x) <- rep("tx1_ORF1", length(x))
#' x <- groupGRangesBy(x)
#' # tx is the whole region
#' tx_gr <- GRanges("chr1", IRanges(c(5, 29), c(27, 30)), "+")
#' names(tx_gr) <- rep("tx1", length(tx_gr))
#' tx <- groupGRangesBy(tx_gr)
#' pmapToTranscriptF(x, tx)
#' pmapToTranscripts(x, tx)
#'
#' # Reuse names for matching
#' x <- GRanges("chr1", IRanges(start = c(26, 29, 5), end = c(27, 29, 18)), "+")
#' names(x) <- c(rep("tx1_1", 2), "tx1_2")
#' x <- groupGRangesBy(x)
#' tx1_2 <- GRanges("chr1", IRanges(c(4, 28), c(26, 31)), "+")
#' names(tx1_2) <- rep("tx1", 2)
#' tx <- c(tx, groupGRangesBy(tx1_2))
#'
#' a <- pmapToTranscriptF(x, tx[txNames(x)])
#' b <- pmapToTranscripts(x, tx[txNames(x)])
#' identical(a, b)
#' seqinfo(a)
#' # A note here, a & b only have 1 seqlength, even though the 2 "tx1"
#' # are different in size. This is an artifact of using duplicated names.
#'
#' ## Also look at the asTx for a similar useful function.
pmapToTranscriptF <- function(x, transcripts, ignore.strand = FALSE,
                              x.is.sorted = TRUE, tx.is.sorted = TRUE) {
  if ((length(x) == 0)) return(x)

  if (length(x) != length(transcripts)) { # Recycling
    if (length(x) == 1) {
      x <- rep(x, length(transcripts))
    } else if(length(transcripts) == 1) {
      transcripts <- rep(transcripts, length(x))
    } else stop("recycling is supported when length(x) == 1 or,",
                "length(transcripts) == 1; otherwise the lengths must match")
  }

  # Store original values we need
  xOriginal <- x; oldNames <- names(x); xClass <- class(x)
  xWidths <- width(xOriginal)

  oldTxNames <- names(transcripts)
  txWidths <- if (is.grl(transcripts)) {
    widthPerGroup(transcripts, FALSE)
  } else width(transcripts)

  xStrandOriginal <- if(is.grl(xOriginal)) {
    as.character(unlist(strand(xOriginal), use.names = FALSE))
  } else if (is(xOriginal, "GRanges")) {
    as.character(strand(xOriginal))
  } else NULL

  # subset to ranges and get indices for x
  if (is.grl(x) & !x.is.sorted) x <- sortPerGroup(x, ignore.strand)
  if (is.gr_or_grl(x)) x <- ranges(x)
  if (is(x, "IRangesList")) {
    indices <- groupings(x)
    xWidths <- as.integer(sum(xWidths))
    x <- unlist(x, use.names = FALSE)
    names(x) <- NULL
  } else if (is(x, "IRanges")) {
    indices <- seq.int(1, length(x))
    names(x) <- NULL
  } else stop("x must either be IRanges, IRangesList, GRanges or GRangesList")

  # Sanity tests
  if (!is.logical(ignore.strand)) stop("ignore.strand must be logical")
  ignore.strand <- as.logical(ignore.strand)
  if (max(indices) > length(transcripts)) stop("invalid names of IRanges")
  if (length(x) != length(indices)) stop("length of ranges != indices")
  notEqualSeqnames <- is.gr_or_grl(xOriginal) & is.gr_or_grl(transcripts) &
                         !all(seqlevels(xOriginal) %in% seqlevels(transcripts))
  if (notEqualSeqnames) stop("subscript contains out-of-bounds indices")
  # TODO: add propper test for per row seqnames, not just seqlevels
  if (is.grl(transcripts) & !tx.is.sorted)
    transcripts <- sortPerGroup(transcripts, ignore.strand)
  if (!all(xWidths <= txWidths)) {
    stop("Invalid ranges to map, check them.",
         " One has width bigger than its reference")
  }

  # Unlist tx, if list structure
  tx <- ranges(transcripts)
  names <- names(tx)
  names(tx) <- NULL
  if (is.grl(transcripts) | is(transcripts, "IRangesList")) {
    tx <- unlist(tx, use.names = FALSE)
    groupings <- groupings(transcripts)
    exonN <- lengths(transcripts)
  } else { # not list
    groupings <- seq.int(1, length(transcripts))
    exonN <- seq.int(1, length(transcripts))
  }

  txStrand <- if (ignore.strand) { # If ignore strand, set to '+'
    rep(TRUE, length(transcripts))
  } else strandBool(transcripts)
  xStrand <- if (ignore.strand) { # If ignore strand, set all to '+'
    rep(TRUE, length(indices))
  } else txStrand[indices]

  # Split indices for x into pos / neg-strand
  if (is.grl(xOriginal) | is(xOriginal, "IRangesList")) {
    indicesPos <- groupings(xOriginal[txStrand])
    indicesNeg <- groupings(xOriginal[!txStrand])
  } else {
    indicesPos <- seq_along(xOriginal[txStrand])
    indicesNeg <- seq_along(xOriginal[!txStrand])
  }
  # Make algorithm dynamic, by skipping if you know you can go to next transcript
  exonCumSumPos <- c(0, cumsum(exonN[txStrand]))[indicesPos]
  exonCumSumNeg <- c(0, cumsum(exonN[!txStrand]))[indicesNeg]
  txStrand <- txStrand[groupings]

  # Here is pos and neg direction of the algorithm ->
  # forward strand (c++ code)
  if (any(txStrand)) {
    pos <- pmapToTranscriptsCPP(start(x)[xStrand], end(x)[xStrand],
                                start(tx)[txStrand], end(tx)[txStrand],
                                groupings[txStrand], '+', exonCumSumPos)
  } else {
    pos <- list(ranges = list(vector("integer"), vector("integer")),
                index = vector("integer"))
  }

  # reverse strand (c++ code)
  if (any(!txStrand)) {
    neg <- pmapToTranscriptsCPP(start(x)[!xStrand], end(x)[!xStrand],
                                start(tx)[!txStrand], end(tx)[!txStrand],
                                groupings[!txStrand], '-',
                                exonCumSumNeg)
  } else {
    neg <- list(ranges = list(vector("integer"), vector("integer")),
                index = vector("integer"))
  }
  xStart <- vector("integer", length(indices))
  xStart[xStrand] <- unlist(pos$ranges[1], use.names = FALSE)
  xStart[!xStrand] <- unlist(neg$ranges[1], use.names = FALSE)

  xEnd <- vector("integer", length(indices))
  xEnd[xStrand] <- unlist(pos$ranges[2], use.names = FALSE)
  xEnd[!xStrand] <- unlist(neg$ranges[2], use.names = FALSE)

  result <- IRanges(xStart, xEnd)


  if (is.gr_or_grl(xClass)) { # If it was GR object, recreate
    newSeqnames <- if (!is.null(oldTxNames)) {
      oldTxNames
      # TODO: check that bellow seq.int is valid
    } else seq.int(length(result))
    newStrand <- if (ignore.strand) {
      "*"
    } else if (is.grl(transcripts)) {
      strandPerGroup(transcripts, FALSE)[indices]
    } else as.character(strand(transcripts))[indices]
    result <- GRanges(seqnames = newSeqnames[indices],
                      ranges = result, strand = newStrand)
    # TODO: remove this when not needed
    unmapped <- start(result) == 0
    if (!ignore.strand) # Check strand correction only if not ignore
      unmapped <- unmapped | (strand(result) != xStrandOriginal)
    if (any(unmapped)) {
      strand(result)[unmapped] <- "*"
    }

    seqlevels.used <- seqlevels(result)
    if(is.null(names(transcripts))) {
      seqlevels.used <- as.integer(seqlevels.used)
    }
    seqlengths(result) <- if (is.grl(transcripts)) {
      widthPerGroup(transcripts[seqlevels.used])
    } else as.integer(width(transcripts[seqlevels.used]))
  }

  if (is.grl(xClass) | is(xOriginal, "IRangesList")) {
    result <- split(result, indices)
    names(result) <- oldNames
    result <- reduce(result, drop.empty.ranges = FALSE)
  } else names(result) <- oldNames
  return(result)
}

#' Faster pmapFromTranscript
#'
#' Map range coordinates between features in the transcriptome and
#' genome (reference) space.
#' The length of x must be the same as length of transcripts. Only exception is
#' if x have integer names like (1, 3, 3, 5), so that x[1] maps to 1, x[2] maps
#' to transcript 3 etc.
#'
#' This version tries to fix the short commings of GenomicFeature's version.
#' Much faster and uses less memory.
#' Implemented as dynamic program optimized c++ code.
#' @param x IRangesList/IRanges/GRanges to map to genomic coordinates
#' @param transcripts a GRangesList to map against (the genomic coordinates)
#' @param removeEmpty a logical, remove non hit exons, else they are set
#'  to 0. That is all exons in the reference that the transcript coordinates
#'  do not span.
#' @return a GRangesList of mapped reads, names from ranges are kept.
#' @export
#' @examples
#' ranges <- IRanges(start = c( 5, 6), end = c(10, 10))
#' seqnames = rep("chr1", 2)
#' strands = rep("-", 2)
#' grl <- split(GRanges(seqnames, IRanges(c(85, 70), c(89, 82)), strands),
#'              c(1, 1))
#' ranges <- split(ranges, c(1,1)) # both should be mapped to transcript 1
#' pmapFromTranscriptF(ranges, grl, TRUE)
#'
pmapFromTranscriptF <- function(x, transcripts, removeEmpty = FALSE) {
  if (is.null(names(x))) {
    if (length(x) != length(transcripts))
      stop("invalid names of ranges, when length(x) != length(transcripts)")
  }
  if (is(x, "GRanges")) x <- ranges(x)
  if (is(x, "IRangesList")) {
    # Create Ranges object from orf scanner result
    x = unlist(x, use.names = TRUE)
    indices <- strtoi(names(x))
    names(x) <- NULL
  } else if (is(x, "IRanges")) {
    if (is.null(names(x))) {
      indices <- seq.int(1, length(x))
    } else {
      indices <- strtoi(names(x))
    }
    names(x) <- NULL
  } else stop("x must either be IRanges, IRangesList or GRanges")

  if (!is.logical(removeEmpty)) stop("removeEmpty must be logical")
  removeEmpty <- as.logical(removeEmpty)
  if (max(indices) > length(transcripts)) stop("invalid names of IRanges")
  if (length(x) != length(indices)) stop("length of ranges != indices")

  tx <- ranges(transcripts)
  names <- names(tx)
  names(tx) <- NULL
  tx <- tx[indices]
  if (!all(width(x) <= as.integer(sum(width(tx))))) {
    stop("Invalid ranges to map, check them. One is bigger than its reference")
  }
  groupings <- groupings(tx)
  tx <- unlist(tx, use.names = FALSE)
  xStrand <- strandBool(transcripts)[indices]
  txStrand <- xStrand[groupings]

  # forward strand
  if (any(txStrand)) {
    pos <- pmapFromTranscriptsCPP(start(x)[xStrand], end(x)[xStrand],
                                  start(tx)[txStrand], end(tx)[txStrand],
                                  groupings[txStrand], '+', removeEmpty)
  } else {
    pos <- list(ranges = list(vector("integer"), vector("integer")),
                index = vector("integer"))
  }

  # reverse strand
  if (any(!txStrand)) {
    neg <- pmapFromTranscriptsCPP(start(x)[!xStrand], end(x)[!xStrand],
                                  start(tx)[!txStrand], end(tx)[!txStrand],
                                  groupings[!txStrand], '-', removeEmpty)
  } else {
    neg <- list(ranges = list(vector("integer"), vector("integer")),
                index = vector("integer"))
  }

  if (removeEmpty) {
    txStrand <- order(c(pos$index, neg$index)) <=  length(pos$index)
  }
  temp <- indices
  nIndices <- vector("integer", length(txStrand))
  nIndices[txStrand] <- pos$index
  nIndices[!txStrand] <- neg$index
  indices <- indices[nIndices]

  xStart <- vector("integer", length(txStrand))
  xStart[txStrand] <- unlist(pos$ranges[1], use.names = FALSE)
  xStart[!txStrand] <- unlist(neg$ranges[1], use.names = FALSE)

  xEnd <- vector("integer", length(txStrand))
  xEnd[txStrand] <- unlist(pos$ranges[2], use.names = FALSE)
  xEnd[!txStrand] <- unlist(neg$ranges[2], use.names = FALSE)

  result <- GRanges(seqnames = seqnamesPerGroup(transcripts, FALSE)[indices],
                    ranges = IRanges(xStart, xEnd),
                    strand = strandPerGroup(transcripts, FALSE)[indices])

  names(result) <- names[indices]
  result <- split(result, nIndices)
  names(result) <- names(transcripts)[temp]
  seqlevels(result) <- seqlevels(transcripts)
  seqinfo(result) <- seqinfo(transcripts)
  return(result)
}

#' Get transcript sequence from a GRangesList and a faFile or BSgenome
#'
#' For each GRanges object, find the sequence of it from faFile or BSgenome.
#'
#' A wrapper around \code{\link{extractTranscriptSeqs}} that works for
#' DNAStringSet and ORFik \code{\link{experiment}} input.
#' For debug of errors do:
#' which(!(unique(seqnamesPerGroup(grl, FALSE)) %in% seqlevels(faFile)))
#' This happens usually when the grl contains chromsomes that the fasta
#' file does not have. A normal error is that mitocondrial chromosome is
#' called MT vs chrM even though they have same seqlevelsStyle. The
#' above line will give you which chromosome it is missing.
#' @param grl a \code{\link{GRangesList}} object
#' @inheritParams findFa
#' @param is.sorted a speedup, if you know the grl ranges are sorted
#' @param keep.names a logical, default (TRUE), if FALSE: return as
#' character vector without names.
#' @export
#' @return a \code{\link{DNAStringSet}} of the transcript sequences
#' @importFrom GenomicFeatures extractTranscriptSeqs
#' @family ExtendGenomicRanges
#'
txSeqsFromFa <- function(grl, faFile, is.sorted = FALSE,
                         keep.names = TRUE) {
  faFile <- findFa(faFile)
  if (!any(seqlevels(grl) %in% seqlevels(faFile)))
    stop("FaFile had no matching seqlevels to ranges object")
  if (!is.sorted) grl <- sortPerGroup(grl)

  # Check for optimization, if conditions are met
  path <- ifelse(is.character(faFile), faFile, ifelse(is(faFile, "FaFile"), faFile$path, ""))
  if (path != "") {
    if (file.size(path) < 1e7 & length(grl) > 50) {
      # TODO: make failsafe for more special cases
      faFile <- readDNAStringSet(path)
      seqnames_grl <- unique(seqnamesPerGroup(grl, FALSE))
      bad_name_format <- !all(seqnames_grl %in% names(faFile))
      if (bad_name_format) {
        new_names <- try(names(seqinfo(FaFile(path))))
        if (all(seqnames_grl %in% new_names)) names(faFile) <- new_names
      }
    }
  }
  if (is(faFile, "DNAStringSet")) {
    seqs <- faFile[grl]
  } else {
    seqs <- extractTranscriptSeqs(faFile, transcripts = grl)
  }
  if (!keep.names) return(as.character(seqs, use.names = FALSE))
  return(seqs)
}

#' Get window region of GRanges object
#'
#' Per GRanges input (gr) of single position inputs (center point),
#' create a GRangesList window output of specified
#' upstream, downstream region relative to some transcript "tx". \cr
#' If downstream is 20, it means the window will start 20 downstream of
#' gr start site (-20 in relative transcript coordinates.)
#' If upstream is 20, it means the window will start 20 upstream of
#' gr start site (+20 in relative transcript coordinates.)
#' It will keep exon structure of tx, so if -20 is on next exon, it jumps to
#' next exon.
#'
#' If a region has a part that goes out of bounds, E.g if you try to get window
#' around the CDS start site, goes longer than the 5' leader start site,
#' it will set start to the edge boundary
#' (the TSS of the transcript in this case).
#' If region has no hit in bound, a width 0 GRanges object is returned.
#' This is useful for things like countOverlaps, since 0 hits will then always
#' be returned for the correct object index. If you don't want the 0 width
#' windows, use \code{reduce()} to remove 0-width windows.
#' @param gr a GRanges/IRanges object (startSites or others,
#'  must be single point per in genomic coordinates)
#' @param tx a \code{\link{GRangesList}} of transcripts or (container region),
#' names of tx must contain all gr names. The names of gr can also be the
#' ORFik orf names. that is "txName_id".
#' @param upstream an integer, default (0), relative region to get
#'  upstream end from. (0 means start site, +1 is one upstream, -1 is one downstream)
#' @param downstream an integer, default (0), relative region to get
#'  downstream end from (0 means start site, +1 is one downstream, -1 is one upstream)
#' @return a GRanges, or GRangesList object if any group had > 1 exon.
#' @export
#' @family ExtendGenomicRanges
#' @importFrom data.table chmatch
#' @examples
#' # find 2nd codon of an ORF on a spliced transcript
#' ORF <- GRanges("1", c(3), "+") # start site
#' names(ORF) <- "tx1_1" # ORF 1 on tx1
#' tx <- GRangesList(tx1 = GRanges("1", c(1,3,5,7,9,11,13), "+"))
#' windowPerGroup(ORF, tx, upstream = 0, downstream = 0) # <- TIS
#' windowPerGroup(ORF, tx, upstream = 0, downstream = 1) # <- first and second base
#' windowPerGroup(ORF, tx, upstream = -1, downstream = 1) # <- second base
#' # find 2nd codon of an ORF on a spliced transcript
#' windowPerGroup(ORF, tx, upstream = -3, downstream = 5) # <- 2nd codon
#'
#' # With multiple extensions downstream
#' ORF <- rep(ORF, 2)
#' names(ORF)[2] <- "tx1_2"
#' windowPerGroup(ORF, tx, upstream = 0, downstream = c(2, 5))
#' # The last one gives 2nd for first ORF and (1st and 2nd) codon for
#' # second ORF, returned as two groups of class GRanges/GRangsList
#'
windowPerGroup <- function(gr, tx, upstream = 0L, downstream = 0L) {
  g <- asTX(gr, tx, tx.is.sorted = TRUE)
  indices <- chmatch(txNames(gr, tx), names(tx))
  if (anyNA(indices)) {
    indices <- chmatch(names(gr), names(tx))
    if (anyNA(indices)) {
      stop("not all references are present, so can not map to transcripts.",
           " Does your annotation contains  '_'+number naming?",
           "ORFik uses this for a special purpose.")
    }
  }
  txEnds <- widthPerGroup(tx[indices], FALSE)
  starts <- pmin(pmax(start(g) - upstream, 1L), txEnds)

  g <- ranges(g)
  if (any(downstream != 0L)) {
    ends <- pmin(pmax(end(g) + downstream, starts - 1), txEnds)
    if (is(g, "IRanges")) # Remove this ?
    g <- IRanges(starts, ends)
  } else {
    start(g) <- starts
  }
  names(g) <- indices
  region <- pmapFromTranscriptF(g, tx, TRUE)
  names(region) <- names(gr)
  return(region)
}

#' Extend the leaders transcription start sites.
#'
#' Will extend the leaders or transcripts upstream (5' end) by extension.
#' The extension is general not relative, that means splicing
#' will not be taken into account.
#' Requires the \code{grl} to be sorted beforehand,
#' use \code{\link{sortPerGroup}} to get sorted grl.
#' @param grl usually a \code{\link{GRangesList}} of 5' utrs or transcripts.
#' Can be used for any extension of groups.
#' @param extension an integer, how much to extend upstream (5' end).
#' Eiter single value that will apply for all, or same as length of grl
#' which will give 1 update value per grl object.
#' Or a GRangesList where start / stops by strand are the positions
#' to use as new starts.
#' @param cds a \code{\link{GRangesList}} of coding sequences,
#' If you want to extend 5' leaders downstream, to catch
#' upstream ORFs going into cds, include it. It will add first
#' cds exon to grl matched by names.
#' Do not add for transcripts, as they are already included.
#' @param is.circular logical, default FALSE if not any is: all(isCircular(grl) %in% TRUE).
#' Where grl is the ranges checked. If TRUE, allow ranges to extend
#' below position 1 on chromosome. Since circular genomes can have negative coordinates.
#' @return an extended GRangeslist
#' @export
#' @family ExtendGenomicRanges
#' @examples
#' library(GenomicFeatures)
#' samplefile <- system.file("extdata", "hg19_knownGene_sample.sqlite",
#'                           package = "GenomicFeatures")
#' txdb <- loadDb(samplefile)
#' fiveUTRs <- fiveUTRsByTranscript(txdb, use.names = TRUE) # <- extract only 5' leaders
#' tx <- exonsBy(txdb, by = "tx", use.names = TRUE)
#' cds <- cdsBy(txdb,"tx",use.names = TRUE)
#' ## extend leaders upstream 1000
#' extendLeaders(fiveUTRs, extension = 1000)
#' ## now try(extend upstream 1000, add all cds exons):
#' extendLeaders(fiveUTRs, extension = 1000, cds)
#'
#' ## when extending transcripts, don't include cds' of course,
#' ## since they are already there
#' extendLeaders(tx, extension = 1000)
#' ## Circular genome (allow negative coordinates)
#' circular_fives <- fiveUTRs
#' isCircular(circular_fives) <- rep(TRUE, length(isCircular(circular_fives)))
#' extendLeaders(circular_fives, extension = 32672841L)
#'
extendLeaders <- function(grl, extension = 1000L, cds = NULL,
                          is.circular = all(isCircular(grl) %in% TRUE)) {
  if (is(extension, "numeric") && length(extension) %in% c(1L, length(grl))) {
    posIndices <- strandBool(grl)
    promo <- promoters(unlist(firstExonPerGroup(grl), use.names = FALSE),
                       upstream = extension)
    newStarts <- rep(NA, length(grl))
    newStarts[posIndices] <- as.integer(start(promo[posIndices]))
    newStarts[!posIndices] <- as.integer(end(promo[!posIndices]))
  } else if (is.grl(class(grl))) {
    starts <- startSites(extension)
    changedGRL <- downstreamFromPerGroup(grl[names(extension)], starts, is.circular)
    return(changedGRL)
  } else {
    stop("extension must either be an integer, or a GRangesList")
  }

  extendedLeaders <- assignFirstExonsStartSite(grl, newStarts, is.circular)
  if(is.null(cds)) return (extendedLeaders)
  return(addCdsOnLeaderEnds(extendedLeaders, cds))
}

#' Extend the Trailers transcription stop sites
#'
#' Will extend the trailers or transcripts downstream (3' end) by extension.
#' The extension is general not relative, that means splicing
#' will not be taken into account.
#' Requires the \code{grl} to be sorted beforehand,
#' use \code{\link{sortPerGroup}} to get sorted grl.
#' @param grl usually a \code{\link{GRangesList}} of 3' utrs or transcripts.
#' Can be used for any extension of groups.
#' @param extension an integer, how much to extend downstream (3' end).
#' Eiter single value that will apply for all, or same as length of grl
#' which will give 1 update value per grl object.
#' Or a GRangesList where start / stops sites by strand are the positions
#' to use as new starts.
#' @inheritParams extendLeaders
#' @return an extended GRangeslist
#' @export
#' @family ExtendGenomicRanges
#' @examples
#' library(GenomicFeatures)
#' samplefile <- system.file("extdata", "hg19_knownGene_sample.sqlite",
#'                           package = "GenomicFeatures")
#' txdb <- loadDb(samplefile)
#' threeUTRs <- threeUTRsByTranscript(txdb) # <- extract only 5' leaders
#' tx <- exonsBy(txdb, by = "tx", use.names = TRUE)
#' ## now try(extend downstream 1000):
#' extendTrailers(threeUTRs, extension = 1000)
#' ## Or on transcripts
#' extendTrailers(tx, extension = 1000)
#' ## Circular genome (allow negative coordinates)
#' circular_three <- threeUTRs
#' isCircular(circular_three) <- rep(TRUE, length(isCircular(circular_three)))
#' extendTrailers(circular_three, extension = 126200008L)[41] # <- negative stop coordinate
#'
extendTrailers <- function(grl, extension = 1000L, is.circular =
                             all(isCircular(grl) %in% TRUE)) {
  if (is(extension, "numeric") && length(extension) %in% c(1L, length(grl))) {
    posIndices <- strandBool(grl)
    promo <- flank(unlist(lastExonPerGroup(grl), use.names = FALSE),
                   width = extension, start = FALSE)
    newEnds <- rep(NA, length(grl))
    newEnds[posIndices] <- as.integer(end(promo[posIndices]))
    newEnds[!posIndices] <- as.integer(start(promo[!posIndices]))
  } else if (is.grl(class(extension))) {
    starts <- startSites(extension)
    changedGRL <-upstreamOfPerGroup(grl[names(extension)], starts,
                                    allowOutside = TRUE, is.circular)
    return(changedGRL)
  } else {
    stop("extension must either be an integer, or a GRangesList")
  }
  return(assignLastExonsStopSite(grl, newEnds, is.circular = is.circular))
}

#' Subset GRanges to get desired frame.
#'
#' Usually used for ORFs to get specific frame (0-2):
#' frame 0, frame 1, frame 2
#'
#' GRanges object should be beforehand
#' tiled to size of 1. This subsetting takes account for strand.
#' @param x A tiled to size of 1 GRanges object
#' @param frame A numeric indicating which frame to extract
#' @return GRanges object reduced to only first frame
#' @export
#' @examples
#' subsetToFrame(GRanges("1", IRanges(1:10, width = 1), "+"), 2)
subsetToFrame <- function(x, frame) {
  if (as.vector(strand(x) == "+")[1]) {
    x[seq.int(frame, length(x), 3)]
  } else {
    x[seq.int(length(x) + 1 - frame, 1, -3)]
  }
}


#' Reduce GRanges / GRangesList
#'
#' Reduce away all GRanges elements with 0-width.
#'
#' Extends function \code{\link{reduce}}
#' by trying to keep names and meta columns, if it is a
#' GRangesList. It also does not lose sorting for GRangesList,
#' since original reduce sorts all by ascending position.
#' If keep.names == FALSE, it's just the normal GenomicRanges::reduce
#' with sorting negative strands descending for GRangesList.
#' @param grl a \code{\link{GRangesList}} or GRanges object
#' @param drop.empty.ranges (FALSE) if a group is empty (width 0), delete it.
#' @param min.gapwidth (1L) how long gap can it be between two ranges,
#' to merge them.
#' @param with.revmap (FALSE) return info on which mapped to which
#' @param with.inframe.attrib (FALSE) For internal use.
#' @param ignore.strand (FALSE), can different strands be reduced together.
#' @param keep.names (FALSE) keep the names and meta columns of the GRangesList
#' @param min.strand.decreasing (TRUE), if GRangesList, return minus strand group
#' ranges in decreasing order (1-5, 30-50) -> (30-50, 1-5)
#' @return A reduced GRangesList
#' @export
#' @family ExtendGenomicRanges
#' @examples
#' ORF <- GRanges(seqnames = "1",
#'                ranges = IRanges(start = c(1, 2, 3), end = c(1, 2, 3)),
#'                strand = "+")
#' # For GRanges
#' reduceKeepAttr(ORF, keep.names = TRUE)
#' # For GRangesList
#' grl <- GRangesList(tx1_1 = ORF)
#' reduceKeepAttr(grl, keep.names = TRUE)
#'
reduceKeepAttr <- function(grl, keep.names = FALSE,
                           drop.empty.ranges = FALSE, min.gapwidth = 1L,
                           with.revmap = FALSE, with.inframe.attrib = FALSE,
                           ignore.strand = FALSE, min.strand.decreasing = TRUE) {
  if (!is.gr_or_grl(class(grl))) stop("grl must be GRanges or GRangesList")

  reduced <- GenomicRanges::reduce(grl, drop.empty.ranges, min.gapwidth,
                                   with.revmap, with.inframe.attrib,
                                   ignore.strand)
  if (is.grl(class(grl)) & min.strand.decreasing) {
    reduced <- reverseMinusStrandPerGroup(reduced)
  }
  if (keep.names) { # return with names
    if (!is.grl(class(grl))) {
      return(reduced)
    }
    gr <- unlist(reduced, use.names = TRUE)
    if (length(gr) == 0) return(GRangesList())

    return(matchNaming(gr, grl))
  } else { # return original
    return(reduced)
  }
}
Roleren/ORFik documentation built on Oct. 19, 2024, 7:37 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
Roleren/ORFik
Open Reading Frames in Genomics

R/ranges_helpers.R
In Roleren/ORFik: Open Reading Frames in Genomics

Defines functions reduceKeepAttr subsetToFrame extendTrailers extendLeaders windowPerGroup txSeqsFromFa pmapFromTranscriptF pmapToTranscriptF asTX tile1 makeORFNames makeExonRanks

Documented in asTX extendLeaders extendTrailers makeExonRanks makeORFNames pmapFromTranscriptF pmapToTranscriptF reduceKeepAttr subsetToFrame tile1 txSeqsFromFa windowPerGroup

R Package Documentation

Browse R Packages

We want your feedback!

Roleren/ORFik Open Reading Frames in Genomics

R/ranges_helpers.R In Roleren/ORFik: Open Reading Frames in Genomics

Defines functions reduceKeepAttr subsetToFrame extendTrailers extendLeaders windowPerGroup txSeqsFromFa pmapFromTranscriptF pmapToTranscriptF asTX tile1 makeORFNames makeExonRanks

Documented in asTX extendLeaders extendTrailers makeExonRanks makeORFNames pmapFromTranscriptF pmapToTranscriptF reduceKeepAttr subsetToFrame tile1 txSeqsFromFa windowPerGroup

R Package Documentation

Browse R Packages

We want your feedback!

Roleren/ORFik
Open Reading Frames in Genomics

R/ranges_helpers.R
In Roleren/ORFik: Open Reading Frames in Genomics