R/gr_positions.R

Defines functions upstreamFromPerGroup upstreamOfPerGroup downstreamFromPerGroup downstreamOfPerGroup assignLastExonsStopSite assignFirstExonsStartSite

Documented in assignFirstExonsStartSite assignLastExonsStopSite downstreamFromPerGroup downstreamOfPerGroup upstreamFromPerGroup upstreamOfPerGroup

#' Reassign the start positions of the first exons per group in grl
#'
#' Per group in GRangesList, assign the most upstream site.
#'
#' make sure your grl is sorted, since start of "-" strand
#' objects should be the
#' max end in group, use ORFik:::sortPerGroup(grl) to get sorted grl.
#' @param grl a \code{\link{GRangesList}} object
#' @param newStarts an integer vector of same length as grl, with new start
#' values (absolute coordinates, not relative)
#' @inheritParams extendLeaders
#' @return the same GRangesList with new start sites
#' @family GRanges
#'
assignFirstExonsStartSite <- function(grl, newStarts, is.circular =
                                        all(isCircular(grl) %in% TRUE)) {
  if (length(grl) != length(newStarts)) stop("length of grl and newStarts ",
                                             "are not equal!")
  posIndices <- strandBool(grl)
  if (!is.circular) {
    if (any(newStarts < 1)) {
      message("Transcript found that would be extended below coordinate position 0, setting to 1.")
      newStarts <- pmax(newStarts, 1)
    }
  }

  dt <- as.data.table(grl)
  dt[!duplicated(dt$group),]$start[posIndices] <- newStarts[posIndices]
  dt[!duplicated(dt$group),]$end[!posIndices] <- newStarts[!posIndices]

  ngrl <-
    GenomicRanges::makeGRangesListFromDataFrame(dt,
                                                split.field = "group",
                                                names.field = "group_name",
                                                keep.extra.columns = TRUE)
  names(ngrl) <- names(grl)

  return(ngrl)
}


#' Reassign the stop positions of the last exons per group
#'
#' Per group in GRangesList, assign the most upstream site.
#'
#' make sure your grl is sorted, since stop of "-" strand objects
#' should be the min start in group, use ORFik:::sortPerGroup(grl) to get
#' sorted grl.
#' @param grl a \code{\link{GRangesList}} object
#' @param newStops an integer vector of same length as grl,
#'  with new start values (absolute coordinates, not relative)
#' @inheritParams extendLeaders
#' @return the same GRangesList with new stop sites
#' @importFrom data.table .N .I
#' @family GRanges
#'
assignLastExonsStopSite <- function(grl, newStops, is.circular =
                                      all(isCircular(grl) %in% TRUE)) {
  if (length(grl) != length(newStops)) stop("length of grl and newStops ",
                                            "are not equal!")
  posIndices <- strandBool(grl)
  if (!is.circular) {
    if (any(newStops < 1)) {
      message("Transcript found that would be extended below coordinate position 0, setting to 1.")
      newStops <- pmax(newStops, 1)
    }
  }
  dt <- as.data.table(grl)
  group <- NULL # avoid check warning
  idx = dt[, .I[.N], by = group]
  dt[idx$V1]$end[posIndices] <- newStops[posIndices]
  dt[idx$V1]$start[!posIndices] <- newStops[!posIndices]
  ngrl <-
    GenomicRanges::makeGRangesListFromDataFrame(dt,
                                                split.field = "group",
                                                names.field = "group_name",
                                                keep.extra.columns = TRUE)
  names(ngrl) <- names(grl)

  return(ngrl)
}


#' Get rest of objects downstream (exclusive)
#'
#' Per group get the part downstream of position.
#' downstreamOfPerGroup(tx, stopSites(cds, asGR = TRUE))
#' will return the 3' utrs per transcript as GRangesList,
#' usually used for interesting
#' parts of the transcripts.
#'
#' If you want to include the points given in the region,
#' use downstreamFromPerGroup
#' @param tx a \code{\link{GRangesList}},
#'  usually of Transcripts to be changed
#' @param downstreamOf a vector of integers, for each group in tx, where
#' is the new start point of first valid exon. Can also be a GRangesList,
#' then stopsites will be used.
#' @return a GRangesList of downstream part
#' @family GRanges
#'
downstreamOfPerGroup <- function(tx, downstreamOf) {
  if (is.grl(downstreamOf)) {
    downstreamOf <- stopSites(downstreamOf, asGR = TRUE, is.sorted = TRUE,
                              keep.names = TRUE)
  } else if (is.numeric(downstreamOf) & length(downstreamOf) == length(tx)) {
    downstreamOf <- IRanges(downstreamOf, width  = 1)
    if (is.null(names(tx))) names(tx) <- seq_along(tx)
    names(downstreamOf) <- names(tx)
  } else stop("downstreamOf must be GRangesList, or numeric of equal",
              "size to tx")
  return(windowPerGroup(downstreamOf, tx, upstream = -1,
                      downstream = max(widthPerGroup(tx, FALSE))))
}

#' Get rest of objects downstream (inclusive)
#'
#' Per group get the part downstream of position.
#' downstreamFromPerGroup(tx, startSites(threeUTRs, asGR = TRUE))
#' will return the  3' utrs per transcript as GRangesList,
#' usually used for interesting
#' parts of the transcripts.
#'
#' If you don't want to include the points given in the region,
#' use \code{\link{downstreamOfPerGroup}}
#' @param tx a \code{\link{GRangesList}},
#'  usually of Transcripts to be changed
#' @param downstreamFrom a vector of integers, for each group in tx, where
#' is the new start point of first valid exon.
#' @inheritParams extendLeaders
#' @return a GRangesList of downstream part
#' @family GRanges
#'
downstreamFromPerGroup <- function(tx, downstreamFrom, is.circular =
                                     all(isCircular(tx) %in% TRUE)) {
  # Needs speed update!
  posIndices <- strandBool(tx)
  posEnds <- end(tx[posIndices])
  negEnds <- start(tx[!posIndices])
  posDown <- downstreamFrom[posIndices]
  negDown <- downstreamFrom[!posIndices]
  pos <- posEnds >= posDown
  neg <- negEnds <= negDown
  posTx <- tx[posIndices][pos]
  negTx <- tx[!posIndices][neg]
  downTx <- tx
  downTx[posIndices] <- posTx
  downTx[!posIndices] <- negTx
  #check if anyone hits boundary, set those to boundary
  if (anyNA(strandPerGroup(downTx, FALSE))) {
    boundaryHits <- which(is.na(strandPerGroup(downTx, FALSE)))
    downTx[boundaryHits] <- firstExonPerGroup(tx[boundaryHits])
    ir <- IRanges(start = downstreamFrom[boundaryHits],
                  end = downstreamFrom[boundaryHits])
    irl <- split(ir, seq_along(ir))
    names(irl) <- names(tx[boundaryHits])
    ranges(downTx[boundaryHits]) <- irl
  }

  return(assignFirstExonsStartSite(downTx, downstreamFrom, is.circular))
}


#' Get rest of objects upstream (exclusive)
#'
#' Per group get the part upstream of position
#' upstreamOfPerGroup(tx, startSites(cds, asGR = TRUE))
#' will return the 5' utrs per transcript, usually used for interesting
#' parts of the transcripts.
#'
#' @param tx a \code{\link{GRangesList}},
#'  usually of Transcripts to be changed
#' @param upstreamOf a vector of integers, for each group in tx, where
#'  is the the base after the new stop point of last valid exon.
#' @param allowOutside a logical (T), can upstreamOf extend outside
#'  range of tx, can set boundary as a false hit, so beware.
#' @inheritParams extendLeaders
#' @return a GRangesList of upstream part
#' @family GRanges
#'
upstreamOfPerGroup <- function(tx, upstreamOf, allowOutside = TRUE,
                               is.circular = all(isCircular(tx) %in% TRUE)) {
  posIndices <- strandBool(tx)
  posStarts <- start(tx[posIndices])
  negStarts <- end(tx[!posIndices])
  posGrlStarts <- upstreamOf[posIndices]
  negGrlStarts <- upstreamOf[!posIndices]
  pos <- posStarts < posGrlStarts
  neg <- negStarts > negGrlStarts
  posTx <- tx[posIndices]
  negTx <- tx[!posIndices]

  # Usually from pos/neg with possible cage extensions
  if (allowOutside) {
    outside <- which(sum(pos) == 0)
    pos[outside] = TRUE
    posTx[outside] <- firstExonPerGroup(posTx[outside])
    outside <- which(sum(neg) == 0)
    neg[outside] = TRUE
    negTx[outside] <- firstExonPerGroup(negTx[outside])
  }
  upTx <- tx
  upTx[posIndices] <- posTx[pos]
  upTx[!posIndices] <- negTx[neg]

  nonZero <- widthPerGroup(upTx) > 0
  if (all(!nonZero)) { # if no ranges exists
    return(upTx)
  }
  upstreamOf <- upstreamOf[nonZero]
  oldPosIndices <- posIndices
  posIndices <- posIndices[nonZero]

  stopSites <- stopSites(upTx[nonZero], FALSE, FALSE, TRUE)
  # check boundaries within group exons
  if (any(posIndices)){
    posChecks <- stopSites[posIndices] < upstreamOf[posIndices] &
      any(!pos[nonZero[oldPosIndices]])
  } else {
    posChecks <- FALSE
  }
  if(any(!posIndices)){
    negChecks <- stopSites[!posIndices] > upstreamOf[!posIndices] &
      any(!neg[nonZero[!oldPosIndices]])
  } else {
    negChecks <- FALSE
  }

  if (any(posChecks)) {
    upstreamOf[posIndices][posChecks] <- stopSites[posIndices][posChecks]
  }
  if (any(negChecks)) {
    upstreamOf[!posIndices][negChecks] <- stopSites[!posIndices][negChecks]
  }

  upTx[nonZero] <- assignLastExonsStopSite(upTx[nonZero], upstreamOf,
                                           is.circular = is.circular)
  return(upTx)
}

#' Get rest of objects upstream (inclusive)
#'
#' Per group get the part upstream of position.
#' upstreamFromPerGroup(tx, stopSites(fiveUTRs, asGR = TRUE))
#' will return the  5' utrs per transcript as GRangesList,
#' usually used for interesting
#' parts of the transcripts.
#'
#' If you don't want to include the points given in the region,
#' use \code{\link{upstreamOfPerGroup}}
#' @param tx a \code{\link{GRangesList}},
#'  usually of Transcripts to be changed
#' @param upstreamFrom a vector of integers, for each group in tx, where
#' is the new start point of first valid exon.
#' @return a GRangesList of upstream part
#' @family GRanges
#'
upstreamFromPerGroup <- function(tx, upstreamFrom) {
  posIndices <- strandBool(tx)
  posStarts <- start(tx[posIndices])
  negStarts <- end(tx[!posIndices])
  posGrlStarts <- upstreamFrom[posIndices]
  negGrlStarts <- upstreamFrom[!posIndices]
  pos <- posStarts <= posGrlStarts
  neg <- negStarts >= negGrlStarts
  upTx <- tx
  upTx[posIndices] <- upTx[posIndices][pos]
  upTx[!posIndices] <- upTx[!posIndices][neg]
  # check if any hits boundary, set those to boundary
  if (anyNA(strandPerGroup(upTx, FALSE))) {
    boundaryHits <- which(is.na(strandPerGroup(upTx, FALSE)))
    upTx[boundaryHits] <- firstExonPerGroup(tx[boundaryHits])
    ir <- IRanges(start = upstreamFrom[boundaryHits],
                  end = upstreamFrom[boundaryHits])
    irl <- split(ir, seq_along(ir))
    names(irl) <- names(tx[boundaryHits])
    ranges(upTx[boundaryHits]) <- irl
  }

  return(assignLastExonsStopSite(upTx, upstreamFrom))
}

Try the ORFik package in your browser

Any scripts or data that you put into this service are public.

ORFik documentation built on March 27, 2021, 6 p.m.