R/NucleotideOverlap.R

Defines functions NucleotideOverlap

Documented in NucleotideOverlap

# Author: Nicholas Cooley
# Maintainer: Nicholas Cooley
# Contact: npc19@pitt.edu

NucleotideOverlap <- function(SyntenyObject,
                              GeneCalls,
                              LimitIndex = FALSE,
                              OutputFormat = "Normal",
                              Verbose = FALSE) {
  ######
  # Error Checking
  # Require names for synteny object, DECIPHER be loaded, object sizes, some index checking
  # Parameters Default Filter, FilterOverlap, and Filter Coverage have been depreciated and
  # a filtering function is under construction
  ######
  
  L <- nrow(SyntenyObject)
  if (!("DECIPHER" %in% .packages())) {
    stop ("Required package DECIPHER is not loaded.")
  }
  if (!is(SyntenyObject, "Synteny")) {
    stop ("Object is not a synteny object.")
  }
  if (L != length(GeneCalls)) {
    stop ("Synteny Object and Gene Calls have differing number of genomes.")
  }
  if (is.null(names(GeneCalls))) {
    stop ("Gene Predictions must be named.")
  }
  if (is.null(rownames(SyntenyObject))) {
    stop ("Synteny Object must have named identifiers.")
  }
  if (any(names(GeneCalls) != rownames(SyntenyObject))) {
    stop ("Names between Synteny Object and Gene Predictions do not match.")
  }
  if (L <= 1L) {
    stop ("SyntenyObject is too small.")
  }
  if (!(OutputFormat %in% c("Sparse",
                            "Normal",
                            "Comprehensive"))) {
    stop("OutputFormat must be either 'Sparse', 'Normal', or 'Comprehensive'.")
  }
  IndexMatching <- vector("integer",
                          length = length(GeneCalls))
  for (i in seq_along(GeneCalls)) {
    IndexMatching[i] <- length(unique(GeneCalls[[i]]$Index))
  }
  SyntenyIndices <- unname(lengths(diag(SyntenyObject)))
  if (any(IndexMatching > SyntenyIndices)) {
    stop ("Indices do not match. Object orders may be not be matched.")
  }
  GCallClasses <- sapply(GeneCalls,
                         function(x) class(x),
                         USE.NAMES = FALSE,
                         simplify = TRUE)
  if (any(GCallClasses == "GRanges")) {
    LimitIndex <- TRUE
    warning("GRanges objects currently only support single contig inputs.")
  }

  if (Verbose) {
    TotalTimeStart <- Sys.time()
  }
  ResultMatrix <- matrix(data = list(),
                         nrow = L,
                         ncol = L)
  TotalLength <- L^2 - L
  TotalCounter <- 0L
  ######
  # scroll through every hit table in the synteny object
  ######
  
  ###### -- Deal with GRanges -------------------------------------------------
  
  FeatureRepresentations <- vector(mode = "list",
                                   length = L)
  
  for (m1 in seq_along(GeneCalls)) {
    if (is(GeneCalls[[m1]],
           "GRanges")) {
      
      if (length(levels(GeneCalls[[m1]]@seqnames)) > 1L) {
        
        warning(paste("GRange object ",
                      m1,
                      " contains more than 1 index and has been truncated.",
                      sep = ""))
        IndexPlaceHolder <- as.character(GeneCalls[[m1]]@seqnames)[1L]
        GeneCalls[[m1]] <- GeneCalls[[m1]][as.character(GeneCalls[[m1]]@seqnames == IndexPlaceHolder), ]
        
      }
      
      TypePlaceHolder <- as.character(GeneCalls[[m1]]$type)
      GeneCalls[[m1]] <- GeneCalls[[m1]][TypePlaceHolder %in% c("gene",
                                                                "pseudogene"), ]
      StrandConversion <- ifelse(test = as.character(GeneCalls[[m1]]@strand == "+"),
                                 yes = 0L,
                                 no = 1L)
      StartConversion <- GeneCalls[[m1]]@ranges@start
      StopConversion <- GeneCalls[[m1]]@ranges@start + GeneCalls[[m1]]@ranges@width - 1L
      IndexConversion <- rep(1L,
                             length(StartConversion))
      LengthsConversion <- StopConversion - StartConversion + 1L
      
      o <- order(StartConversion)
      StrandConversion <- StrandConversion[o]
      IndexConversion <- IndexConversion[o]
      StartConversion <- StartConversion[o]
      StopConversion <- StopConversion[o]
      LengthsConversion <- LengthsConversion[o]
      
      FeatureRepresentations[[m1]] <- matrix(data = c(IndexConversion,
                                                      StrandConversion,
                                                      StartConversion,
                                                      StopConversion,
                                                      LengthsConversion),
                                             nrow = length(o),
                                             ncol = 5L)
      
      if (any(StopConversion > SyntenyObject[[m1, m1]][1])) {
        FeatureRepresentations[[m1]] <- FeatureRepresentations[[m1]][-which(StopConversion > SyntenyObject[[m1, m1]][1]), ]
      }
      
      rm(list = c("IndexConversion",
                  "StrandConversion",
                  "StartConversion",
                  "StopConversion",
                  "LengthsConversion"))
      
    } else if (is(GeneCalls[[m1]],
                  "DFrame")) {
      
      o <- order(GeneCalls[[m1]][, "Index"], GeneCalls[[m1]][, "Start"])
      CurrentStarts <- as.integer(GeneCalls[[m1]]$Start)[o]
      CurrentStops <- as.integer(GeneCalls[[m1]]$Stop)[o]
      CurrentStrands <- as.integer(GeneCalls[[m1]]$Strand)[o]
      CurrentIndices <- as.integer(GeneCalls[[m1]]$Index)[o]
      CurrentLengths <- CurrentStops - CurrentStarts + 1L
      
      FeatureRepresentations[[m1]] <- matrix(data = c(CurrentIndices,
                                                      CurrentStrands,
                                                      CurrentStarts,
                                                      CurrentStops,
                                                      CurrentLengths),
                                             nrow = length(o),
                                             ncol = 5L)
      rm(list = c("CurrentIndices",
                  "CurrentStrands",
                  "CurrentStarts",
                  "CurrentStops",
                  "CurrentLengths"))
    } else if (is(GeneCalls[[m1]],
                  "Genes")) {
      # convert Erik's gene calls to a temporary DataFrame
      # the column "Gene" assigns whether or not that particular line is the gene
      # that the caller actually picked, calls must be subset to where Gene == 1
      ans <- GeneCalls[[m1]]
      R <- mapply(function(x, y) IRanges(start = x,
                                         end = y),
                  x = ans[, "Begin"],
                  y = ans[, "End"],
                  SIMPLIFY = FALSE)
      D <- DataFrame("Index" = as.integer(ans[, "Index"]),
                     "Strand" = as.integer(ans[, "Strand"]),
                     "Start" = as.integer(ans[, "Begin"]),
                     "Stop" = as.integer(ans[, "End"]),
                     "Type" = rep("gene",
                                  nrow(ans)),
                     "Range" = IRangesList(R),
                     "Coding" = rep(TRUE,
                                    nrow(ans)))
      D <- D[ans[, "Gene"] == 1L, ]
      rownames(D) <- NULL
      GeneCalls[[m1]] <- D
      rm(c("D", "ans", "R"))
    }
  }
  
  
  pBar <- txtProgressBar(style = 1L)
  for (m1 in seq_len(L - 1L)) {
    for (m2 in (m1 +1L):L) {
      ######
      # Collect Index Start Stop and Strand from current subject and query gene calls
      # Collect Index Start Stop Strand from hit table
      ######

      Q.Start <- FeatureRepresentations[[m1]][, 3L]
      Q.Stop <- FeatureRepresentations[[m1]][, 4L]
      Q.Index <- FeatureRepresentations[[m1]][, 1L]
      QG.Strand <- FeatureRepresentations[[m1]][, 2L]
      
      S.Start <- FeatureRepresentations[[m2]][, 3L]
      S.Stop <- FeatureRepresentations[[m2]][, 4L]
      S.Index <- FeatureRepresentations[[m2]][, 1L]
      SG.Strand <- FeatureRepresentations[[m2]][, 2L]
      
      CurrentHitTable <- SyntenyObject[m1, m2, drop = FALSE][[1]]
      if (LimitIndex) {
        ######
        # Select current hit table, subset to only single index
        ######
        CurrentHitTable <- CurrentHitTable[which(CurrentHitTable[, "index1"] == 1L), ]
        CurrentHitTable <- CurrentHitTable[which(CurrentHitTable[, "index2"] == 1L), ]
        CurrentHitTable <- CurrentHitTable[order(CurrentHitTable[,
                                                                 "start1",
                                                                 drop = FALSE]),
                                           ,
                                           drop = FALSE]
      } else {
        ######
        # If not limiting by index, order hit table by the index1 first, then the query starts
        ######
        CurrentHitTable <- CurrentHitTable[order(CurrentHitTable[,
                                                                 "index1",
                                                                 drop = FALSE],
                                                 CurrentHitTable[,
                                                                 "start1",
                                                                 drop = FALSE]),
                                           ,
                                           drop = FALSE]
      }
      ######
      # Collect the starts and stops for all hits correct for strandedness
      # Collect indices as well
      ######
      HitWidths <- CurrentHitTable[, "width"]
      Q.HitStarts <- CurrentHitTable[, "start1"]
      S.HitStarts <- CurrentHitTable[, "start2"]
      Q.HitEnds <- Q.HitStarts + HitWidths - 1L
      S.HitEnds <- S.HitStarts + HitWidths - 1L
      Strand <- CurrentHitTable[, "strand"]
      QHI <- CurrentHitTable[, "index1"]
      SHI <- CurrentHitTable[, "index2"]
      for (i in seq_along(HitWidths)) {
        if (Strand[i] == 0L) {
          S.HitStarts[i] <- CurrentHitTable[i, "start2"]
          S.HitEnds[i] <- CurrentHitTable[i, "start2"] + CurrentHitTable[i, "width"] - 1L
        } else if (Strand[i] == 1L) {
          S.HitStarts[i] <- CurrentHitTable[i, "start2"] - CurrentHitTable[i, "width"] + 1L
          S.HitEnds[i] <- CurrentHitTable[i, "start2"]
        }
      }
      ######
      # Record a query matrix for every hit that lands inside a gene in our query geneset/genome
      # contains:
      # the gene index and hit index in question
      # modifies the hit boundaries IN THE SUBJECT
      # records the distances between the hit boundaries and the gene boundaries
      # the strand for the gene in the query that is being recorded
      ######
      QueryMatrix <- matrix(NA_integer_,
                            nrow = 8L,
                            ncol = nrow(CurrentHitTable))
      HitCounter <- 1L
      AddCounter <- 1L
      DimLimit <- nrow(CurrentHitTable) / 2
      DimAdjust <- 2L
      for (z1 in seq_along(Q.Start)) {
        ######
        # loop through the starts of the query genes as they have been ordered
        ######
        CurrentGene <- HitCounter
        Q.NucOverLapL <- NA_integer_
        Q.NucOverLapR <- NA_integer_
        S.NucPositionL <- NA_integer_
        S.NucPositionR <- NA_integer_
        S.Strand <- NA_integer_
        while (HitCounter <= length(HitWidths)) {
          ######
          # interrogation loop, depending upon where the current hit is
          # and where the current gene is
          # either go to the next hit (HitCounter + 1L)
          # or
          # go to the next gene (break)
          # AddCounter tells the loop which row to add information to in
          # the initialized QueryMatrix, when there is information to add
          ######
          if (Q.HitEnds[HitCounter] < Q.Start[z1] &
              QHI[HitCounter] == Q.Index[z1]) {
            # Hit ends before current query gene begins
            # AND the indices are the same
            # record the hit counter every time you iterate to the next hit
            HitCounter <- HitCounter + 1L
          } else if (Q.HitStarts[HitCounter] < Q.Start[z1] &
                     Q.HitEnds[HitCounter] >= Q.Start[z1] &
                     Q.HitEnds[HitCounter] <= Q.Stop[z1] &
                     QHI[HitCounter] == Q.Index[z1]) {
            # Hit overlaps left bound of current query gene
            # Stay on the current gene
            # And go to the next hit
            CurrentGene <- z1
            Q.NucOverLapL <- Q.Start[z1]
            Q.NucOverLapR <- Q.HitEnds[HitCounter]
            S.Strand <- Strand[HitCounter]
            NewWidth <- Q.NucOverLapR - Q.NucOverLapL + 1L
            Current.QHI <- QHI[HitCounter]
            Current.SHI <- SHI[HitCounter]
            ######
            # Trim hit in subject, overlaps that are in the reverse strand require different trims
            ######
            if (S.Strand == 0L) {
              S.NucPositionR <- S.HitEnds[HitCounter]
              S.NucPositionL <- S.HitEnds[HitCounter] - NewWidth + 1L
            } else {
              S.NucPositionL <- S.HitStarts[HitCounter]
              S.NucPositionR <- S.HitStarts[HitCounter] + NewWidth - 1L
            }
            
            # Add To Vector!
            QueryMatrix[, AddCounter] <- c(CurrentGene,
                                           Q.NucOverLapL,
                                           Q.NucOverLapR,
                                           S.NucPositionL,
                                           S.NucPositionR,
                                           S.Strand,
                                           Current.QHI,
                                           Current.SHI)
            if (AddCounter >= DimLimit) {
              QueryMatrix <- cbind(QueryMatrix,
                                   matrix(data = NA_integer_,
                                          nrow = nrow(QueryMatrix),
                                          ncol = ncol(QueryMatrix) * DimAdjust))
              DimLimit <- ncol(QueryMatrix) / 2
              DimAdjust <- DimAdjust * 2L
            }
            # record the add counter every time a new row is added to the query matrix
            AddCounter <- AddCounter + 1L
            # record the hit counter every time you iterate to the next hit
            HitCounter <- HitCounter + 1L
          } else if (Q.HitStarts[HitCounter] >= Q.Start[z1] &
                     Q.HitEnds[HitCounter] <= Q.Stop[z1] &
                     QHI[HitCounter] == Q.Index[z1]) {
            # Hit occurs entirely within current query gene
            # Stay on the current gene
            # And go to the next hit
            CurrentGene <- z1
            Q.NucOverLapL <- Q.HitStarts[HitCounter]
            Q.NucOverLapR <- Q.HitEnds[HitCounter]
            S.Strand <- Strand[HitCounter]
            S.NucPositionL <- S.HitStarts[HitCounter]
            S.NucPositionR <- S.HitEnds[HitCounter]
            Current.QHI <- QHI[HitCounter]
            Current.SHI <- SHI[HitCounter]
            # Add To Vector!
            QueryMatrix[, AddCounter] <- c(CurrentGene,
                                           Q.NucOverLapL,
                                           Q.NucOverLapR,
                                           S.NucPositionL,
                                           S.NucPositionR,
                                           S.Strand,
                                           Current.QHI,
                                           Current.SHI)
            if (AddCounter >= DimLimit) {
              QueryMatrix <- cbind(QueryMatrix,
                                   matrix(data = NA_integer_,
                                          nrow = nrow(QueryMatrix),
                                          ncol = ncol(QueryMatrix) * DimAdjust))
              DimLimit <- ncol(QueryMatrix) / 2
              DimAdjust <- DimAdjust * 2L
            }
            # record the add counter every time a new row is added to the query matrix
            AddCounter <- AddCounter + 1L
            # record the hit counter every time you iterate to the next hit
            HitCounter <- HitCounter + 1L
          } else if (Q.HitStarts[HitCounter] >= Q.Start[z1] &
                     Q.HitStarts[HitCounter] <= Q.Stop[z1] &
                     Q.HitEnds[HitCounter] > Q.Stop[z1] &
                     QHI[HitCounter] == Q.Index[z1]) {
            # Hit overlaps right bound of current query gene
            # Stay on the current hit and go to the next gene
            CurrentGene <- z1
            Q.NucOverLapL <- Q.HitStarts[HitCounter]
            Q.NucOverLapR <- Q.Stop[z1]
            S.Strand <- Strand[HitCounter]
            NewWidth <- Q.NucOverLapR - Q.NucOverLapL + 1L
            Current.QHI <- QHI[HitCounter]
            Current.SHI <- SHI[HitCounter]
            if (S.Strand == 0L) {
              S.NucPositionL <- S.HitStarts[HitCounter]
              S.NucPositionR <- S.HitStarts[HitCounter] + NewWidth - 1L
            } else {
              S.NucPositionR <- S.HitEnds[HitCounter]
              S.NucPositionL <- S.HitEnds[HitCounter] - NewWidth + 1L
            }
            # Add To Vector!
            QueryMatrix[, AddCounter] <- c(CurrentGene,
                                           Q.NucOverLapL,
                                           Q.NucOverLapR,
                                           S.NucPositionL,
                                           S.NucPositionR,
                                           S.Strand,
                                           Current.QHI,
                                           Current.SHI)
            if (AddCounter >= DimLimit) {
              QueryMatrix <- cbind(QueryMatrix,
                                   matrix(data = NA_integer_,
                                          nrow = nrow(QueryMatrix),
                                          ncol = ncol(QueryMatrix) * DimAdjust))
              DimLimit <- ncol(QueryMatrix) / 2
              DimAdjust <- DimAdjust * 2L
            }
            # record the add counter every time a new row is added to the query matrix
            AddCounter <- AddCounter + 1L
            break
          } else if (Q.HitStarts[HitCounter] < Q.Start[z1] &
                     Q.HitEnds[HitCounter] > Q.Stop[z1] &
                     QHI[HitCounter] == Q.Index[z1]) {
            # Hit eclipses current query gene
            # Stay on the current hit, and go to the next gene
            CurrentGene <- z1
            Q.NucOverLapL <- Q.Start[z1]
            Q.NucOverLapR <- Q.Stop[z1]
            TrimLeft <- Q.Start[z1] - Q.HitStarts[HitCounter]
            TrimRight <- Q.HitEnds[HitCounter] - Q.Stop[z1]
            S.Strand <- Strand[HitCounter]
            NewWidth <- Q.NucOverLapR - Q.NucOverLapL + 1L
            Current.QHI <- QHI[HitCounter]
            Current.SHI <- SHI[HitCounter]
            if (S.Strand == 0L) {
              S.NucPositionL <- S.HitStarts[HitCounter] + TrimLeft
              S.NucPositionR <- S.NucPositionL + NewWidth - 1L
            } else {
              S.NucPositionR <- S.HitEnds[HitCounter] - TrimLeft
              S.NucPositionL <- S.NucPositionR - NewWidth + 1L
            }
            # Add To Vector!
            QueryMatrix[, AddCounter] <- c(CurrentGene,
                                           Q.NucOverLapL,
                                           Q.NucOverLapR,
                                           S.NucPositionL,
                                           S.NucPositionR,
                                           S.Strand,
                                           Current.QHI,
                                           Current.SHI)
            if (AddCounter >= DimLimit) {
              QueryMatrix <- cbind(QueryMatrix,
                                   matrix(data = NA_integer_,
                                          nrow = nrow(QueryMatrix),
                                          ncol = ncol(QueryMatrix) * DimAdjust))
              DimLimit <- ncol(QueryMatrix) / 2
              DimAdjust <- DimAdjust * 2L
            }
            # record the add counter every time a new row is added to the query matrix
            AddCounter <- AddCounter + 1L
            break
          } else if (Q.HitStarts[HitCounter] > Q.Stop[z1] &
                     QHI[HitCounter] == Q.Index[z1]) {
            # Hit occurs after current query gene
            # And the indices are matched
            # Go to next gene
            break
          } else if (QHI[HitCounter] > Q.Index[z1]) {
            # If the index of the hits in the query has outpaced the index of the genes
            break
          } else if (QHI[HitCounter] < Q.Index[z1]) {
            # If the index of the genes has outpaced the index of the hits
            # record the hit counter every time you iterate to the next hit
            HitCounter <- HitCounter + 1L
          } # end of else if conditionals
        } # end while loop through hits
      } # end for loop through genes
      if (Verbose) {
        TotalCounter <- TotalCounter + 1L
        setTxtProgressBar(pb = pBar,
                          value = TotalCounter/TotalLength)
      }
      QueryMatrix <- t(QueryMatrix)
      colnames(QueryMatrix) <- c("CurrentGene", # The gene that was recorded with a hit present
                                 "QueryNucleotideOverLapLeft", # leftmost nucleotide position of a hit that is within CurrentGene
                                 "QueryNucleotideOverLapRight", # rightmost nucleotide position of a hit that is within CurrentGene
                                 "SubjectNucleotidePositionLeft", # corresponding leftmost position in the subject
                                 "SubjectNucleotidePositionRight", # corresponding rightmost position in the subject
                                 "SubjectStrand", # strand of the hit in the subject
                                 "QueryIndex", # index that was matched between the hit and the gene in the query
                                 "SubjectIndex" # index (chromosome, plasmid, whatever) of the hit in the subject
                                 )
      ######
      # Remove unfilled extra rows
      ######
      QueryMatrix <- QueryMatrix[apply(QueryMatrix,
                                       1L,
                                       function(x) !all(is.na(x))),
                                 ,
                                 drop = FALSE]
      if (dim(QueryMatrix)[1] == 0L) {
        OutPutMatrix <- matrix(NA_integer_,
                               nrow = 1L,
                               ncol = 7L)
        OverLapMatrix <- matrix(NA_integer_,
                                nrow = 1L,
                                ncol = 7L)
      } else {
        HitCounter <- 1L
        AddCounter <- 1L
        OverLapMatrix <- matrix(NA_integer_,
                                ncol = nrow(QueryMatrix),
                                nrow = 9L)
        QueryMatrix <- QueryMatrix[order(QueryMatrix[,
                                                     "SubjectIndex",
                                                     drop = FALSE],
                                         QueryMatrix[,
                                                     "SubjectNucleotidePositionLeft",
                                                     drop = FALSE]),
                                   ,
                                   drop = FALSE]
        DimAdjust <- 2L
        DimLimit <- ncol(OverLapMatrix) / 2
        ######
        # Part 2!
        # Hits that were recorded as being within a gene in the query
        # are now tested again the genes in the subject
        ######
        QueryMap <- QueryMatrix[, "CurrentGene"]
        S.HitStarts <- QueryMatrix[, "SubjectNucleotidePositionLeft"]
        S.HitEnds <- QueryMatrix[, "SubjectNucleotidePositionRight"]
        Q.HitStarts <- QueryMatrix[, "QueryNucleotideOverLapLeft"]
        Q.HitEnds <- QueryMatrix[, "QueryNucleotideOverLapRight"]
        QueryIndices <- QueryMatrix[, "QueryIndex"]
        SubjectHitIndex <- QueryMatrix[, "SubjectIndex"]
        for (z2 in seq_along(S.Start)) {
          ######
          # Loop through the subject genes
          ######
          while (HitCounter <= length(QueryMap)) {
            if (S.HitEnds[HitCounter] < S.Start[z2] &
                SubjectHitIndex[HitCounter] == S.Index[z2]) {
              # Hit ends before current subject gene begins
              HitCounter <- HitCounter + 1L
            } else if (S.HitStarts[HitCounter] < S.Start[z2] &
                       S.HitEnds[HitCounter] >= S.Start[z2] &
                       S.HitEnds[HitCounter] <= S.Stop[z2] &
                       SubjectHitIndex[HitCounter] == S.Index[z2]) {
              # Hit overlaps left bound of current subject gene
              CurrentGene <- z2
              QueryGenePosition <- QueryMap[HitCounter]
              CurrentQueryIndex <- QueryIndices[HitCounter]
              CurrentSubjectIndex <- SubjectHitIndex[HitCounter]
              TrimLeft <- S.Start[z2] - S.HitStarts[HitCounter]
              ExactOverLap <- S.HitEnds[HitCounter] - S.Start[z2] + 1L
              SubjectHitLeft <- S.HitStarts[HitCounter] + TrimLeft
              SubjectHitRight <- S.HitEnds[HitCounter]
              QueryHitLeft <- Q.HitStarts[HitCounter] + TrimLeft
              QueryHitRight <- Q.HitEnds[HitCounter]
              # Add to vector !
              OverLapMatrix[, AddCounter] <- c(QueryGenePosition,
                                               CurrentGene,
                                               ExactOverLap,
                                               CurrentQueryIndex,
                                               CurrentSubjectIndex,
                                               QueryHitLeft,
                                               QueryHitRight,
                                               SubjectHitLeft,
                                               SubjectHitRight)
              if (AddCounter >= DimLimit) {
                OverLapMatrix <- cbind(OverLapMatrix,
                                       matrix(data = NA_integer_,
                                              nrow = nrow(OverLapMatrix),
                                              ncol = ncol(OverLapMatrix) * DimAdjust))
                DimLimit <- ncol(OverLapMatrix) / 2
                DimAdjust <- DimAdjust * 2L
              }
              AddCounter <- AddCounter + 1L
              HitCounter <- HitCounter + 1L
            } else if (S.HitStarts[HitCounter] >= S.Start[z2] &
                       S.HitEnds[HitCounter] <= S.Stop[z2] &
                       SubjectHitIndex[HitCounter] == S.Index[z2]) {
              # Hit occurs entirely within current subject gene
              CurrentGene <- z2
              QueryGenePosition <- QueryMap[HitCounter]
              ExactOverLap <- S.HitEnds[HitCounter] - S.HitStarts[HitCounter] + 1L
              CurrentQueryIndex <- QueryIndices[HitCounter]
              CurrentSubjectIndex <- SubjectHitIndex[HitCounter]
              SubjectHitLeft <- S.HitStarts[HitCounter]
              SubjectHitRight <- S.HitEnds[HitCounter]
              QueryHitLeft <- Q.HitStarts[HitCounter]
              QueryHitRight <- Q.HitEnds[HitCounter]
              # Add to vector !
              OverLapMatrix[, AddCounter] <- c(QueryGenePosition,
                                               CurrentGene,
                                               ExactOverLap,
                                               CurrentQueryIndex,
                                               CurrentSubjectIndex,
                                               QueryHitLeft,
                                               QueryHitRight,
                                               SubjectHitLeft,
                                               SubjectHitRight)
              if (AddCounter >= DimLimit) {
                OverLapMatrix <- cbind(OverLapMatrix,
                                       matrix(data = NA_integer_,
                                              nrow = nrow(OverLapMatrix),
                                              ncol = ncol(OverLapMatrix) * DimAdjust))
                DimLimit <- ncol(OverLapMatrix) / 2
                DimAdjust <- DimAdjust * 2L
              }
              AddCounter <- AddCounter + 1L
              HitCounter <- HitCounter + 1L
            } else if (S.HitStarts[HitCounter] >= S.Start[z2] &
                       S.HitStarts[HitCounter] <= S.Stop[z2] &
                       S.HitEnds[HitCounter] > S.Stop[z2] &
                       SubjectHitIndex[HitCounter] == S.Index[z2]) {
              # Hit overlaps right bound of current subject gene
              # Stay on the current hit and go to the next gene
              CurrentGene <- z2
              QueryGenePosition <- QueryMap[HitCounter]
              ExactOverLap <- S.Stop[z2] - S.HitStarts[HitCounter] + 1L
              CurrentQueryIndex <- QueryIndices[HitCounter]
              CurrentSubjectIndex <- SubjectHitIndex[HitCounter]
              TrimRight <- S.HitEnds[HitCounter] - S.Stop[z2]
              SubjectHitLeft <- S.HitStarts[HitCounter]
              SubjectHitRight <- S.HitEnds[HitCounter] - TrimRight
              QueryHitLeft <- Q.HitStarts[HitCounter]
              QueryHitRight <- Q.HitEnds[HitCounter] - TrimRight
              # Add to vector !
              OverLapMatrix[, AddCounter] <- c(QueryGenePosition,
                                               CurrentGene,
                                               ExactOverLap,
                                               CurrentQueryIndex,
                                               CurrentSubjectIndex,
                                               QueryHitLeft,
                                               QueryHitRight,
                                               SubjectHitLeft,
                                               SubjectHitRight)
              if (AddCounter >= DimLimit) {
                OverLapMatrix <- cbind(OverLapMatrix,
                                       matrix(data = NA_integer_,
                                              nrow = nrow(OverLapMatrix),
                                              ncol = ncol(OverLapMatrix) * DimAdjust))
                DimLimit <- ncol(OverLapMatrix) / 2
                DimAdjust <- DimAdjust * 2L
              }
              AddCounter <- AddCounter + 1L
              break
            } else if (S.HitStarts[HitCounter] <= S.Start[z2] &
                       S.HitEnds[HitCounter] >= S.Stop[z2] &
                       SubjectHitIndex[HitCounter] == S.Index[z2]) {
              # Hit eclipses current subject gene
              CurrentGene <- z2
              QueryGenePosition <- QueryMap[HitCounter]
              ExactOverLap <- S.Stop[z2] - S.Start[z2] + 1L
              CurrentQueryIndex <- QueryIndices[HitCounter]
              CurrentSubjectIndex <- SubjectHitIndex[HitCounter]
              TrimLeft <- S.Start[z2] - S.HitStarts[HitCounter]
              TrimRight <- S.HitEnds[HitCounter] - S.Stop[z2]
              SubjectHitLeft <- S.HitStarts[HitCounter] + TrimLeft
              SubjectHitRight <- S.HitEnds[HitCounter] - TrimRight
              QueryHitLeft <- Q.HitStarts[HitCounter] + TrimLeft
              QueryHitRight <- Q.HitEnds[HitCounter] - TrimRight
              # Add to vector !
              OverLapMatrix[, AddCounter] <- c(QueryGenePosition,
                                               CurrentGene,
                                               ExactOverLap,
                                               CurrentQueryIndex,
                                               CurrentSubjectIndex,
                                               QueryHitLeft,
                                               QueryHitRight,
                                               SubjectHitLeft,
                                               SubjectHitRight)
              if (AddCounter >= DimLimit) {
                OverLapMatrix <- cbind(OverLapMatrix,
                                       matrix(data = NA_integer_,
                                              nrow = nrow(OverLapMatrix),
                                              ncol = ncol(OverLapMatrix) * DimAdjust))
                DimLimit <- ncol(OverLapMatrix) / 2
                DimAdjust <- DimAdjust * 2L
              }
              AddCounter <- AddCounter + 1L
              break
            } else if (S.HitStarts[HitCounter] > S.Stop[z2]) {
              # Hit occurs after current subject gene
              break
            } else if (SubjectHitIndex[HitCounter] < S.Index[z2]) {
              # The indices do not match the only direction to go is forward
              HitCounter <- HitCounter + 1L
            } else if (SubjectHitIndex[HitCounter] > S.Index[z2]) {
              # if you have outpaced the gene index with the hits go to the next gene
              break
            }
          }
        }
      }
      OverLapMatrix <- t(OverLapMatrix)
      ######
      # Remove empty extra rows
      ######
      OverLapMatrix <- OverLapMatrix[apply(OverLapMatrix,
                                           1L,
                                           function(x) !all(is.na(x))),
                                     ,
                                     drop = FALSE]
      OverLapMatrix <- OverLapMatrix[order(OverLapMatrix[, 1L, drop = FALSE],
                                           OverLapMatrix[, 2L, drop = FALSE],
                                           OverLapMatrix[, 4L, drop = FALSE],
                                           OverLapMatrix[, 5L, drop = FALSE]),
                                     ,
                                     drop = FALSE]
      ######
      # If the overlap matrix is empty, assign an empty single row matrix
      # If it is not, sum up the overlap in nucleotide space and condense to a single row
      # Select the smallest distances for all four displacements
      ######
      if (dim(OverLapMatrix)[1] == 0L) {
        OverLapMatrix <- matrix(NA_integer_,
                                nrow = 1L,
                                ncol = 9L)
        OutPutMatrix <- matrix(NA_integer_,
                               nrow = 1L,
                               ncol = 9L)
      } else if (dim(OverLapMatrix)[1] >= 1L) {
        OutPutMatrix <- matrix(NA_integer_,
                               ncol = 9L,
                               nrow = nrow(OverLapMatrix))
        ######
        # All recordings so far are by hit
        # condense hits as appropriate, by the genes that they link
        ######
        RowCount <- 1L
        CondenseCount <- 1L
        Row <- 2L
        while (CondenseCount <= nrow(OverLapMatrix)) {
          while (Row <= nrow(OverLapMatrix)) {
            if (OverLapMatrix[Row, 5L] != OverLapMatrix[CondenseCount, 5L]) {
              break
            }
            if (OverLapMatrix[Row, 4L] != OverLapMatrix[CondenseCount, 4L]) {
              break
            }
            if (OverLapMatrix[Row, 2L] != OverLapMatrix[CondenseCount, 2L]) {
              break
            }
            if (OverLapMatrix[Row, 1L] != OverLapMatrix[CondenseCount, 1L]) {
              break
            }
            Row <- Row + 1L
          }
          
          z5 <- CondenseCount:(Row - 1L)
          OutPutMatrix[RowCount, ] <- c(OverLapMatrix[CondenseCount, 1L],
                                        OverLapMatrix[CondenseCount, 2L],
                                        sum(OverLapMatrix[z5, 3L]),
                                        OverLapMatrix[CondenseCount, 4L],
                                        OverLapMatrix[CondenseCount, 5L],
                                        min(OverLapMatrix[z5, 6L]),
                                        max(OverLapMatrix[z5, 7L]),
                                        min(OverLapMatrix[z5, 8L]),
                                        max(OverLapMatrix[z5, 9L]))
          RowCount <- RowCount + 1L
          CondenseCount <- CondenseCount + length(z5)
        }
        OutPutMatrix <- OutPutMatrix[apply(OutPutMatrix,
                                           1L,
                                           function(x) !all(is.na(x))),
                                     ,
                                     drop = FALSE]
      }
      OutPutMatrix <- OutPutMatrix[order(OutPutMatrix[,
                                                      1L,
                                                      drop = FALSE]),
                                   ,
                                   drop = FALSE]
      
      colnames(OutPutMatrix) <- c("QueryGene",
                                  "SubjectGene",
                                  "ExactOverlap",
                                  "QueryIndex",
                                  "SubjectIndex",
                                  "QLeftPos",
                                  "QRightPos",
                                  "SLeftPos",
                                  "SRightPos")
      QueryStartDisplacement <- ifelse(test = QG.Strand[OutPutMatrix[, "QueryGene"]] == 1L,
                                       yes = abs(OutPutMatrix[, 7L] - Q.Stop[OutPutMatrix[, 1L]]),
                                       no = abs(OutPutMatrix[, 6L] - Q.Start[OutPutMatrix[, 1L]]))
      QueryStopDisplacement <- ifelse(test = QG.Strand[OutPutMatrix[, "QueryGene"]] == 1L,
                                      yes = abs(OutPutMatrix[, 6L] - Q.Start[OutPutMatrix[, 1L]]),
                                      no = abs(OutPutMatrix[, 7L] - Q.Stop[OutPutMatrix[, 1L]]))
      SubjectStartDisplacement <- ifelse(test = SG.Strand[OutPutMatrix[, "SubjectGene"]] == 1L,
                                         yes = abs(OutPutMatrix[, 9L] - S.Stop[OutPutMatrix[, 2L]]),
                                         no = abs(OutPutMatrix[, 8L] - S.Start[OutPutMatrix[, 2L]]))
      SubjectStopDisplacement <- ifelse(test = SG.Strand[OutPutMatrix[, "SubjectGene"]] == 1L,
                                        yes = abs(OutPutMatrix[, 8L] - S.Start[OutPutMatrix[, 2L]]),
                                        no = abs(OutPutMatrix[, 9L] - S.Stop[OutPutMatrix[, 2L]]))
      DisplacementMatrix <- cbind(QueryStartDisplacement,
                                  QueryStopDisplacement,
                                  SubjectStartDisplacement,
                                  SubjectStopDisplacement)
      
      if (Verbose) {
        TotalCounter <- TotalCounter + 1L
        setTxtProgressBar(pb = pBar,
                          value = TotalCounter/TotalLength)
      }
      if (OutputFormat == "Normal") {
        ResultMatrix[m1, m2] <- list(OutPutMatrix)
        ResultMatrix[m2, m1] <- list(DisplacementMatrix)
      } else if (OutputFormat == "Sparse") {
        ResultMatrix[m1, m2] <- list(cbind(OutPutMatrix[, "QueryGene"],
                                           OutPutMatrix[, "SubjectGene"],
                                           OutPutMatrix[, "ExactOverlap"],
                                           QueryStartDisplacement,
                                           QueryStopDisplacement,
                                           SubjectStartDisplacement,
                                           SubjectStopDisplacement))
        colnames(ResultMatrix[m1, m2][[1]])[1:3] <- c("QueryGene",
                                                      "SubjectGene",
                                                      "ExactOverlap")
      } else if (OutputFormat == "Comprehensive") {
        ResultMatrix[m1, m2] <- list(OutPutMatrix)
        ResultMatrix[m2, m1] <- list(DisplacementMatrix)
        diag(ResultMatrix) <- GeneCalls
      }
    } # end of columns loop
  } # end of rows loop
  if (Verbose) {
    TotalTimeStop <- Sys.time()
    
    cat("\n")
    print(TotalTimeStop - TotalTimeStart)
  }
  dimnames(ResultMatrix) <- dimnames(SyntenyObject)
  class(ResultMatrix) <- "LinkedPairs"
  return(ResultMatrix)
}

Try the SynExtend package in your browser

Any scripts or data that you put into this service are public.

SynExtend documentation built on Nov. 8, 2020, 7:50 p.m.