# Author: Nicholas Cooley
# Maintainer: Nicholas Cooley
# Contact: npc19@pitt.edu
NucleotideOverlap <- function(SyntenyObject,
GeneCalls,
LimitIndex = FALSE,
AcceptContigNames = TRUE,
Verbose = FALSE) {
######
# Error Checking
# Require names for synteny object, DECIPHER be loaded, object sizes, some index checking
# Parameters Default Filter, FilterOverlap, and Filter Coverage have been depreciated and
# a filtering function is under construction
######
L <- nrow(SyntenyObject)
if (!("DECIPHER" %in% .packages())) {
stop ("Required package DECIPHER is not loaded.")
}
if (!is(SyntenyObject, "Synteny")) {
stop ("Object is not a synteny object.")
}
# this needs to be dropped if i'm going to work with self-synteny objects
# if (L != length(GeneCalls)) {
# stop ("Synteny Object and Gene Calls have differing number of genomes.")
# }
if (is.null(names(GeneCalls))) {
stop ("Gene Predictions must be named.")
}
if (is.null(rownames(SyntenyObject))) {
stop ("Synteny Object must have named identifiers.")
}
if (any(names(GeneCalls) != rownames(SyntenyObject))) {
stop ("Names between Synteny Object and Gene Predictions do not match.")
}
if (!all(rownames(SyntenyObject) %in% names(GeneCalls))) {
stop ("All distinct identifiers in the Synteny object require matching gene calls.")
}
if (L <= 1L) {
stop ("SyntenyObject is too small.")
}
if (Verbose) {
TotalTimeStart <- Sys.time()
pBar <- txtProgressBar(style = 1L)
}
ResultMatrix <- matrix(data = list(),
nrow = L,
ncol = L)
TotalLength <- L^2 - L
TotalCounter <- 0L
######
# scroll through every hit table in the synteny object
######
###### -- Deal with Different GeneCall types --------------------------------
SynNames <- unname(sapply(diag(SyntenyObject),
function(x) gsub(x = names(x),
pattern = " .*",
replacement = ""),
simplify = FALSE,
USE.NAMES = FALSE))
# this can no longer be length L and needs to just be the length of GeneCalls
FeatureRepresentations <- ContigNames <- vector(mode = "list",
length = length(GeneCalls))
names(FeatureRepresentations) <- names(GeneCalls)
GCallClasses <- sapply(GeneCalls,
function(x) class(x),
USE.NAMES = FALSE,
simplify = TRUE)
if (any(GCallClasses == "GRanges")) {
warning("GRanges support is tenuous, exercise caution.")
}
IndexMatching <- vector("integer",
length = length(GeneCalls))
if (Verbose) {
cat("\nReconciling genecalls.\n")
}
for (m1 in seq_along(GeneCalls)) {
if (!is(GeneCalls[[m1]],
"GRanges")) {
IndexMatching[m1] <- length(unique(GeneCalls[[m1]][, "Index"]))
} else {
IndexMatching[m1] <- 1L
}
}
SyntenyIndices <- unname(lengths(diag(SyntenyObject)))
if (any(IndexMatching > SyntenyIndices)) {
warning ("Indices do not match. Setting LimitIndex to TRUE.")
LimitIndex <- TRUE
}
for (m1 in seq_along(GeneCalls)) {
if (is(GeneCalls[[m1]],
"GRanges")) {
# if GRanges, force contig name matching
# stop if a contig name exists in the synteny object that
# does not exist in the GRanges object
ContigNames[[m1]] <- seq(length(GeneCalls[[m1]]@seqnames@lengths))
names(ContigNames[[m1]]) <- unique(GeneCalls[[m1]]@seqnames@values)[ContigNames[[m1]]]
if (any(is.na(match(x = names(ContigNames[[m1]]),
table = SynNames[[m1]])))) {
stop ("Contig names imply inorrectly matched objects at diag position ",
names(GeneCalls)[m1])
}
TypePlaceHolder <- as.character(GeneCalls[[m1]]$type)
GeneCalls[[m1]] <- GeneCalls[[m1]][TypePlaceHolder %in% c("gene",
"pseudogene"), ]
StrandConversion <- ifelse(test = as.character(GeneCalls[[m1]]@strand == "+"),
yes = 0L,
no = 1L)
StartConversion <- GeneCalls[[m1]]@ranges@start
StopConversion <- GeneCalls[[m1]]@ranges@start + GeneCalls[[m1]]@ranges@width - 1L
# assign integer positions to contings
IndexConversion <- rep(unname(ContigNames[[m1]]),
GeneCalls[[m1]]@seqnames@lengths)
LengthsConversion <- StopConversion - StartConversion + 1L
# reorder and reindex lines
C.Index <- IndexConversion
ph1 <- unname(ContigNames[[m1]])
ph2 <- match(x = names(ContigNames[[m1]]),
table = SynNames[[m1]])
ph3 <- vector(mode = "list",
length = length(ph1))
for (m3 in seq_along(ph1)) {
ph3[[m3]] <- which(C.Index == ph1[m3])
}
for (m3 in seq_along(ph1)) {
if (length(ph3) > 0L) {
C.Index <- replace(x = C.Index,
list = ph3[[m3]],
values = ph2[m3])
}
}
rm(list = c("ph1",
"ph2",
"ph3"))
o <- order(C.Index,
StartConversion)
StrandConversion <- StrandConversion[o]
IndexConversion <- C.Index[o]
StartConversion <- StartConversion[o]
StopConversion <- StopConversion[o]
LengthsConversion <- LengthsConversion[o]
# FeatureRepresentations[[m1]] <- matrix(data = c(IndexConversion,
# StrandConversion,
# StartConversion,
# StopConversion,
# LengthsConversion),
# nrow = length(o),
# ncol = 5L)
StrandMax <- rep(unname(SyntenyObject[[m1, m1]]),
table(IndexConversion))
R <- mapply(function(x, y) IRanges(start = x,
end = y),
x = StartConversion,
y = StopConversion,
SIMPLIFY = FALSE)
FeatureRepresentations[[m1]] <- DataFrame("Index" = IndexConversion,
"Strand" = StrandConversion,
"Start" = StartConversion,
"Stop" = StopConversion,
# "Lengths" = LengthsConversion,
"Type" = rep("gene",
length(o)),
"Translation_Table" = rep(NA_character_,
length(o)),
"Coding" = rep(FALSE,
length(o)),
"Range" = IRangesList(R))
FeatureRepresentations[[m1]] <- FeatureRepresentations[[m1]][StopConversion <= StrandMax, ]
rm(list = c("IndexConversion",
"StrandConversion",
"StartConversion",
"StopConversion",
"LengthsConversion"))
ph <- seq(length(ContigNames[[m1]]))
# ph <- unname(SyntenyObject[[m1, m1]])
names(ph) <- SynNames[[m1]]
ContigNames[[m1]] <- ph
} else if (is(GeneCalls[[m1]],
"DFrame")) {
ph <- unique(GeneCalls[[m1]][, c("Index", "Contig")])
ContigNames[[m1]] <- ph$Index
names(ContigNames[[m1]]) <- ph$Contig
rm(list = c("ph"))
CurrentIndices <- as.integer(GeneCalls[[m1]]$Index)
CurrentStarts <- as.integer(GeneCalls[[m1]]$Start)
if (AcceptContigNames) {
C.Index <- CurrentIndices
ph1 <- unname(ContigNames[[m1]])
ph2 <- match(x = names(ContigNames[[m1]]),
table = SynNames[[m1]])
ph3 <- vector(mode = "list",
length = length(ph1))
for (m3 in seq_along(ph1)) {
ph3[[m3]] <- which(C.Index == ph1[m3])
}
for (m3 in seq_along(ph1)) {
if (length(ph3) > 0L) {
C.Index <- replace(x = C.Index,
list = ph3[[m3]],
values = ph2[m3])
}
}
rm(list = c("ph1",
"ph2",
"ph3"))
o <- order(C.Index,
CurrentStarts)
} else {
o <- order(CurrentIndices,
CurrentStarts)
}
# CurrentStarts <- CurrentStarts[o]
# CurrentStops <- as.integer(GeneCalls[[m1]]$Stop)[o]
# CurrentStrands <- as.integer(GeneCalls[[m1]]$Strand)[o]
if (AcceptContigNames) {
# CurrentIndices <- C.Index[o]
GeneCalls[[m1]][, "Index"] <- C.Index
} else {
# CurrentIndices <- CurrentIndices[o]
GeneCalls[[m1]][, "Index"] <- CurrentIndices
}
# CurrentLengths <- CurrentStops - CurrentStarts + 1L
# FeatureRepresentations[[m1]] <- matrix(data = c(CurrentIndices,
# CurrentStrands,
# CurrentStarts,
# CurrentStops,
# CurrentLengths),
# nrow = length(o),
# ncol = 5L)
FeatureRepresentations[[m1]] <- GeneCalls[[m1]][o, ]
# rm(list = c("CurrentIndices",
# "CurrentStrands",
# "CurrentStarts",
# "CurrentStops",
# "CurrentLengths"))
} else if (is(GeneCalls[[m1]],
"Genes")) {
# convert Erik's gene calls to a temporary DataFrame
# the column "Gene" assigns whether or not that particular line is the gene
# that the caller actually picked, calls must be subset to where Gene == 1
ans <- GeneCalls[[m1]]
CurrentIndices <- as.integer(ans[, "Index"])
CurrentStarts <- as.integer(ans[, "Begin"])
CurrentStrand <- as.integer(ans[, "Strand"])
CurrentStops <- as.integer(ans[, "End"])
CurrentType <- rep("gene",
nrow(ans))
CurrentGene <- as.integer(ans[, "Gene"])
CurrentCoding <- ifelse(test = CurrentGene > 0L,
yes = TRUE,
no = FALSE)
R <- mapply(function(x, y) IRanges(start = x,
end = y),
x = ans[, "Begin"],
y = ans[, "End"],
SIMPLIFY = FALSE)
ContigNames[[m1]] <- unique(CurrentIndices)
names(ContigNames[[m1]]) <- gsub(x = names(attr(ans, "widths")),
pattern = " .*",
replacement = "")
if (AcceptContigNames) {
C.Index <- CurrentIndices
ph1 <- unname(ContigNames[[m1]])
ph2 <- match(x = names(ContigNames[[m1]]),
table = SynNames[[m1]])
ph3 <- vector(mode = "list",
length = length(ph1))
for (m3 in seq_along(ph1)) {
ph3[[m3]] <- which(C.Index == ph1[m3])
}
for (m3 in seq_along(ph1)) {
if (length(ph3) > 0L) {
C.Index <- replace(x = C.Index,
list = ph3[[m3]],
values = ph2[m3])
}
}
ContigNames[[m1]] <- ContigNames[[m1]][ph2]
rm(list = c("ph1",
"ph2",
"ph3"))
o <- order(C.Index,
CurrentStarts)
} else {
# there should be no need to generically re-order a GeneCalls object
# unless it has been fiddled with by the user, but we'll include this anyway
o <- order(CurrentIndices,
CurrentStarts)
}
D <- DataFrame("Index" = CurrentIndices,
"Strand" = CurrentStrand,
"Start" = CurrentStarts,
"Stop" = CurrentStops,
"Type" = CurrentType,
"Range" = IRangesList(R),
"Gene" = CurrentGene,
"Coding" = CurrentCoding,
"Translation_Table" = rep(NA_character_,
length(o)),
"Contig" = rep(names(ContigNames[[m1]]),
times = table(CurrentIndices)))
D <- D[o, ]
D <- D[as.vector(ans[, "Gene"]) != 0L, ]
rownames(D) <- NULL
FeatureRepresentations[[m1]] <- D
# ContigNames[[m1]] <- unique(D$Index)
# names(ContigNames[[m1]]) <- ContigNames[[m1]]
rm(list = c("D",
"ans",
"R",
"CurrentIndices",
"CurrentStarts",
"CurrentStrand",
"CurrentStops",
"CurrentType",
"CurrentGene",
"CurrentCoding"))
}
if (Verbose) {
setTxtProgressBar(pb = pBar,
value = m1 / length(GeneCalls))
}
}
if (Verbose) {
cat("\nFinding connected features.\n")
}
if (AcceptContigNames) {
for (m1 in seq_along(ContigNames)) {
if (any(is.na(match(x = names(ContigNames[[m1]]),
table = SynNames[[m1]])))) {
stop ("Contig names imply incorrectly matched objects at diag position ",
names(GeneCalls)[m1])
} else {
ph <- unname(SyntenyObject[[m1, m1]])
# ph <- seq(length(ContigNames[[m1]]))
names(ph) <- SynNames[[m1]]
ContigNames[[m1]] <- ph
}
}
} else {
for (m1 in seq_along(ContigNames)) {
ph <- unname(SyntenyObject[[m1, m1]])
names(ph) <- SynNames[[m1]]
ContigNames[[m1]] <- ph
}
}
diag(ResultMatrix) <- ContigNames
feature_match <- match(x = rownames(SyntenyObject),
table = names(FeatureRepresentations))
###### -- End Gene call stuff -----------------------------------------------
pBar <- txtProgressBar(style = 1L)
for (m1 in seq_len(L - 1L)) {
for (m2 in (m1 +1L):L) {
######
# Collect Index Start Stop and Strand from current subject and query gene calls
# Collect Index Start Stop Strand from hit table
######
Q.Index <- FeatureRepresentations[[feature_match[m1]]][, 1L]
Q.Start <- FeatureRepresentations[[feature_match[m1]]][, 3L]
Q.Stop <- FeatureRepresentations[[feature_match[m1]]][, 4L]
QG.Strand <- FeatureRepresentations[[feature_match[m1]]][, 2L]
S.Index <- FeatureRepresentations[[feature_match[m2]]][, 1L]
S.Start <- FeatureRepresentations[[feature_match[m2]]][, 3L]
S.Stop <- FeatureRepresentations[[feature_match[m2]]][, 4L]
SG.Strand <- FeatureRepresentations[[feature_match[m2]]][, 2L]
# if (AcceptContigNames) {
# Q.Index <- FeatureRepresentations[[m1]][, 1L]
# ph1 <- unname(ContigNames[[m1]])
# ph2 <- match(x = names(ContigNames[[m1]]),
# table = SynNames[[m1]])
# ph3 <- vector(mode = "list",
# length = length(ph1))
# for (m3 in seq_along(ph1)) {
# ph3[[m3]] <- which(Q.Index == ph1[m3])
# }
# for (m3 in seq_along(ph1)) {
# if (length(ph3) > 0L) {
# Q.Index <- replace(x = Q.Index,
# list = ph3[[m3]],
# values = ph2[m3])
# }
# }
# rm(list = c("ph1",
# "ph2",
# "ph3"))
# S.Index <- FeatureRepresentations[[m2]][, 1L]
# ph1 <- unname(ContigNames[[m2]])
# ph2 <- match(x = names(ContigNames[[m2]]),
# table = SynNames[[m2]])
# ph3 <- vector(mode = "list",
# length = length(ph1))
# for (m3 in seq_along(ph1)) {
# ph3[[m3]] <- which(S.Index == ph1[m3])
# }
# for (m3 in seq_along(ph2)) {
# if (length(ph3[[m3]]) > 0L) {
# S.Index <- replace(x = S.Index,
# list = ph3[[m3]],
# values = ph2[m3])
# }
# }
# rm(list = c("ph1",
# "ph2",
# "ph3"))
# o <- order(S.Index,
# S.Start)
# S.Start <- S.Start[o]
# S.Stop <- S.Stop[o]
# SG.Strand <- SG.Strand[o]
# S.Index <- S.Index[o]
# o <- order(Q.Index,
# Q.Start)
# Q.Start <- Q.Start[o]
# Q.Stop <- Q.Stop[o]
# QG.Strand <- QG.Strand[o]
# Q.Index <- Q.Index[o]
# rm(list = c("o"))
# } else {
# Q.Index <- FeatureRepresentations[[m1]][, 1L]
# S.Index <- FeatureRepresentations[[m2]][, 1L]
# }
CurrentHitTable <- SyntenyObject[m1, m2, drop = FALSE][[1]]
# return(list(unique(CurrentHitTable[, c(1,2)]),
# FeatureRepresentations[[m1]],
# FeatureRepresentations[[m2]],
# LimitIndex,
# diag(ResultMatrix),
# SynNames,
# FeatureRepresentations,
# Q.Index,
# FeatureRepresentations[[m1]][, 1L],
# S.Index,
# FeatureRepresentations[[m2]][, 1L]))
if (LimitIndex) {
######
# Select current hit table, subset to only single index
######
CurrentHitTable <- CurrentHitTable[which(CurrentHitTable[, "index1"] == 1L), ]
CurrentHitTable <- CurrentHitTable[which(CurrentHitTable[, "index2"] == 1L), ]
CurrentHitTable <- CurrentHitTable[order(CurrentHitTable[,
"start1",
drop = FALSE]),
,
drop = FALSE]
} else {
######
# If not limiting by index, order hit table by the index1 first, then the query starts
######
CurrentHitTable <- CurrentHitTable[order(CurrentHitTable[,
"index1",
drop = FALSE],
CurrentHitTable[,
"start1",
drop = FALSE]),
,
drop = FALSE]
}
######
# Collect the starts and stops for all hits correct for strandedness
# Collect indices as well
######
HitWidths <- CurrentHitTable[, "width"]
Q.HitStarts <- CurrentHitTable[, "start1"]
S.HitStarts <- CurrentHitTable[, "start2"]
Q.HitEnds <- Q.HitStarts + HitWidths - 1L
S.HitEnds <- S.HitStarts + HitWidths - 1L
Strand <- CurrentHitTable[, "strand"]
QHI <- CurrentHitTable[, "index1"]
SHI <- CurrentHitTable[, "index2"]
for (i in seq_along(HitWidths)) {
if (Strand[i] == 0L) {
S.HitStarts[i] <- CurrentHitTable[i, "start2"]
S.HitEnds[i] <- CurrentHitTable[i, "start2"] + CurrentHitTable[i, "width"] - 1L
} else if (Strand[i] == 1L) {
S.HitStarts[i] <- CurrentHitTable[i, "start2"] - CurrentHitTable[i, "width"] + 1L
S.HitEnds[i] <- CurrentHitTable[i, "start2"]
}
}
######
# Record a query matrix for every hit that lands inside a gene in our query geneset/genome
# contains:
# the gene index and hit index in question
# modifies the hit boundaries IN THE SUBJECT
# records the distances between the hit boundaries and the gene boundaries
# the strand for the gene in the query that is being recorded
######
QueryMatrix <- matrix(NA_integer_,
nrow = 8L,
ncol = nrow(CurrentHitTable))
# return(list(CurrentHitTable,
# FeatureRepresentations,
# Q.Index,
# S.Index))
HitCounter <- 1L
AddCounter <- 1L
DimLimit <- nrow(CurrentHitTable) / 2
DimAdjust <- 2L
# return(list(Q.Start,
# Q.Stop,
# QG.Strand,
# Q.Index,
# S.Start,
# S.Stop,
# SG.Strand,
# S.Index,
# SHI))
for (z1 in seq_along(Q.Start)) {
######
# loop through the starts of the query genes as they have been ordered
######
CurrentGene <- HitCounter
Q.NucOverLapL <- NA_integer_
Q.NucOverLapR <- NA_integer_
S.NucPositionL <- NA_integer_
S.NucPositionR <- NA_integer_
S.Strand <- NA_integer_
######
# re-register the hit search if a new gene has a start position that
# occurs before the end of the previous gene
# re-registers can only occur when
######
if (z1 > 1L &
HitCounter > 1L &
HitCounter <= length(HitWidths)) {
# not the first gene
# not the first hit
if (Q.Start[z1] < Q.Stop[z1 - 1L]) {
# start of the current gene occurs before end of previous gene
# print(paste(QHI[HitCounter],
# Q.Index[z1],
# HitCounter,
# z1))
if (QHI[HitCounter] == Q.Index[z1] &
Q.Index[z1] == Q.Index[z1 - 1L]) {
# gene and hit are in the same index
# gene and previous gene are in the same index
AvailableReRegisters <- QHI == Q.Index[z1] & Q.HitStarts < Q.Start[z1]
if (sum(AvailableReRegisters) > 0L) {
# more than zero re-register positions available
TempHit <- max(which(AvailableReRegisters))
if (TempHit < HitCounter) {
HitCounter <- TempHit
}
} # else do nothing, no hits that start before the current start and occupy the correct index
} # else do nothing - indices are not matched
} # else do nothing - start of current gene is after stop of previous
} # else do nothing - either first gene or hit
while (HitCounter <= length(HitWidths)) {
######
# interrogation loop, depending upon where the current hit is
# and where the current gene is
# either go to the next hit (HitCounter + 1L)
# or
# go to the next gene (break)
# AddCounter tells the loop which row to add information to in
# the initialized QueryMatrix, when there is information to add
######
if (Q.HitEnds[HitCounter] < Q.Start[z1] &
QHI[HitCounter] == Q.Index[z1]) {
# Hit ends before current query gene begins
# AND the indices are the same
# record the hit counter every time you iterate to the next hit
HitCounter <- HitCounter + 1L
} else if (Q.HitStarts[HitCounter] < Q.Start[z1] &
Q.HitEnds[HitCounter] >= Q.Start[z1] &
Q.HitEnds[HitCounter] <= Q.Stop[z1] &
QHI[HitCounter] == Q.Index[z1]) {
# Hit overlaps left bound of current query gene
# Stay on the current gene
# And go to the next hit
CurrentGene <- z1
Q.NucOverLapL <- Q.Start[z1]
Q.NucOverLapR <- Q.HitEnds[HitCounter]
S.Strand <- Strand[HitCounter]
NewWidth <- Q.NucOverLapR - Q.NucOverLapL + 1L
Current.QHI <- QHI[HitCounter]
Current.SHI <- SHI[HitCounter]
######
# Trim hit in subject, overlaps that are in the reverse strand require different trims
######
if (S.Strand == 0L) {
S.NucPositionR <- S.HitEnds[HitCounter]
S.NucPositionL <- S.HitEnds[HitCounter] - NewWidth + 1L
} else {
S.NucPositionL <- S.HitStarts[HitCounter]
S.NucPositionR <- S.HitStarts[HitCounter] + NewWidth - 1L
}
# Add To Vector!
QueryMatrix[, AddCounter] <- c(CurrentGene,
Q.NucOverLapL,
Q.NucOverLapR,
S.NucPositionL,
S.NucPositionR,
S.Strand,
Current.QHI,
Current.SHI)
if (AddCounter >= DimLimit) {
QueryMatrix <- cbind(QueryMatrix,
matrix(data = NA_integer_,
nrow = nrow(QueryMatrix),
ncol = ncol(QueryMatrix) * DimAdjust))
DimLimit <- ncol(QueryMatrix) / 2
DimAdjust <- DimAdjust * 2L
}
# record the add counter every time a new row is added to the query matrix
AddCounter <- AddCounter + 1L
# record the hit counter every time you iterate to the next hit
HitCounter <- HitCounter + 1L
} else if (Q.HitStarts[HitCounter] >= Q.Start[z1] &
Q.HitEnds[HitCounter] <= Q.Stop[z1] &
QHI[HitCounter] == Q.Index[z1]) {
# Hit occurs entirely within current query gene
# Stay on the current gene
# And go to the next hit
CurrentGene <- z1
Q.NucOverLapL <- Q.HitStarts[HitCounter]
Q.NucOverLapR <- Q.HitEnds[HitCounter]
S.Strand <- Strand[HitCounter]
S.NucPositionL <- S.HitStarts[HitCounter]
S.NucPositionR <- S.HitEnds[HitCounter]
Current.QHI <- QHI[HitCounter]
Current.SHI <- SHI[HitCounter]
# Add To Vector!
QueryMatrix[, AddCounter] <- c(CurrentGene,
Q.NucOverLapL,
Q.NucOverLapR,
S.NucPositionL,
S.NucPositionR,
S.Strand,
Current.QHI,
Current.SHI)
if (AddCounter >= DimLimit) {
QueryMatrix <- cbind(QueryMatrix,
matrix(data = NA_integer_,
nrow = nrow(QueryMatrix),
ncol = ncol(QueryMatrix) * DimAdjust))
DimLimit <- ncol(QueryMatrix) / 2
DimAdjust <- DimAdjust * 2L
}
# record the add counter every time a new row is added to the query matrix
AddCounter <- AddCounter + 1L
# record the hit counter every time you iterate to the next hit
HitCounter <- HitCounter + 1L
} else if (Q.HitStarts[HitCounter] >= Q.Start[z1] &
Q.HitStarts[HitCounter] <= Q.Stop[z1] &
Q.HitEnds[HitCounter] > Q.Stop[z1] &
QHI[HitCounter] == Q.Index[z1]) {
# Hit overlaps right bound of current query gene
# Stay on the current hit and go to the next gene
CurrentGene <- z1
Q.NucOverLapL <- Q.HitStarts[HitCounter]
Q.NucOverLapR <- Q.Stop[z1]
S.Strand <- Strand[HitCounter]
NewWidth <- Q.NucOverLapR - Q.NucOverLapL + 1L
Current.QHI <- QHI[HitCounter]
Current.SHI <- SHI[HitCounter]
if (S.Strand == 0L) {
S.NucPositionL <- S.HitStarts[HitCounter]
S.NucPositionR <- S.HitStarts[HitCounter] + NewWidth - 1L
} else {
S.NucPositionR <- S.HitEnds[HitCounter]
S.NucPositionL <- S.HitEnds[HitCounter] - NewWidth + 1L
}
# Add To Vector!
QueryMatrix[, AddCounter] <- c(CurrentGene,
Q.NucOverLapL,
Q.NucOverLapR,
S.NucPositionL,
S.NucPositionR,
S.Strand,
Current.QHI,
Current.SHI)
if (AddCounter >= DimLimit) {
QueryMatrix <- cbind(QueryMatrix,
matrix(data = NA_integer_,
nrow = nrow(QueryMatrix),
ncol = ncol(QueryMatrix) * DimAdjust))
DimLimit <- ncol(QueryMatrix) / 2
DimAdjust <- DimAdjust * 2L
}
# record the add counter every time a new row is added to the query matrix
AddCounter <- AddCounter + 1L
break
} else if (Q.HitStarts[HitCounter] < Q.Start[z1] &
Q.HitEnds[HitCounter] > Q.Stop[z1] &
QHI[HitCounter] == Q.Index[z1]) {
# Hit eclipses current query gene
# Stay on the current hit, and go to the next gene
CurrentGene <- z1
Q.NucOverLapL <- Q.Start[z1]
Q.NucOverLapR <- Q.Stop[z1]
TrimLeft <- Q.Start[z1] - Q.HitStarts[HitCounter]
TrimRight <- Q.HitEnds[HitCounter] - Q.Stop[z1]
S.Strand <- Strand[HitCounter]
NewWidth <- Q.NucOverLapR - Q.NucOverLapL + 1L
Current.QHI <- QHI[HitCounter]
Current.SHI <- SHI[HitCounter]
if (S.Strand == 0L) {
S.NucPositionL <- S.HitStarts[HitCounter] + TrimLeft
S.NucPositionR <- S.NucPositionL + NewWidth - 1L
} else {
S.NucPositionR <- S.HitEnds[HitCounter] - TrimLeft
S.NucPositionL <- S.NucPositionR - NewWidth + 1L
}
# Add To Vector!
QueryMatrix[, AddCounter] <- c(CurrentGene,
Q.NucOverLapL,
Q.NucOverLapR,
S.NucPositionL,
S.NucPositionR,
S.Strand,
Current.QHI,
Current.SHI)
if (AddCounter >= DimLimit) {
QueryMatrix <- cbind(QueryMatrix,
matrix(data = NA_integer_,
nrow = nrow(QueryMatrix),
ncol = ncol(QueryMatrix) * DimAdjust))
DimLimit <- ncol(QueryMatrix) / 2
DimAdjust <- DimAdjust * 2L
}
# record the add counter every time a new row is added to the query matrix
AddCounter <- AddCounter + 1L
break
} else if (Q.HitStarts[HitCounter] > Q.Stop[z1] &
QHI[HitCounter] == Q.Index[z1]) {
# Hit occurs after current query gene
# And the indices are matched
# Go to next gene
break
} else if (QHI[HitCounter] > Q.Index[z1]) {
# If the index of the hits in the query has outpaced the index of the genes
break
} else if (QHI[HitCounter] < Q.Index[z1]) {
# If the index of the genes has outpaced the index of the hits
# record the hit counter every time you iterate to the next hit
HitCounter <- HitCounter + 1L
} # end of else if conditionals
} # end while loop through hits
} # end for loop through genes
if (Verbose) {
TotalCounter <- TotalCounter + 1L
setTxtProgressBar(pb = pBar,
value = TotalCounter/TotalLength)
}
QueryMatrix <- t(QueryMatrix)
colnames(QueryMatrix) <- c("CurrentGene", # The gene that was recorded with a hit present
"QueryNucleotideOverLapLeft", # leftmost nucleotide position of a hit that is within CurrentGene
"QueryNucleotideOverLapRight", # rightmost nucleotide position of a hit that is within CurrentGene
"SubjectNucleotidePositionLeft", # corresponding leftmost position in the subject
"SubjectNucleotidePositionRight", # corresponding rightmost position in the subject
"SubjectStrand", # strand of the hit in the subject
"QueryIndex", # index that was matched between the hit and the gene in the query
"SubjectIndex" # index (chromosome, plasmid, whatever) of the hit in the subject
)
# return(QueryMatrix)
######
# Remove unfilled extra rows
######
QueryMatrix <- QueryMatrix[apply(QueryMatrix,
1L,
function(x) !all(is.na(x))),
,
drop = FALSE]
if (dim(QueryMatrix)[1] == 0L) {
OutPutMatrix <- matrix(NA_integer_,
nrow = 1L,
ncol = 7L)
OverLapMatrix <- matrix(NA_integer_,
nrow = 1L,
ncol = 7L)
} else {
HitCounter <- 1L
AddCounter <- 1L
OverLapMatrix <- matrix(NA_integer_,
ncol = nrow(QueryMatrix),
nrow = 9L)
QueryMatrix <- QueryMatrix[order(QueryMatrix[,
"SubjectIndex",
drop = FALSE],
QueryMatrix[,
"SubjectNucleotidePositionLeft",
drop = FALSE]),
,
drop = FALSE]
DimAdjust <- 2L
DimLimit <- ncol(OverLapMatrix) / 2
######
# Part 2!
# Hits that were recorded as being within a gene in the query
# are now tested again the genes in the subject
######
QueryMap <- QueryMatrix[, "CurrentGene"]
S.HitStarts <- QueryMatrix[, "SubjectNucleotidePositionLeft"]
S.HitEnds <- QueryMatrix[, "SubjectNucleotidePositionRight"]
Q.HitStarts <- QueryMatrix[, "QueryNucleotideOverLapLeft"]
Q.HitEnds <- QueryMatrix[, "QueryNucleotideOverLapRight"]
QueryIndices <- QueryMatrix[, "QueryIndex"]
SubjectHitIndex <- QueryMatrix[, "SubjectIndex"]
# return(list(QueryMatrix,
# S.Start,
# S.Stop,
# S.Index,
# FeatureRepresentations[[m1]],
# FeatureRepresentations[[m2]]))
for (z2 in seq_along(S.Start)) {
######
# Loop through the subject genes
######
while (HitCounter <= length(QueryMap)) {
if (S.HitEnds[HitCounter] < S.Start[z2] &
SubjectHitIndex[HitCounter] == S.Index[z2]) {
# Hit ends before current subject gene begins
HitCounter <- HitCounter + 1L
} else if (S.HitStarts[HitCounter] < S.Start[z2] &
S.HitEnds[HitCounter] >= S.Start[z2] &
S.HitEnds[HitCounter] <= S.Stop[z2] &
SubjectHitIndex[HitCounter] == S.Index[z2]) {
# Hit overlaps left bound of current subject gene
CurrentGene <- z2
QueryGenePosition <- QueryMap[HitCounter]
CurrentQueryIndex <- QueryIndices[HitCounter]
CurrentSubjectIndex <- SubjectHitIndex[HitCounter]
TrimLeft <- S.Start[z2] - S.HitStarts[HitCounter]
ExactOverLap <- S.HitEnds[HitCounter] - S.Start[z2] + 1L
SubjectHitLeft <- S.HitStarts[HitCounter] + TrimLeft
SubjectHitRight <- S.HitEnds[HitCounter]
QueryHitLeft <- Q.HitStarts[HitCounter] + TrimLeft
QueryHitRight <- Q.HitEnds[HitCounter]
# Add to vector !
OverLapMatrix[, AddCounter] <- c(QueryGenePosition,
CurrentGene,
ExactOverLap,
CurrentQueryIndex,
CurrentSubjectIndex,
QueryHitLeft,
QueryHitRight,
SubjectHitLeft,
SubjectHitRight)
if (AddCounter >= DimLimit) {
OverLapMatrix <- cbind(OverLapMatrix,
matrix(data = NA_integer_,
nrow = nrow(OverLapMatrix),
ncol = ncol(OverLapMatrix) * DimAdjust))
DimLimit <- ncol(OverLapMatrix) / 2
DimAdjust <- DimAdjust * 2L
}
AddCounter <- AddCounter + 1L
HitCounter <- HitCounter + 1L
} else if (S.HitStarts[HitCounter] >= S.Start[z2] &
S.HitEnds[HitCounter] <= S.Stop[z2] &
SubjectHitIndex[HitCounter] == S.Index[z2]) {
# Hit occurs entirely within current subject gene
CurrentGene <- z2
QueryGenePosition <- QueryMap[HitCounter]
ExactOverLap <- S.HitEnds[HitCounter] - S.HitStarts[HitCounter] + 1L
CurrentQueryIndex <- QueryIndices[HitCounter]
CurrentSubjectIndex <- SubjectHitIndex[HitCounter]
SubjectHitLeft <- S.HitStarts[HitCounter]
SubjectHitRight <- S.HitEnds[HitCounter]
QueryHitLeft <- Q.HitStarts[HitCounter]
QueryHitRight <- Q.HitEnds[HitCounter]
# Add to vector !
OverLapMatrix[, AddCounter] <- c(QueryGenePosition,
CurrentGene,
ExactOverLap,
CurrentQueryIndex,
CurrentSubjectIndex,
QueryHitLeft,
QueryHitRight,
SubjectHitLeft,
SubjectHitRight)
if (AddCounter >= DimLimit) {
OverLapMatrix <- cbind(OverLapMatrix,
matrix(data = NA_integer_,
nrow = nrow(OverLapMatrix),
ncol = ncol(OverLapMatrix) * DimAdjust))
DimLimit <- ncol(OverLapMatrix) / 2
DimAdjust <- DimAdjust * 2L
}
AddCounter <- AddCounter + 1L
HitCounter <- HitCounter + 1L
} else if (S.HitStarts[HitCounter] >= S.Start[z2] &
S.HitStarts[HitCounter] <= S.Stop[z2] &
S.HitEnds[HitCounter] > S.Stop[z2] &
SubjectHitIndex[HitCounter] == S.Index[z2]) {
# Hit overlaps right bound of current subject gene
# Stay on the current hit and go to the next gene
CurrentGene <- z2
QueryGenePosition <- QueryMap[HitCounter]
ExactOverLap <- S.Stop[z2] - S.HitStarts[HitCounter] + 1L
CurrentQueryIndex <- QueryIndices[HitCounter]
CurrentSubjectIndex <- SubjectHitIndex[HitCounter]
TrimRight <- S.HitEnds[HitCounter] - S.Stop[z2]
SubjectHitLeft <- S.HitStarts[HitCounter]
SubjectHitRight <- S.HitEnds[HitCounter] - TrimRight
QueryHitLeft <- Q.HitStarts[HitCounter]
QueryHitRight <- Q.HitEnds[HitCounter] - TrimRight
# Add to vector !
OverLapMatrix[, AddCounter] <- c(QueryGenePosition,
CurrentGene,
ExactOverLap,
CurrentQueryIndex,
CurrentSubjectIndex,
QueryHitLeft,
QueryHitRight,
SubjectHitLeft,
SubjectHitRight)
if (AddCounter >= DimLimit) {
OverLapMatrix <- cbind(OverLapMatrix,
matrix(data = NA_integer_,
nrow = nrow(OverLapMatrix),
ncol = ncol(OverLapMatrix) * DimAdjust))
DimLimit <- ncol(OverLapMatrix) / 2
DimAdjust <- DimAdjust * 2L
}
AddCounter <- AddCounter + 1L
break
} else if (S.HitStarts[HitCounter] <= S.Start[z2] &
S.HitEnds[HitCounter] >= S.Stop[z2] &
SubjectHitIndex[HitCounter] == S.Index[z2]) {
# Hit eclipses current subject gene
CurrentGene <- z2
QueryGenePosition <- QueryMap[HitCounter]
ExactOverLap <- S.Stop[z2] - S.Start[z2] + 1L
CurrentQueryIndex <- QueryIndices[HitCounter]
CurrentSubjectIndex <- SubjectHitIndex[HitCounter]
TrimLeft <- S.Start[z2] - S.HitStarts[HitCounter]
TrimRight <- S.HitEnds[HitCounter] - S.Stop[z2]
SubjectHitLeft <- S.HitStarts[HitCounter] + TrimLeft
SubjectHitRight <- S.HitEnds[HitCounter] - TrimRight
QueryHitLeft <- Q.HitStarts[HitCounter] + TrimLeft
QueryHitRight <- Q.HitEnds[HitCounter] - TrimRight
# Add to vector !
OverLapMatrix[, AddCounter] <- c(QueryGenePosition,
CurrentGene,
ExactOverLap,
CurrentQueryIndex,
CurrentSubjectIndex,
QueryHitLeft,
QueryHitRight,
SubjectHitLeft,
SubjectHitRight)
if (AddCounter >= DimLimit) {
OverLapMatrix <- cbind(OverLapMatrix,
matrix(data = NA_integer_,
nrow = nrow(OverLapMatrix),
ncol = ncol(OverLapMatrix) * DimAdjust))
DimLimit <- ncol(OverLapMatrix) / 2
DimAdjust <- DimAdjust * 2L
}
AddCounter <- AddCounter + 1L
break
} else if (S.HitStarts[HitCounter] > S.Stop[z2]) {
# Hit occurs after current subject gene
break
} else if (SubjectHitIndex[HitCounter] < S.Index[z2]) {
# The indices do not match the only direction to go is forward
HitCounter <- HitCounter + 1L
} else if (SubjectHitIndex[HitCounter] > S.Index[z2]) {
# if you have outpaced the gene index with the hits go to the next gene
break
}
}
}
}
OverLapMatrix <- t(OverLapMatrix)
######
# Remove empty extra rows
######
OverLapMatrix <- OverLapMatrix[apply(OverLapMatrix,
1L,
function(x) !all(is.na(x))),
,
drop = FALSE]
######
# If the overlap matrix is empty, assign an empty single row matrix
# If it is not, sum up the overlap in nucleotide space and condense to a single row
# Select the smallest distances for all four displacements
######
if (dim(OverLapMatrix)[1] == 0L) {
OverLapMatrix <- matrix(NA_integer_,
nrow = 1L,
ncol = 9L) # was 11, does this need to be 11 somewhere else?
OutPutMatrix <- matrix(NA_integer_,
nrow = 1L,
ncol = 11L)
} else if (dim(OverLapMatrix)[1] >= 1L) {
OverLapMatrix <- OverLapMatrix[order(OverLapMatrix[, 1L, drop = FALSE],
OverLapMatrix[, 2L, drop = FALSE],
OverLapMatrix[, 4L, drop = FALSE],
OverLapMatrix[, 5L, drop = FALSE]),
,
drop = FALSE]
OutPutMatrix <- matrix(NA_integer_,
ncol = 11L,
nrow = nrow(OverLapMatrix))
######
# All recordings so far are by hit
# condense hits as appropriate, by the genes that they link
######
RowCount <- 1L
CondenseCount <- 1L
Row <- 2L
while (CondenseCount <= nrow(OverLapMatrix)) {
while (Row <= nrow(OverLapMatrix)) {
if (OverLapMatrix[Row, 5L] != OverLapMatrix[CondenseCount, 5L]) {
break
}
if (OverLapMatrix[Row, 4L] != OverLapMatrix[CondenseCount, 4L]) {
break
}
if (OverLapMatrix[Row, 2L] != OverLapMatrix[CondenseCount, 2L]) {
break
}
if (OverLapMatrix[Row, 1L] != OverLapMatrix[CondenseCount, 1L]) {
break
}
Row <- Row + 1L
}
z5 <- CondenseCount:(Row - 1L)
OutPutMatrix[RowCount, ] <- c(OverLapMatrix[CondenseCount, 1L],
OverLapMatrix[CondenseCount, 2L],
sum(OverLapMatrix[z5, 3L]),
OverLapMatrix[CondenseCount, 4L],
OverLapMatrix[CondenseCount, 5L],
min(OverLapMatrix[z5, 6L]),
max(OverLapMatrix[z5, 7L]),
min(OverLapMatrix[z5, 8L]),
max(OverLapMatrix[z5, 9L]),
max(OverLapMatrix[z5, 3L]),
nrow(OverLapMatrix[z5, , drop = FALSE]))
RowCount <- RowCount + 1L
CondenseCount <- CondenseCount + length(z5)
}
OutPutMatrix <- OutPutMatrix[apply(OutPutMatrix,
1L,
function(x) !all(is.na(x))),
,
drop = FALSE]
}
OutPutMatrix <- OutPutMatrix[order(OutPutMatrix[,
1L,
drop = FALSE]),
,
drop = FALSE]
# return(list(OutPutMatrix,
# OverLapMatrix))
colnames(OutPutMatrix) <- c("QueryGene",
"SubjectGene",
"ExactOverlap",
"QueryIndex",
"SubjectIndex",
"QLeftPos",
"QRightPos",
"SLeftPos",
"SRightPos",
"MaxKmerSize",
"TotalKmerHits")
colnames(OverLapMatrix) <- c("QueryGene",
"SubjectGene",
"ExactOverlap",
"QueryIndex",
"SubjectIndex",
"QLeftPos",
"QRightPos",
"SLeftPos",
"SRightPos")
# return(list(OverLapMatrix,
# OutPutMatrix))
# QueryStartDisplacement <- ifelse(test = QG.Strand[OutPutMatrix[, "QueryGene"]] == 1L,
# yes = abs(OutPutMatrix[, 7L] - Q.Stop[OutPutMatrix[, 1L]]),
# no = abs(OutPutMatrix[, 6L] - Q.Start[OutPutMatrix[, 1L]]))
# QueryStopDisplacement <- ifelse(test = QG.Strand[OutPutMatrix[, "QueryGene"]] == 1L,
# yes = abs(OutPutMatrix[, 6L] - Q.Start[OutPutMatrix[, 1L]]),
# no = abs(OutPutMatrix[, 7L] - Q.Stop[OutPutMatrix[, 1L]]))
# SubjectStartDisplacement <- ifelse(test = SG.Strand[OutPutMatrix[, "SubjectGene"]] == 1L,
# yes = abs(OutPutMatrix[, 9L] - S.Stop[OutPutMatrix[, 2L]]),
# no = abs(OutPutMatrix[, 8L] - S.Start[OutPutMatrix[, 2L]]))
# SubjectStopDisplacement <- ifelse(test = SG.Strand[OutPutMatrix[, "SubjectGene"]] == 1L,
# yes = abs(OutPutMatrix[, 8L] - S.Start[OutPutMatrix[, 2L]]),
# no = abs(OutPutMatrix[, 9L] - S.Stop[OutPutMatrix[, 2L]]))
# DisplacementMatrix <- cbind(QueryStartDisplacement,
# QueryStopDisplacement,
# SubjectStartDisplacement,
# SubjectStopDisplacement)
if (Verbose) {
TotalCounter <- TotalCounter + 1L
setTxtProgressBar(pb = pBar,
value = TotalCounter/TotalLength)
}
if (nrow(OutPutMatrix) == 1L &
all(is.na(OutPutMatrix))) {
OutPutMatrix <- OutPutMatrix[-1L, ] # i'm not sure this is safe in the long run?
# double check this with erik
# DisplacementMatrix[1, ] <- rep(0L, ncol(DisplacementMatrix))
# OverLapMatrix[1, ] <- rep(0L, ncol(OverLapMatrix))
OverLapMatrix <- OverLapMatrix[-1L, ]
}
ResultMatrix[m1, m2] <- list(OutPutMatrix)
ResultMatrix[m2, m1] <- list(OverLapMatrix)
} # end of columns loop
} # end of rows loop
if (Verbose) {
TotalTimeStop <- Sys.time()
cat("\n")
print(TotalTimeStop - TotalTimeStart)
}
dimnames(ResultMatrix) <- dimnames(SyntenyObject)
class(ResultMatrix) <- "LinkedPairs"
attr(ResultMatrix, "GeneCalls") <- FeatureRepresentations
return(ResultMatrix)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.