Nothing
.chainGenes <- function(allstarts,
totScore,
codScore,
sameScores,
oppoScores,
maxOverlapSame,
maxOverlapOpposite,
minScore,
includeLength) {
maxFracOverlap <- 0.2 # max overlap with adjacent genes
maxFracOverlapInclude <- 0.05 # max overlap for forcing inclusion
ORFanCutoff <- 300L # length cutoff for ORFans
ORFanDensity <- 0.2 # allowed density of ORFans
ORFanScore <- seq(minScore[2],
minScore[1],
length.out=ORFanDensity*100) # score cutoffs for ORFans
ORFanDist <- 10L # minimum distance from nearest non-ORFan
minScore <- minScore[1]
alllengths <- allstarts[, 4L] - allstarts[, 3L] + 1L
w <- which(totScore >= minScore)
topstarts <- allstarts[w,, drop=FALSE]
o <- order(topstarts[, 1L], topstarts[, 3L])
topstarts <- topstarts[o,]
toplengths <- alllengths[w[o]]
topScore <- totScore[w[o]]
maxL <- (length(sameScores) - 1L)/2L
if (maxL==maxOverlapSame &&
maxL==maxOverlapOpposite) {
scoreIntergenic <- TRUE
} else {
scoreIntergenic <- FALSE
}
res <- .Call("chainGenes",
topstarts,
topScore,
toplengths,
scoreIntergenic,
maxOverlapSame,
maxOverlapOpposite,
maxFracOverlap,
sameScores,
oppoScores,
PACKAGE="DECIPHER")
pointers <- res[[1]]
cumscore <- res[[2]]
pointer <- which.max(cumscore)
indices <- integer(pointer)
position <- 1L
indices[position] <- pointer
repeat {
if (pointer==pointers[pointer]) {
break
} else {
pointer <- pointers[pointer]
position <- position + 1L
indices[position] <- pointer
}
}
length(indices) <- position
indices <- rev(indices)
# eliminate ORFans (small genes in low density regions)
remove <- logical(length(indices))
currIndex <- 0L
i <- 1L
while (i < length(indices)) {
if (currIndex != topstarts[indices[i], 1L]) {
currIndex <- topstarts[indices[i], 1L]
last <- 0L
}
if (toplengths[indices[i]] < ORFanCutoff &&
topstarts[indices[i], 3L] - last >= ORFanDist) {
j <- i + 1L
while (j < length(indices) &&
topstarts[indices[j], 1L]==topstarts[indices[i], 1L]) {
if (toplengths[indices[j]] < ORFanCutoff) {
j <- j + 1L
} else {
break
}
}
if (topstarts[indices[j], 1L]==topstarts[indices[i], 1L]) {
if (topstarts[indices[j], 3L] - topstarts[indices[j - 1L], 4L] < ORFanDist)
j <- j - 1L
if (j > i) {
r <- i:(j - 1L)
density <- sum(toplengths[indices[r]])/(topstarts[indices[j], 3L] - last)
if (density < ORFanDensity)
remove[r] <- topScore[indices[r]] < ORFanScore[ceiling(density*100)]
}
}
i <- j
}
last <- topstarts[indices[i], 4L]
i <- i + 1L
}
remove <- which(remove)
if (length(remove) > 0)
indices <- indices[-remove]
indices <- w[o[indices]]
missing <- which(alllengths >= includeLength &
totScore < minScore)
currIndex <- 0L
while (length(missing) > 0) {
if (currIndex != allstarts[missing[1L], 1L]) {
currIndex <- allstarts[missing[1L], 1L]
W <- which(allstarts[, 1L]==allstarts[missing[1L], 1L])
}
if (allstarts[missing[1], 2L]==1L) { # reverse strand
w <- W[allstarts[W, 3L]==allstarts[missing[1L], 3L]]
} else { # forward strand
w <- W[allstarts[W, 4L]==allstarts[missing[1L], 4L]]
}
if (all(totScore[w] < minScore)) { # could not be included already
i <- w[which.max(totScore[w] - codScore[w])]
overlap <- which(allstarts[indices, 3L] <= allstarts[i, 3L] &
allstarts[indices, 4L] >= allstarts[i, 4L] &
allstarts[indices, 1L] == allstarts[i, 1L])
if (length(overlap)==0) { # check beginning overlap
overlap <- which(allstarts[indices, 3L] > allstarts[i, 3L] &
allstarts[indices, 3L] < allstarts[i, 4L] &
allstarts[indices, 1L] == allstarts[i, 1L])
if (length(overlap) > 0) {
delta <- allstarts[i, 4L] - allstarts[indices[overlap], 3L] + 1
overlap <- which(delta > alllengths[i]*maxFracOverlapInclude |
delta > alllengths[indices[overlap]]*maxFracOverlapInclude)
}
}
if (length(overlap)==0) { # check ending overlap
overlap <- which(allstarts[indices, 4L] > allstarts[i, 3L] &
allstarts[indices, 4L] < allstarts[i, 4L] &
allstarts[indices, 1L] == allstarts[i, 1L])
if (length(overlap) > 0) {
delta <- allstarts[indices[overlap], 4L] - allstarts[i, 3L] + 1
overlap <- which(delta > alllengths[i]*maxFracOverlapInclude |
delta > alllengths[indices[overlap]]*maxFracOverlapInclude)
}
}
if (length(overlap)==0)
indices <- c(indices, i)
}
missing <- missing[!(missing %in% w)]
}
return(sort(indices))
}
.extractRBS <- function(dna,
deltaGrulesRNA,
upstreamWidth,
pattern=DNAString("AAGGAGG"),
precision=2,
spacing=3) {
mapping <- diag(4) # don't penalize mismatches
dimnames(mapping) <- list(DNA_BASES,
DNA_BASES)
mapping["G", "G"] <- 4
mapping["C", "C"] <- 4
w <- which(width(dna)==upstreamWidth)
p <- pairwiseAlignment(dna[w],
pattern,
type="local-global",
substitutionMatrix=mapping,
gapOpening=10,
gapExtension=4)
pattern <- as.character(pattern(p))
subject <- as.character(subject(p))
dG <- .Call("calculateDeltaG",
pattern,
subject,
deltaGrulesRNA,
PACKAGE="DECIPHER")
dG <- round(dG/precision)*precision
pos <- (start(pattern(p)) + end(pattern(p)))/2
pos <- round(pos/spacing)*spacing
motif <- rep(NA_character_,
length(dna))
motif[w] <- paste(dG, pos)
return(motif)
}
.performFolding <- function(foldLeft,
foldRight,
deltaGrulesRNA) {
mapping <- diag(4)
mapping[mapping==0] <- -4
dimnames(mapping) <- list(DNA_BASES,
DNA_BASES)
mapping["G", "G"] <- 6
mapping["C", "C"] <- 6
p <- pairwiseAlignment(foldLeft,
foldRight,
type="overlap",
substitutionMatrix=mapping,
gapOpening=5,
gapExtension=1)
pattern <- as.character(pattern(p))
subject <- as.character(subject(p))
dG <- .Call("calculateDeltaG",
pattern,
subject,
deltaGrulesRNA,
PACKAGE="DECIPHER")
return(dG)
}
.logisticRegression <- function(response,
predictor,
indices,
remove,
penalty,
interactions=FALSE,
bins=25) {
other <- seq_len(length(response))[-c(indices, remove)]
w <- c(indices,
sample(other,
min(length(other),
1e4)))
# remove any remaining NA values from indices
sub_data <- cbind(response=response[w],
predictor[w,, drop=FALSE])
sub_data <- na.omit(sub_data)
# fit a logistic regression model
if (interactions) { # include pairwise interactions
suppressWarnings(model <- glm(response ~ .^2,
binomial,
data=sub_data,
control=glm.control(1e-6)))
} else {
suppressWarnings(model <- glm(response ~ .,
binomial,
data=sub_data,
control=glm.control(1e-6)))
}
# perform stepwise feature selection
suppressWarnings(model <- step(model,
k=penalty,
trace=0,
direction="forward"))
# convert probabilies into scores
suppressWarnings(p <- predict(model,
predictor,
"response"))
p <- .bincode(p,
c(0,
quantile(p[indices],
seq(1/bins,
1 - 1/bins,
1/bins),
na.rm=TRUE),
1))
fg <- tabulate(p[indices], bins)
fg <- ifelse(fg==0, 1, fg)
fg <- fg/sum(fg)
bg <- tabulate(p[-indices], bins)
bg <- ifelse(bg==0, 1, bg)
bg <- bg/sum(bg)
score <- log(fg/bg)
score[p]
}
FindGenes <- function(myDNAStringSet,
geneticCode=getGeneticCode("11"),
minGeneLength=60,
allowEdges=TRUE,
allScores=FALSE,
showPlot=FALSE,
verbose=TRUE) {
# set default parameters
threshold <- c(0.995, 0.999) # lower/upper cutoff quantiles for background scoring
noiseCutoff <- 150L # cutoff length for background scoring
backgroundLength <- 200L # background for lenScr
initialCodons <- 8L # initial codon frequencies
terminationCodons <- 6L # termination codon frequencies
maxL <- 1000L # maximum gene overlap
maxOverlapSame <- 60L # region for spline fitting overlap
maxOverlapOpposite <- 200L # region for spline fitting overlap
upstreamWidth <- 20L # upstream nucleotides for motif search
upstreamMotifs <- 30L # upstream of upstreamWidth
upstreamKmerSize <- 4L # must be <= 8
RBSk <- 6L # length of possible RBS motifs
foldingWidth <- 35L # nucleotides either side of start
offset <- c(-20L, 0L, 20L, 40L) # folding sites
temp <- 37 # degrees Celsius
ions <- 1 # Na+ equivalents
penalty <- 7.879 # qchisq(0.005, 1, lower.tail=FALSE)
length_params <- c(-4, 1400, 0.25, -10, 200) # initial parameters for sigmoid fitting the distribution of gene lengths
includeMultiplier <- 1.5 # multiplier on x_intercept to seek inclusion despite negative score
startCodonDist <- c("ATG"=80, "GTG"=14, "TTG"=5.995, "CTG"=0.002, "ATA"=0.001, "ATT"=0.001, "ATC"=0.001) # prior distribution of initial codon frequencies
signals <- matrix(c(3, 8, 9, 1, 12, 10, 4, 0, 2, 6, 7, 11),
nrow=3)
codonFreqCutoff <- 0.3 # average distance of codon frequencies within models
foldBetter <- 4 # fold-increase required to choose an alternate codon model
lenScrMultiplier <- 2 # multiplier on positive length scores
staScrMultiplier <- 2 # multiplier on start scores
intergenicCount <- 50 # number of intergenic observations to use a precise score
maxIterations <- 9L # maximum number of iterations (must be > 1)
# error checking
if (!is(myDNAStringSet, "DNAStringSet"))
stop("myDNAStringSet must be a DNAStringSet.")
if (sum(width(myDNAStringSet))==0)
stop("myDNAStringSet must contain sequences.")
a <- vcountPattern("-", myDNAStringSet)
if (any(a > 0))
stop("Gap characters ('-') must be removed from myDNAStringSet.")
a <- vcountPattern(".", myDNAStringSet)
if (any(a > 0))
stop("Unknown characters ('.') must be removed from myDNAStringSet.")
codons <- c('AAA', 'AAC', 'AAG', 'AAT',
'ACA', 'ACC', 'ACG', 'ACT',
'AGA', 'AGC', 'AGG', 'AGT',
'ATA', 'ATC', 'ATG', 'ATT',
'CAA', 'CAC', 'CAG', 'CAT',
'CCA', 'CCC', 'CCG', 'CCT',
'CGA', 'CGC', 'CGG', 'CGT',
'CTA', 'CTC', 'CTG', 'CTT',
'GAA', 'GAC', 'GAG', 'GAT',
'GCA', 'GCC', 'GCG', 'GCT',
'GGA', 'GGC', 'GGG', 'GGT',
'GTA', 'GTC', 'GTG', 'GTT',
'TAA', 'TAC', 'TAG', 'TAT',
'TCA', 'TCC', 'TCG', 'TCT',
'TGA', 'TGC', 'TGG', 'TGT',
'TTA', 'TTC', 'TTG', 'TTT')
if (!is.character(geneticCode))
stop("geneticCode must be a character vector.")
if (length(geneticCode) != 64)
stop("geneticCode must contain 64 codons.")
if (is.null(names(geneticCode)))
stop("geneticCode must be a named character vector.")
startCodons <- names(geneticCode[which(geneticCode=="M")])
startCodons <- c(startCodons,
attr(geneticCode, "alt_init_codons"))
if (length(startCodons)==0)
stop("geneticCode must contain at least one start codon.")
if (length(startCodons) >= 10)
stop("geneticCode must contain fewer than 10 start codons.")
startIndex <- match(startCodons, codons) - 1L
if (any(is.na(startIndex)))
stop("geneticCode may only contain standard 3-letter start codons.")
stopIndex <- names(geneticCode[which(geneticCode=="*")])
stopIndex <- match(stopIndex, codons) - 1L
if (length(stopIndex)==0)
stop("No stop codons (*) found in the geneticCode.")
if (length(stopIndex) >= 10)
stop("geneticCode must contain fewer than 10 stop codons.")
if (any(startIndex %in% stopIndex))
stop("Start codons cannot also be stop codons in the geneticCode.")
gC <- geneticCode
geneticCode <- geneticCode[codons]
if (any(is.na(geneticCode)))
stop("geneticCode is missing one or more codons.")
geneticCode <- match(geneticCode, AA_STANDARD) - 1L
if (!is.numeric(minGeneLength))
stop("minGeneLength must be a numeric.")
if (length(minGeneLength) != 1)
stop("minGeneLength must contain a single integer.")
if (minGeneLength < (2 + initialCodons)*3)
stop("minGeneLength must be at least ",
(2 + max(initialCodons, terminationCodons))*3,
".")
if (minGeneLength > 0.9*noiseCutoff)
stop("minGeneLength can be at most ",
floor(0.9*noiseCutoff),
".")
if (minGeneLength != floor(minGeneLength))
stop("minGeneLength must be a whole number.")
minGeneLength <- as.integer(minGeneLength)
if (!is.logical(allowEdges))
stop("allowEdges must be a logical.")
if (!is.logical(allScores))
stop("allScores must be a logical.")
if (!is.logical(showPlot))
stop("showPlot must be a logical.")
if (!is.logical(verbose))
stop("verbose must be a logical.")
if (verbose) {
time.1 <- Sys.time()
cat("Iter Models Start Motif Fold Init UpsNt Term RBS Auto Stop Genes\n1 1")
flush.console()
iter <- 1L
}
# find all possible genes
ORFs <- .Call("getORFs",
myDNAStringSet,
startIndex,
stopIndex,
minGeneLength,
allowEdges,
PACKAGE="DECIPHER")
colnames(ORFs) <- c("Index", "Strand", "Begin", "End")
if (nrow(ORFs)==0)
stop("myDNAStringSet does not contain any open reading frames.")
longest <- .Call("longestORFs",
ORFs,
PACKAGE="DECIPHER")
l <- ORFs[, 4] - ORFs[, 3] + 1L
# determine the background intergenic length distribution
o <- longest[l[longest] >= noiseCutoff]
o <- o[order(ORFs[o, 1], ORFs[o, 3])]
intergenic <- ORFs[o[-1], 3] - ORFs[o[-length(o)], 4] + 1L
sameStrand <- ORFs[o[-1], 2]==ORFs[o[-length(o)], 2]
sameIndex <- ORFs[o[-1], 1]==ORFs[o[-length(o)], 1]
r <- -maxL:maxL # symmetric length range of scores
bg_same <- tabulate(intergenic[sameStrand & sameIndex] + maxL + 1L, length(r))
first <- maxL - maxOverlapSame
bg_same[seq_len(first)] <- 0L
bg_same <- bg_same/sum(bg_same)
bg_oppo <- tabulate(intergenic[!sameStrand & sameIndex] + maxL + 1L, length(r))
bg_oppo[seq_len(first)] <- 0L
bg_oppo <- bg_oppo/sum(bg_oppo)
same_scores <- oppo_scores <- rep(0, length(r))
# fit the background ORF length distribution
t <- tabulate(l[longest])
x <- which(t[seq_len(backgroundLength)] > 0)
weight <- sqrt(t[x])
logt <- log10(t)
# fit logt = slope*x + y_intercept
mx <- mean(x)
mt <- mean(logt[x])
slope <- sum(weight*(x - mx)*(logt[x] - mt))/sum(weight*(x - mx)^2)
y_intercept <- mt - slope*mx
x_intercept <- -y_intercept/slope
x_intercept <- as.integer(x_intercept)
.PDF2 <- function(x, p)
-((exp(p[1]*log(x/p[2]))+1)^(-p[3]-1)*(exp(p[4]*log(x/p[5]))+1)^(-p[3]-1)*(((p[4]*p[3]+p[1]*p[3])*exp(p[1]*log(x/p[2]))+p[4]*p[3])*exp(p[4]*log(x/p[5]))+p[1]*p[3]*exp(p[1]*log(x/p[2]))))/x
.CDF2 <- function(x, p) {
sig1 <- (1 + exp(p[1]*log(x/p[2])))^-p[3]
sig2 <- (1 + exp(p[4]*log(x/p[5])))^-p[3]
sig1*sig2
}
.fitSigmoid <- function(x, y, ini) {
weights <- diff(c(0, log(x)))
.SSE <- function(p) {
pdf <- .PDF2(x, p)
if (any(is.nan(pdf)))
return(sum(y^2))
expected <- .CDF2(x, p)
sum(weights*(y - expected)^2)
}
o <- suppressWarnings(optim(ini,
.SSE,
control=list(maxit=1e4)))
return(o$par)
}
# fit the distribution of approximate gene lengths
s <- seq(3, length(t), 3)
bg <- 10^(slope*seq_along(t) + y_intercept)
observed <- t[s] - bg[s] # expected number of genes
observed[seq_len(backgroundLength/3)] <- 0
observed[observed < 0] <- 0
observed <- cumsum(observed)
observed <- observed/observed[length(observed)]
o <- .fitSigmoid(s,
observed,
length_params)
# calculate log-odds model of gene length
fg <- .PDF2(s, o)
fg <- fg/sum(fg)
bg <- bg[s]
bg <- bg/sum(bg)
lenScr <- log(fg/bg)
w <- which(is.infinite(lenScr))
if (length(w) > 0)
lenScr[w] <- (lenScr[w[1L] - 1] - lenScr[w[1L] - 2])*seq_along(w) + lenScr[w[1L] - 1]
lenScr <- lenScr[l/3]
# compute single codon log-odds model
codon_scores <- .Call("codonModel",
myDNAStringSet,
ORFs,
stopIndex,
x_intercept,
PACKAGE="DECIPHER")
codScr <- .Call("scoreCodonModel",
myDNAStringSet,
ORFs,
codon_scores,
PACKAGE="DECIPHER")
orf_scores <- lenScr + codScr
# add preliminary start codon scoring
starts <- .Call("getRegion",
myDNAStringSet,
ORFs,
3L,
3L,
PACKAGE="DECIPHER")
bgStarts <- table(starts)[startCodons]
bgStarts[is.na(bgStarts)] <- 0L
bgStarts <- bgStarts/sum(bgStarts)
fgStarts <- startCodonDist[startCodons]
fgStarts[is.na(fgStarts)] <- 0.01 # add missing start codons
fgStarts <- fgStarts/sum(fgStarts)
start_scores <- log(fgStarts/bgStarts)
staScr <- unname(start_scores[starts])
if (allowEdges) # potential non-canonical start
staScr[is.na(staScr)] <- 0
orf_scores <- orf_scores + staScr
# chain high-scoring ORFs into putative genes
cutoff <- quantile(orf_scores[longest[which(l[longest] <= noiseCutoff)]],
threshold)
indices <- .chainGenes(ORFs,
orf_scores,
codScr,
sameScores=same_scores,
oppoScores=oppo_scores,
maxOverlapSame=maxOverlapSame,
maxOverlapOpposite=maxOverlapOpposite,
minScore=cutoff,
includeLength=includeMultiplier*x_intercept)
if (verbose) {
deltaSta <- mean(staScr[indices]) - mean(staScr[-indices])
show <- formatC(deltaSta,
width=6,
digits=2,
format="f")
cat(show)
cat(" ")
show <- formatC(length(indices),
width=6,
digits=0,
format="f")
cat(show)
flush.console()
}
# split the upstream region into k-mers
upstream <- .Call("getRegion",
myDNAStringSet,
ORFs,
upstreamWidth,
0L,
PACKAGE="DECIPHER")
data("deltaHrulesRNA", envir=environment(), package="DECIPHER")
data("deltaSrulesRNA", envir=environment(), package="DECIPHER")
deltaSrulesRNA <- deltaSrulesRNA + 0.368*log(ions)/1000
deltaGrulesRNA <- deltaHrulesRNA - (273.15 + temp)*deltaSrulesRNA
# Shine-Delgarno free energy
upstream <- DNAStringSet(upstream)
dG_RBS <- .extractRBS(upstream,
deltaGrulesRNA,
upstreamWidth)
# tile subsequences 3 bases apart
foldLeft <- foldRight <- vector("list", length(offset))
for(i in seq_along(offset)) {
foldLeft[[i]] <- .Call("getRegion",
myDNAStringSet,
ORFs,
foldingWidth,
offset[i],
PACKAGE="DECIPHER")
foldRight[[i]] <- .Call("getRegion",
myDNAStringSet,
ORFs,
-foldingWidth,
offset[i],
PACKAGE="DECIPHER")
}
foldLeft <- DNAStringSet(unlist(foldLeft))
foldRight <- DNAStringSet(unlist(foldRight))
foldRight <- reverseComplement(foldRight)
# mRNA folding free energy
w <- which(width(foldLeft)==foldingWidth &
width(foldRight)==foldingWidth &
l >= offset[length(offset)] + foldingWidth + 3L)
dG_Fold <- matrix(NA_real_,
nrow=nrow(ORFs),
ncol=length(offset))
dG_Fold[w] <- .performFolding(foldLeft[w],
foldRight[w],
deltaGrulesRNA)
reject <- which(rowSums(is.na(dG_Fold)) > 0)
dG_Fold <- as.data.frame(dG_Fold)
names(dG_Fold) <- paste("p",
offset,
sep="")
.getGenes <- function(indices,
multiModel=FALSE,
startCodonModel=FALSE,
initialCodons=0L,
terminationCodons=0L,
scoreRBS=FALSE,
scoreAutocorr=FALSE,
upstreamMotifs=0L,
scoreFolding=FALSE,
scoreIntergenicLength=FALSE,
runLengthModel=FALSE,
stopCodonModel=FALSE,
scoreUpstream=FALSE,
allScores=FALSE,
showPlot=FALSE) {
genes <- ORFs[indices,, drop=FALSE]
# compute gene length model
observed <- tabulate(l[indices], s[length(s)])
observed <- observed[s]
observed <- cumsum(observed)
observed <- observed/observed[length(observed)]
# reset initial condition for quicker convergence
length_params <<- .fitSigmoid(s,
observed,
length_params)
fg <- .PDF2(s, length_params)
fg <- fg/sum(fg)
lenScr <- log(fg/bg)
w <- which(is.infinite(lenScr))
if (length(w) > 0)
lenScr[w] <- (lenScr[w[1L] - 1] - lenScr[w[1L] - 2])*seq_along(w) + lenScr[w[1L] - 1]
lenScr[lenScr > 0] <- lenScrMultiplier*lenScr[lenScr > 0]
lenScr <- lenScr[l/3]
if (showPlot) {
layout(matrix(c(1, 3, 2, 4),
nrow=2))
plot(logt,
xlab='ORF length (nucleotides)',
ylab=expression('log'[10]*'(Frequency)'),
xlim=c(minGeneLength, length(logt)),
pch=46,
log="x")
abline(a=y_intercept,
b=slope,
h=0,
v=x_intercept,
col=c(6, 5, 5),
lty=c(1, 2, 2),
untf=TRUE)
legend("topright",
c("Background", "Intercepts", "ORF"),
col=c(6, 5, 1),
lty=c(1, 2, NA),
pch=c(NA, NA, 15),
bg="white")
plot(s,
100*diff(c(0, observed)),
xlab="Length (nucleotides)",
ylab="Frequency (%)",
xlim=c(minGeneLength,
max(1e4,
s[length(s)])),
pch=46,
col="darkgreen",
log="x")
lines(s,
100*fg,
lwd=2,
col="darkgreen")
lines(s,
100*bg,
col="magenta")
legend("topright",
c("Background", "Estimated", "Gene"),
col=c("magenta", "darkgreen", "darkgreen"),
lty=c(1, 1, NA),
lwd=c(1, 2, NA),
pch=c(NA, NA, 15),
bg="white")
}
# compute codon log-odds models
codFreqs <- .Call("codonFrequencies",
myDNAStringSet,
ORFs,
indices,
PACKAGE="DECIPHER")
freqs <- colMeans(codFreqs)
minSize <- quantile(freqs[freqs > 0], 0.1)
minSize <- c(90*minSize^-1, # rare codons ~30 times
0.3*minSize^-2) # rare dicodons ~0.1 times
if (multiModel) {
if (nrow(codFreqs) > 10000) # use a subset
codFreqs <- codFreqs[seq_len(10000),]
dists <- dist(codFreqs)
clusts <- IdClusters(dists,
cutoff=codonFreqCutoff,
verbose=FALSE)
clusts <- clusts$cluster
clusts <- lapply(seq_len(max(clusts)),
function(x)
which(clusts==x))
size <- sapply(clusts,
function(x)
sum(l[indices[x]]))
keep <- size >= minSize[1]
clusts <- clusts[keep]
size <- size[keep]
if (length(clusts) > 1 ||
length(clusts)==0 ||
(length(clusts)==1 &&
length(clusts[[1]]) < length(indices))) {
clusts <- c(list(seq_along(indices)),
clusts)
size <- c(sum(l[indices]),
size)
}
} else {
clusts <- list(seq_along(indices))
size <- sum(l[indices])
}
for (i in seq_along(clusts)) {
if (size[i] >= minSize[2]) { # use dicodon model
codon_scores <- .Call("dicodonModel",
myDNAStringSet,
genes[clusts[[i]],, drop=FALSE],
stopIndex,
PACKAGE="DECIPHER")
#dimnames(codon_scores) <- list(codons, codons)
# codon_scores matrix is [next, prev]
} else { # use unicodon model
codon_scores <- .Call("unicodonModel",
myDNAStringSet,
genes[clusts[[i]],, drop=FALSE],
stopIndex,
PACKAGE="DECIPHER")
}
tempScr <- .Call("scoreCodonModel",
myDNAStringSet,
ORFs,
codon_scores,
PACKAGE="DECIPHER")
if (i==1L) {
codScr <- tempScr
# add run length into model
if (runLengthModel) {
run_scores <- .Call("runLengthModel",
myDNAStringSet,
genes,
codon_scores,
PACKAGE="DECIPHER")
runScr <- .Call("scoreRunLengthModel",
myDNAStringSet,
ORFs,
codon_scores,
run_scores,
PACKAGE="DECIPHER")
orf_scores <- orf_scores + runScr
}
} else if (i==2) {
altScr <- tempScr
} else { # record the best alternative codScr
altScr <- ifelse(tempScr > altScr,
tempScr,
altScr)
}
}
if (length(clusts) > 1) {
w <- which(altScr > 0 &
altScr/foldBetter > codScr)
if (length(w) > 0)
codScr[w] <- altScr[w]
}
orf_scores <- lenScr + codScr
# add start codon preferences into model
if (startCodonModel) {
start_scores <- .Call("startCodonModel",
myDNAStringSet,
ORFs,
indices,
startIndex,
PACKAGE="DECIPHER")
staScr <- .Call("scoreStartCodonModel",
myDNAStringSet,
ORFs,
start_scores,
PACKAGE="DECIPHER")
}
staScr <- staScrMultiplier*staScr
deltaSta <- mean(staScr[indices]) - mean(staScr[-indices])
if (deltaSta > 0) {
orf_scores <- orf_scores + staScr
} else if (verbose) {
deltaSta <- "-"
}
# add initial codon bias into model
if (initialCodons) {
ini_codon_scores <- .Call("initialCodonModel",
myDNAStringSet,
ORFs,
indices,
initialCodons,
PACKAGE="DECIPHER")
iniScr <- .Call("scoreInitialCodonModel",
myDNAStringSet,
ORFs,
ini_codon_scores,
PACKAGE="DECIPHER")
deltaIni <- mean(iniScr[indices]) - mean(iniScr[-indices])
if (deltaIni < 0) {
deltaIni <- "-"
} else {
orf_scores <- orf_scores + iniScr
}
}
# add termination codon bias into model
if (terminationCodons) {
ter_codon_scores <- .Call("terminationCodonModel",
myDNAStringSet,
ORFs,
indices,
terminationCodons,
PACKAGE="DECIPHER")
terScr <- .Call("scoreTerminationCodonModel",
myDNAStringSet,
ORFs,
ter_codon_scores,
PACKAGE="DECIPHER")
deltaTer <- mean(terScr[indices]) - mean(terScr[-indices])
if (deltaTer < 0) {
deltaTer <- "-"
} else {
orf_scores <- orf_scores + terScr
}
}
# add stop codon preferences into model
if (stopCodonModel) {
stop_scores <- .Call("stopCodonModel",
myDNAStringSet,
ORFs,
indices,
stopIndex,
PACKAGE="DECIPHER")
stoScr <- .Call("scoreStopCodonModel",
myDNAStringSet,
ORFs,
stop_scores,
PACKAGE="DECIPHER")
deltaSto <- mean(stoScr[indices]) - mean(stoScr[-indices])
if (deltaSto < 0) {
deltaSto <- "-"
} else {
orf_scores <- orf_scores + stoScr
}
}
# add autocorrelation into model
if (scoreAutocorr) {
autocorr_scores <- .Call("autocorrelationModel",
myDNAStringSet,
ORFs,
indices,
geneticCode,
PACKAGE="DECIPHER")
autScr <- .Call("scoreAutocorrelationModel",
myDNAStringSet,
ORFs,
autocorr_scores,
geneticCode,
PACKAGE="DECIPHER")
deltaAut <- mean(autScr[indices]) - mean(autScr[-indices])
if (deltaAut < 0) {
deltaAut <- "-"
} else {
orf_scores <- orf_scores + autScr
}
}
# add upstream nucleotide bias model
if (scoreUpstream) {
up_nuc_scores <- .Call("nucleotideBiasModel",
myDNAStringSet,
ORFs,
indices,
upstreamWidth,
PACKAGE="DECIPHER")
upsScr <- .Call("scoreNucleotideBiasModel",
myDNAStringSet,
ORFs,
up_nuc_scores,
PACKAGE="DECIPHER")
deltaUps <- mean(upsScr[indices]) - mean(upsScr[-indices])
if (deltaUps < 0) {
deltaUps <- "-"
} else {
orf_scores <- orf_scores + upsScr
}
}
# add upstream motifs into model
if (upstreamMotifs) {
upstream_motif_scores <- .Call("upstreamMotifModel",
myDNAStringSet,
ORFs,
indices,
upstreamWidth + 1L,
upstreamMotifs,
upstreamKmerSize,
PACKAGE="DECIPHER")
motScr <- .Call("scoreUpstreamMotifModel",
myDNAStringSet,
ORFs,
upstream_motif_scores,
upstreamWidth + 1L,
upstreamMotifs,
upstreamKmerSize,
PACKAGE="DECIPHER")
deltaMot <- mean(motScr[indices]) - mean(motScr[-indices])
if (deltaMot < 0) {
deltaMot <- "-"
} else {
orf_scores <- orf_scores + motScr
}
}
if (scoreFolding ||
scoreRBS) {
response <- integer(nrow(ORFs))
response[indices] <- 1L
}
# add folding around start into model
if (scoreFolding) {
folScr <- .logisticRegression(response,
dG_Fold,
indices,
reject,
penalty,
interactions=TRUE)
if (length(reject) > 0)
folScr[reject] <- 0
deltaFol <- mean(folScr[indices]) - mean(folScr[-indices])
if (deltaFol < 0) {
deltaFol <- "-"
} else {
orf_scores <- orf_scores + folScr
}
}
# add ribosome binding sites into model
if (scoreRBS) {
t <- unique(dG_RBS)
t <- setNames(rep(1,
length(t)),
t)
obs <- t
temp <- table(dG_RBS[indices])
obs[names(temp)] <- temp
obs[is.na(names(obs))] <- 0
obs <- obs/sum(obs)
bg <- t
temp <- table(dG_RBS[-indices])
bg[names(temp)] <- temp
bg[is.na(names(bg))] <- 0
bg <- bg/sum(bg)
rbsScr <- log(obs/bg)
rbsScr <- rbsScr[dG_RBS]
rbsScr[is.na(dG_RBS)] <- 0
# learn alternative RBS sites
fg <- oligonucleotideFrequency(upstream[indices],
RBSk,
simplify.as="collapse")
fg <- fg/sum(fg)
bg <- oligonucleotideFrequency(upstream[-indices],
RBSk,
simplify.as="collapse")
bg[bg==0] <- 1L # add pseudocounts
bg <- bg/sum(bg)
score <- log(fg/bg)
score <- sort(score,
decreasing=TRUE)
score <- score[score >= 1]
if (length(score) > 0) {
if (length(score) > 1000)
score <- score[1:1000]
motif <- names(score)
motif <- DNAStringSet(motif)
dists <- DistanceMatrix(motif,
verbose=FALSE)
clusts <- IdClusters(dists,
method="complete",
cutoff=1/RBSk, # 1 mismatch
verbose=FALSE)
clusts <- clusts$cluster
o <- tapply(score,
clusts,
sum)
o <- order(o,
decreasing=TRUE)
if (length(o) > 20)
o <- o[1:20]
PWMs <- lapply(o,
function(x) {
w <- which(clusts==x)
# weight motifs approximately by score
m <- rep(motif[w],
score[w])
pwm <- PWM(m)
ns <- ConsensusSequence(m)
ns <- as.character(ns)
list(pwm, ns)
})
hits <- vector("list",
length(PWMs) + 1L)
names(hits) <- c(sapply(PWMs,
`[`,
2L),
"dG_SD")
PWMs <- lapply(PWMs,
`[[`,
1L)
begin <- cumsum(width(upstream)) - width(upstream)
merged <- unlist(upstream)
pos <- seq(0L,
upstreamWidth - RBSk,
1L)
for (i in seq_along(PWMs)) {
scores <- matrix(0,
nrow=length(starts),
ncol=length(pos))
for (j in seq_along(pos)) {
w <- which(pos[j] + RBSk - 1L <= width(upstream))
scores[w, j] <- PWMscoreStartingAt(PWMs[[i]],
merged,
starting.at=begin[w] + pos[j] + 1L)
}
hits[[i]] <- apply(scores,
1L,
max)
}
hits[[length(hits)]] <- rbsScr
hits <- data.frame(hits)
rbsScr <- .logisticRegression(response,
hits,
indices,
integer(),
penalty)
}
deltaRbs <- mean(rbsScr[indices]) - mean(rbsScr[-indices])
if (deltaRbs < 0) {
deltaRbs <- "-"
} else {
orf_scores <- orf_scores + rbsScr
}
}
# add intergenic length into model
if (scoreIntergenicLength) {
o <- order(ORFs[indices, 1], ORFs[indices, 3])
intergenic <- ORFs[indices[o][-1], 3] - ORFs[indices[o][-length(o)], 4] + 1L
sameStrand <- ORFs[indices[o][-1], 2]==ORFs[indices[o][-length(o)], 2]
sameIndex <- ORFs[indices[o][-1], 1]==ORFs[indices[o][-length(o)], 1]
obs_same <- tabulate(intergenic[sameStrand & sameIndex] + maxL + 1L, length(r))
obs_same[seq_len(first)] <- 0L
# accumulate background
sum <- 0
for (i in (maxL - 1):(first + 1)) {
if (obs_same[i]==0) {
sum <- sum + bg_same[i]
} else if (sum > 0) {
bg_same[i] <- bg_same[i] + sum
sum <- 0
}
}
obs_oppo <- tabulate(intergenic[!sameStrand & sameIndex] + maxL + 1L, length(r))
obs_oppo[seq_len(first)] <- 0L
# accumulate background
sum <- 0
for (i in (maxL - 1):(first + 1)) {
if (obs_oppo[i]==0) {
sum <- sum + bg_oppo[i]
} else if (sum > 0) {
bg_oppo[i] <- bg_oppo[i] + sum
sum <- 0
}
}
obs_sum <- sum(obs_same, obs_oppo)
odds_same <- log(obs_same/obs_sum/bg_same)
odds_same[is.nan(odds_same) | is.infinite(odds_same)] <- 0
t <- try(same_scores <- predict(smooth.spline(r,
odds_same,
obs_same,
df=20))[[2]],
silent=TRUE)
if (is(t, "try-error")) {
same_scores <- numeric(length(odds_same))
} else {
sub <- which(obs_same >= intergenicCount)
if (length(sub) > 0)
same_scores[sub] <- odds_same[sub]
}
odds_oppo <- log(obs_oppo/obs_sum/bg_oppo)
odds_oppo[is.nan(odds_oppo) | is.infinite(odds_oppo)] <- 0
t <- try(oppo_scores <- predict(smooth.spline(r,
odds_oppo,
obs_oppo,
df=20))[[2]],
silent=TRUE)
if (is(t, "try-error")) {
oppo_scores <- numeric(length(odds_oppo))
} else {
sub <- which(obs_oppo >= intergenicCount)
if (length(sub) > 0)
oppo_scores[sub] <- odds_oppo[sub]
}
maxOverlapSame <- maxL
maxOverlapOpposite <- maxL
if (showPlot) {
halfMaxObs <- max(obs_same, obs_oppo)/2
plot(r,
odds_same,
xlab="Intergenic length (nucleotides)",
ylab="Log-odds",
col="darkblue",
cex=obs_same/halfMaxObs,
ylim=c(-5, 5),
xlim=c(-200, 200))
points(r,
odds_oppo,
cex=obs_oppo/halfMaxObs,
pch=0,
col="darkred")
lines(r,
same_scores,
col="darkblue")
lines(r,
oppo_scores,
lty=3,
lwd=2,
col="darkred")
legend("topleft",
c("Same", "Oppo."),
col=c("darkblue", "darkred"),
lty=c(1, 3),
pch=c(1, 0),
lwd=c(1, 2),
title=expression(underline("Strand")),
bg="white")
}
}
cutoff <- quantile(orf_scores[l <= noiseCutoff],
threshold)
cutoff <- cutoff + same_scores[maxL + 1L] # add typical intergenic score
# chain high-scoring ORFs into genes
indices <- .chainGenes(ORFs,
orf_scores,
codScr,
sameScores=same_scores,
oppoScores=oppo_scores,
maxOverlapSame=maxOverlapSame,
maxOverlapOpposite=maxOverlapOpposite,
minScore=cutoff,
includeLength=includeMultiplier*x_intercept)
if (showPlot) {
plot(l[longest],
orf_scores[longest],
xlab="ORF length (nucleotides)",
ylab="Total score",
log="x",
pch=46)
points(l[indices],
orf_scores[indices],
col="darkgreen",
pch=46)
abline(h=cutoff[1],
lty=2,
col="orange")
legend("topleft",
c("ORF", "Gene", "Threshold"),
pch=c(15, 15, NA),
lty=c(NA, NA, 2),
col=c("black", "darkgreen", "orange"),
bg="white")
}
if (verbose) {
cat("\r")
iter <<- iter + 1L
show <- formatC(iter,
width=-6,
digits=0,
format="f")
cat(show)
show <- formatC(length(size),
width=6,
digits=0,
format="f")
cat(show)
show <- formatC(deltaSta,
width=6,
digits=2,
format="f")
cat(show)
if (upstreamMotifs) {
show <- formatC(deltaMot,
width=6,
digits=2,
format="f")
cat(show)
} else {
cat(" ")
}
if (scoreFolding) {
show <- formatC(deltaFol,
width=6,
digits=2,
format="f")
cat(show)
} else {
cat(" ")
}
if (initialCodons) {
show <- formatC(deltaIni,
width=6,
digits=2,
format="f")
cat(show)
} else {
cat(" ")
}
if (scoreUpstream) {
show <- formatC(deltaUps,
width=6,
digits=2,
format="f")
cat(show)
} else {
cat(" ")
}
if (terminationCodons) {
show <- formatC(deltaTer,
width=6,
digits=2,
format="f")
cat(show)
} else {
cat(" ")
}
if (scoreRBS) {
show <- formatC(deltaRbs,
width=6,
digits=2,
format="f")
cat(show)
} else {
cat(" ")
}
if (scoreAutocorr) {
show <- formatC(deltaAut,
width=6,
digits=2,
format="f")
cat(show)
} else {
cat(" ")
}
if (stopCodonModel) {
show <- formatC(deltaSto,
width=6,
digits=2,
format="f")
cat(show)
} else {
cat(" ")
}
show <- formatC(length(indices),
width=6,
digits=0,
format="f")
cat(show)
flush.console()
}
if (allScores) {
ans <- cbind(ORFs,
TotalScore=orf_scores,
LengthScore=lenScr,
CodingScore=codScr)
if (startCodonModel)
ans <- cbind(ans,
StartScore=staScr)
if (stopCodonModel)
ans <- cbind(ans,
StopScore=stoScr)
if (initialCodons > 0)
ans <- cbind(ans,
InitialCodonScore=iniScr)
if (terminationCodons > 0)
ans <- cbind(ans,
TerminationCodonScore=terScr)
if (scoreRBS)
ans <- cbind(ans,
RibosomeBindingSiteScore=rbsScr)
if (scoreAutocorr)
ans <- cbind(ans,
AutocorrelationScore=autScr)
if (scoreUpstream)
ans <-cbind(ans,
UpstreamNucleotideScore=upsScr)
if (upstreamMotifs)
ans <- cbind(ans,
UpstreamMotifScore=motScr)
if (scoreFolding)
ans <- cbind(ans,
FoldingScore=folScr)
if (runLengthModel)
ans <- cbind(ans,
RunLengthScore=runScr)
return(list(ans, indices))
} else {
return(indices)
}
}
params <- list()
for (j in seq_len(ncol(signals))) {
for (i in seq_len(nrow(signals))) {
if (signals[i, j]==1) {
params <- c(params,
list(multiModel=TRUE))
} else if (signals[i, j]==2) {
params <- c(params,
list(startCodonModel=TRUE))
} else if (signals[i, j]==3) {
params <- c(params,
list(initialCodons=initialCodons))
} else if (signals[i, j]==4) {
params <- c(params,
list(terminationCodons=terminationCodons))
} else if (signals[i, j]==5) {
params <- c(params,
list(runLengthModel=TRUE))
} else if (signals[i, j]==6) {
params <- c(params,
list(scoreRBS=TRUE))
} else if (signals[i, j]==7) {
params <- c(params,
list(scoreAutocorr=TRUE))
} else if (signals[i, j]==8) {
params <- c(params,
list(upstreamMotifs=upstreamMotifs))
} else if (signals[i, j]==9) {
params <- c(params,
list(scoreFolding=TRUE))
} else if (signals[i, j]==10) {
params <- c(params,
list(scoreIntergenicLength=TRUE))
} else if (signals[i, j]==11) {
params <- c(params,
list(stopCodonModel=TRUE))
} else if (signals[i, j]==12) {
params <- c(params,
list(scoreUpstream=TRUE))
}
}
indices <- do.call(.getGenes,
c(params, list(indices)))
}
bootstraps <- integer(nrow(ORFs))
bootstraps[indices] <- 1
prev_indices <- indices
for (i in 2L:maxIterations) {
indices <- do.call(.getGenes,
c(params, list(indices)))
bootstraps[indices] <- bootstraps[indices] + 1
indices <- which(bootstraps==i)
if (length(prev_indices)==length(indices) &&
all(prev_indices==indices))
break
prev_indices <- indices
}
params <- c(params,
list(indices=indices, # indices no longer changes
allScores=TRUE,
showPlot=showPlot))
ans <- do.call(.getGenes,
params)
indices <- ans[[2]]
bootstraps[indices] <- bootstraps[indices] + 1
ans <- ans[[1]]
if (allScores) {
gene <- numeric(nrow(ans))
gene[indices] <- 1
ans <- cbind(ans,
FractionReps=bootstraps/(i + 1),
Gene=gene)
} else {
ans <- ans[indices,]
ans <- cbind(ans,
FractionReps=bootstraps[indices]/(i + 1),
Gene=rep(1, nrow(ans)))
}
o <- order(ans[, "Index"], ans[, "Begin"])
ans <- ans[o,]
rownames(ans) <- seq_len(nrow(ans))
class(ans) <- "Genes"
attr(ans, "widths") <- setNames(width(myDNAStringSet),
names(myDNAStringSet))
attr(ans, "geneticCode") <- gC
attr(ans, "minGeneLength") <- minGeneLength
attr(ans, "allowEdges") <- allowEdges
if (verbose) {
cat("\n\n")
time.2 <- Sys.time()
print(round(difftime(time.2,
time.1,
units='secs'),
digits=2))
cat("\n")
}
return(ans)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.