## Used for suppressing dplyr/ggplot variable names...
globalVariables(c("AdjustedBasePairs", "AggregateQuality", "Allele", "BasePairs", "Coverage",
"ExpandedRegion", "ForwardFlank", "LUS", "Marker", "Motif", "MotifLength",
"NeighbourAllele", "Quality", "Region", "Repeats", "ReverseFlank", "Type",
"AlternativeRegion", "Observed", "Start"))
#' Identified STR regions
#'
#' The identified STR regions of the sampleSequences.fastq file, created by the \link{identifySTRRegions} function.
#'
#' @name identifiedSTRs
#' @docType data
#' @author Søren B. Vilsen \email{svilsen@@math.aau.dk}
#' @keywords data
#' @usage data("identifiedSTRs")
#' @format A list with an element for each of the 10 identified markers indicating which sequences were identified for each marker.
NULL
#' Aggregated string coverage.
#'
#' The aggregated string coverage of the \link{identifiedSTRs} data, created by the \link{stringCoverage} function.
#'
#' @name stringCoverageList
#' @docType data
#' @author Søren B. Vilsen \email{svilsen@@math.aau.dk}
#' @keywords data
#' @usage data("stringCoverageList")
#' @format A list of \link{tibble}'s one for each of the 10 markers, showing the aggregated information on a string-by-string basis.
NULL
#' Genotype list
#'
#' The identified genotypes of the \link{stringCoverageList} data, created by the \link{getGenotype} function.
#'
#' @name genotypeList
#' @docType data
#' @author Søren B. Vilsen \email{svilsen@@math.aau.dk}
#' @keywords data
#' @usage data("genotypeList")
#' @format A list of \link{tibble}'s one for each of the 10 markers, showing which strings are the potential alleles based on the 'Coverage'.
NULL
#' Noise list
#'
#' The identified noise of the \link{stringCoverageList} data, created by the \link{identifyNoise} function.
#'
#' @name noiseList
#' @docType data
#' @author Søren B. Vilsen \email{svilsen@@math.aau.dk}
#' @keywords data
#' @usage data("noiseList")
#' @format A list of \link{tibble}'s one for each of the 10 markers, showing which strings can be safely classified as noise based on the 'Coverage'.
NULL
#' Combined string coverage and genotype information
#'
#' A merge of the \link{stringCoverageList} and \link{genotypeList} data.
#'
#' @name stringCoverageGenotypeList
#' @docType data
#' @author Søren B. Vilsen \email{svilsen@@math.aau.dk}
#' @keywords data
#' @usage data("stringCoverageGenotypeList")
#' @format A list of \link{tibble}'s one for each of the 10 markers containing the combined string coverage and genotypic information.
NULL
#' Flanking regions
#'
#' The flanking regions searched for to identify the markers and STR regions of all autosomal/X/Y STR's in the Illumina ForenSeq prep kit.
#'
#' @name flankingRegions
#' @docType data
#' @author Søren B. Vilsen \email{svilsen@@math.aau.dk}
#' @keywords data
#' @usage data("flankingRegions")
#' @format A \link{tibble} containing the flanks (forward and reverse), motif, motif length, adjustment need to make it compatible with CE, and the shifts needed for further trimming, for each marker
NULL
##
.appendExtractLists <- function (x, val, addRCIndex = TRUE, addRCIndexRef = "matchedSeq") {
if (is.null(val)) {
val <- structure(vector("list", length(x)), .Names = as.character(names(x)))
}
else if (is.null(x)) {
x <- structure(vector("list", length(val)), .Names = as.character(names(val)))
}
xnames <- names(x)
for (v in names(val)) {
if (v == "name") {
x[[v]] <- if (is.null(x[[v]])) as.character(val[[v]]) else as.character(x[[v]])
}
else {
if (is.null(x[[v]])) {
x[[v]] <- val[[v]]
}
else if (is.null(val[[v]])) {
x[[v]] <- x[[v]]
}
else {
if (is_tibble(x[[v]]) & is_tibble(val[[v]])) {
x[[v]] <- bind_rows(x[[v]], val[[v]])
}
else {
x[[v]] <- append(x[[v]], val[[v]])
}
}
}
}
if (addRCIndex & (addRCIndexRef %in% names(val))) {
x[["ReverseComplement"]] <- data.frame(SeqID = c(x[[addRCIndexRef]]), ReverseComplement = rep(c(FALSE, TRUE), c(length(x[[addRCIndexRef]]) - length(val[[addRCIndexRef]]), length(val[[addRCIndexRef]]))))
}
return(x)
}
#' @title Block length of the missing motif.
#'
#' @description Given a motif length and a string it finds the blocks of the string.
#'
#' @param s a string of either class: 'character' or 'DNAString'.
#' @param motifLength the known motif length of the STR region.
#' @param returnType the type of return wanted. Takes three values 'numeric', 'string', or 'fullList' (or any other combination cased letters).
#'
#' @details If returnType is 'numeric', the function returns the numeric value of the LUS.
#' If returnType is instead chosen as 'string', the function returns "[AATG]x" i.e. the motif, AATG, is repeated 'x' times.
#' Lastly if the returnType is set to fullList, the function returns a list of data.frames containing every possible repeat structure their start and the numeric value of the repeat unit length.
#'
#' @return Depending on returnType it return an object of class 'numeric', 'string', or 'fulllist'.
#' @example inst/examples/blmm.R
BLMM <- function(s, motifLength = 4, returnType = "numeric") {
motifLength <- if (!is.integer(motifLength)) as.integer(motifLength) else motifLength
if (class(s) == "character") {
sD <- DNAString(s)
} else if (class(s) == "DNAString"){
sD <- s
s <- as.character(s)
} else {
stop("LUS only implemented for class 'character' or 'DNAString'")
}
typesOfMotifs <- oligonucleotideFrequency(sD, motifLength)
typesOfMotifs <- typesOfMotifs[typesOfMotifs > 0]
motifs <- names(typesOfMotifs)
positionOfMotifs <- lapply(as.list(motifs), function(y, text = s) unlist(gregexpr2(y, text = text)))
allRepeats <- structure(vector("list", length(positionOfMotifs)), .Names = motifs)
for (i in seq_along(positionOfMotifs)) {
y = positionOfMotifs[[i]]
rleValues <- rle(y-(motifLength*(0:(length(y) - 1))))
end <- y[cumsum(rleValues$lengths)] + motifLength
startEndFrame <- data.frame(Start = end - rleValues$length*motifLength, End = end, Repeats = rleValues$length)
j = 1
while (j <= dim(startEndFrame)[1]) {
whichExtending <- which(startEndFrame$Start == startEndFrame$End[j])
if (length(whichExtending) > 0) {
startEndFrame$End[j] <- startEndFrame$End[whichExtending]
startEndFrame$Repeats[j] <- startEndFrame$Repeats[j] + startEndFrame$Repeats[whichExtending]
startEndFrame <- startEndFrame[-whichExtending, ]
}
j = j + 1
}
allRepeats[[i]] <- startEndFrame
}
if (length(allRepeats) == 0) {
allRepeats <- list("NA" = data.frame(Start = NA, End = NA, Repeats = NA))
}
allRepeats <- enframe(allRepeats, name = "Motif") %>% unnest()
reducedRepeats <- allRepeats
i = 1
while ((i <= dim(reducedRepeats)[1]) & !is.na(reducedRepeats$Repeats[1])) {
whichWithin <- rep(FALSE, dim(reducedRepeats)[1])
whichWithin[-i] <- IRanges(reducedRepeats$Start[-i], reducedRepeats$End[-i]) %within% IRanges(reducedRepeats$Start[i], reducedRepeats$End[i])
reducedRepeats <- reducedRepeats[!whichWithin, ]
i = i + 1
}
lusOfMotifs <- reducedRepeats %>% group_by(Motif) %>% filter(Repeats == max(Repeats)) %>% ungroup()
lus <- which.max(lusOfMotifs$Repeats)
if (tolower(returnType) == "numeric") {
numeric_format <- lusOfMotifs$Repeats[lus]
return(numeric_format)
}
else if (tolower(returnType) == "string") {
string_format <- paste("[", lusOfMotifs$Motif[lus], "]", lusOfMotifs$Repeats[lus], sep="")
return(string_format)
}
else if (tolower(returnType) == "fulllist") {
return(reducedRepeats %>% arrange(Start))
}
else
stop(paste(returnType, "is not valid. Please use 'numeric', 'string', or 'fullList'."))
}
.cyclicRotation <- function(x, y) {
(nchar(y) == nchar(x)) && (grepl(y, strrep(x, 2), fixed = TRUE))
}
.loadRData <- function(fileName) {
load(fileName)
get(ls()[ls() != "fileName"])
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.