###############################################################################
# functions to read the read counts
###############################################################################
BedToGenomicRanges <- function(panelBedFilepath,
ampliconColumn,
split = "_",
doReduce = TRUE,
rangeExtend = 0,
dropChromossomes = NA,
skip = 1) {
#load the bed file
segments <- read.table(panelBedFilepath,
sep = "\t",
as.is = TRUE,
skip = skip)
gr <- GRanges(segments[, 1], IRanges(segments[, 2], segments[, 3]))
#make sure that all regions are more than one read apart
GenomicRanges::start(gr) <- GenomicRanges::start(gr) - rangeExtend
GenomicRanges::end(gr) <- GenomicRanges::end(gr) + rangeExtend
if(doReduce) {
reducedGr <- GenomicRanges::reduce(gr, ignore.strand = TRUE)
#get the genes for each reduced range
hitsAnnotation <- GenomicRanges::findOverlaps(reducedGr, gr,
select = "first",
ignore.strand=TRUE)
#get the amplicon names from the segments
amplicons = segments[hitsAnnotation, ampliconColumn]
#replace the original gr with the reduced gr
gr <- reducedGr
} else{
amplicons = segments[ , ampliconColumn]
}
#get the genes from segments
splitted = strsplit(amplicons, split = split)
# Required because of package checking complaints
i <- NULL
genes = foreach(i = seq_along(splitted)) %do% {
splitted[[i]][1]
}
genes = unlist(genes)
# missing the dollar symbol before text because of groovy script
elementMetadata(gr)["geneNames"] = genes
elementMetadata(gr)["ampliconNames"] = paste(seqnames(gr),
start(gr),
end(gr),
sep = "_",
amplicons)
if (!is.na(dropChromossomes)) {
gr <- dropSeqlevels(gr, dropChromossomes)
}
return(gr)
}
#load the files you want to analyze
ReadCountsFromBam <- function(bamFilenames,
sampleNames,
gr,
ampliconNames,
minimumMappingQuality = 20,
removeDup = FALSE) {
if (missing(sampleNames)) {
sampleNames = bamFilenames
}
if (missing(ampliconNames)) {
ampliconNames = elementMetadata(gr)$ampliconNames
}
bamIndexFilepaths <- BamIndexFilepaths(bamFilenames)
# Because of package check complaints
i <- NULL
curbam = foreach(i = seq_along(bamFilenames), .combine = cbind) %do% {
if (!file.exists(bamIndexFilepaths[i])) {
message(paste("Bai file not found. Creating bai file for", bamFilenames[i], "..."))
IndexMultipleBams(bamFilenames)
}
message(paste("Reading counts for bam file: ", bamFilenames[i]))
countBamInGRanges(bamFilenames[i],
gr,
remove.dup = removeDup,
min.mapq = minimumMappingQuality,
get.width = TRUE)
}
listOfBamsHasOnlyOneElement <- length(bamFilenames) == 1
if (listOfBamsHasOnlyOneElement) {
curbam <- matrix(curbam)
}
colnames(curbam) = sampleNames
rownames(curbam) = ampliconNames
return(curbam)
}
#get the combined counts
CombinedNormalizedCounts <- function (sampleCounts,
referenceCounts,
method = "tmm",
ampliconNames = NULL) {
sampleCounts <- as.matrix(sampleCounts)
referenceCounts <- as.matrix(referenceCounts)
allCounts <- cbind(sampleCounts, referenceCounts)
classes <- rep(c("samples", "reference"), c(ncol(sampleCounts),
ncol(referenceCounts)))
bamDataRangesNorm <- NormalizeCounts(allCounts, method = method)
# bamDataRangesNorm <- NormalizeCounts(allCounts)
if (!is.null(ampliconNames)) {
rownames(bamDataRangesNorm) = ampliconNames
}
normalizedSamples <- bamDataRangesNorm[, which(classes == "samples"), drop = FALSE]
normalizedReference <- bamDataRangesNorm[, which(classes == "reference"), drop = FALSE]
return(list(samples = normalizedSamples, reference = normalizedReference))
}
###############################################################################
# helper functions
###############################################################################
VerifiyIfOutputDirectoryExistsOrIsNotEmpty <- function(outputDir) {
outputDirectoryExists <- file.exists(outputDir)
outputDirectoryIsNotEmpty <- length(dir(outputDir, all.files=TRUE)) != 0
if (outputDirectoryExists || outputDirectoryIsNotEmpty) {
stop("Output directory already exists or is not empty. Please delete or change output directory")
}
}
tss <- function(allCounts) {
return(apply(allCounts, 2, function(x) {x / sum(x)}))
}
NormalizeCounts <- function(allCounts, method = "tmm") {
# TODO check if normalization method is supported
allowedNormalizationMethods <- c("tmm", # trimmed mean of M-values
"tss") # total sum scaling
tinyValueToAvoidZeroReadCounts <- 1e-05
bamDataRangesNorm <- NULL
if (method == "tmm") {
bamDataRangesNorm <- tmm(allCounts, refColumn = NULL, k = tinyValueToAvoidZeroReadCounts)
} else if (method == "tss") {
# bamDataRangesNorm <- apply(allCounts, 2, function(x) {x / sum(x)})
bamDataRangesNorm <- tss(allCounts)
} else {
message(paste("method", method, "not included in the allowed Normalization methods", allowedNormalizationMethods))
}
return(bamDataRangesNorm)
}
#get the positions for each gene as a list
IndexGenesPositions = function(genes) {
positions = seq_along(genes)
genesPos = split(positions, genes)
return(genesPos)
}
# #how many numbers in a vector are above a given cutoff
# PercentAboveValue <- function(vector, value) {
# sum(vector > value)/length(vector)
# }
#
# #how many numbers in a vector are below a given cutoff
# PercentBelowValue <- function(vector, value) {
# sum(vector < value)/length(vector)
# }
BamIndexFilepaths <- function(bamFilepaths, indexType = ".bam.bai", ignoreCase = FALSE) {
return (gsub(".bam$",
bamFilepaths,
replacement = indexType,
ignore.case = ignoreCase))
}
# check this params with Thomas
#index the bam files if there is no index yet
IndexMultipleBams <- function(bams,
index_type = ".bam.bai") {
#check if the index already exists and need to be indexed
# potentialBaiFilenames <- gsub(".bam$",
# bams,
# replacement = index_type,
# ignore.case = TRUE)
potentialBaiFilenames <- BamIndexFilepaths(bams, indexType = index_type)
print(paste("potentialBaiFilenames", potentialBaiFilenames))
bamsToBeIndexed <- bams[!sapply(potentialBaiFilenames, file.exists)]
if(length(bamsToBeIndexed) > 0) {
#index the bams
#if(multicore) {
# check with Thomas if we can replace by this..
# bplapply(bamsToBeIndexed, indexBam)
#mclapply(bamsToBeIndexed, indexBam, mc.cores = ncores)
#} else {
lapply(bamsToBeIndexed, indexBam)
#}
}
}
WriteListToXLSX <- function(listOfDataFrames, multipleFiles = FALSE, outputFolder = file.path(getwd(), "xlsx"), filepath = "list.xlsx") {
if (multipleFiles) {
dir.create(outputFolder, recursive = TRUE, showWarnings = FALSE)
for(name in names(listOfDataFrames)){
message(paste0("Saving file to '", file.path(outputFolder, paste0(name, ".xlsx"))))
write.xlsx(listOfDataFrames[[name]], file = file.path(outputFolder, paste0(name, ".xlsx")), rowNames = TRUE)
}
} else {
message(paste0("Saving file to '", filepath, "'"))
write.xlsx(listOfDataFrames, file = filepath, rowNames = TRUE)
}
}
ReadXLSXToList <- function(filepath, rowNames = TRUE, colNames = TRUE) {
listOfDataFrames <- list()
for(name in getSheetNames(filepath)) {
listOfDataFrames[[name]] <- read.xlsx(filepath,
rowNames = rowNames,
colNames = colNames,
sheet = name)
}
return(listOfDataFrames)
}
###############################################################################
# functions for bootstrapping
###############################################################################
adjustedLength <- function(a) {
round(sqrt(length(a)))
}
ratiosMean <- function(ratios) {
return(exp(mean(log(ratios))))
}
#a function that randomly samples positions from a vector using subsampling
SubsamplingPositions <- function(pos, mtry = adjustedLength(pos)) {
return(sample(pos, mtry, replace = FALSE))
}
#calculates the mean from a vector given specific positions
PositionMean <- function(position, vector) {
return(ratiosMean(vector[position]))
}
#calculate the mean for each gene
GenePositionMean <- function(genesPos, vector) {
means = sapply(genesPos, PositionMean, vector = vector)
names(means) = names(genesPos)
return(means)
}
#calculates the mean from a vector given specific positions for a matrix
GeneMeanRatioMatrix <- function(genesPos, ratioMatrix) {
# Package check complaints
i <- NULL
ratioList = foreach(i = 1:ncol(ratioMatrix), .combine = cbind) %do% {
GenePositionMean(genesPos,ratioMatrix[, i])
}
ratioList <- as.matrix(ratioList)
colnames(ratioList) <- colnames(ratioMatrix)
return(ratioList)
}
# ReferenceWeights <- function(refmat, varianceFunction = sd) {
# variance = apply(refmat, 2, varianceFunction)
# weights = 1/variance
# return(weights)
# }
# split into a function generating the bootstrap distribution
# and a function averaging over amplicons.
# define the function to generate bootstrap distributions and
# return a list with the genewise bootstrapping
BootList <- function(geneNames, sampleMatrix, refmat, replicates) {
enforceDeterministicResult()
# get the genes positions in the matrix as a list from a gene name vector
genesPos <- IndexGenesPositions(geneNames)
if (class(sampleMatrix) != "matrix") {
stop(paste("Parameter 'sampleMatrix' has to be of class matrix and is of class", class(sampleMatrix)))
}
if (is.null(colnames(sampleMatrix)) | length(colnames(sampleMatrix))!=ncol(sampleMatrix)) {
stop("All columns of 'sampleMatrix' have to be named")
}
# a vector and matrix are not the same and for a vector iterating over
# the column makes no sense so we have
# to check if a matrix or a vector was passed. ncol only works for matrix
# not for vector
if (class(sampleMatrix) == "matrix") {
iterator <- 1:ncol(sampleMatrix)
# } else {
# iterator <- 1
}
i <- NULL
j <- NULL
bootListSamples <- foreach(i = iterator) %:%
foreach(j = rep(1, replicates), .combine = rbind) %do% {
# a vector and matrix are not the same and for a vector
# iterating over the column makes no sense so we have to check
# if a mtrix or a vector was passed.
if (class(sampleMatrix) == "matrix") {
testSample <- sampleMatrix[, i]
# } else {
# testSample <- sampleMatrix
}
# for each gene subsample the amplicon positions independently
# sample the samples using bootstrapping
sampleBootPos <- sample(1:ncol(refmat),
ncol(refmat),
replace = TRUE)
geneBootPos <- c(lapply(genesPos,
SubsamplingPositions),
recursive = TRUE)
# given the obtained sampling using the bootstraps calculated
refMatPos <- refmat[geneBootPos, sampleBootPos, drop=FALSE]
bootRatio <- testSample[geneBootPos]/rowMeans(refMatPos)
# after the bootstrapping the gene positions in the vector
# changes so recalculate them
splitClass <- rep(names(genesPos), sapply(genesPos,
adjustedLength))
newGenesPos <- split(seq_along(splitClass), splitClass)
sapply(newGenesPos, PositionMean, vector = bootRatio)
}
names(bootListSamples) <- basename(colnames(sampleMatrix))
# In case a single step of bootstrap is done (for testing purposes),
# the elements of each sample will be converted to a matrix
for (i in seq_along(bootListSamples)) {
bootListSamples[[i]] <- matrix(bootListSamples[[i]], ncol = length(names(genesPos)))
colnames(bootListSamples[[i]]) <- names(genesPos)
}
return(bootListSamples)
}
ConsensusCheck <- function(genomicRangesFromBed, sampleMatrix, referenceMatrix) {
# sampleMatrix <- samplesNormalizedReadCounts
# referenceMatrix <- referenceNormalizedReadCounts
# View(sampleMatrix[, c(1,2)])
# View(apply(referenceMatrix, 1, median))
# ConsensusCheck(genomicRangesFromBed, results_1PGM_sequencingLibraries1@sampleReadCounts, results_1PGM_sequencingLibraries1@referenceReadColunts),
# sampleMatrix <- results_1PGM_sequencingLibraries1@sampleReadCounts
# referenceMatrix <- results_1PGM_sequencingLibraries1@referenceReadColunts
# This should not reach here.. but just in case..
colnames(sampleMatrix) <- basename(colnames(sampleMatrix))
colnames(referenceMatrix) <- basename(colnames(referenceMatrix))
geneNames <- genomicRangesFromBed$geneNames
allSampleRatios <- data.frame()
for (smpl in colnames(sampleMatrix)) {
# smpl <- colnames(sampleMatrix)[which(grepl("28208", colnames(sampleMatrix)))]
ampliconsRatio <- sampleMatrix[, smpl]/apply(referenceMatrix, 1, median)
allSampleRatios <- rbind(allSampleRatios, ampliconsRatio)
}
allSampleRatios <- t(allSampleRatios)
rownames(allSampleRatios) <- rownames(sampleMatrix)
colnames(allSampleRatios) <- colnames(sampleMatrix)
genesPositions <- IndexGenesPositions(geneNames)
geneConsensus <- data.frame(matrix(nrow=length(genesPositions), ncol=ncol(sampleMatrix)))
geneConsensus <- data.frame()
for (smpl in colnames(allSampleRatios)) {
for (gene in names(genesPositions)) {
geneAmpliconsRatios <- allSampleRatios[,smpl][genesPositions[[gene]]]
ampliconConsensus <- all(geneAmpliconsRatios>1) | all(geneAmpliconsRatios<1)
geneConsensus[gene, smpl] <- ampliconConsensus
}
}
# TODO case the reference has amplicons with 0 read counts, it returns NA instead of boolean
geneConsensus[is.na(geneConsensus)] <- FALSE
return(geneConsensus)
}
HaveMininumNumberOfAmplicons <- function(genomicRangesFromBed, numberOfAmplicons) {
geneNames <- genomicRangesFromBed$geneNames
genesPositions <- IndexGenesPositions(geneNames)
sapply(genesPositions, function(x) {length(x)>=numberOfAmplicons})
}
#calculate significance
CheckSignificance <- function(bootList, significanceLevel = 0.05) {
# Bonferroni correction
significanceLevel <- significanceLevel/ncol(bootList[[1]])
margin <- significanceLevel/2
# package check complains
i <- NULL
j <- NULL
sigTables = foreach(i = seq_along(bootList)) %:%
foreach(j = 1:ncol(bootList[[i]]), .combine = rbind) %do% {
bootRatioDistribution <- bootList[[i]][,j]
# meanNoise <- exp(mean(log(bootRatioDistribution)))
meanNoise <- ratiosMean(bootRatioDistribution)
lowerBound <- quantile(bootRatioDistribution, margin, type = 1)
upperBound <- quantile(bootRatioDistribution, 1 - margin, type = 1)
bounds <- c(lowerBound, meanNoise, upperBound)
roundedBounds <- round(bounds, digits = 2)
maybeAmplification <- roundedBounds[1] > 1
maybeDeletion <- roundedBounds[3] < 1
roundedSignificant <- maybeAmplification | maybeDeletion
copyNumberPutativeStatus <- "Normal"
if(maybeAmplification) {
copyNumberPutativeStatus <- "Amplification"
} else {
if (maybeDeletion) {
copyNumberPutativeStatus <- "Deletion"
}
}
sigTable <- c(roundedBounds, roundedSignificant, copyNumberPutativeStatus)
names(sigTable) <- c("lowerBound", "mean", "upperBound", "isSig", "putativeStatus")
sigTable
}
names(sigTables) <- names(bootList)
# name the genes in the list. All samples should share the same gene names
geneNames <- colnames(bootList[[1]])
for (i in seq_along(sigTables)) {
rownames(sigTables[[i]]) <- geneNames
}
return(sigTables)
}
#SignificantGenes <- function(sigList, genesPositionsIndex) {
# # package check complains
# i <- NULL
# sigGenes = foreach(i = seq_along(sigList)) %do% {
# names(genesPositionsIndex)[sigList[[i]][, "isSig"] == TRUE]
# }
# return(sigGenes)
#}
# #############################################################################
# # functions for background noise estimation
# #############################################################################
NonSignificantGeneIndex <- function(sigList, genesPositionsIndex) {
# package check complains
i <- NULL
genePosNonSig = foreach(i = seq_along(sigList)) %do% {
sigGenes = names(genesPositionsIndex)[sigList[[i]][, "isSig"] == TRUE]
selectedIndex <- RemSigGenes(genesPositionsIndex, sigGenes)
if (length(selectedIndex) == 0) {
genesPositionsIndex
} else {
selectedIndex
}
}
names(genePosNonSig) <- names(sigList)
return(genePosNonSig)
}
AmplProbMultipeSamples <- function(genePosNonSig) {
# package check complains
i <- NULL
amplWeights = foreach(i = seq_along(genePosNonSig)) %do% {
AmplProb(genePosNonSig[[i]])
}
names(amplWeights) <- names(genePosNonSig)
return(amplWeights)
}
enforceDeterministicResult <- function() {
set.seed(1)
}
Background <- function(geneNames,
samplesNormalizedReadCounts,
referenceNormalizedReadCounts,
bootList,
replicates = 1000,
significanceLevel = 0.05,
robust = FALSE) {
enforceDeterministicResult()
#which genes showed significant changes
sigList <- CheckSignificance(bootList)
# gene index
genesPositionsIndex <- IndexGenesPositions(geneNames)
# calculate the reference mean
#refMean <- apply(referenceNormalizedReadCounts, 1, mean)
refMean <- rowMeans(referenceNormalizedReadCounts)
# calculate the ratio matrix for each sample
ratioMatrix <- RatioMatrix(samplesNormalizedReadCounts, refMean)
# remove the significant genes from the noise estimation
genesPosNonSig <- NonSignificantGeneIndex(sigList, genesPositionsIndex)
# calculate the weight for each amplicon of non significant changed genes
amplWeights <- AmplProbMultipeSamples(genesPosNonSig)
uniqueAmpliconNumbers <- NumberOfUniqueAmplicons(genesPositionsIndex)
sampleIndex <- NULL
backgroundObject <- foreach(sampleIndex = seq_along(genesPosNonSig)) %do% {
message(paste0("Calculating Background for ", colnames(samplesNormalizedReadCounts)[sampleIndex]))
nonSigAmpliconRatios <- ratioMatrix[unlist(genesPosNonSig[[sampleIndex]]), sampleIndex]
IterateAmplNum(uniqueAmpliconNumbers,
nonSigAmpliconRatios,
replicates = replicates,
probs = amplWeights[[sampleIndex]],
significanceLevel = significanceLevel,
robust = robust)
}
return(backgroundObject)
}
roundDf <- function(x, digits) {
# round all numeric variables
# x: data frame
# digits: number of digits to round
# numeric_columns <- sapply(x, mode) == 'numeric'
numeric_columns <- sapply(x, is.numeric)
x[numeric_columns] <- round(x[numeric_columns], digits)
x
}
# Helper function to add column to reportTables
ReliableStatus <- function(putative, numberOfThresholdsPassed) {
status <- as.vector(putative)
# print(status)
if (numberOfThresholdsPassed < 2) {
status <- "Normal"
}
return(status)
}
ReportTables <- function(geneNames,
samplesNormalizedReadCounts,
referenceNormalizedReadCounts,
bootList,
backgroundNoise) {
# get the background noise in a format that can be used
# for a report table
backgroundReport <- BackgroundReport(backgroundNoise, geneNames)
# gene index
genesPositionsIndex <- IndexGenesPositions(geneNames)
# calculate the reference mean
#refMean <- apply(referenceNormalizedReadCounts, 1, mean)
refMean <- rowMeans(referenceNormalizedReadCounts)
# calculate the ratio matrix for each sample
ratioMatrix <- RatioMatrix(samplesNormalizedReadCounts, refMean)
# calculate the genewise ratio matrix from the ratio_mat
ratioMatGene <- GeneMeanRatioMatrix(genesPositionsIndex, ratioMatrix)
sigList <- CheckSignificance(bootList)
# because package generaton complains..
i <- NULL
reportTables <- foreach(i = seq_along(backgroundReport)) %do% {
#isSig <- (sigList[[i]][, "upperBound"] < 1) | (sigList[[i]][, "lowerBound"] > 1)
putativeStatus <- sigList[[i]][, "putativeStatus"]
isSig <- putativeStatus != "Normal"
backgroundUp <- backgroundReport[[i]][, "UpperNoise"]
backgroundDown <- backgroundReport[[i]][, "LowerNoise"]
rMatGene <- ratioMatGene[, i]
# TODO is this correct?! why compare rMatGene > 1 ?
aboveNoise <- (rMatGene > 1 & sigList[[i]][, "lowerBound"] > backgroundUp) |
(rMatGene < 1 & sigList[[i]][, "upperBound"] < backgroundDown)
dfTemp <- data.frame(meanRatio = ratioMatGene[, i],
lowerBoundBootstrapRatio = sigList[[i]][, "lowerBound"],
meanBootstrapRatio = sigList[[i]][, "mean"],
upperBoundBootstrapRatio = sigList[[i]][, "upperBound"],
# lowerNoise = backgroundReport[[i]][, 1],
# meanNoise = backgroundReport[[i]][, 2],
# upperNoise = backgroundReport[[i]][, 3],
lowerNoise = backgroundReport[[i]][, "LowerNoise"],
meanNoise = backgroundReport[[i]][, "MeanNoise"],
upperNoise = backgroundReport[[i]][, "UpperNoise"],
significant = isSig,
aboveNoise = aboveNoise,
amplNum = as.vector(table(geneNames)),
putativeStatus = putativeStatus,
passed = isSig + aboveNoise)
significativeNumbers <- 2
dfTemp <- roundDf(dfTemp, significativeNumbers)
names(dfTemp) <- c("MeanRatio",
"LowerBoundBoot",
"MeanBoot",
"UpperBoundBoot",
"LowerNoise",
"MeanNoise",
"UpperNoise",
"Signif.",
"AboveNoise",
"Amplicons",
"PutativeStatus",
"Passed")
dfTemp
}
# TODO Check if this column added is correct
for(i in 1:length(reportTables)) {
tmpReportTable <- reportTables[[i]]
# tmpReportTable <- reportTables[[1]]
status <- mapply(ReliableStatus, tmpReportTable$PutativeStatus, tmpReportTable$Passed)
# print(i)
reportTables[[i]] <- cbind(reportTables[[i]], "ReliableStatus" = status)
}
# names(reportTables) <- colnames(samplesNormalizedReadCounts)
names(reportTables) <- names(bootList)
return(reportTables)
}
BackgroundReport <- function(background, geneNames) {
# because package generation complains..
i <- NULL
j <- NULL
backgroundReport = foreach(i = seq_along(background)) %:%
foreach(j = table(geneNames), .combine = rbind) %do% {
background[[i]][[as.character(j)]]
}
}
#remove genes that were considered significant by the algorithm
RemSigGenes <- function(genesPos, sigGenes) {
#if some of the genes were reported with a significantly different read count
#should not be used for
#background testing
#which genes are in the genes pos object
geneNames= names(genesPos)
#define a function for not in
`%ni%` = Negate(`%in%`)
#which of these genes did not show a significant read count change
nonSigGenes = which(geneNames %ni% sigGenes)
#keep only the genes that did not show a significant read coount change
nonSigGenesPos = genesPos[nonSigGenes]
return(nonSigGenesPos)
}
AmplProb <- function(genesPos) {
#how many amplicons where used for each ofhte genes
geneCounts = countAmplicons(genesPos)
#adjust the probablity for depending on the number of amplicons for each gene
genePerc = 1/geneCounts
#this information has to be available for each stable position
ampliconProb = rep(genePerc, geneCounts)
return(ampliconProb)
}
RatioMatrix <- function(sampleMat, refMean) {
apply(sampleMat, 2, `/`, refMean)
}
SampleRatio <- function(ratios, numAmpl, amplWeights = NULL) {
replace <- FALSE
if(numAmpl >= length(ratios)) {
replace <- TRUE
}
randomPos = sample(seq_along(ratios),
numAmpl,
prob = amplWeights,
replace = replace)
randomlySelectedRatios = ratios[randomPos]
# randomMean = exp(mean(log(randomlySelectedRatios)))
randomMean <- ratiosMean(randomlySelectedRatios)
return(randomMean)
}
DescriptiveStatistics <- function(distribution, robust = FALSE) {
# print(paste("Robust is ", robust))
if (robust) {
centralTendency <- median(distribution)
variability <- mad(distribution, constant = 1) # median absolute deviation
} else {
centralTendency <- mean(distribution)
variability <- sd(distribution)
}
return(list(centralTendency = centralTendency,
variability = variability))
}
SampleNoiseGenes <- function(numAmpl = 2,
ratios,
replicates = 100,
probs = NULL,
significanceLevel = 0.05,
robust = FALSE) {
margin <- significanceLevel / 2
# now repeat the sampling for the selected number of
# amplicons replicates
sampleNoiseDistribution = replicate(replicates,
SampleRatio(ratios,
numAmpl,
amplWeights = probs))
#get the upper, mean and lower values of the noise distribution
logSampleNoiseDistribution <- log(sampleNoiseDistribution)
ds <- DescriptiveStatistics(logSampleNoiseDistribution, robust)
logMeanNoise <- ds$centralTendency
logSdNoise <- ds$variability
# if (robust) {
# logMeanNoise <- median(logSampleNoiseDistribution)
# logSdNoise <- mad(logSampleNoiseDistribution, constant = 1)
# } else {
# logMeanNoise <- mean(logSampleNoiseDistribution)
# logSdNoise <- sd(logSampleNoiseDistribution)
# }
lowerBound <- exp(logMeanNoise + qnorm(margin) * logSdNoise)
meanNoise <- exp(logMeanNoise)
upperBound <- exp(logMeanNoise + qnorm(1 - margin) * logSdNoise)
sampledNoise = c(lowerBound, meanNoise, upperBound)
names(sampledNoise) = paste0(c("Lower", "Mean", "Upper"), "Noise")
return(sampledNoise)
}
countAmplicons <- function(x) {
return(sapply(x, NROW))
}
# Returns a list with the unique number of amplicons for all genes
NumberOfUniqueAmplicons <- function(genesPos) {
# calcUniqueAmpliconNumbers <-function(genesPos) {
ampliconNumbers = countAmplicons(genesPos)
uniqueAmpliconNumbers = sort(unique(ampliconNumbers))
return(uniqueAmpliconNumbers)
}
#now calculate the background for each of the unique amplicon numbers
# ratios: the
IterateAmplNum <- function(uniqueAmpliconNumbers,
ratios,
replicates = 100,
probs = NULL,
significanceLevel = 0.05,
robust = FALSE) {
# Needed because of package check complaints..
i <- NULL
noiseResults = foreach(i = seq_along(uniqueAmpliconNumbers)) %do% {
sampledNoise = SampleNoiseGenes(uniqueAmpliconNumbers[i],
ratios = ratios,
replicates = replicates,
probs = probs,
significanceLevel = significanceLevel,
robust = robust)
sampledNoise
}
names(noiseResults) = as.character(uniqueAmpliconNumbers)
return(noiseResults)
}
SelectReferenceSetByPercentil <- function(allSamplesReadCounts,
normalizationMethod = "tmm",
lowerBoundPercentage = 1,
upperBoundPercentage = 99) {
allSamplesReadCountsNormalized <-NormalizeCounts(allSamplesReadCounts,
method = normalizationMethod)
selectedSamplesFilepath <- NULL
allSamples <- round(allSamplesReadCountsNormalized)
samplesColnames <- colnames(allSamples)
selectedRange <- list()
for (i in 1:nrow(allSamples)) {
# selectedRange[[i]] <- quantile(allSamples[i,])
selectedRange[[i]] <- quantile(allSamples[i,], c(lowerBoundPercentage, upperBoundPercentage) / 100)
}
selectedSamplesIndex <- NULL
excludedSamplesIndex <- NULL
for (j in 1:ncol(allSamples)) {
addSample <- TRUE
for(i in 1:nrow(allSamples)) {
if ((allSamples[i, j] < selectedRange[[i]][1]) | (allSamples[i, j] > selectedRange[[i]][2])) {
addSample <- FALSE
# print(paste("Add sample?", addSample, i, selectedRange[[i]][1], allSamples[i, j], selectedRange[[i]][2]))
break
}
}
if (addSample) {
selectedSamplesIndex <- cbind(selectedSamplesIndex, j)
} else {
excludedSamplesIndex <- cbind(excludedSamplesIndex, j)
}
}
selectedSamples <- allSamples[, selectedSamplesIndex]
excludedSamples <- allSamples[, -excludedSamplesIndex]
selectedSamplesFilepath <- colnames(selectedSamples)
return(selectedSamplesFilepath)
}
SelectReferenceSetByInterquartileRange <- function(allSamplesReadCounts, normalizationMethod = "tmm", iqrFactor = 1) {
allSamplesReadCounts <- NormalizeCounts(allSamplesReadCounts, method = normalizationMethod)
fivenumReads <- t(apply(allSamplesReadCounts, 1, fivenum))
colnames(fivenumReads) <- c("SampleMinimum", "LowerQuartile", "Median", "UpperQuartile", "SampleMaximum")
iqr <- fivenumReads[, "UpperQuartile"] - fivenumReads[, "LowerQuartile"]
dispersion <- iqrFactor * iqr
sampleIndexes <- NULL
for (i in 1:ncol(allSamplesReadCounts)) {
if (all((allSamplesReadCounts[, i] >= (fivenumReads[,"LowerQuartile"] - dispersion )) & ((dispersion + fivenumReads[,"UpperQuartile"]) >= allSamplesReadCounts[, i]))) {
sampleIndexes <- c(sampleIndexes, i)
}
}
return(colnames(allSamplesReadCounts)[sampleIndexes])
}
SelectReferenceSetByKmeans <- function(allSamplesReadCounts, normalizationMethod = "tmm", referenceNumberOfElements) {
# referenceSamples <- function(referenceNumberOfElements, allSamples) {
allSamples <- NormalizeCounts(allSamplesReadCounts, method = normalizationMethod)
# allSamples <- allSamplesReadCounts
numberOfSamples <- ncol(allSamples)
numberOfOutliers <- numberOfSamples - referenceNumberOfElements
if (numberOfOutliers < 0) {
message(paste("Number of samples", numberOfSamples, "is smaller than the size of the reference set requested", referenceNumberOfElements, ", so all samples will be selected"))
return(colnames(allSamples))
} else if (numberOfOutliers == 0) {
message(paste("Number of samples", numberOfSamples, "is equal to the size of the reference set requested", referenceNumberOfElements))
return(colnames(allSamples))
} else {
allSamples <- t(allSamples)
kmeans.result <- kmeans(allSamples, centers=1)
# "centers" is a data frame with one center
centers <- kmeans.result$centers[kmeans.result$cluster, ]
distances <- sqrt(rowSums((allSamples - centers)^2))
outliers <- order(distances, decreasing=TRUE)[1:numberOfOutliers]
# these rows are top outliers
message(cat("outliers:", colnames(allSamplesReadCounts)[outliers]))
allSamples <- t(allSamples)
# return(colnames(allSamples[, -outliers, drop = FALSE]))
return(colnames(allSamples)[-outliers])
}
}
SelectReferenceSet <- function(samplesFilepaths,
genomicRanges,
removePcrDuplicates = FALSE,
normalizationMethod = "tmm",
referenceMaximumNumberOfElements = 10,
referenceSelectionMethod = "percentil",
lowerBoundPercentage = 1,
upperBoundPercentage = 99) {
# samplesFilepaths <- allSamplesFilteredFilepaths[1:3]
# genomicRanges <- genomicRangesFromBed
allSamplesReadCounts <- ReadCountsFromBam(bamFilenames = samplesFilepaths,
sampleNames = samplesFilepaths,
gr = genomicRanges,
# ampliconNames = ampliconNames,
removeDup = removePcrDuplicates)
selectedSamplesFilepath <- SelectReferenceSetFromReadCounts(allSamplesReadCounts,
# genomicRanges,
# removePcrDuplicates = removePcrDuplicates,
normalizationMethod = normalizationMethod,
referenceMaximumNumberOfElements = referenceMaximumNumberOfElements,
referenceSelectionMethod = referenceSelectionMethod,
lowerBoundPercentage = lowerBoundPercentage,
upperBoundPercentage = lowerBoundPercentage)
return(selectedSamplesFilepath)
}
SelectReferenceSetFromReadCounts <- function(allSamplesReadCounts,
# genomicRanges,
# removePcrDuplicates = FALSE,
normalizationMethod = "tmm",
referenceMaximumNumberOfElements = 30,
# referenceSelectionMethod = "percentil",
referenceSelectionMethod = "kmeans",
lowerBoundPercentage = 1,
upperBoundPercentage = 99) {
selectedSamplesFilepath <- NULL
if (referenceSelectionMethod == "percentil") {
message("Selecting reference by percentil")
selectedSamplesFilepath <- SelectReferenceSetByPercentil(allSamplesReadCounts,
normalizationMethod = normalizationMethod,
# referenceSelectionMethod = referenceSelectionMethod,
lowerBoundPercentage = lowerBoundPercentage,
upperBoundPercentage = upperBoundPercentage)
} else if (referenceSelectionMethod == "kmeans") {
message("Selecting reference by kmeans")
selectedSamplesFilepath <- SelectReferenceSetByKmeans(allSamplesReadCounts,
normalizationMethod = normalizationMethod,
referenceMaximumNumberOfElements)
} else {
message(paste("method", referenceSelectionMethod, "not supported"))
}
return(selectedSamplesFilepath)
}
CollectColumnFromAllReportTables <- function(reportTables, columnName) {
compilation <- NULL
mySampleNames <- names(reportTables)
for(name in mySampleNames) {
tmpReportTable <- reportTables[[name]]
compilation <- cbind(compilation, as.vector(tmpReportTable[, columnName]))
}
colnames(compilation) <- mySampleNames
rownames(compilation) <- rownames(reportTables[[1]])
return(compilation)
}
revalueDF <- function(myDataFrame, mappings) {
myDataFrameColnames <- colnames(myDataFrame)
for(columnName in myDataFrameColnames) {
myDataFrame[, columnName] <- revalue(myDataFrame[, columnName], mappings)
}
mode(myDataFrame) <- "numeric"
return(myDataFrame)
}
## TODO check this function
# ReliableAberrationStatusHeatMap <- function(meanBootResults,
# realiableResults,
# filepath = NULL) {
# meanBootResults <- melanomaMeanBootResults
# reliableResults <- melanomaCNVPanelizerResults
# test_that("testing correct dimensions", {
# expect_equal(colnames(meanBootResults), colnames(reliableResults))
# expect_equal(rownames(meanBootResults), rownames(reliableResults))
# })
#
# kNormalDefaultValue <- 1.0
# kAmplificationMaximalValue <- 4.0
# kAmplificationMinimumValue <- 2.0
#
# for(i in rownames(reliableResults)) {
# for(j in colnames(reliableResults)) {
# if (reliableResults[i,j] == "Normal") {
# # print(paste("Normal", i, reliableResults[i,j], "setting ", meanBootResults[i,j],"as", kNormalDefaultValue))
# meanBootResults[i,j] <- kNormalDefaultValue
# }
#
# if ((reliableResults[i,j] == "Amplification") & (meanBootResults[i,j] > kAmplificationMaximalValue)) {
# print(paste("Too high ", i, reliableResults[i,j], "setting ", meanBootResults[i,j],"as ", kAmplificationMaximalValue))
# meanBootResults[i,j] <- kAmplificationMaximalValue
# }
#
# if ((reliableResults[i,j] == "Amplification") & (meanBootResults[i,j] < kAmplificationMinimumValue)) {
# print(paste("Amplification too low", i, reliableResults[i,j], "setting ", meanBootResults[i,j],"as", kNormalDefaultValue))
# meanBootResults[i,j] <- kNormalDefaultValue
# }
# }
# }
#
# df.team_data <- melt(meanBootResults)
# colnames(df.team_data) <- c("Gene", "Sample", "Bootratio")
# g <- ggplot(data = df.team_data,
# aes(x = Gene, y = Sample)) +
# # geom_tile(aes(fill = scale(Bootratio))) +
# geom_tile(aes(fill = log(Bootratio))) +
# # + opts(theme(axis.text.x = element_text(angle = 180, hjust = 1)))
# # scale_fill_gradient2(low="darkblue", high="darkgreen", guide="colorbar") +
# # scale_fill_gradient2(low="green4", high="red4", guide="colorbar") +
# scale_fill_gradient2(low = "darkgreen", mid = "white", high = "darkred") +
# theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
# theme(axis.text.y = element_text(size=5))
# if (!missing(filepath)) {
# ggsave(filepath)
# }
# }
# why these colors by default:
StatusHeatmap <- function(dfData,
statusColors = c("Deletion" = "blue",
"Normal" = "green",
# " " = "green",
"Amplification" = "red"),
header = "Status Heatmap",
filepath = "CNVPanelizerHeatMap.png") {
mappings <- seq(length(statusColors))
# mappings <- c(-1, 0, 1)
names(mappings) <- names(statusColors)
matrixData <- revalueDF(dfData, mappings)
valuesUsed <- table(matrixData)
indexOfColorsUsed <- as.numeric(names(valuesUsed))
colorsRequired <- unname(statusColors[indexOfColorsUsed])
# TODO find out why do I need this horrible hack when only one status is available
if (length(colorsRequired) <= 1) {
colorsRequired <- c("green", "yellow")
}
# colorsRequired <- as.vector(statusColors[as.numeric(names(table(matrixData)))])
png(filename = file.path(filepath), # create PNG for the heat map
width = 5*300, # 5 x 300 pixels
height = 5*300,
res = 300, # 300 pixels per inch
pointsize = 8) # smaller font size
lmat <- rbind( c(5,3,4), c(2,1,4) )
lhei <- c(1.5, 4)
lhei <- c(1, 10)
lwid <- c(1.5, 4, 0.75)
myCexRow <- 0.2
heatmap.2(matrixData,
main = header,
# plot layout
lmat = lmat,
lhei = lhei,
lwid = lwid,
# scale = "row",
# ColSideColors=heatColors,
key = FALSE, # whether a color-key should be shown
# key = TRUE, # whether a color-key should be shown
# key.par=list(match=c(0),
# amplification_deletion=c(1, 2, 3, 4),
# deletion_amplification=c(5, 6)),
dendrogram = "none",
tracecol=NA,
# density.info=c("histogram","density","none"),
# col = unname(statusColors),
# col = c("green"),
col = colorsRequired, # if the values are not all revalued the colors have to be passed selectively
# col = heatColors,
# col = heatColors[matchPoints],
colsep=1:ncol(matrixData),
rowsep=1:nrow(matrixData),
margins = c(15, 5),
Rowv=FALSE,
Colv=FALSE,
# cexRow=myCexRow
)
# legend("topleft", inset=.02, title= "UOUOU",
# "this is legend",
# # fill=topo.colors(3),
# # fill=unname(statusColors),
# horiz=TRUE, cex=0.8)
#bottomleft
# legend("bottom", inset=.01, title="CNV Status",
#
#
# legend("left", inset=.01, title="CNV Status",
# # legend("left", inset=0, title="CNV Status",
# names(statusColors),
# # fill=topo.colors(3),
# fill=unname(statusColors),
# cex=0.8)
# # horiz=TRUE, cex=0.8)
#
# legend("bottomleft", legend=c("Line 1", "Line 2"),
# col=c("red", "blue"), lty=1:2, cex=0.8)
# legend(1, 95, legend=c("Line 1", "Line 2"),
# col=c("red", "blue"), lty=1:2, cex=0.8)
# in case of mistake this instruction should be called until return null instead of png..
dev.off()
}
## TODO check
# StatusStability <- function(geneNames, sampleNormalizedReadCounts, tmpReferenceNormalizedReadCounts) {
# centralTendency <- matrix(rowMeans(tmpReferenceNormalizedReadCounts), ncol = 1)
# rownames(centralTendency) <- rownames(tmpReferenceNormalizedReadCounts)
# referenceAndSampleDifferences <- centralTendency - sampleNormalizedReadCounts
# rownames(referenceAndSampleDifferences) <- rownames(tmpReferenceNormalizedReadCounts)
# colnames(referenceAndSampleDifferences) <- "difference"
# uniqueGeneNames <- unique(geneNames)
# geneStabilityStatus <- c()
# for (i in seq(uniqueGeneNames)) {
# geneAmpliconIndexes <- grep(uniqueGeneNames[i], rownames(myReference))
# geneAmpliconStatus <- referenceAndSampleDifferences[geneAmpliconIndexes, ]
# allGeneAmpliconsHaveSameStatus <- length(unique(geneAmpliconStatus >= 0))==1
# geneStabilityStatus <- c(geneStabilityStatus, allGeneAmpliconsHaveSameStatus)
# }
# names(geneStabilityStatus) <- uniqueGeneNames
# return(geneStabilityStatus)
# }
## TODO CHECK
# StatusStabilityTable <- function(geneNames, tmpSamplesNormalizedReadCounts, tmpReferenceNormalizedReadCounts) {
# statusStabilityTable <- NULL
# for (i in seq(ncol(tmpSamplesNormalizedReadCounts))) {
# statusStabilityTable <- cbind(statusStabilityTable, StatusStability(geneNames, matrix(tmpSamplesNormalizedReadCounts[, i], ncol = 1), tmpReferenceNormalizedReadCounts))
# }
# colnames(statusStabilityTable) <- colnames(tmpSamplesNormalizedReadCounts)
# # statusStabilityTable <- t(statusStabilityTable)
# # colnames(statusStabilityTable) <- colnames(tmpSamplesNormalizedReadCounts)
# return(statusStabilityTable)
# }
PlotBootstrapDistributions <- function(bootList,
reportTables,
outputFolder = getwd(),
sampleNames = NULL,
save = FALSE,
# scale = 7) {
scale = 10) {
selSample <- NULL
plotList <- foreach(selSample = seq_along(bootList)) %do% {
test <- as.factor(reportTables[[selSample]][, "Passed"])
levelLabels <- c("NoChange", "NonReliableChange", "ReliableChange")
names(levelLabels) <- c(0, 1, 2)
test <- suppressMessages(revalue(test, levelLabels))
namedColors <- c("#56B4E9", "#CC79A7" ,"#D55E00")
names(namedColors) <- levelLabels
ratios <- NULL
testsPassed <- NULL
df <- data.frame(class = as.factor(colnames(bootList[[selSample]])),
ratios = (as.vector(t(bootList[[selSample]]))),
testsPassed = test)
#if a genomic ranges object has been
#if(!is.null(gr)) {
# newGeneOrder <- bedToGeneOrder(gr)
# df$class = with(df,factor(class,levels(class)[newGeneOrder]))
#}
ylim1 <- boxplot.stats((df$ratios))$stats[c(1, 5)]
ylim1 <- c(max(ylim1),max(ylim1))
ylim1 <- log2(ylim1)
ylim1 <- ylim1 * c(-scale,scale)
if(is.null(sampleNames)) {
filename <- names(bootList[selSample])
filepath <- paste0(outputFolder, "/", filename, "_plot.pdf")
} else {
filename <- sampleNames[selSample]
filepath <- paste0(outputFolder, "/", filename, ".pdf")
}
bootPlot <- ggplot(df, aes(x = class, y = log2(ratios), fill = testsPassed)) +
geom_violin() + ggtitle(filename) +
theme(plot.title = element_text(lineheight = 0.8, face = "bold"),
text = element_text(size = 15),
axis.text.x = element_text(angle = 90)) +
scale_fill_manual(name = "CNV Reliability",
values = namedColors,
labels = c("0" = "Foo", "1" = "Bar")) +
coord_cartesian(ylim = ylim1) +
theme(axis.text=element_text(size=10),
axis.title=element_text(size=20,face="bold")) +
scale_x_discrete("Gene Names") +
# geom_hline(yintercept=log2(1.5), color="#009E73") +
# geom_hline(yintercept=log2(0.5), color="#009E73") +
geom_hline(yintercept=0, color="#009E73")
if(save == TRUE) {
message(paste0("Saving plot to '", filepath, "'"))
# dir.create(outputFolder, recursive = TRUE, showWarnings = TRUE)
dir.create(outputFolder, recursive = TRUE, showWarnings = FALSE)
ggsave(filename = filepath,
plot = bootPlot,
height = 7.42 * 1,
width = 8.11 * 2,
limitsize = FALSE)
}
# dev.off()
return(bootPlot)
}
names(plotList) <- names(reportTables)
return(plotList)
}
CNVPanelizerFromReadCountsHELPER <- function(sampleReadCounts,
referenceReadCounts,
genomicRangesFromBed,
numberOfBootstrapReplicates = 10000,
normalizationMethod = "tmm",
robust = TRUE,
backgroundSignificanceLevel = 0.05,
outputDir = file.path(getwd(), "CNVPanelizer"),
splitSize = 5) {
myCNVPanelizerResults <- list()
splits <- split(1:ncol(sampleReadCounts), ceiling(seq_along(1:ncol(sampleReadCounts))/splitSize))
# for (i in 1:ncol(sampleReadCounts)) {
for (i in 1:length(splits)) {
myCNVPanelizerResults[[i]] <- CNVPanelizerFromReadCounts(sampleReadCounts[, splits[[i]]],
referenceReadCounts,
genomicRangesFromBed,
numberOfBootstrapReplicates = numberOfBootstrapReplicates,
normalizationMethod = normalizationMethod,
robust = robust,
backgroundSignificanceLevel = backgroundSignificanceLevel,
outputDir = outputDir)
}
n <- length(myCNVPanelizerResults)
res <- NULL
for (i in seq(n)) {
res <- cbind(res, myCNVPanelizerResults[[i]])
}
myCNVPanelizerResultsCompiled <- myCNVPanelizerResults[[1]]
if(length(myCNVPanelizerResults) > 1) {
for (i in 2:length(myCNVPanelizerResults)) {
myCNVPanelizerResultsCompiled$sampleReadCounts <- cbind(myCNVPanelizerResultsCompiled$sampleReadCounts, myCNVPanelizerResults[[i]]$sampleReadCounts)
myCNVPanelizerResultsCompiled$bootList <- append(myCNVPanelizerResultsCompiled$bootList, myCNVPanelizerResults[[i]]$bootList)
myCNVPanelizerResultsCompiled$backgroundNoise <- append(myCNVPanelizerResultsCompiled$backgroundNoise, myCNVPanelizerResults[[i]]$backgroundNoise)
myCNVPanelizerResultsCompiled$reportTables <- append(myCNVPanelizerResultsCompiled$reportTables, myCNVPanelizerResults[[i]]$reportTables)
}
}
return(myCNVPanelizerResultsCompiled)
}
######################################################################################
# still on testing..
######################################################################################
# CNVPanelizerFromReadCountsHelperParallel <- function(sampleReadCounts,
# referenceReadCounts,
# genomicRangesFromBed,
# numberOfBootstrapReplicates = 10000,
# normalizationMethod = "tmm",
# robust = TRUE,
# backgroundSignificanceLevel = 0.05,
# outputDir = file.path(getwd(), "CNVPanelizer"),
# splitSize = 5) {
#
# myCNVPanelizerResults <- list()
#
# splits <- split(1:ncol(sampleReadCounts), ceiling(seq_along(1:ncol(sampleReadCounts))/splitSize))
#
# # Use the detectCores() function to find the number of cores in system
# no_cores <- detectCores()
# # Setup cluster
# clust <- makeCluster(no_cores - 1)
# clusterExport(clust,
# varlist = c(
# "sampleReadCounts",
# "referenceReadCounts",
# "genomicRangesFromBed",
# "numberOfBootstrapReplicates",
# "normalizationMethod",
# "robust",
# "backgroundSignificanceLevel",
# "outputDir",
# "CNVPanelizerFromReadCounts"),
# envir = environment())
#
# myCNVPanelizerResults <- parLapply(clust, 1:length(splits), function(i) {
# CNVPanelizerFromReadCounts(sampleReadCounts[, splits[[i]]],
# referenceReadCounts,
# genomicRangesFromBed,
# numberOfBootstrapReplicates = numberOfBootstrapReplicates,
# normalizationMethod = normalizationMethod,
# robust = robust,
# backgroundSignificanceLevel = backgroundSignificanceLevel,
# outputDir = outputDir)
# })
# stopCluster(clust)
#
# n <- length(myCNVPanelizerResults)
# res <- NULL
# for (i in seq(n)) {
# res <- cbind(res, myCNVPanelizerResults[[i]])
# }
#
# myCNVPanelizerResultsCompiled <- myCNVPanelizerResults[[1]]
#
# if(length(myCNVPanelizerResults) > 1) {
# for (i in 2:length(myCNVPanelizerResults)) {
# myCNVPanelizerResultsCompiled$sampleReadCounts <- cbind(myCNVPanelizerResultsCompiled$sampleReadCounts, myCNVPanelizerResults[[i]]$sampleReadCounts)
# myCNVPanelizerResultsCompiled$bootList <- append(myCNVPanelizerResultsCompiled$bootList, myCNVPanelizerResults[[i]]$bootList)
# myCNVPanelizerResultsCompiled$backgroundNoise <- append(myCNVPanelizerResultsCompiled$backgroundNoise, myCNVPanelizerResults[[i]]$backgroundNoise)
# myCNVPanelizerResultsCompiled$reportTables <- append(myCNVPanelizerResultsCompiled$reportTables, myCNVPanelizerResults[[i]]$reportTables)
# }
# }
# return(myCNVPanelizerResultsCompiled)
# }
CNVPanelizerFromReadCounts <- function(sampleReadCounts,
referenceReadCounts,
genomicRangesFromBed,
numberOfBootstrapReplicates = 10000,
normalizationMethod = "tmm",
robust = TRUE,
backgroundSignificanceLevel = 0.05,
outputDir = file.path(getwd(), "CNVPanelizer")) {
# VerifiyIfOutputDirectoryExistsOrIsNotEmpty(outputDir)
metadataFromGenomicRanges <- elementMetadata(genomicRangesFromBed)
geneNames = metadataFromGenomicRanges["geneNames"][, 1]
#
# # metadataFromGenomicRanges <- elementMetadata(genomicRangesFromBed)
# # geneNames = metadataFromGenomicRanges["geneNames"][, 1]
# # ampliconNames = metadataFromGenomicRanges["ampliconNames"][, 1]
#
# sampleReadCounts <- ReadCountsFromBam(bamFilenames = sampleBamFilepaths,
# sampleNames = sampleBamFilepaths,
# gr = genomicRangesFromBed,
# removeDup = FALSE)
#
# sampleReadCounts <- ReadCountsFromBam(bamFilenames = sampleBamFilepaths,
# sampleNames = sampleBamFilepaths,
# gr = genomicRangesFromBed,
# removeDup = FALSE)
normalizedReadCounts <- CombinedNormalizedCounts(sampleReadCounts,
referenceReadCounts,
method = normalizationMethod
# , ampliconNames = ampliconNames
)
# After normalization data sets need to be splitted again to perform bootstrap
samplesNormalizedReadCounts = normalizedReadCounts["samples"][[1]]
referenceNormalizedReadCounts = normalizedReadCounts["reference"][[1]]
bootList <- BootList(geneNames,
samplesNormalizedReadCounts,
referenceNormalizedReadCounts,
replicates = numberOfBootstrapReplicates)
# Estimate the background noise left after normalization
backgroundNoise <- Background(geneNames,
samplesNormalizedReadCounts,
referenceNormalizedReadCounts,
bootList,
replicates = numberOfBootstrapReplicates,
significanceLevel = backgroundSignificanceLevel,
robust = robust)
# Build report tables
reportTables <- ReportTables(geneNames,
samplesNormalizedReadCounts,
referenceNormalizedReadCounts,
bootList,
backgroundNoise)
plots = PlotBootstrapDistributions(bootList,
reportTables,
sampleNames = names(reportTables),
outputFolder = file.path(outputDir, "plots"),
save = TRUE)
WriteListToXLSX(reportTables,
multipleFiles = TRUE,
outputFolder = file.path(outputDir, "xlsx"))
# WriteListToXLSX(reportTables, outputFolder = "", filepath = "list.xlsx") {
# write.xlsx(listOfDataFrames, , rowNames = TRUE)
# }
#
#
#
#
# WriteListToXLSX(tmp1@reportTables, filepath = file.path(outputDir, "reportTables.xlsx"))
# WriteListToXLSX(tmp1@reportTables, filepath = file.path(getwd(), "CNVPanelizer", "reportTables.xlsx"))
# CNVPanelizerResults <- setClass("CNVPanelizerResults",
# # setClass("CNVPanelizerResults",
#
# # use this function to extract the slot names of the slotNames(results_CNVPanelizer[[1]])
# slots = c(sampleReadCounts="matrix",
# referenceReadColunts="matrix",
# genomicRangesFromBed="GRanges",
# bootList="list",
# backgroundNoise="list",
# plots = "list",
# reportTables="list"))
#
# myCNVPanelizerResults <- CNVPanelizerResults(sampleReadCounts = sampleReadCounts,
# referenceReadColunts = referenceReadCounts,
# genomicRangesFromBed = genomicRangesFromBed,
# bootList = bootList,
# backgroundNoise = backgroundNoise,
# plots = plots,
# reportTables = reportTables)
myCNVPanelizerResults <- list(sampleReadCounts = sampleReadCounts,
referenceReadCounts = referenceReadCounts,
genomicRangesFromBed = genomicRangesFromBed,
bootList = bootList,
backgroundNoise = backgroundNoise,
plots = plots,
reportTables = reportTables)
return(myCNVPanelizerResults)
}
CNVPanelizer <- function(sampleBamFilepaths,
referenceBamFilepaths,
bedFilepath,
amplColumnNumber = 6,
minimumMappingQuality = 20,
numberOfBootstrapReplicates = 10000,
removePcrDuplicates = TRUE,
# analysisMode = "gene", # analysisMode can be "gene" or "amplicon"
robust = TRUE,
backgroundSignificanceLevel = 0.05,
outputDir = file.path(getwd(), "CNVPanelizer")) {
# VerifiyIfOutputDirectoryExistsOrIsNotEmpty(outputDir)
genomicRangesFromBed <- BedToGenomicRanges(bedFilepath,
ampliconColumn = amplColumnNumber,
split = "_")
# metadataFromGenomicRanges <- elementMetadata(genomicRangesFromBed)
# geneNames = metadataFromGenomicRanges["geneNames"][, 1]
# ampliconNames = metadataFromGenomicRanges["ampliconNames"][, 1]
sampleReadCounts <- ReadCountsFromBam(bamFilenames = sampleBamFilepaths,
sampleNames = sampleBamFilepaths,
gr = genomicRangesFromBed,
minimumMappingQuality = minimumMappingQuality,
removeDup = removePcrDuplicates)
referenceReadCounts <- ReadCountsFromBam(bamFilenames = referenceBamFilepaths,
sampleNames = referenceBamFilepaths,
gr = genomicRangesFromBed,
minimumMappingQuality = minimumMappingQuality,
removeDup = removePcrDuplicates)
results <- CNVPanelizerFromReadCounts(sampleReadCounts = sampleReadCounts,
referenceReadCounts = referenceReadCounts,
genomicRangesFromBed = genomicRangesFromBed,
# bedFilepath = bedFilepath,
# amplColumnNumber = amplColumnNumber,
# minimumMappingQuality = minimumMappingQuality,
numberOfBootstrapReplicates = numberOfBootstrapReplicates,
# removePcrDuplicates = removePcrDuplicates,
# analysisMode = analysisMode,
robust = robust,
backgroundSignificanceLevel = backgroundSignificanceLevel,
outputDir = outputDir)
return(results)
}
Strict <- function(cnvPanelizerResults,
amplificationMinimumThreshold = 2,
deletionMaximumThreshold = 0.6,
minimumNumberOfAmplicons = 2
# , fullConsensusAmongAmplicons = TRUE
) {
# myCNVPanelizerTableResults <- CollectColumnFromAllReportTables(cnvPanelizerResults@reportTables, "ReliableStatus")
myCNVPanelizerTableResults <- CollectColumnFromAllReportTables(cnvPanelizerResults$reportTables, "ReliableStatus")
myCNVPanelizerTableResults <- t(myCNVPanelizerTableResults)
myCNVPanelizerTableResults <- revalueDF(myCNVPanelizerTableResults, c("Normal"=""))
interestingColumn <- "MeanBoot"
# meanBootResults <- CollectColumnFromAllReportTables(cnvPanelizerResults@reportTables, interestingColumn)
meanBootResults <- CollectColumnFromAllReportTables(cnvPanelizerResults$reportTables, interestingColumn)
class(meanBootResults) <- "numeric"
meanBootResults <- t(meanBootResults)
# consensusResults <- ConsensusCheck(cnvPanelizerResults@genomicRangesFromBed, cnvPanelizerResults@sampleReadCounts, cnvPanelizerResults@referenceReadCounts)
consensusResults <- ConsensusCheck(cnvPanelizerResults$genomicRangesFromBed, cnvPanelizerResults$sampleReadCounts, cnvPanelizerResults$referenceReadCounts)
consensusResults <- t(consensusResults)
test_that("colnames from CNVPanelizer results and Consensus match", {
expect_equal(colnames(consensusResults), colnames(myCNVPanelizerTableResults))
})
test_that("rownames from CNVPanelizer results and Consensus match", {
expect_equal(rownames(consensusResults), rownames(myCNVPanelizerTableResults))
})
cNVPanelizerResultsConsensus <- myCNVPanelizerTableResults
for (rowname in rownames(myCNVPanelizerTableResults)) {
for (colname in colnames(myCNVPanelizerTableResults)) {
# print(paste(rowname, colname))
if (!consensusResults[rowname, colname]) {
cNVPanelizerResultsConsensus[rowname, colname] <- ""
print(paste("Setting ", myCNVPanelizerTableResults[rowname, colname], "to", cNVPanelizerResultsConsensus[rowname, colname]))
}
}
}
# table(unlist(myCNVPanelizerTableResults))
# table(unlist(cNVPanelizerResultsConsensus))
test_that("colnames from CNVPanelizer results and meanBootResults match", {
expect_equal(colnames(meanBootResults), colnames(cNVPanelizerResultsConsensus))
})
test_that("rownames from CNVPanelizer results and meanBootResults match", {
expect_equal(rownames(meanBootResults), rownames(cNVPanelizerResultsConsensus))
})
cNVPanelizerResultsConsensusAndThreshold <- cNVPanelizerResultsConsensus
for (rowname in rownames(cNVPanelizerResultsConsensusAndThreshold)) {
for (colname in colnames(cNVPanelizerResultsConsensusAndThreshold)) {
# print(cNVPanelizerResultsConsensus[rowname, colname])}}
if ((cNVPanelizerResultsConsensusAndThreshold[rowname, colname] == "Amplification")
& ((meanBootResults[rowname, colname] <= amplificationMinimumThreshold))
# & ((meanBootResults[rowname, colname] <= 3))
) {
cNVPanelizerResultsConsensusAndThreshold[rowname, colname] <- ""
print(paste("Setting ", cNVPanelizerResultsConsensus[rowname, colname], "to", cNVPanelizerResultsConsensusAndThreshold[rowname, colname], "because", meanBootResults[rowname, colname]))
}
if ((cNVPanelizerResultsConsensusAndThreshold[rowname, colname] == "Deletion")
& ((meanBootResults[rowname, colname] >= deletionMaximumThreshold))
#& ((meanBootResults[rowname, colname] >= 0.7))
) {
cNVPanelizerResultsConsensusAndThreshold[rowname, colname] <- ""
print(paste("Setting ", cNVPanelizerResultsConsensus[rowname, colname], "to", cNVPanelizerResultsConsensusAndThreshold[rowname, colname]))
}
}
}
# table(unlist(cNVPanelizerResultsConsensus))
# table(unlist(cNVPanelizerResultsConsensusAndThreshold))
cNVPanelizerResultsConsensusAndThresholdAndMinimumNumberOfAmplicons <- cNVPanelizerResultsConsensusAndThreshold
# genesWithMinimumNumberOfAmplicons <- HaveMininumNumberOfAmplicons(cnvPanelizerResults@genomicRangesFromBed, minimumNumberOfAmplicons)
genesWithMinimumNumberOfAmplicons <- HaveMininumNumberOfAmplicons(cnvPanelizerResults$genomicRangesFromBed, minimumNumberOfAmplicons)
for (rowname in rownames(cNVPanelizerResultsConsensusAndThresholdAndMinimumNumberOfAmplicons)) {
for (colname in names(genesWithMinimumNumberOfAmplicons)) {
if ((cNVPanelizerResultsConsensusAndThreshold[rowname, colname] == "Deletion") &
!genesWithMinimumNumberOfAmplicons[colname]) {
cNVPanelizerResultsConsensusAndThresholdAndMinimumNumberOfAmplicons[rowname, colname] <- ""
print(paste("Setting ", cNVPanelizerResultsConsensusAndThreshold[rowname, colname], "to", cNVPanelizerResultsConsensusAndThresholdAndMinimumNumberOfAmplicons[rowname, colname]))
}
}
}
# table(unlist(cNVPanelizerResultsConsensusAndThreshold))
# table(unlist(cNVPanelizerResultsConsensusAndThresholdAndMinimumNumberOfAmplicons))
# write.xlsx(cNVPanelizerResults, file=file.path(outputDir, paste0("cNVPanelizerResults", ".xlsx")), row.names = TRUE)
# write.xlsx(cNVPanelizerResultsConsensus, file=file.path(outputDir, paste0("cNVPanelizerResultsConsensus", ".xlsx")), row.names = TRUE)
# write.xlsx(cNVPanelizerResultsConsensusAndThreshold, file=file.path(outputDir, paste0("cNVPanelizerResultsConsensusAndThreshold", ".xlsx")), row.names = TRUE)
# write.xlsx(cNVPanelizerResultsConsensusAndThresholdAndMinimumNumberOfAmplicons, file=file.path(outputDir, paste0("cNVPanelizerResultsConsensusAndThresholdAndMinimumNumberOfAmplicons", ".xlsx")), row.names = TRUE)
return(cNVPanelizerResultsConsensusAndThresholdAndMinimumNumberOfAmplicons)
}
# TODO Could be provided as a measurement of quality for the run
# Overview <- function(
# # sampleIdentifier,
# sampleReadCounts1,
# referenceReadCounts1,
# allSamplesReadCounts,
# genomicRangesFromBed,
# # normalizationMethod = "tss",
# normalizationMethod = "tmm",
# outputDir = file.path(getwd(), "CNVPanelizer", "overviewKNNReference30")) {
#
# # sampleIdentifier <- "I:/Run Data/Run S5-56/M1_S5-56_IonXpress_074_rawlib.bam"
#
# # Overview(allSamplesReadCounts[, filepaths],
# # allSamplesReadCounts[, referenceReadCounts],
# # # allSamplesReadCounts,
# # genomicRangesFromBed,
# # # normalizationMethod = "tss",
# # # normalizationMethod = "tmm",
# # outputDir = file.path(getwd(), "CNVPanelizer", "overview"))
#
# sampleReadCounts1 <- allSamplesReadCounts[, filepaths]
# # referenceReadCounts1 <- allSamplesReadCounts[, referenceReadCounts]
# referenceReadCounts1 <- allSamplesReadCounts[, referenceFilepathsKmeans]
#
# allSamplesReadCountsNormalized <- NormalizeCounts(allSamplesReadCounts, method = normalizationMethod)
# samplesNormalizedReadCounts = allSamplesReadCountsNormalized[, colnames(sampleReadCounts1)]
# referenceNormalizedReadCounts = allSamplesReadCountsNormalized[, colnames(referenceReadCounts1)]
#
# runSamples <- samplesNormalizedReadCounts
# refSamples <- referenceNormalizedReadCounts
# entireSetOfSamples <- allSamplesReadCountsNormalized
#
# geneNames <- unique(genomicRangesFromBed$geneNames)
# ampliconNames <- genomicRangesFromBed$ampliconNames
# # TODO validate if ampliconNames are the rownames of samples matrices
# # ampliconNames <- rownames(allSamplesNormalized)
#
# sampleIdentifiers <- colnames(runSamples)
# #sampleIdentifiers <- sampleIdentifiers[1]
# for (sampleIdentifier in sampleIdentifiers) {
# myDir <- file.path(outputDir, basename(sampleIdentifier))
# dir.create(myDir, recursive = TRUE)
# for (geneName in geneNames) {
# # geneName <- "BRAF"
# myAmpliconsIndexes <- grep(geneName, ampliconNames)
# png(file=file.path(myDir, paste0(geneName,'_amplicons.png')),
# width=1000,
# height=(ceiling(length(myAmpliconsIndexes)/2) * 500))
#
# # par(mfrow=c(ceiling(length(myAmpliconsIndexes)/2), 2), mar=c(3,3,5,1))
# par(mfrow=c(ceiling(length(myAmpliconsIndexes)/2), 2), mar=c(3, 3, 5, 1), oma=c(0,0,7,0))
#
# for (ampliconName in ampliconNames[myAmpliconsIndexes]) {
# # ampliconName <- "chr7_140453102_140453221_BRAF_Ex15-NM_004333"
# tmp <- data.frame(y = c(as.vector(runSamples[ampliconName, ])
# ,as.vector(refSamples[ampliconName, ])
# ,as.vector(entireSetOfSamples[ampliconName, ])
# ),
#
# x = factor(
# c(rep("runSamples", length(as.vector(runSamples[ampliconName, ])))
# ,rep("refSamples",length(as.vector(refSamples[ampliconName, ])))
# ,rep("allSamples",length(as.vector(entireSetOfSamples[ampliconName, ])))
# ), levels = c("runSamples", "refSamples", "allSamples"))
#
# # x = c(rep("runSamples", length(as.vector(runSamples[ampliconName, ])))
# # ,rep("refSamples",length(as.vector(refSamples[ampliconName, ])))
# # )
# )
#
# myValues <- runSamples[ampliconName, sampleIdentifier]
#
# boxplot(y~x,
# data=tmp,
# ylab="Read Counts",
# xlab="Sample set",
# names = c("Run Samples", "Reference", "All Samples"),
# # main = ampliconName)
# main = paste(myValues, ampliconName))
# stripchart(y ~ x,
# vertical = TRUE,
# data = tmp,
# method = "jitter",
# add = TRUE,
# # pch = 20,
# pch = 20,
# col = 'blue')
#
#
# # stripchart(list(0.5, 1), vertical=TRUE, add=TRUE, method="stack", col='red', pch="*")
# stripchart(myValues, vertical=TRUE, add=TRUE, method="stack", col='red', pch=16, cex=2)
# }
# title(main = paste(geneName, "-", sampleIdentifier), font.main = 4, outer=TRUE)
# dev.off()
# }
# }
# }
## TODO CHECK THIS
#
#
# #rowIndexesToMerge <- IndexGenesPositions(lungGenomicRanges$geneNames)
# #GeneMultilinePlot(normalSamples, rowIndexesToMerge, normalizationMethod = "tss")
# GeneMultilinePlot <- function(dataSamples, rowIndexesToMerge, normalizationMethod = "tmm") {
# dataSamples <- round(NormalizeCounts(dataSamples))
# result <- NULL
# for (i in 1:ncol(dataSamples)) {
# result <- cbind(result, sapply( rowIndexesToMerge, function(x) {mean(dataSamples[x, i])} ))
# }
# colnames(result) <- colnames(dataSamples)
# dataSamples <- result
# # dataSamples <- cbind(geneReadCounts = seq_along(rownames(dataSamples)), dataSamples)
#
# melted <- melt(dataSamples)
#
# ggplot(data=melted, aes(x=Var1, y=value, group=Var2, colour=Var2, linetype = Var2)) +
# #geom_line() +
# # scale_colour_manual(values = c('pink','orange','white', 'red')) +
# theme(legend.position="left") +
# geom_point(aes(color=Var2)) +
# geom_line(size=1) +
# theme(axis.text.x = element_text(angle = 90, hjust = 1))
# }
#
# #Example for testing MultiLinePlot
# dataSamples <- read.table(text = "CN CNVPanelizer panelcn.mops ioncopy
# CN1 0.1 0.4 0.4
# CN2 0.2 0.1 0.3
# CN3 0.1 0.2 0.1
# CN4 0.1 0.4 0.4
# CN5 0.2 0.1 0.3
# CN6 0.1 0.3 0.5
# CN7 0.6 0.6 0.6
# CN8 0.6 0.3 0.2
# CN9 0.6 0.8 0.8
# CN10 1 0.9 0.8", header=TRUE)
# TODO reuse this code at GeneMultilinePlot ...
# MultiLinePlot <- function(dataSamples, idColumnIdentifier = NULL, title = "", legendPosition = "left", xLabel = "", yLabel = "", smooth = FALSE) {
# rs <- dataSamples
# if (is.null(idColumnIdentifier)) {
# melted = melt(rs)
# } else {
# melted = melt(rs, id.vars=idColumnIdentifier)
# }
# melted$CN <- as.character(melted$CN)
# melted$CN <-factor(melted$CN, levels = unique(melted$CN))
# myPlot <- ggplot(data=melted, aes(x=CN, y=value, group=variable, colour=variable, linetype = variable)) +
# scale_colour_discrete(name = "Method") + # scale_colour_manual(values = c('pink','orange','white', 'red')) +
# ggtitle(title) +
# geom_point(aes(color=variable)) + # plots the dots..
# theme(text = element_text(size=10),
# axis.text.x = element_text(angle=90, hjust=1)) +
# theme(panel.border = element_blank(),
# panel.grid.major = element_blank(),
# panel.grid.minor = element_blank(),
# axis.line = element_line(colour = "black")) +
# theme_bw() + # sets the background colour
# theme(plot.title = element_text(hjust = 0.5)) +
# theme(legend.position=legendPosition) + xlab(xLabel) + ylab(yLabel)
# if(smooth) {
# # myPlot = myPlot + geom_smooth(method = "loess", se = FALSE, span = 1.5, show.legend = FALSE)
# myPlot = myPlot + geom_smooth(method = "auto", se = FALSE, span = 1.5, show.legend = FALSE)
# } else {
# myPlot = myPlot + geom_line(size = 1, show.legend = FALSE) # links the dots with lines..
# }
# return(myPlot)
# }
SampleReadCountsPools <- function(ampliconsReadCounts, numberOfPools) {
listOfPools <- list()
for (i in 1:numberOfPools) {
listOfPools[[i]] <- ampliconsReadCounts[seq(from = i, to = length(ampliconsReadCounts), by = numberOfPools)]
}
return(listOfPools)
}
SampleReadCountsByPool <-function(ampliconReadCounts, pools) {
# existingPools <- as.numeric(names(table(pools)))
# if (any(is.na(existingPools)) {
# stop("Pools are not defined as 1, 2, ...")
# }
existingPools <- names(table(pools))
if (length(pools) != length(ampliconReadCounts)) {
stop("Length of pools is different than ampliconReadCounts")
}
listOfPools <- list()
for (i in existingPools) {
listOfPools[[i]] <- (ampliconReadCounts[which(pools==i)])
}
return(listOfPools)
}
PoolsPlots <- function(sampleReadCountsByPool) {
plotValues <- as.integer(unlist(lapply(sampleReadCountsByPool, mean)))
barplot(plotValues, ylab = "Read Counts mean", xlab="Pools", names.arg = names(sampleReadCountsByPool), col = "dodgerblue4",
# ylim=c(0, trunc(max(plotValues)))
)
}
################################################################################
# KaryotypeAberrationPlot
################################################################################
# KaryotypeAberrationPlot(bedFilepath = bedFilepath,
# ampliconColumn = 4,
# filepath = "TESTANDO_APAGAR_KARYOTYPE.png")
KaryotypeAberrationPlot <- function(bedFilepath,
ampliconColumn,
status = sample(c("red", "green", "blue"), 30, replace = TRUE),
filepath = "KaryotypeAberrationPlot.png",
width = 1200,
height = 800) {
#source("https://bioconductor.org/biocLite.R")
#biocLite("chromPlot")
#library(chromPlot)
#library(stringr)
hg_gap <- NULL # not sure if this is good solution to avoid 'NOTE'
data(hg_gap)
getGeneRegionsStatusSummary <- GetGeneRegionsStatusSummary(bedFilepath = bedFilepath,
ampliconColumn = ampliconColumn,
status = sample(c("red", "green", "blue"), 30, replace=TRUE))
geneRegionsBands <- GetGeneRegionsBand(getGeneRegionsStatusSummary)
png(filepath, width = width, height = height)
# chromPlot(gaps=hg_gap)
# chromPlot(gaps=hg_gap, stat=geneRegions, statCol="Value")
# chromPlot(gaps=hg_gap, stat=geneRegions, statCol="Value", chr=unique(geneRegions$Chrom))
# chromPlot(gaps=hg_gap, bands=hg_cytoBandIdeo, stat=geneRegions, statCol="Value", chr=unique(geneRegions$Chrom), statName="Value", noHist=TRUE, figCols=5, cex=0.7, statTyp="n", chrSide=c(1,1,1,1,1,1,-1,1))
# TODO TEMPORARARIO PORQUE O chromPlot nao pode ser instalado em versao 3.2.0 ...
# karyotypeAberrationPlot <- chromPlot(gaps = hg_gap,
# bands = geneRegionsBands,
# chr = unique(getGeneRegionsStatusSummary$Chrom),
# stat = getGeneRegionsStatusSummary,
# statCol = "Value",
# statName = "Value",
# noHist = TRUE,
# figCols = 7,
# cex = 0.7,
# statTyp = "n",
# chrSide = c(1,1,1,1,1,1,-1,1))
karyotypeAberrationPlot <- NA
dev.off()
return(karyotypeAberrationPlot)
}
# It Merges all regions related to the same gene and assigns it a color that relates to the aberratiosn status
# Status should be a vector of colors representing the amplifications (red for amplification, green for deletion and blue for normal)
GetGeneRegionsStatusSummary <- function(bedFilepath, ampliconColumn = 4, split = "_", status = c("blue")) {
a <- read.csv(bedFilepath, sep="\t", skip=1, header = FALSE)
a[, 1] <- str_replace(a$V1, "chr","")
a[, ampliconColumn] <- vapply(strsplit(as.vector(a[, ampliconColumn]), "_", fixed = TRUE), "[", "", 1)
a <- a[,1:4]
colnames(a) <- c("Chrom", "Start", "End", "ID"
# , "Colors"
)
geneNames <- unique(a$ID)
geneRegions <- NULL
for (geneName in geneNames) {
genomicRegions <- a[a$ID==geneName, ]
chromossome <- unique(genomicRegions$Chrom)
start <- min(genomicRegions$Start)
end <- max(genomicRegions$End)
gene <- unique(genomicRegions$ID)
geneRegions <- rbind(geneRegions, c(chromossome, start, end, gene))
}
colnames(geneRegions) <- colnames(a)
geneRegions <- as.data.frame(geneRegions,
# as.is = TRUE)
stringsAsFactors = FALSE)
geneRegions <- cbind(geneRegions, Colors = status)
# geneRegions$Chrom <- as.numeric(geneRegions$Chrom)
geneRegions$Start <- as.numeric(geneRegions$Start)
geneRegions$End <- as.numeric(geneRegions$End)
geneRegions$ID <- as.factor(geneRegions$ID)
return(geneRegions)
}
# Returns objected needed for plotting.. it changes some column names and makes the regions larger to be able to be visible in the plot..
#GetGeneRegionsBand <- function(geneRegionsStatusSummary) {
GetGeneRegionsBand <- function(geneRegions) {
geneRegionsBands <- geneRegions
colnames(geneRegionsBands) <- c("Chrom", "Start", "End", "Name", "Colors")
geneRegionsBands$Chrom <- paste0("chr", geneRegionsBands$Chrom)
geneRegionsBands$Name <- as.character(geneRegionsBands$Name)
geneRegionsBands$Colors <- as.character(geneRegionsBands$Colors)
# since the regions are too small to be visualized.. around 1 milion times smaller than the size of chromossome..
geneRegionsBands$End <- geneRegionsBands$End + 1E6
return(geneRegionsBands)
}
# TODO
# Could take in consideration the size of regions and
# the whole coverage of the gene (number of amplicons * seqLength)
# > 50 bp
# panelStatistics(genomicRangesFromBed) {
# # hist(width(ranges(genomicRangesFromBed)))
# # genomicRangesFromBed$geneNames
#
# }
# boxplotGeneLevel <- function(allSamplesNormalized,
# cohortFilepaths,
# referenceFilepathsKNN,
# referenceFilepathsPercentil,
# geneName,
# outputDir = file.path(getwd(),"batchAnalysisNoMininumMappingQuality2")) {
# #boxplotPerAmplicon <- function(ampliconIndex) {
# #geneName <- "BRAF"
# #geneName <- "APC"
# # geneName <- "ERBB4"
#
# # setwd(outputDir)
#
# # cohortFilepaths <- filepaths
# martina <- allSamplesNormalized[, (colnames(allSamplesNormalized) %in% cohortFilepaths)]
#
# nonMartina <- allSamplesNormalized[, !(colnames(allSamplesNormalized) %in% cohortFilepaths)]
#
#
# for (geneName in geneNames) {
#
#
#
# ampliconNames <- rownames(allSamplesNormalized)
# myAmpliconsIndexes <- grep(geneName, ampliconNames)
#
# # pdf(file=paste0(geneName,'_plot.pdf'), width=10, height= ceiling(length(myAmpliconsIndexes)/2) * 5)
# png(file=paste0(geneName,'_plot.png'), width=10, height= ceiling(length(myAmpliconsIndexes)/2) * 5)
#
# par(mfrow=c(ceiling(length(myAmpliconsIndexes)/2), 2), mar=c(3,3,5,1))
#
# for (ampliconName in ampliconNames[myAmpliconsIndexes]) {
# # ampliconName <- "chr7_140453102_140453221_BRAF_Ex15-NM_004333"
# tmp <- data.frame(y = c(as.vector(nonMartina[ampliconName, ]),
# as.vector(nonMartina[ampliconName, referenceFilepathsKNN]),
# as.vector(nonMartina[ampliconName, referenceFilepathsPercentil]),
# as.vector(martina[ampliconName, ])),
# x = c(rep("nonMartina",length(as.vector(nonMartina[ampliconName, ]))),
# rep("referenceKNN", length(as.vector(nonMartina[ampliconName, referenceFilepathsKNN]))),
# rep("referencePercentil", length(as.vector(nonMartina[ampliconName, referenceFilepathsPercentil]))),
# rep("martina", length(as.vector(martina[ampliconName, ])))))
#
# boxplot(y~x, data=tmp, ylab="Read Counts", xlab="Sample set", main = ampliconName)
# stripchart(y ~ x,
# vertical = TRUE,
# data = tmp,
# method = "jitter",
# add = TRUE,
# pch = 20,
# col = 'blue')
# }
# dev.off()
# }
# }
# i <- 0
# for (x in 1:100) {
# i <- i+1
# if (i %% 10 == 0) {
# Sys.sleep(1)
# cat(".")
# }
# }
#
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.