# Hello, world!
#
# This is an example function named 'hello'
# which prints 'Hello, world!'.
#
# You can learn more about package authoring with RStudio at:
#
# http://r-pkgs.had.co.nz/
#
# Some useful keyboard shortcuts for package authoring:
#
# Build and Reload Package: 'Cmd + Shift + B'
# Check Package: 'Cmd + Shift + E'
# Test Package: 'Cmd + Shift + T'
# hello <- function() {
# print("Hello, world!")
# }
#main function giving estimates for missing mutations in each copy number state and for each peak,
#for missing mutations cumulated over all peaks, the signatures and estimates for the numbers of observations
#in different peaks depending on their mut ational context
missing_mutations <- function(vcf_path, copy_number_path, p, c=min(SNV$allele_counts)) {
vcf <- readVcfAsVRanges(vcf_path, "hg19")
copy_number = read.csv(copy_number_path)
select = select_func(vcf,copy_number)
relevant = relevant_copy_number_states(select)
SNV = SNV_info(select,relevant)
ML=max_lik(p,SNV,relevant)
numbers = find_missing(SNV,relevant,ML,p,c)
sigs = signatures(select)
emp = empirical_distr(relevant,SNV,p)
peak_0_given_bin = peak_0_given_bin(relevant,SNV,ML,numbers,emp,p)
peak_0_given_af = peak_0_given_af(SNV,peak_0_given_bin)
prob_of_peaks = prob_of_peaks(relevant,SNV,ML,peak_0_given_af,p)
mutational_contexts_in_peaks = mutational_contexts_in_peaks(SNV, prob_of_peaks)
return(list(missing=numbers,in_each_peak=round(apply(numbers[,(3:(ncol(numbers)-1))],2,sum)),signatures=sigs,
mutational_contexts_in_peaks=mutational_contexts_in_peaks))
}
get_peak_centres <- function(Total_CN,Minor_CN,p) {
return_vector = c()
for (i in 1:(Total_CN-Minor_CN)){
return_vector[i]=(i*p)/(p*Total_CN+2*(1-p))
}
return(return_vector)
}
select_func <- function(vcf,copy_number) {
vcf <- keepSeqlevels(vcf, c("chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12",
"chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20","chr21","chr22"))
vcf <- vcf[(length(vcf)/2+1):length(vcf)]
vcf <- vcf[apply(vcf@softFilterMatrix,1,prod)==1]
x <- GRanges(copy_number)
seqlevels(x) <- c("chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12","chr13",
"chr14","chr15","chr16","chr17","chr18","chr19","chr20","chr21","chr22","chrX","chrY")
x <- keepSeqlevels(x, c("chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12",
"chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20","chr21","chr22"))
mc <- mutationContext(vcf,BSgenome.Hsapiens.UCSC.hg19)
SMT <- paste(subseq(elementMetadata(mc)$context,1,1), rep("[",length(mc)),
subseq(elementMetadata(mc)$alteration,1,1), rep(">",length(mc)),
subseq(elementMetadata(mc)$alteration,2,2), rep("]",length(mc)),
subseq(elementMetadata(mc)$context,3,3), sep="")
return(list(vcf,x,SMT))
}
#we exclude all copy number states with less than 20 observations
relevant_copy_number_states <- function(select) {
f <- function(row) {
if(length(subsetByOverlaps(select[[1]],select[[2]][which(select[[2]]@elementMetadata$Total_CN==row[1]
& select[[2]]@elementMetadata$Minor_CN==row[2])]))>19)
tab=c(row[1],row[2])
}
rel=data.frame(matrix(unlist(apply(unique(select[[2]]@elementMetadata),1,f)),ncol=2,byrow=TRUE))
colnames(rel)=c("Total_CN","Minor_CN")
return(rel[order(rel$Total_CN,rel$Minor_CN),])
}
#gives data frame with SNVs, their respective allele counts, read counts, allele fractions, copy number states
#and mutational contexts
SNV_info <- function(select,relevant) {
VAF = select[[1]]$VariantAlleleFrequency
VAC = select[[1]]$VariantAlleleCount
RC = select[[1]]$ReadCount
SNVs_in_copy_number_states <- function(row) {
regions_with_desired_cns <- select[[2]][which(select[[2]]@elementMetadata$Total_CN==row[1] & select[[2]]@elementMetadata$Minor_CN==row[2])]
indices <- as.data.frame(findOverlaps(select[[1]],regions_with_desired_cns))$queryHits #gives indices of SNVs in desired copy number state
h = cbind(as.data.frame(cbind(VAF,VAC,RC)[indices,]), mut = select[[3]][indices])
delete_outliers = mean(h$RC)-3*sd(h$RC) <= h$RC & h$RC <= mean(h$RC)+3*sd(h$RC) #delete outliers in number of reads
number_of_reads = h$RC[delete_outliers]
allele_fractions = h$VAF[delete_outliers]
allele_counts = h$VAC[delete_outliers]
mutational_context = h$mut[delete_outliers]
h_result = cbind(as.data.frame(cbind(Total_CN=row[1],Minor_CN=row[2],allele_fractions,allele_counts,
number_of_reads)),mutational_context)
}
return(as.data.frame(do.call("rbind",apply(relevant,1,SNVs_in_copy_number_states))))
}
max_lik <- function(p,SNV,relevant) {
f <- function(row) {
peak_centres = get_peak_centres(row[1],row[2],p)
#the starting values for the maximum likelihood estimation shall be chosen in a senseful way:
find_closest_peaks <- function(m) {max(which(abs(peak_centres-m)==min(abs(peak_centres-m))))}
#closest is a vector showing, for each allele fraction in the copy number state given, which peak it is closest to
closest=mapply(find_closest_peaks,SNV[which(SNV$Total_CN==row[1] & SNV$Minor_CN==row[2]),]$allele_fractions)
#if there is more than one peak, find a good starting value, otherwise the weight shall be 1
#we use a log transformation to make sure the resulting weights are within [0,1] and sum to 1
if (row[1]-row[2]>1) {
a=rep(0,length(peak_centres))
for (i in 1:length(closest)) a[closest[i]]=a[closest[i]]+1
b=(a/sum(a))+0.0001 #so we do not take log(0)
c=(b/sum(b))
start=log(c[1:(length(c)-1)]/c[length(c)])} else start=numeric(0)
#only consider observations on the left hand side of peak 1 as we might have an extra "peak 0" violating the mixture of binomials
SNV_cut=SNV[SNV$Total_CN==row[1] & SNV$Minor_CN==row[2] & SNV$allele_fractions>=peak_centres[1],]
likelihood <- function(weights) {
weights_mod=c(exp(weights)/(1+sum(exp(weights))),1/(1+sum(exp(weights))))
g1=0
for (j in 1:length(weights_mod))
g1=g1+weights_mod[j]*dbinom(SNV_cut$allele_counts,SNV_cut$number_of_reads,peak_centres[j])
g2=1
for (k in 1:length(weights_mod))
g2=g2-(weights_mod[k]*pbinom(round(peak_centres[1]*SNV_cut$number_of_reads),
SNV_cut$number_of_reads,peak_centres[k]))
g3=g1/g2
return(-sum(log(g3)))
}
fit = optim(start,likelihood,method="BFGS")
numbers = c(peak=c(exp(fit$par)/(1+sum(exp(fit$par))), 1/(1+sum(exp(fit$par))),
rep(0,max(relevant$Total_CN-relevant$Minor_CN)-length(fit$par)-1)),row[1],row[2])}
numbers = transpose(apply(relevant,1,f))
return(as.data.frame(numbers))
}
#find_missing where we estimate number of missing mutations in each peak separately
find_missing <- function(SNV,relevant,ML,p,c){
h <- function(row) {
peak_centres = get_peak_centres(row[1],row[2],p)
SNV_cut = SNV[SNV$Total_CN==row[1] & SNV$Minor_CN==row[2] & SNV$allele_fractions>=peak_centres[1],]
weights = ML[ML$Total_CN==row[1] & ML$Minor_CN==row[2],]
prob_1 = 0
#P(allele_fractions>peak_centre_1)
for (i in 1:(row[1]-row[2]))
prob_1 = prob_1+sum((as.numeric(weights[i])*(1-pbinom(round(peak_centres[1]*SNV_cut$number_of_reads),
SNV_cut$number_of_reads,peak_centres[i])))*
(1/length(SNV_cut$number_of_reads)))
total_number_estimated = round(nrow(SNV_cut)/prob_1) #estimated total number of observations we would have with the mixture of binomials
prob_2 =rep(0,max(relevant$Total_CN-relevant$Minor_CN))
#vector containing P(allele_counts<min(SNV$allele_counts)) for each peak
for (i in 1:(row[1]-row[2]))
prob_2[i] = sum(pbinom(c,SNV_cut$number_of_reads,peak_centres[i])*
(1/length(SNV_cut$number_of_reads)))
numbers_in_peaks = rep(0,max(relevant$Total_CN-relevant$Minor_CN))
#vector containing estimated total number of observations we would have in each peak with mixture of binomials; without missing mutations
for (i in 1:(row[1]-row[2]))
numbers_in_peaks[i] = total_number_estimated*as.numeric(weights[i])
missing_numbers = c(row[1],row[2],peak=numbers_in_peaks*prob_2,total_number_estimated=total_number_estimated)
#vector containing numbers of missing mutations in each peak and total number estimated to be actually there
}
return(as.data.frame(transpose(apply(relevant,1,h))))
}
signatures <- function(select) {
SMT <- factor(select[[3]],levels=colnames(signatures.cosmic))
#contains mutational contexts for all SNVs in vcf
SMTdf <- as.data.frame(t(matrix(table(SMT))))
colnames(SMTdf) <- colnames(signatures.cosmic)
rownames(SMTdf) <- 1
sigs <- whichSignatures(tumor.ref = SMTdf, signatures.ref =
signatures.cosmic,
sample.id = 1, contexts.needed = TRUE)
return(sigs$weights)
}
empirical_distr <- function(relevant,SNV,p) {
breaks = seq(0,1,by=1/100)
g <- function(row) {
peak_centres = get_peak_centres(row[1],row[2],p)
SNV_need = SNV[SNV$Total_CN==row[1] & SNV$Minor_CN==row[2],]
m = rep(0,100)
for (j in 1:100)
m[j] = length(which(SNV_need$allele_fractions>=breaks[j] &
SNV_need$allele_fractions<breaks[j+1]))
res = c(prob=m/sum(m),row[1],row[2])
}
return(as.data.frame(transpose(apply(relevant,1,g)))) #data frame with empirical probabilities, based on a histogram with 100 bins,
#for each copy number state
}
#The output of this function is a matrix that, depending on the copy number state, gives P("peak_0"|bin), where we consider 100 bins
peak_0_given_bin <- function(relevant,SNV,ML,find,emp,p) {
breaks=seq(0,1,by=1/100)
g <- function(row) {
peak_centres = get_peak_centres(row[1],row[2],p)
weights = ML[ML$Total_CN==row[1] & ML$Minor_CN==row[2],]
SNV_need = SNV[SNV$Total_CN==row[1] & SNV$Minor_CN==row[2],]
SNV_cut = SNV[SNV$Total_CN==row[1] & SNV$Minor_CN==row[2] & SNV$allele_fractions>=peak_centres[1],]
emp_need = emp[emp$Total_CN==row[1] & emp$Minor_CN==row[2],]
b = rep(0,100)
for (i in 1:100) {
for (j in 1:(row[1]-row[2])) {
b[i] = b[i]+sum(as.numeric(weights[j])*pbinom(round(breaks[i+1]*SNV_cut$number_of_reads),SNV_cut$number_of_reads,peak_centres[j])-
as.numeric(weights[j])*pbinom(round(breaks[i]*SNV_cut$number_of_reads),SNV_cut$number_of_reads,peak_centres[j]))
}
}
res = b/sum(b) #probability distribution based on mixture of binomials, cumulated for each of the 100 bins (red line)
total_number_estimated = find[find$Total_CN==row[1] & find$Minor_CN==row[2],]$total_number_estimated #estimated total number of SNVs with
#mixture of binomials
n = round(res*total_number_estimated) #estimated number of observations in each bin under the red line (mixture of binomials)
m = round(as.numeric(emp_need[1:100]*nrow(SNV_need))) #estimated number of observations in each bin under the blue line (empirical)
m_cut = c(m[1:floor(peak_centres[1]*100)],rep(0,100-length(m[1:floor(peak_centres[1]*100)])))
n_cut = c(n[1:floor(peak_centres[1]*100)],rep(0,100-length(n[1:floor(peak_centres[1]*100)])))
prob=rep(0,100)
prob[m_cut==0 | n_cut>m_cut] = 0
prob[m_cut!=0 & n_cut<m_cut] = (m_cut[m_cut!=0 & n_cut<m_cut]-n_cut[m_cut!=0 & n_cut<m_cut])/m_cut[m_cut!=0 & n_cut<m_cut]
#P("peak_0"|bin) for each bin; depending on copy number state
erg=c(bin=prob,row[1],row[2])
}
return(as.data.frame(transpose(apply(relevant,1,g))))
}
#gives P("peak_0"|allele_fraction) for all allele fractions
peak_0_given_af <- function(SNV,peak_0_given_bin) {
breaks=seq(0,1,by=1/100)
f <- function(row) {
prob_peak0_need=peak_0_given_bin[peak_0_given_bin$Total_CN==row[1] & peak_0_given_bin$Minor_CN==row[2],]
a=as.numeric(prob_peak0_need[which.max(breaks[breaks<=row[3]])])
}
return(apply(SNV[,1:5],1,f))
}
#gives matrix with rows P("peak_0"|allele_fraction),...,P(peak_(Total_CN-Minor_CN)|allele_fraction) for all allele fractions
prob_of_peaks <- function(relevant,SNV,ML,peak_0_given_af,p) {
t = as.data.frame(table(SNV$number_of_reads)) #empirical probability distribution of number of reads
colnames(t) <- c("read_count","frequency")
breaks = seq(0,1,by=1/100)
f <- function(row) {
peak_centres = get_peak_centres(row[1],row[2],p)
weights = ML[ML$Total_CN==row[1] & ML$Minor_CN==row[2],]
denominator = 0
for (i in 1:length(peak_centres))
denominator = denominator+as.numeric(weights[i])*dbinom(row[4],row[5],peak_centres[i])*(t[which(t$read_count==row[5]),]$freq)/nrow(SNV)
vector = c()
for (i in 1:length(peak_centres))
vector = c(vector,(as.numeric(weights[i])*dbinom(row[4],row[5],peak_centres[i])*((t[which(t$read_count==row[5]),]$freq)/nrow(SNV))/
denominator)*as.numeric(1-row[6]))
vector = c(row[6],vector,rep(0,max(relevant$Total_CN-relevant$Minor_CN)-length(vector)))
}
result = data.frame(matrix(transpose(unlist(apply(cbind(SNV[,1:5],peak_0_given_af),1,f))),
ncol=max(relevant$Total_CN-relevant$Minor_CN)+1))
colnames(result) <- c(paste("peak",0:max(relevant$Total_CN-relevant$Minor_CN, sep="")))
return(result)
}
mutational_contexts_in_peaks <- function(SNV, prob_of_peaks) {
z = cbind(SNV,prob_of_peaks) #data frame with copy number state, allele fractions, allele counts,
#number of reads, mutational context and probabilities
a = matrix(0,nrow=96,ncol=ncol(z)-6)
for (i in 1:(ncol(z)-6)) {
for (j in 1:96) {
a[j,i] = sum(z[z$mutational_context==levels(z$mutational_context)[j],][,i+6])
}
}
u = a/apply(a,1,sum)
for (j in 1:96)
u[j,] = round(u[j,]*nrow(z[z$mutational_context==levels(z$mutational_context)[j],]))
return(u) #96x(Total_CN-Minor_CN+1) matrix containing estimated numbers of mutational contexts within
#each peak
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.