#' Run CNCDriver coding region (CDS) p-value calculation with mutation pre-filtering
#'
#' @param inputFileDir The path for prepared R object file [for example,reducedFunseqOutputCDS_GBM.Rd]
#' @param outputFileDir The path for output files
#' @param codingRegionBedFile The defintion of coding regions in bed file format
#' @param elementKeyWord Default is "Promoter", the keyword of promoter annotation in FunSeq2
#' @param proteinDomainFile protein domain information from pfam
#' @param proteinLengthFile protein length information from ensembl
#' @param minPoints default is 2 [ unit: samples ]
#' @param dRadius default is 50 [ unit: bp ]
#' @param triNucleotideDistributionFile Cancer type specific mutation counts in 96 trinucleotide category
#' @param filterOutBlacklistMutations TRUE or FALSE
#' @param mutationBlacklistFile The file for mutation blacklist regions
#' @param replicationTimingGenomeBinnedFile Replication timing value at 1MB bin
#' @param replicationTimingElementBinnedFile Replication timing value of each element within 1MB bin
#' @param tumorType Study name
#' @param mutationType User provided mutated gene list
#' @param cellType Cell type of the study, [ BJ, GM12878, HeLaS3, HepG2, IMR90, K562, MCF7, SKNSH or average ]
#' @param replicationTimingCutOff Default is 0.2, a numeric value ranging from 0 to 1
#' @param seedNum User provided random number seed, default is 42
#' @param reSampleIterations User provided re-sapling iteration numbers
#' @param reRunPvalueCutOff Default is 0.1, a numeric value ranging from 0 to 1
#' @param useCores Default is 1, number of cpu to use
#' @param taskNum Default is 0, 0 means to run all elements
#' @param unitSize Number of elements to run per task
#' @param debugMode TRUE or FALSE
#'
#' @return results data frame
#'
#' @examples
#' #date<-getRunDates(latest=TRUE)
#' cancerType<-"KIRC"
#' selectedSampleId<-NA
#' #worDir<-getwd()
#' mutSig2CVthreshold<-0.1
#' rareMutationUpperLimit<-0.3
#' rareMutationLowerLimit<-0.1
#' rareMutationFreq<-0.02
#'
#' #runNetBox2(dataDir,cancerType,
#' # mutationList,ampGeneList,delGeneList,epiSilencedList,
#' # mutationFreq,ampGeneFreq,delGeneFreq,epiSilencedFreq,
#' # pathwayCommonsDb,directed,
#' # linkerPValThreshold,communityDetectionMethod,
#' # keepIsolatedNodes,verbose=TRUE)
#'
#' @concept CNCDriver
#' @export
#' @import doRNG
#' @import IRanges
#' @import GenomicRanges
#' @importFrom plyr rbind.fill
#' @importFrom stats p.adjust
#' @importFrom utils write.table
#' @importFrom data.table data.table
#' @importFrom data.table fread
#'
getCDSPvalueWithPreFilter<-function(inputFileDir,outputFileDir,
codingRegionBedFile,elementKeyWord="CDS",
proteinDomainFile,proteinLengthFile,
minPoints=2,dRadius=50,
triNucleotideDistributionFile,
filterOutBlacklistMutations,mutationBlacklistFile,
replicationTimingGenomeBinnedFile,replicationTimingElementBinnedFile,
tumorType,mutationType,cellType,replicationTimingCutOff,
seedNum=42,reSampleIterations,reRunPvalueCutOff=0.1,
useCores=1,taskNum=0,unitSize,debugMode=FALSE){
#library(data.table)
#library(plyr)
######
#inputDir<-"~/work/Ekta_lab/Priyanka_project/fsigsnv_results"
#tumorType<-"Prostate"
replicationTimingCutOff<-as.numeric(replicationTimingCutOff)
replicationTimingParm<-paste("RT_",replicationTimingCutOff,sep="")
reSampleIterations<-as.numeric(reSampleIterations)
reRunPvalueCutOff<-as.numeric(reRunPvalueCutOff)
useCores<-as.numeric(useCores)
taskNum<-as.numeric(taskNum)
unitSize<-as.numeric(unitSize)
workDir<-file.path(outputFileDir,"output_result_triMut_match",mutationType,tumorType,replicationTimingParm)
#mutationType<-"CDS"
#seedNum<-42
#reSampleNum<-1000
reSampleNum<-reSampleIterations
set.seed(seedNum)
if( !file.exists(paste(workDir,sep="/")) ){
dir.create(paste(workDir,sep=""),recursive=TRUE)
}
cat(sprintf("Start processing %s - %s\n",tumorType,mutationType))
########
# Load pre-processed Rd file
########
filePath<-file.path(inputFileDir)
fileName<-paste("reducedFunseqOutputCDS_",tumorType,".Rd",sep="")
fileName<-file.path(filePath,fileName)
cat(sprintf("Load pre-processed Rd file\n"))
load(fileName)
#####
## load CDS bed file
#####
###
## not load pre-defined bed file. We used cds definitions from VAT.
##
###
if(FALSE){
cat(sprintf("Load elementBedfile\n"))
#filePath<-"~/work/Ekta_lab/FunSeq2_compositeDriver_SEP_2017/data_context_SEP_2017/gencode"
#fileName<-"gencode.v19.promoter.bed"
#fileName<-codingRegionBedFile
elementBedfileName<-file.path(codingRegionBedFile)
#reducedFunseqOutput<-reducedFunseqOutputNCDS
elementBedfile<-read.table(elementBedfileName,sep="\t",header=FALSE,stringsAsFactors = FALSE)
colnames(elementBedfile)<-c("chr","posStart","posEnd","name")
#colnames(dat)<-c("chr","posStart","posEnd","name","score","strand","thickStart","thickEnd","itemRgb","blockCount","blockSizes","blockStarts")
## extract mutations overlap with pre-defined element regions
reducedFunseqOutputCDS<-extractElementMutations(elementBedfile,reducedFunseqOutputCDS)
}
######
# load triNucleotideDistribution file
######
cat(sprintf("Load triNucleotideDistribution file\n"))
#filePath<-"~/work/Ekta_lab/compositeDriver_data/triNucleotidedistribution"
#fileName<-"compositeDriver_mutationDistributionMatrix_all_kmer_3_counts_Mar_2018_v0.txt"
#fileName<-file.path(filePath,fileName)
fileName<-triNucleotideDistributionFile
mutationDistMatrix<-read.table(fileName,sep="\t",header=TRUE,check.names = FALSE,stringsAsFactors = FALSE)
categoryName<-rownames(mutationDistMatrix)
categoryName<-gsub(" ","@",categoryName)
rownames(mutationDistMatrix)<-categoryName
######
# This section is for pan-cancer calculation
# This is not required for single cancer type calculatin
######
if(FALSE){
filePath<-file.path("~/work/Ekta_lab/cncdriver_analysis_Mar_2018/sample_summary")
fileName<-"Eric_sampleDist_Mar_2018.txt"
fileName<-file.path(filePath,fileName)
tumorDistTable<-read.table(fileName,header=TRUE,sep="\t",stringsAsFactors=FALSE)
#tumorDistTable<-tumorDistTable[order(-tumorDistTable$totalMutCounts),]
mutationBurden<-sort(colSums(mutationDistMatrix),decreasing = TRUE)
mutationBurdenDF<-data.frame(names(mutationBurden),mutationBurden,stringsAsFactors = FALSE)
colnames(mutationBurdenDF)<-c("histologyName","totalMutationCounts")
tumorDistTable<-merge(tumorDistTable,mutationBurdenDF,by="histologyName")
tumorDistTable$mutationBurdenAverage<-tumorDistTable$totalMutationCounts/tumorDistTable$histologySize
tumorDistTable<-tumorDistTable[order(-tumorDistTable$mutationBurdenAverage),]
######
#tumorDistTable<-tumorDistTable[tumorDistTable$histologyName %in% tumorType,]
#sampleSizeFactor<-tumorDistTable$histologySize
#names(sampleSizeFactor)<-tumorDistTable$histologyName
#sampleSizeFactor<-tumorDistTable$totalMutationCounts
#names(sampleSizeFactor)<-tumorDistTable$histologyName
#sampleSizeFactor<-tumorDistTable$mutationBurdenAverage
#names(sampleSizeFactor)<-tumorDistTable$histologyName
#sampleSizeFactor<-rep(1,nrow(tumorDistTable))
#names(sampleSizeFactor)<-tumorDistTable$histologyName
sampleSizeFactor<-(log10(tumorDistTable$totalMutationCounts/min(tumorDistTable$totalMutationCounts)))+1
names(sampleSizeFactor)<-tumorDistTable$histologyName
#countsSizeFactor<-tumorDistTable$histologySize
#names(countsSizeFactor)<-tumorDistTable$histologyName
countsSizeFactor<-tumorDistTable$totalMutationCounts
names(countsSizeFactor)<-tumorDistTable$histologyName
#countsSizeFactor<-tumorDistTable$mutationBurdenAverage
#names(countsSizeFactor)<-tumorDistTable$histologyName
}
######
######
# normalize tri-nucleotide context per cancer type
if(TRUE){
tmp_mm<-mutationDistMatrix
#tmp_mm<-tmp_mm[,c(1:2)]
tumorTypeSum<-colSums(tmp_mm)
#overallSum<-sum(tumorTypeSum)
# normalization on columns for each tumorType
tmp_mm_t<-lapply(1:ncol(tmp_mm), function(x){
tmp_mm[,x]/tumorTypeSum[colnames(tmp_mm)[x]]
})
tmp_mm_t<-do.call(rbind,tmp_mm_t)
tmp_mm2<-t(tmp_mm_t)
rownames(tmp_mm2)<-rownames(tmp_mm)
colnames(tmp_mm2)<-colnames(tmp_mm)
tmp_mm<-tmp_mm2
tmp_mm_normalized<-tmp_mm
mutationDistMatrix<-tmp_mm_normalized
}
cat(sprintf("Finish processing triNucleotideDistribution matrix\n"))
######
parseFunseqGeneField<-function(field,keyword,useCores=1){
geneSymbol<-{}
#a1<-strsplit(aa,",")
a1<-strsplit(field,"\\)")
#a1<-unlist(a1)
tmpStr<-mclapply(1:length(a1), function(x){
#cat(sprintf("gene:%s\t",i))
idx <- which(grepl(keyword, a1[[x]]))
a1[[x]]<-gsub("\\,","",a1[[x]])
t3 <- strsplit(a1[[x]][idx], "\\(")
#cat(sprintf("%s\n",t3[1][1]))
tmpStr<-t3[[1]][1]
},mc.cores=useCores)
geneSymbol<-unlist(tmpStr)
return(geneSymbol)
}
#cat(sprintf("Start parsing GENE field annotations\n"))
#reducedFunseqOutputNCDS$GENEparsed<-parseFunseqGeneField(field=reducedFunseqOutputNCDS$GENE,keyword=elementKeyWord,useCores=1)
tmpString<-strsplit(as.character(reducedFunseqOutputCDS$CDSS),":",fixed=TRUE)
tmpStringFrame<-data.frame(do.call("rbind",tmpString),stringsAsFactors=FALSE)
reducedFunseqOutputCDS$score<-as.numeric(tmpStringFrame[,1])
#######
#######
# filter out mutations on blacklist
######
#filePath<-"~/work/Ekta_lab/compositeDriver_data/genome_blacklist"
#fileName<-"wgEncodeDacMapabilityConsensusExcludable.bed"
#fileName<-file.path(filePath,fileName)
fileName<-mutationBlacklistFile
testDT<-data.table(reducedFunseqOutputCDS)
setkeyv(testDT,c("chr","posStart","posEnd"))
dacDT<-fread(fileName)
colnames(dacDT)<-c("chr","posStart","posEnd","annotation","score","strand")
setkeyv(dacDT,c("chr","posStart","posEnd"))
mutExcludable<-foverlaps(testDT,dacDT,nomatch=0)
numOfBlacklistMutations<-nrow(mutExcludable)
cat(sprintf("there are %s mutations on blacklist\n",numOfBlacklistMutations))
removeIdx<-mutExcludable[,c("chr","i.posStart","i.posEnd"),with=FALSE]
if(filterOutBlacklistMutations){
woBlacklistDT<-testDT[ !list(removeIdx),]
reducedFunseqOutputCDS<-setDF(woBlacklistDT)
remove(woBlacklistDT)
}
remove(dacDT)
remove(mutExcludable)
remove(testDT)
remove(removeIdx)
#####
# sanity check
if(FALSE){
reducedFunseqOutput<-reducedFunseqOutputCDS
reducedFunseqOutput$index<-paste(reducedFunseqOutput$sampleID,reducedFunseqOutput$chr,paste(reducedFunseqOutput$posStart,reducedFunseqOutput$posEnd,sep="-"),sep=":")
duplicatedLine<-duplicated(reducedFunseqOutput$index)
educedFunseqOutput<-reducedFunseqOutput[!duplicatedLine,]
reducedFunseqOutput<-reducedFunseqOutput[,c(1:26)]
numOfWhitelistMutations<-nrow(reducedFunseqOutput)
cat(sprintf("there are %s mutations on the regions of element bed file\n",numOfWhitelistMutations))
}
#####
# Add triNucleotideDistribution
#####
cat(sprintf("Add triNucleotideDistribution information\n"))
triNucleotideDat<-addTriNucleotideDistribution(reducedFunseqOutputCDS)
reducedFunseqOutputCDS<-cbind(reducedFunseqOutputCDS,triNucleotideDat)
####
## associate replication timing data with each variant
## note: there is no chromosome Y or chromosome M data in the replication timing
####
#cellType<-"BJ"
cat(sprintf("Add replication timing signal for cellType %s\n",cellType))
filePath<-"~/work/Ekta_lab/compositeDriver_data/replication_timing/UW/processed"
fileName<-"UW_RepliSeq_wavelet_1Mb_windows.txt"
fileName<-file.path(filePath,fileName)
fileName<-replicationTimingGenomeBinnedFile
replicationTimingDF<-read.table(fileName,header=TRUE,sep="\t",stringsAsFactors=FALSE)
selectedColumn<-which(colnames(replicationTimingDF) %in% cellType)
replicationTimingDF<-replicationTimingDF[,c(1:4,selectedColumn)]
colnames(replicationTimingDF)<-c("chr","start","end","name","signalValue")
replicationTimingDF$index<-paste(replicationTimingDF$chr,paste(replicationTimingDF$start,replicationTimingDF$end,sep="-"),sep=":")
reducedFunseqOutputCDS<-addReplicationTimingSignal(reducedFunseqOutputCDS,replicationTimingDF)
reducedFunseqOutputCDS<-reducedFunseqOutputCDS[!is.na(reducedFunseqOutputCDS$signalValue),]
#####
# Set element-wise replication timing signal boundary
#####
cat(sprintf("Set element-wise replication timing signal boundary at %s similarity [range: 0-1]\n",replicationTimingCutOff))
#filePath<-"~/work/Ekta_lab/compositeDriver_data/replication_timing/UW/processed"
#fileName<-"UW_RepliSeq_wavelet_1Mb_windows_promoter.txt"
fileName<-replicationTimingElementBinnedFile
elementReplicationTimingDF<-fread(fileName,header=TRUE,sep="\t",stringsAsFactors=FALSE,data.table=FALSE)
selectedColumn<-which(colnames(elementReplicationTimingDF) %in% cellType)
elementReplicationTimingDF<-elementReplicationTimingDF[,c(1,selectedColumn)]
colnames(elementReplicationTimingDF)<-c("elementName","signalValue")
#replicationTimingCutOff<-0.3
if(FALSE){
elementMat<-elementReplicationTimingDF
featureMat<-data.frame(elementMat[,2:ncol(elementMat)])
colnames(featureMat)<-c("replicationTiming")
rownames(featureMat)<-elementMat[,1]
rangeMat<-getNearestElement(featureMat,replicationTimingCutOff,useCores=useCores)
rangeMat$elementName<-rownames(rangeMat)
elementReplicationTimingDF<-merge(elementReplicationTimingDF,rangeMat,by="elementName")
}
if(TRUE){
#signalConst<-elementReplicationTimingDF$signalValue
#names(signalConst)<-elementReplicationTimingDF$elementName
# the original formula used in the cds calculation
#maxValue<-max(replicationTimingDF$signalValue)
#minValue<-min(replicationTimingDF$signalValue)
# the formula used in promoter and other elements
maxValue<-max(elementReplicationTimingDF$signalValue)
minValue<-min(elementReplicationTimingDF$signalValue)
interval<-0.5*(maxValue-minValue)*replicationTimingCutOff
#signalUpper<-rep(maxValue,nrow(elementReplicationTimingDF))
#signalLower<-rep(minValue,nrow(elementReplicationTimingDF))
signalUpper<-elementReplicationTimingDF$signalValue+interval
signalLower<-elementReplicationTimingDF$signalValue-interval
rangeValue<-data.frame(elementReplicationTimingDF,signalLower,signalUpper,stringsAsFactors = FALSE)
colnames(rangeValue)<-c("elementName","signalValue","signalLower","signalUpper")
idx<-which(rangeValue$signalUpper>=maxValue)
deltaUpper<-rangeValue[idx,]$signalUpper-maxValue
rangeValue[idx,]$signalLower<-rangeValue[idx,]$signalLower-deltaUpper
rangeValue[idx,]$signalUpper<-rep(maxValue,length(idx))
idx<-which(rangeValue$signalLower<=minValue)
deltaLower<-minValue-rangeValue[idx,]$signalLower
rangeValue[idx,]$signalUpper<-rangeValue[idx,]$signalUpper+deltaLower
rangeValue[idx,]$signalLower<-rep(minValue,length(idx))
rangeValue<-rangeValue[,-which(colnames(rangeValue) %in% "signalValue")]
}
elementReplicationTimingDF<-merge(elementReplicationTimingDF,rangeValue,by="elementName")
# debug for plotting
if(FALSE){
cc<-elementReplicationTimingDF
dd<-cc[order(cc$signalValue),]
dd$index<-seq(1,nrow(cc),by=1)
#name<-"TERT"
#name<-"HOXC5"
name<-"POTEB2"
#name<-"PCDH15"
ss<-dd[dd$elementName %in% name,]
#nameList<-dd[dd$signalValue<=ss$dmax & dd$signalValue>=ss$dmin,]$elementName
nameList<-dd[dd$signalValue<=ss$signalUpper & dd$signalValue>=ss$signalLower,]$elementName
kk<-dd[dd$elementName %in% nameList,]
plot(dd$index,dd$signalValue,col="black")
points(kk$index,kk$signalValue,col="red")
points(ss$index,ss$signalValue,col="green",pch=19)
plot(geneDFunique$signalValue,geneDFunique$occurence)
kk<-geneDFunique[geneDFunique$signalValue<=ss$signalUpper & geneDFunique$signalValue>=ss$signalLower,]
points(kk$signalValue,kk$occurence,col="red")
#points(ss$signalValue,ss$signalValue,col="green",pch=19)
plot(geneDFunique$signalValue,geneDFunique$compositeScore)
points(kk$signalValue,kk$compositeScore,col="red")
}
cat(sprintf("Finish processing replication Timing file \n"))
####
# CDS calculation specific section
# Add pfam protein domain information
# Add protein length information
####
#filePath<-"~/work/Ekta_lab/compositeDriver_data/ensembl_hg37"
#fileName<-"hg37_ensembl_txCDSwithProteinLength.txt"
#fileName<-file.path(filePath,fileName)
fileName<-proteinLengthFile
cdsExonLength<-read.table(fileName,sep="\t",header=TRUE,stringsAsFactors = FALSE)
parseVATannotation<-function(vector,index){
a2<-strsplit(vector,":")
alteredType<-sapply(a2,"[",5)
tmp<-sapply(a2,"[",6)
tmp<-strsplit(tmp,"\\/")
alteredRatio<-(as.numeric(sapply(tmp,"[",1))/as.numeric(sapply(tmp,"[",2)))
tmp<-sapply(a2,"[",8)
tmp<-strsplit(tmp,"\\.")
alteredTranscriptId<-sapply(tmp,"[",1)
tmp<-sapply(a2,"[",9)
tmp<-strsplit(tmp,"\\_")
alteredPos<-sapply(tmp,"[",3)
tmp<-sapply(tmp,"[",4)
tmp<-strsplit(tmp,"->")
refNu<-sapply(tmp,"[",1)
altNu<-sapply(tmp,"[",2)
tmp<-data.frame(alteredType,alteredRatio,alteredTranscriptId,alteredPos,refNu,altNu,stringsAsFactors = FALSE)
maxRatio<-max(tmp$alteredRatio)
tmpVector<-tmp[tmp$alteredRatio==maxRatio,]
if(nrow(tmpVector)>1){
tmpVector<-tmpVector[1,]
}
tmpVector$index<-index
return(tmpVector)
}
arrangeVATannotation<-function(field,useCores){
tmpVector<-{}
a1<-strsplit(field,",")
tmpVector<-mclapply(1:length(a1), function(x){
cat(sprintf("variant:%s /%s\n",x,length(a1)))
if(length(a1[[x]])>1){
tmpVector[[x]]<-parseVATannotation(a1[[x]],x)
}else{
if(!is.na(a1[[x]])){
tmpVector[[x]]<-parseVATannotation(a1[[x]],x)
}else{
tmpStr<-rep(NA,6)
tmpFrame<-data.frame(t(tmpStr),x,stringsAsFactors = FALSE)
colnames(tmpFrame)<-c("alteredType","alteredRatio","alteredTranscriptId","alteredPos","refNu","altNu","index")
tmpVector[[x]]<-tmpFrame
}
}
},mc.cores=useCores)
#alteredDf<-rbindlist(tmpVector)
alteredDf<-rbind.fill(tmpVector)
return(alteredDf)
}
#filePath<-file.path("~/work/Ekta_lab/cncdriver_analysis_Mar_2018","compositeDriver_input",mutationType)
filePath<-file.path(inputFileDir,mutationType)
fileName<-paste("vatDF_",tumorType,"_",mutationType,".Rd",sep="")
fileName<-file.path(filePath,fileName)
if(file.exists(fileName)){
load(fileName)
}else{
vatDF<-arrangeVATannotation(reducedFunseqOutputCDS$VA,useCores)
if( !file.exists(paste(filePath,sep="/")) ){
dir.create(paste(filePath,sep=""),recursive=TRUE)
}
save(vatDF,file=fileName)
}
## debug check
#tt<-cbind(vatDF,reducedFunseqOutputCDS)
####
#filePath<-"~/work/Ekta_lab/compositeDriver_data/cds_data/domainData"
#fileName<-"pfam28.9606.processed_w_eValue_cutOff.txt"
#fileName<-file.path(filePath,fileName)
fileName<-proteinDomainFile
domainDF<-read.table(fileName,header=TRUE,sep="\t",stringsAsFactors = FALSE)
domainDF<-split(domainDF,domainDF$geneSymbol)
#####
#geneName<-"CDKN2A"
addDomainScore<-function(domainDF,geneDF,useCores=1){
variantAnno<-{}
nameVector<-unique(names(geneDF))
#for(geneName in nameVector){
variantAnno<-mclapply(1:length(nameVector), function(x){
#variantAnno<-lapply(1:length(nameVector), function(x){
geneName<-nameVector[x]
#cat(sprintf("name: %s\n",geneName))
domainDat<-domainDF[[geneName]]
if(!is.null(domainDat)){
domainIRanges<-IRanges(start=(domainDat$envelopeStart),
end=(domainDat$envelopeEnd))
mcols(domainIRanges)$names<-domainDat$hmmName
variantDat<-geneDF[[geneName]]
#splice site debug
if(nrow(variantDat[is.na(variantDat$alteredPos),])>0){
variantDat[is.na(variantDat$alteredPos),]$alteredPos<-0
}
variantIRanges<-IRanges(start=(as.numeric(variantDat$alteredPos)),
end=(as.numeric(variantDat$alteredPos)))
hit<-findOverlaps(variantIRanges,domainIRanges)
variantDat$domain<-rep(NA,nrow(variantDat))
variantDat$domain[queryHits(hit)]<-mcols(domainIRanges)$names[subjectHits(hit)]
variantDat$newScore<-variantDat$score
#withinDomainIndex<-which(!variantDat$domain %in% "NA" & !variantDat$alteredType %in% "synonymous_variant")
withinDomainIndex<-which(!is.na(variantDat$domain) & !variantDat$alteredType %in% "synonymous_variant")
if(length(withinDomainIndex)>0){
# cds score add one when the variant is located within pfam protein domain
variantDat[withinDomainIndex,]$newScore<-variantDat[withinDomainIndex,]$newScore+1
}
variantAnno[[geneName]]<-variantDat
}else{
variantDat<-geneDF[[geneName]]
variantDat$domain<-rep(NA,nrow(variantDat))
variantDat$newScore<-variantDat$score
variantAnno[[geneName]]<-variantDat
}
#}
#})
},mc.cores=useCores)
names(variantAnno)<-nameVector
#variantAnno<-rbind.fill(variantAnno)
return(variantAnno)
}
####
mergeDF<-reducedFunseqOutputCDS
#fileName<-paste(tumorType,"_",mutationType,"_merge_variant_details.txt",sep="")
#fileName<-file.path(workDir,fileName)
#write.table(mergeDF,fileName,sep="\t",quote=FALSE,row.names = FALSE,col.names = TRUE)
groupType<-c("allSamples")
groupDF<-{}
groupDF[[groupType[1]]]<-mergeDF
#tmpDF<-reducedFunseqOutputNCDS
for(i in 1:length(groupType)){
tmpDF<-groupDF[[groupType[i]]]
groupName<-groupType[i]
cat(sprintf("Processing %s\n",groupName))
posIndex<-paste(tmpDF$chr,paste(tmpDF$posStart,tmpDF$posEnd,sep="-"),sep=":")
geneDF<-data.frame(tmpDF$sampleID,posIndex,tmpDF$chr,tmpDF$posStart,
tmpDF$posEnd,tmpDF$GENE,tmpDF$ref,tmpDF$alt,
tmpDF$score,tmpDF$RECUR,tmpDF$index,tmpDF$signalValue,
tmpDF$tumorType,stringsAsFactors = FALSE)
#tmpDF$tumorType,stringsAsFactors = FALSE)
colnames(geneDF)<-c("sampleID","posIndex","chr","posStart",
"posEnd","geneSymbol","ref","alt",
"score","recur","triMutIndex","signalValue","tumorType")
geneDF<-cbind(geneDF,vatDF)
#kk<-geneDF[geneDF$geneSymbol=="EGFR",]
#cdsExonLength$geneWithTranscript<-paste(cdsExonLength$external_gene_name,cdsExonLength$ensembl_transcript_id,sep=":")
cdsExonLengthSimple<-cdsExonLength[,c(4,5,7)]
colnames(cdsExonLengthSimple)<-c("alteredTranscriptId","cds_length","protein_length")
#geneDF0<-merge(geneDF,cdsExonLengthSimple,by="alteredTranscriptId",all.x=TRUE)
## some SNV without VA annotation, we will remove them. It may be the problem of VA software
geneDF<-merge(geneDF,cdsExonLengthSimple,by="alteredTranscriptId")
## remove splice_variant mutations
geneDF<-geneDF[!(geneDF$alteredType %in% "splice_variant"),]
# debug
# checkScoreDF<-geneDF[is.na(geneDF$score),]
# dim(checkScoreDF)
if( nrow(geneDF[is.na(geneDF$score),])>0 ){
geneDF[is.na(geneDF$score),]$score<-0
}
geneNameVector<-unique(geneDF$geneSymbol)
geneDF<-split(geneDF,geneDF$geneSymbol)
## Add new score it variant located within pfam domain
geneDF<-addDomainScore(domainDF,geneDF)
########
# additional filter
########
#filePath<-file.path("~/work/Ekta_lab/cncdriver_analysis_Mar_2018/compositeDriver_input",mutationType)
filePath<-file.path(inputFileDir,mutationType)
fileName<-paste("geneDFcluster_",tumorType,"_",mutationType,".Rd",sep="")
fileName<-file.path(filePath,fileName)
if(file.exists(fileName)){
load(fileName)
}else{
#minPoints<-2
#dRadius<-50
cat(sprintf("dbscan cluster parameter: minPoints:%s, dRadius:%s\n",minPoints,dRadius))
geneDFcluster<-{}
cat(sprintf("Processing geneDFcluster object for calculation\n"))
## this number is 1 for single cancer type calculation
## A correction factor only used in pancancer calculation
if(TRUE){
sampleSizeFactor<-1
names(sampleSizeFactor)<-tumorType
}
geneDFcluster<-mclapply(1:length(geneNameVector), function(x){
#geneDFcluster<-lapply(1:length(geneNameVector), function(x){
#geneDFunique<-mclapply(1:6, function(x){
#for(x in 1:length(geneNameVector)){
#cat(sprintf("%s\n",x))
geneName<-geneNameVector[x]
tmp0DF<-geneDF[[geneName]]
tmp0DF$oldScore<-tmp0DF$score
tmp0DF$score<-tmp0DF$newScore
## when variant cross multiple seqments of replication timing bins, take mean value.
tmp0DF$signalValue<-mean(tmp0DF$signalValue)
tmpDFdbscan<-data.frame(tmp0DF$posStart,rep(1,nrow(tmp0DF)),stringsAsFactors = FALSE)
colnames(tmpDFdbscan)<-c("posStart","dummy")
#dRadius<-(max(tmpDFdbscan$posStart)-min(tmpDFdbscan$posStart))/nrow(tmpDFdbscan)
dbscanOut<-dbscan(tmpDFdbscan,eps=dRadius,minPts = minPoints)
tmpDFdbscan$cluster<-dbscanOut$cluster
table(tmpDFdbscan$cluster)
if(FALSE){
plot(tmpDFdbscan$pos,tmpDFdbscan$cluster,col=tmpDFdbscan$cluster+1,pch=20)
points(tmpDFdbscan[tmpDFdbscan$cluster==0,]$pos,tmpDFdbscan[tmpDFdbscan$cluster==0,]$cluster,col="grey",pch=20)
}
tmpDFdbscan<-tmpDFdbscan[order(tmpDFdbscan$posStart),]
tmpDFdbscan<-tmpDFdbscan[!duplicated(tmpDFdbscan$posStart),]
tmp0DF<-merge(tmp0DF,tmpDFdbscan,all.x=TRUE)
tmp0DF<-tmp0DF[,c(2:5,1,6:25,27)]
#tmp0DFoutlier<-tmp0DF[tmp0DF$cluster==0,]
# cluster 0 as outlier
#tmp0DF<-tmp0DF[tmp0DF$cluster!=0,]
geneDFcluster[[geneName]]<-mergeVariantPositionClusterV2(tmp0DF,sampleSizeFactor,countsCutOff = 2)
#})
},mc.cores=useCores)
names(geneDFcluster)<-geneNameVector
if( !file.exists(paste(filePath,sep="/")) ){
dir.create(paste(filePath,sep=""),recursive=TRUE)
}
save(geneDFcluster,file=fileName)
}
geneDFcluster<-rbind.fill(geneDFcluster)
geneNameVector<-unique(geneDFcluster$geneSymbol)
geneDFcluster<-split(geneDFcluster,geneDFcluster$geneSymbol)
#geneDFbackup<-geneDF
#geneDF<-geneDFcluster
#geneNameVector_backup<-geneNameVector
#geneNameVector<-names(geneDF)
########
geneDFpatient<-{}
geneDFpatient<-mclapply(1:length(geneNameVector), function(x){
#tmpDat<-geneDF[[geneName]][!(duplicated(geneDF[[geneName]]$posIndex)),]
geneName<-geneNameVector[x]
tmpDF<-geneDF[[geneName]]
npat<-length(unique(tmpDF$sampleID))
geneDFpatient[[geneName]]<-npat
},mc.cores=useCores)
geneDFpatient<-unlist(geneDFpatient)
names(geneDFpatient)<-geneNameVector
compositeScoreVector<-{}
compositeScoreScaledVector<-{}
uniqueVariantPos<-{}
####
if(FALSE){
#filePath<-file.path("~/work/Ekta_lab/cncdriver_analysis_Mar_2018/compositeDriver_input",mutationType)
filePath<-file.path(inputFileDir,mutationType)
fileName<-paste("geneDFunique_",tumorType,"_",mutationType,".Rd",sep="")
fileName<-file.path(filePath,fileName)
if(file.exists(fileName)){
cat(sprintf("Load %s\n",fileName))
load(fileName)
}else{
cat(sprintf("Processing geneDFunique object for calculation\n"))
geneDFunique<-{}
## this number is 1 for single cancer type calculation
## A correction factor only used in pancancer calculation
if(TRUE){
sampleSizeFactor<-1
names(sampleSizeFactor)<-tumorType
}
geneDFunique<-mclapply(1:length(geneNameVector), function(x){
#geneDFunique<-mclapply(1:6, function(x){
#for(x in 1:length(geneNameVector)){
#cat(sprintf("%s\n",x))
geneName<-geneNameVector[x]
tmp0DF<-geneDF[[geneName]]
tmp0DF$oldScore<-tmp0DF$score
tmp0DF$score<-tmp0DF$newScore
## when variant cross multiple seqments of replication timing bins, take mean value.
tmp0DF$signalValue<-mean(tmp0DF$signalValue)
geneDF[[geneName]]<-tmp0DF
geneDFunique[[geneName]]<-geneDF[[geneName]][!(duplicated(geneDF[[geneName]]$posIndex)),]
geneDFunique[[geneName]]<-geneDFunique[[geneName]][order(geneDFunique[[geneName]]$posIndex),]
#recurrenceVector<-table(geneDF[[geneName]]$posIndex)
collapsedDF<-mergeVariantPosition(geneDF[[geneName]],sampleSizeFactor,countsCutOff = 2)
#geneDFunique[[geneName]]$occurence<-mergedDF[geneDFunique[[geneName]]$posIndex,]$occurence
#geneDFunique[[geneName]]$compositeScore<-mergedDF[geneDFunique[[geneName]]$posIndex,]$compositeScore
#geneDFunique[[geneName]]$tumorCounts<-mergedDF[geneDFunique[[geneName]]$posIndex,]$tumorCounts
tmp2DF<-geneDFunique[[geneName]]
#tmp3DF<-cbind(tmp2DF[,2:12],tmp2DF[,14:22])
#tmp3DF<-cbind(tmp2DF[,c(1,3:13)],tmp2DF[,15:18])
#tmp3DF$oldScore<-tmp2DF$oldScore
#geneDFunique[[geneName]]<-cbind(tmp3DF,collapsedDF[,2:7])
tmp3DF<-cbind(tmp2DF[,c(1,3:12)],tmp2DF[,15:24])
tmp3DF$oldScore<-tmp2DF$oldScore
geneDFunique[[geneName]]<-cbind(tmp3DF,collapsedDF[,2:6])
#}
},mc.cores=useCores)
names(geneDFunique)<-geneNameVector
if( !file.exists(paste(filePath,sep="/")) ){
dir.create(paste(filePath,sep=""),recursive=TRUE)
}
save(geneDFunique,file=fileName)
}
}
geneDFunique<-geneDFcluster
#####
compositeScoreVector<-mclapply(1:length(geneNameVector), function(x){
geneName<-geneNameVector[x]
compositeScoreVector[[geneName]]<-sum(geneDFunique[[geneName]]$compositeScore)
},mc.cores=useCores)
compositeScoreVector<-unlist(compositeScoreVector)
names(compositeScoreVector)<-geneNameVector
#######
compositeScoreScaledVector<-mclapply(1:length(geneNameVector), function(x){
geneName<-geneNameVector[x]
compositeScoreScaledVector[[geneName]]<-sum(geneDFunique[[geneName]]$compositeScoreScaled)
},mc.cores=useCores)
compositeScoreScaledVector<-unlist(compositeScoreScaledVector)
names(compositeScoreScaledVector)<-geneNameVector
#######
uniqueVariantPos<-mclapply(1:length(geneNameVector), function(x){
geneName<-geneNameVector[x]
uniqueVariantPos[[geneName]]<-nrow(geneDFunique[[geneName]])
},mc.cores=useCores)
uniqueVariantPos<-unlist(uniqueVariantPos)
names(uniqueVariantPos)<-geneNameVector
#######
numOfAlteration<-mclapply(1:length(geneNameVector), function(x){
geneName<-geneNameVector[x]
numOfAlteration<-sum(geneDFunique[[geneName]]$occurence)
},mc.cores=useCores)
numOfAlteration<-unlist(numOfAlteration)
names(numOfAlteration)<-geneNameVector
#######
#geneDF<-rbind.fill(geneDF)
geneDFunique<-rbind.fill(geneDFunique)
#####
#filePath<-file.path("~/work/Ekta_lab/cncdriver_analysis_Mar_2018/compositeDriver_input",mutationType)
filePath<-file.path(inputFileDir,mutationType)
fileName<-paste("variantTriMutCategoryParsed_",tumorType,"_",mutationType,".Rd",sep="")
fileName<-file.path(filePath,fileName)
if(!file.exists(fileName)){
cat(sprintf("Processing variantTriMutCategoryParsed object for calculation\n"))
variantTriMutCategoryParsed<-parsePosVariantByTriMutContextWithAnnotationCDS(geneDFunique,mutationDistMatrix,useCores=useCores)
if( !file.exists(paste(filePath,sep="/")) ){
dir.create(paste(filePath,sep=""),recursive=TRUE)
}
save(variantTriMutCategoryParsed,file=fileName)
}else{
cat(sprintf("Load %s\n",fileName))
load(fileName)
}
####
compositeScoreDF<-data.frame(uniqueVariantPos,numOfAlteration,geneDFpatient,compositeScoreVector,compositeScoreScaledVector,stringsAsFactors = FALSE)
rownames(compositeScoreDF)<-names(compositeScoreVector)
colnames(compositeScoreDF)<-c("uniqueVariantPos","numOfAlteration","numOfPatient","compositeScore","compositeScoreScaled")
cat(sprintf("Finish data preparation\n"))
######
#set.seed(42)
#mutationType<-"CDS"
#reSampleNum<-1000000
compositeScore<-{}
compositeScoreResample<-{}
numOfAlterationPos<-{}
numOfAlteration<-{}
numOfPatient<-{}
numOfAboveCScore<-{}
reSampleSize<-{}
pValue<-{}
outputDf<-{}
#######
cdsMutationCheckList<-unique(geneDFunique$geneSymbol)
# remove some genes not have replication timing data
cdsMutationCheckList<-intersect(cdsMutationCheckList,elementReplicationTimingDF$elementName)
#ncdsMutationCheckList<-c("TBX5","TP53","NRXN1","ITGB6","GRIA2","LATS1","YES1","DPP4","PAX7","EXOC1","NOS3","DST","BRCA2","BRCA1","CDK12","RB1")
#ncdsMutationCheckList<-c("ABCB10","AKT1","LAMA1","LRP2","PIK3CA","SF3B1","TP53","ZNF263","IARS","GATA3","COPB2","CFTR","ZAP70","ALK","NCOR1","PTEN","ARID1A","CBFB")
#ncdsMutationCheckList<-c("TP53","SPOP","FOXA1","PTEN","RB1","ABCC4","CSMD3","BRINP2","UMODL1")
numOfgeneCheck<-length(cdsMutationCheckList)
cat(sprintf("%s elements have alterations\n",numOfgeneCheck))
#numOfgeneCheck<-12
#####
# further split tasks into small units if large-scale parallelization calculations
####
#unitSize<-12
totalTaskNum<-ceiling(numOfgeneCheck/unitSize)
#totalTaskNum
loopVector<-{}
for(i in 1:totalTaskNum){
startNum<-1 + unitSize*(i-1)
endNum<-startNum + unitSize -1
if(endNum>numOfgeneCheck){
endNum<-numOfgeneCheck
}
loopVector[[i]]<-c(startNum:endNum)
}
if(taskNum!=0){
loopSequence<-loopVector[[taskNum]]
}else{
loopSequence<-c(1:numOfgeneCheck)
}
#######
# main p-value calculation section
#######
detectCores()
cl<-makeCluster(useCores)
registerDoParallel(cl)
time_start<-proc.time()
cat(sprintf("Use %s cores\n",useCores))
cat (sprintf ("Start calculating p-value for %s candidates\n", numOfgeneCheck) )
filePath<-workDir
fileName<-paste("log_",mutationType,"_triMut_distribution_RT_similarity_",replicationTimingCutOff,"_task_",taskNum,"_.txt",sep="")
fileName<-file.path(filePath,fileName)
cat(sprintf ("progress can be look up at \n"))
cat(sprintf("%s\n",fileName))
outputDf<-foreach(k=loopSequence, .options.RNG=seedNum, .packages=c("stringr","parallel","plyr","CNCDriver")) %dorng% {
#outputDf<-foreach(k=1:numOfgeneCheck, .options.RNG=seedNum, .packages=c("FNN","stringr","parallel","plyr")) %dorng% {
#for(k in 1:numOfgeneCheck){
#filePath<-file.path("~/work/Ekta_lab/JasonWong_dataset/compositeFunSeq_result_triMut_match",tumorType)
filePath<-workDir
fileName<-paste("log_",mutationType,"_triMut_distribution_RT_similarity_",replicationTimingCutOff,"_task_",taskNum,"_.txt",sep="")
fileName<-file.path(filePath,fileName)
#cat(sprintf("%s/%s\t",k,numOfgeneCheck),file=fileName,append=TRUE)
cat(sprintf("%s/%s\ttype:%s\tgene:%s\n",k,numOfgeneCheck,mutationType,cdsMutationCheckList[k]),file=fileName,append=TRUE)
#cat(sprintf("%s/%s\t",k,numOfgeneCheck))
#cat(sprintf("type:%s\tgene:%s\n",mutationType,ncdsMutationCheckList[k]))
numOfAlterationPos<-compositeScoreDF[rownames(compositeScoreDF) %in% cdsMutationCheckList[k],]$uniqueVariantPos
numOfAlteration<-sum(geneDFunique[geneDFunique$geneSymbol %in% cdsMutationCheckList[k],]$occurence)
numOfPatient<-geneDFpatient[[cdsMutationCheckList[k]]]
compositeScore<-compositeScoreDF[rownames(compositeScoreDF) %in% cdsMutationCheckList[k],]$compositeScoreScaled
gname<-cdsMutationCheckList[k]
#gname<-"TERT"
#a1<-geneDFunique[geneDFunique$geneSymbol==gname,]
a1<-variantTriMutCategoryParsed[variantTriMutCategoryParsed$geneSymbol==gname,]
#reSamplePosSize<-nrow(a1)
#compositeScore[k]<-sum(a1$compositeScore)
#####
# still need to modifiy for panCancer triMutIndex complexity
#####
#variantTable<-{}
#variantTable<-table(a1$triMutIndex)
#variantTriMutCategory<-data.frame(names(variantTable),as.numeric(variantTable),stringsAsFactors = FALSE)
#variantTriMutCategory<-makeTriMutCountsByPosition(a1$categoryCounts)
#colnames(variantTriMutCategory)<-c("categoryName","variantCounts")
#variantTriMutCategory<-parsePosVariantByTriMutContext(a1$categoryCounts)
#variantTriMutCategory$compositeScore<-a1$compositeScore
#variantTriMutCategory$posIndex<-a1$posIndex
#variantTriMutCategory$geneSymbol<-a1$geneSymbol
variantTriMutCategory<-a1
#####
#a2<-geneDFunique[geneDFunique$geneSymbol!=gname,]
#a2<-a2[!is.na(a2$signalValue),]
a2<-variantTriMutCategoryParsed[variantTriMutCategoryParsed$geneSymbol!=gname,]
a2<-a2[!is.na(a2$signalValue),]
#####
featureDF<-elementReplicationTimingDF[elementReplicationTimingDF$elementName==gname,]
#a3<-a2[a2$signalValue<=featureDF$signalUpper & a2$signalValue>=featureDF$signalLower,]
#geneDFuniqueSelected<-a3
backgroundPosVariant<-a2
#####
## this step need reduce calculation to gain speedup
#backgroundPosVariant<-parsePosVariantByTriMutContext(geneDFuniqueSelected$categoryCounts,useCores=1)
#backgroundPosVariant$compositeScore<-geneDFuniqueSelected$compositeScore
#backgroundPosVariant$posIndex<-geneDFuniqueSelected$posIndex
#backgroundPosVariant$geneSymbol<-geneDFuniqueSelected$geneSymbol
#backgroundPosVariant<-a3
#####
#filePath<-file.path("~/work/Ekta_lab/JasonWong_dataset/compositeFunSeq_result_triMut_match",tumorType)
filePath<-workDir
debugFileName<-paste("debug_",mutationType,"_triMut_distribution_RT_similarity_",replicationTimingCutOff,"_task_",taskNum,"_.txt",sep="")
debugFileName<-file.path(filePath,debugFileName)
## plotting correlation for debug
if(FALSE){
filePath<-file.path("~/work/Ekta_lab/JasonWong_dataset/checkCorrelation",tumorType)
fileName<-paste(tumorType,"_correlation_RT_",mutationType,"_",gname,".png",sep="")
fileName<-file.path(filePath,fileName)
png(fileName,width=800,height=600)
plot(geneDFunique$signalValue,geneDFunique$compositeScore,pch=19,xlab="",ylab="",cex=0.6)
title(main=paste(tumorType," (n=",sampleSize ,"), ",mutationType,sep=""),
xlab="replication timing [ late to early ]",
ylab="positional CompositeScore")
mtext(paste(gname,"num pos:",numOfAlterationPos,sep=" "))
points(geneDFuniqueSelected$signalValue,geneDFuniqueSelected$compositeScore,col="red",pch=19,cex=0.6)
#points(a22$signalValue,a22$compositeScore,col="blue",pch=19,cex=0.6)
points(a1$signalValue,a1$compositeScore,col="green",pch=19,cex=0.6)
dev.off()
}
#replicationTimingCutOff<-0.1
#system.time(
result<-calculatePvalueWithMutCategoryDistributionMatched7(variantTriMutCategory,backgroundPosVariant,mutationDistMatrix,featureDF,reSampleNum,replaceFlag = FALSE,replicationTimingCutOff,debugFileName,debugMode)
#)
if(result$pValue<=reRunPvalueCutOff){
reSampleNumExpanded<-reSampleNum*10
result<-calculatePvalueWithMutCategoryDistributionMatched7(variantTriMutCategory,backgroundPosVariant,mutationDistMatrix,featureDF,reSampleNumExpanded,replaceFlag = FALSE,replicationTimingCutOff,debugFileName,debugMode)
}
pValue<-result$pValue
numOfAboveCScore<-result$numOfAboveCScore
reSampleSize<-result$reSampleSize
tmpResult<-data.frame(gname,numOfAlterationPos,numOfAlteration,numOfPatient,compositeScore,numOfAboveCScore,reSampleSize,pValue,stringsAsFactors = FALSE)
#colnames(tmpResult)<-c("geneSymbol","numOfAlterationPos","numOfAlteration","numOfPatient","compositeDriverScore","numOfAboveCDscore","reSampleNum","pValue")
colnames(tmpResult)<-c("elementPos","numOfAlterationPos","numOfAlteration","numOfPatient","compositeDriverScore","numOfAboveCDscore","reSampleNum","pValue")
#filePath<-file.path("~/work/Ekta_lab/JasonWong_dataset/compositeFunSeq_result_triMut_match",tumorType)
filePath<-workDir
fileName<-paste("tmpOutput_",mutationType,"_triMut_distribution_RT_similarity_",replicationTimingCutOff,"_task_",taskNum,"_.txt",sep="")
fileName<-file.path(filePath,fileName)
write.table(tmpResult,file=fileName,sep="\t",quote=FALSE,row.names = FALSE,col.names = FALSE,append=TRUE)
return(tmpResult)
}
stopCluster(cl)
time_elapesed<-proc.time()-time_start
cat (sprintf ("Time for calculating p-value of %s candidates: %.2f sec\n", numOfgeneCheck, time_elapesed[3]) )
outputDf<-rbind.fill(outputDf)
#####
#outputDf<-data.frame(ncdsMutationCheckList[1:numOfgeneCheck],numOfAlterationPos,numOfAlteration,numOfPatient,compositeScore,numOfAboveCScore,reSampleSize,pValue)
outputDf<-outputDf[order(outputDf$pValue),]
outputDf$qValue<-p.adjust(outputDf$pValue,method = "BH")
#colnames(outputDf)<-c("geneSymbol","numOfAlterationPos","numOfAlteration","numOfPatient","compositeDriverScore","numOfAboveCDscore","reSampleNum","pValue","qValue")
colnames(outputDf)<-c("elementPos","numOfAlterationPos","numOfAlteration","numOfPatient","compositeDriverScore","numOfAboveCDscore","reSampleNum","pValue","qValue")
fileName<-paste(tumorType,"_outputDf_",mutationType,"_",groupName,"_with_RT_correction_similarity_",replicationTimingCutOff,"_triMut_match_distribution_task_",taskNum,"_iter_",reSampleNum,"_.txt",sep="")
fileName<-file.path(workDir,fileName)
write.table(outputDf,file=fileName,sep="\t",quote=FALSE,row.names =FALSE,col.names = TRUE)
}
return(outputDf)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.