R/MEDIPS.meth.R

##########################################################################
##Function calculates genome wide methylation signals for each provided set and
##calculates differential coverage between groups of MEDIPS SETs (if provided) and
##calculates CNV scores between groups of INPUT SETs (if provided)
##########################################################################
##Input:	Groups of MEDIPS and INPUT SETs
##Param:	MSet1, MSet2, CSet, ISet1, ISet2, chr, p.adj, diff.method, minRowSum, norm
##Output:	Result table
##Requires:	DNAcopy
##Modified:	04/02/2015
##Author:	Lukas Chavez

MEDIPS.meth = function(
		MSet1 = NULL, 
		MSet2 = NULL, 
		CSet = NULL, 
		ISet1 = NULL, 
		ISet2 = NULL, 
		chr = NULL, 
		p.adj="bonferroni", 
		diff.method="edgeR", 
		CNV=FALSE,
		MeDIP=FALSE,
		minRowSum=10,
		diffnorm="tmm"
		)
{
	nMSets1 = length(MSet1)	
	nMSets2 = length(MSet2)	
	nISets1 = length(ISet1)	
	nISets2 = length(ISet2)	
	if(is.list(CSet)) CSet=CSet[[1]]
	if(!is.list(MSet1)) MSet1=c(MSet1)
	if(!is.list(MSet2)) MSet2=c(MSet2)
	if(!is.list(ISet1)) ISet1=c(ISet1)
	if(!is.list(ISet2)) ISet2=c(ISet2)
	
	##Proof of correctness
	#######################
	if(MeDIP){
		if(class(CSet)!="COUPLINGset"){stop("You have to state a COUPLINGset object!")}
	}

	controlSet = MSet1[[1]]

	for(i in 1:nMSets1){
		if(class(MSet1[[i]])!="MEDIPSset" & class(MSet1[[i]])!="MEDIPSroiSet"){stop("You have to state a MEDIPSset or MEDIPSroiSet object!")}
		if(length(genome_count(MSet1[[i]]))!=length(genome_count(controlSet)))stop("MEDIPSset/MEDIPSroiSet objects are of different length!")
	}
	if(!is.null(MSet2)){
    		for(i in 1:nMSets2){
			if(class(MSet2[[i]])!="MEDIPSset" & class(MSet2[[i]])!="MEDIPSroiSet"){stop("You have to state a MEDIPSset or MEDIPSroiSet object!")}
			if(length(genome_count(MSet2[[i]]))!=length(genome_count(controlSet)))stop("MEDIPSset/MEDIPSroiSet objects are of different length!")
    		}
	}	
	if(!is.null(ISet1)){
		for(i in 1:nISets1){
			if(class(ISet1[[i]])!="MEDIPSset" & class(ISet1[[i]])!="MEDIPSroiSet"){stop("You have to state a MEDIPSset or MEDIPSroiSet object!")}
			if(length(genome_count(ISet1[[i]]))!=length(genome_count(controlSet)))stop("MEDIPSset/MEDIPSroiSet objects are of different length!")
		}
	}
	if(!is.null(ISet2)){
		for(i in 1:nISets2){
			if(class(ISet2[[i]])!="MEDIPSset" & class(ISet2[[i]])!="MEDIPSroiSet"){stop("You have to state a MEDIPSset or MEDIPSroiSet object!")}
			if(length(genome_count(ISet2[[i]]))!=length(genome_count(controlSet)))stop("MEDIPSset/MEDIPSroiSet objects are of different length!")
		}
	}


	##Data preparation
	###################
	##Calculate genomic coordinates
	##################################
	if(class(controlSet)=="MEDIPSset"){
		window_size = window_size(controlSet)
		no_chr_windows = ceiling(chr_lengths(controlSet)/window_size(controlSet))
		supersize_chr = cumsum(no_chr_windows)	
		GRanges.genome = MEDIPS.GenomicCoordinates(supersize_chr, no_chr_windows, chr_names(controlSet), chr_lengths(controlSet), window_size(controlSet))
	}
	else if(class(controlSet)=="MEDIPSroiSet"){
		GRanges.genome = rois(controlSet)
		window_size=as.numeric(width(GRanges.genome))
	} 
	
	##Create data frame for all genomic windows and MEDIPS SETs
	if(MeDIP){
		base = data.frame(chr=as.vector(seqnames(GRanges.genome)), start=start(GRanges.genome), stop=end(GRanges.genome), CF=genome_CF(CSet), stringsAsFactors=F)
	}
	else{
		base = data.frame(chr=as.vector(seqnames(GRanges.genome)), start=start(GRanges.genome), stop=end(GRanges.genome), stringsAsFactors=F)
	}

	rm(controlSet)
	gc()

	counts.medip = NULL
	rpkm.medip = NULL
	rms = NULL	
	counts.input = NULL
	rpkm.input = NULL
	
	##Add counts
	if(!is.null(MSet1)){
		for(i in 1:nMSets1){
			cat(paste("Preprocessing MEDIPS SET ", i, " in MSet1...\n", sep=""))
			counts.medip = cbind(counts.medip, MSet1=genome_count(MSet1[[i]]))
			rpkm.medip = cbind(rpkm.medip, round(((genome_count(MSet1[[i]])*10^9)/(window_size*number_regions(MSet1[[i]]))), digits=2))
			if(MeDIP){
				ccObj = MEDIPS.calibrationCurve(MSet=MSet1[[i]], CSet=CSet, input=F)
				rms = cbind(rms, MEDIPS.rms(MSet1[[i]], CSet, ccObj=ccObj))
			}		
		}
	}	
	if(!is.null(MSet2)){
		 for(i in 1:nMSets2){
			cat(paste("Preprocessing MEDIPS SET ", i, " in MSet2...\n", sep=""))
			counts.medip = cbind(counts.medip, MSet2=genome_count(MSet2[[i]]))
			rpkm.medip = cbind(rpkm.medip, round(((genome_count(MSet2[[i]])*10^9)/(window_size*number_regions(MSet2[[i]]))), digits=2))
			if(MeDIP){
				ccObj = MEDIPS.calibrationCurve(MSet=MSet2[[i]], CSet=CSet, input=F)
				rms = cbind(rms, MEDIPS.rms(MSet2[[i]], CSet, ccObj=ccObj))
			}
	  	 }
	}	
	if(!is.null(ISet1)){		
		for(i in 1:nISets1){
			cat(paste("Preprocessing INPUT SET ", i, " in ISet1...\n", sep=""))
			counts.input = cbind(counts.input, ISet1=genome_count(ISet1[[i]]))
			rpkm.input = cbind(rpkm.input, round(((genome_count(ISet1[[i]])*10^9)/(window_size*number_regions(ISet1[[i]]))), digits=2))
		   }
	}	
	if(!is.null(ISet2)){
		for(i in 1:nISets2){
			cat(paste("Preprocessing INPUT SET ", i, " in ISet2...\n", sep=""))
			counts.input = cbind(counts.input, ISet2=genome_count(ISet2[[i]]))
			rpkm.input = cbind(rpkm.input, round(((genome_count(ISet2[[i]])*10^9)/(window_size*number_regions(ISet2[[i]]))), digits=2))
		 	}
	}		
	
	##Extract data for selected chromosome
	#######################################
	if(!is.null(chr)){
		fi=base[,1]%in%chr
		
		cat("Extracting data for", chr, "...\n", sep=" ")
		if(length(fi)==0){stop("Stated chromosome does not exist in the COUPLING SET.")}
		if(!is.null(counts.medip)){
			counts.medip = counts.medip[fi,]
			rpkm.medip = rpkm.medip[fi,]
			rms = rms[fi,]
		}
		if(!is.null(counts.input)){
			counts.input = counts.input[fi,]
			rpkm.input = rpkm.input[fi,]
		}
		base = base[fi,]
		cat(nrow(base), "windows on", chr, "\n",sep=" ")		
	}
	
	##Set colnames and transform to data.frames
	############################################
	col.names.count = NULL
	col.names.rpkm = NULL
	col.names.rms = NULL
	if(nMSets1!=0){
		for(i in 1:nMSets1){
			col.names.count = c(col.names.count, paste(sample_name(MSet1[[i]]), ".counts", sep=""))
			col.names.rpkm = c(col.names.rpkm, paste(sample_name(MSet1[[i]]), ".rpkm", sep=""))
			if(MeDIP){
				col.names.rms = c(col.names.rms, paste(sample_name(MSet1[[i]]), ".rms", sep=""))
			}
		}		
	}
	if(nMSets2!=0){
		for(i in 1:nMSets2){
			col.names.count = c(col.names.count, paste(sample_name(MSet2[[i]]), ".counts", sep=""))
			col.names.rpkm = c(col.names.rpkm, paste(sample_name(MSet2[[i]]), ".rpkm", sep=""))
			if(MeDIP){
				col.names.rms = c(col.names.rms, paste(sample_name(MSet2[[i]]), ".rms", sep=""))
			}
		}		
	}
	if(nMSets1!=0 | nMSets2!=0){
		
		counts.medip = data.frame(counts.medip)
		colnames(counts.medip) = col.names.count
		rpkm.medip =  data.frame(rpkm.medip)
		colnames(rpkm.medip) = 	col.names.rpkm
		if(MeDIP){
			rms = data.frame(rms)
			colnames(rms) = col.names.rms
		}
	}		
	
	col.names.count.input = NULL
	col.names.rpkm.input = NULL
	if(nISets1!=0){
		for(i in 1:nISets1){
			col.names.count.input = c(col.names.count.input, paste(sample_name(ISet1[[i]]), ".counts", sep=""))
			col.names.rpkm.input = c(col.names.rpkm.input, paste(sample_name(ISet1[[i]]), ".rpkm", sep=""))
		}		
	}
	if(nISets2!=0){
		for(i in 1:nISets2){
			col.names.count.input = c(col.names.count.input, paste(sample_name(ISet2[[i]]), ".counts", sep=""))
			col.names.rpkm.input = c(col.names.rpkm.input, paste(sample_name(ISet2[[i]]), ".rpkm", sep=""))
		}		
	}
	
	if(nISets1!=0 | nISets2!=0){		
		
		counts.input = data.frame(counts.input)
		colnames(counts.input) = 	col.names.count.input
		rpkm.input = data.frame(rpkm.input)	
		colnames(rpkm.input) = 		col.names.rpkm.input
	}
		
	
	##If two groups of MEDIPS SETs are given 
	##calculate differential coverage
	##################################
	if(!is.null(MSet1) & !is.null(MSet2)){
		cat(paste("Differential coverage analysis...\n", sep=" "))
		
		##Correct for test selection if necessary
		if((nMSets1<3 | nMSets2<3) & diff.method=="ttest"){
			stop("Method 'ttest' is not valid for less than 3 replicates per group. Method 'edgeR' can be applied in this case.")
		}		
		
		if(diff.method=="edgeR"){
			
			if(diffnorm=="rpkm" | diffnorm=="rms"){
				stop("rpkm and rms normalization are not available for the edgeR mode. Here, differential enrichment is performed on the count data. Please set diffnorm to tmm or quantile.\n")
			}
			
			##Extract number of reads per sample
			if(!is.null(MSet1)){
				n.r.M1 = NULL
				for(i in 1:nMSets1){
					n.r.M1 = c(n.r.M1, number_regions(MSet1[[i]]))		
				}
			}	
			if(!is.null(MSet2)){
		 		n.r.M2 = NULL
		 		for(i in 1:nMSets2){
					n.r.M2 = c(n.r.M2, number_regions(MSet2[[i]]))	
	  	 		}
			}	

			#Quantile normalization
			if(diffnorm=="quantile"){
				cat("Performing quantile normalization on sequencing counts. Please note, the returned counts - but not the returned rpkm values - will be quantile normalized.\n")
				counts.medip.coln = colnames(counts.medip)				
				counts.medip = preprocessCore::normalize.quantiles(as.matrix(counts.medip), copy=FALSE)	
				counts.medip <- round(counts.medip)
				colnames(counts.medip) = counts.medip.coln
			}
		
			diff.results.list = MEDIPS.diffMeth(base=base, values=counts.medip, diff.method="edgeR", nMSets1=nMSets1, nMSets2=nMSets2, p.adj=p.adj, n.r.M1=n.r.M1, n.r.M2=n.r.M2, MeDIP=MeDIP, minRowSum=minRowSum, diffnorm=diffnorm)
		}
		else if(diff.method=="ttest"){

			if(diffnorm=="quantile" | diffnorm=="tmm"){
				stop("Quantile and TMM normalization are not available for the t-test mode. Please chose edgeR or set diffnorm to rpkm or rms.\n")
			}

			if(diffnorm=="rpkm"){
				diff.results.list = MEDIPS.diffMeth(base=base, values=rpkm.medip, diff.method="ttest", nMSets1=nMSets1, nMSets2=nMSets2, p.adj=p.adj, MeDIP=MeDIP, minRowSum=minRowSum)
			}
			else if(diffnorm=="rms"){
				if(MeDIP){
					diff.results.list = MEDIPS.diffMeth(base=base, values=rms, diff.method="ttest", nMSets1=nMSets1, nMSets2=nMSets2, p.adj=p.adj, MeDIP=MeDIP, minRowSum=minRowSum)
				}
				else{
					stop("Invalid specification for parameter diffnorm because parameter MeDIP is FALSE (no rms values have been calculated).")
				}
			}
			else{
				stop("Unknown specification for parameter diffnorm.")
			}
		}
		else{stop("Selected method for calculating differential coverage not supported")}
		
		cat("Please note, log2 ratios are reported as log2(MSet1/MSet2).\n")
		diff.results = diff.results.list$diff.results
		diff.index = diff.results.list$diff.index
				
		rm(diff.results.list)
		gc()
	
	}
	else{
		cat("No differential coverage will be calculated- only one group of MEDIPS SETs given.\n")
	}
	
	##If two groups of INPUT SETs are given 
	##calculate CNV on mean per set
	##################################
	if(CNV){
		if(!is.null(ISet1) & !is.null(ISet2)){
			cat(paste("CNV analysis...\n", sep=" "))		
			cnv.combined = MEDIPS.cnv(base=base, rpkm.input=rpkm.input, nISets1=nISets1, nISets2=nISets2)
		}
		else{
			cat("Cannot perform CNV analysis- please specify two groups of INPUT SETs!\n")
		}
	}
	
	##Create results table	
	##################################
	cat(paste("Creating results table...\n", sep=" "))
	if(!is.null(counts.medip)){
		if(MeDIP){
			results = data.frame(base, counts.medip, rpkm.medip, rms, stringsAsFactors=F)
		}
		else{
			results = data.frame(base, counts.medip, rpkm.medip, stringsAsFactors=F)
		}
	}
	if(!is.null(counts.input)){
		if(!is.null(counts.medip))
		{
			results = data.frame(results, counts.input, rpkm.input, stringsAsFactors=F)
		}
		else{
			results = data.frame(base, counts.input, rpkm.input, stringsAsFactors=F)
		}
	}
	
	##Add mean counts, rpkm, rms columns
	set1idx=1:(nMSets1)
	counts.mean.C=numeric(dim(counts.medip)[1])
	rpkm.mean.C=numeric(dim(rpkm.medip)[1])
	for (i in set1idx){
		counts.mean.C=counts.mean.C+counts.medip[,i]
		rpkm.mean.C=rpkm.mean.C+rpkm.medip[,i]
	}
	counts.mean.C=counts.mean.C/nMSets1
	rpkm.mean.C=rpkm.mean.C/nMSets1
	if(MeDIP){
		rms.mean.C =numeric(dim(rms)[1])
		for (i in set1idx){
			rms.mean.C=rms.mean.C+rms[,i]
		}
		rms.mean.C=rms.mean.C/nMSets1
		results = data.frame(results, MSets1.counts.mean=counts.mean.C, MSets1.rpkm.mean=rpkm.mean.C, MSets1.rms.mean=rms.mean.C, stringsAsFactors=F)
		rm(counts.mean.C,rpkm.mean.C,set1idx,rms.mean.C)
	}
	else{
		results = data.frame(results, MSets1.counts.mean=counts.mean.C, MSets1.rpkm.mean=rpkm.mean.C, stringsAsFactors=F)
		rm(counts.mean.C,rpkm.mean.C,set1idx)
	}

	if(nMSets2>0){
		set2idx=(nMSets1+1):(nMSets1+nMSets2)
		counts.mean.T=numeric(dim(counts.medip)[1])
		rpkm.mean.T=numeric(dim(rpkm.medip)[1])
		for (i in set2idx){
			counts.mean.T=counts.mean.T+counts.medip[,i]
			rpkm.mean.T=rpkm.mean.T+rpkm.medip[,i]
		}	
		counts.mean.T=counts.mean.T/nMSets2
		rpkm.mean.T=rpkm.mean.T/nMSets2

		if(MeDIP){
			rms.mean.T =numeric(dim(rms)[1])
			for (i in set2idx){
				rms.mean.T=rms.mean.T+rms[,i]
			}
			rms.mean.T=rms.mean.T/nMSets2
			results = data.frame(results, MSets2.counts.mean=counts.mean.T, MSets2.rpkm.mean=rpkm.mean.T, MSets2.rms.mean=rms.mean.T, stringsAsFactors=F)
			rm(counts.mean.T,rpkm.mean.T,set2idx,rms.mean.T)
		}
		else{
			results = data.frame(results, MSets2.counts.mean=counts.mean.T, MSets2.rpkm.mean=rpkm.mean.T, stringsAsFactors=F)
			rm(counts.mean.T,rpkm.mean.T,set2idx)
		}
	}

	if(nISets1>1){
		setI1idx=1:(nISets1)
		counts.input.mean.C = counts.input[,setI1idx[1]]
		rpkm.input.mean.C = rpkm.input[,setI1idx[1]]
		for (i in setI1idx[-1]){
		  counts.input.mean.C =counts.input.mean.C+counts.input[,i]
		  rpkm.input.mean.C =rpkm.input.mean.C+rpkm.input[,i]
		}
		counts.input.mean.C =counts.input.mean.C/nISets1
		rpkm.input.mean.C =rpkm.input.mean.C/nISets1
		results = data.frame(results, ISets1.counts.mean=counts.input.mean.C, ISets1.rpkm.mean=rpkm.input.mean.C, stringsAsFactors=F)
		rm(counts.input.mean.C,rpkm.input.mean.C,setI1idx)
	}
	if(nISets2>1){
		setI2idx=(nISets1+1):(nISets1+nISets2)
		counts.input.mean.T = counts.input[,setI2idx[1]]
		rpkm.input.mean.T = rpkm.input[,setI2idx[1]]
		for (i in setI2idx[-1]){
		  counts.input.mean.T =counts.input.mean.T+counts.input[,i]
		  rpkm.input.mean.T =rpkm.input.mean.T+rpkm.input[,i]
		}
		counts.input.mean.T =counts.input.mean.T/nISets2
		rpkm.input.mean.T =rpkm.input.mean.T/nISets2

		results = data.frame(results, ISets2.counts.mean=counts.input.mean.T, ISets2.rpkm.mean=rpkm.input.mean.T, stringsAsFactors=F)
		rm(counts.input.mean.T,rpkm.input.mean.T,setI2idx)
	}
	
	if(MeDIP){rm(base, counts.medip, rpkm.medip, rms, counts.input, rpkm.input)}
	else{rm(base, counts.medip, rpkm.medip, counts.input, rpkm.input)}
	gc()
	
	##Add diff.meth results
	if(nMSets1!=0 & nMSets2!=0){
		cat(paste("Adding differential coverage results...\n", sep=" "))
		dummy.results = matrix(ncol=ncol(diff.results), nrow=nrow(results))
		
		if(diff.method=="edgeR"){
			c.names = colnames(diff.results)
			diff.results <- matrix(unlist(diff.results), ncol=ncol(diff.results), byrow=FALSE)
			colnames(diff.results)=c.names
			rm(c.names)
		}
		
		dummy.results[diff.index,] = diff.results
		colnames(dummy.results)=colnames(diff.results)		
		results = data.frame(results, dummy.results, stringsAsFactors=F)
			
		rm(diff.results, dummy.results, diff.index)
		gc()
	}
	
	##Add CNV results
	if(!is.null(ISet1) & !is.null(ISet2)){
		if(CNV){
			cat(paste("Adding CNV results...\n", sep=" "))
			dummy.results = matrix(ncol=1, nrow=(nrow(results)))
			for(i in 1:nrow(cnv.combined)){
				dummy.results[cnv.combined[i,1]:cnv.combined[i,2]] = cnv.combined[i,3]		
			}
		colnames(dummy.results)="CNV.log2.ratio"
		results = data.frame(results, dummy.results, stringsAsFactors=F)
		
		rm(dummy.results)	
		gc()
		}
	}
	
	rownames(results) = seq(1, nrow(results))
    	gc()
	return(results)
		
}
emmecola/MEDIPS documentation built on May 16, 2019, 5:10 a.m.