COHCAP: CpG Island Analysis Pipeline for Illumina Methylation Array and Targeted BS-Seq Data

custom.mean = function(arr)
{
	return(mean(as.numeric(arr), na.rm = T))
}#end def custom.mean

custom.cor = function(arr, var1)
{
	if(length(arr[!is.na(arr)]) >= 0.5 * length(arr)){
		return(cor(as.numeric(arr), var1, use="complete.obs"))
	}else{
		return(NA)
	}
}#end def custom.cor

ttest2 = function(arr, grp1, grp2)
{
	group1 = as.numeric(arr[grp1])
	group2 = as.numeric(arr[grp2])
	#print(grp1)
	#print(grp2)
	#stop()
	#require at least 2 replicates
	if((length(group1[!is.na(group1)]) >=2) & (length(group2[!is.na(group2)]) >=2))
	{
		result = t.test(group1, group2)
		if(is.na(result$p.value)){
			return(1)
		}else{
			return(result$p.value)
		}
	}
	else
	{
		return(1)
	}
}#end def ttest2

anova.pvalue = function(arr, grp.levels)
{
	#print(arr)
	#print(grp.levels)
	
	grp.no.na = as.factor(as.character(grp.levels[!is.na(arr)]))
	if(length(levels(grp.no.na)) >= 2)
		{
			test = aov(arr ~ grp.levels) 
			result = summary(test)[[1]][['Pr(>F)']][1]
			if(is.null(result))
				{
					return(1)
				}
			else
				{
					return(result)
				}
		}
	else
		{
			return(1)
		}
}#end def anova.pvalue

lm.pvalue = function(arr, var1)
{
	if(length(arr[!is.na(arr)]) >= 0.5 * length(arr)){
		fit = lm(as.numeric(arr)~var1)
		result = summary(fit)
		pvalue = result$coefficients[2,4]
		return(pvalue)
	}else{
		return(NA)
	}
}#end def lm.pvalue


lm.pvalue2 = function(arr, var1, var2)
{
	if(typeof(var2) == "character"){
		var2 = as.factor(var2)
	} 
	if(length(arr[!is.na(arr)]) >= 0.5 * length(arr)){
		fit = lm(as.numeric(arr)~var1 + as.numeric(var2))
		result = summary(fit)
		pvalue = result$coefficients[2,4]
		return(pvalue)
	}else{
		return(NA)
	}
}#end def lm.pvalue

annova.2way.pvalue = function(arr, grp.levels, pairing.levels)
{
	#print(arr)
	#print(grp.levels)
	#print(pairing.levels)
	
	grp.no.na = as.factor(as.character(grp.levels[!is.na(arr)]))
	
	min.reps = min(table(grp.no.na))
	
	if((length(levels(grp.no.na)) >= 2) && (min.reps >= 2))
		{
			test = aov(arr ~ grp.levels + pairing.levels) 
			result = summary(test)[[1]][['Pr(>F)']][1]
			if(is.null(result))
				{
					return(1)
				}
			else
				{
					return(result)
				}
		}
	else
		{
			return(1)
		}
}#end def anova.pvalue

count.observed = function(arr){
	return(length(arr[!is.na(arr)]))
}#end def count.observed

cor.dist = function(mat){
	cor.mat = cor(as.matrix(t(mat)), use="pairwise.complete.obs")
	dis.mat = 1 - cor.mat
	dis.mat[is.na(dis.mat)]=2
	return(as.dist(dis.mat))
}#end def cor.dist

cpp.ANOVA.1way.wrapper = function(arr, grp.levels, ref){
	full_beta = arr[!is.na(arr)]
	betaT = arr[(grp.levels != ref)&!is.na(arr)]
	betaR = arr[(grp.levels == ref)&!is.na(arr)]
	result = .Call('_COHCAP_ANOVA_cpp_2group', PACKAGE = 'COHCAP', full_beta, betaT, betaR)
	return(result)
}#end def cpp.ANOVA.1way.wrapper

cpp.annova.2way.wrapper = function(arr, var1, var2, ref){
	full_beta = arr[!is.na(arr)]
	
	max.df = length(full_beta)-length(levels(as.factor(as.character(var1[!is.na(arr)]))))*length(levels(as.factor(as.character(var2[!is.na(arr)]))))
	
	if(max.df < 1){
		return(NA)
	}else{
		betaT = arr[(var1 != ref)&!is.na(arr)]
		betaR = arr[(var1 == ref)&!is.na(arr)]
		
		interaction.var = paste(var1, var2, sep="-")
		interaction.var = interaction.var[!is.na(arr)]
		
		#Rcpp code uses numeric array
		interaction.var = as.numeric(as.factor(interaction.var))
		if(min(table(interaction.var)) < 2){
			return(NA)
		}else{
			result = .Call('_COHCAP_ANOVA_cpp_2group_2way', PACKAGE = 'COHCAP',full_beta, betaT, betaR, interaction.var)
			return(result)		
		}	
	}#end else
}#end def cpp.annova.2way.wrapper

cpp.ttest.wrapper = function(arr, grp.levels, ref){
	betaT = arr[(grp.levels != ref)&!is.na(arr)]
	betaR = arr[(grp.levels == ref)&!is.na(arr)]
	result = .Call('_COHCAP_ttest_cpp', PACKAGE = 'COHCAP', betaT, betaR)
	return(result)
}#end def cpp.ttest.wrapper

cpp.paired.ttest.wrapper = function(arr, iTrt, iRef){
	pairedT = arr[iTrt]
	pairedR = arr[iRef]
	groupT=pairedT[!is.na(pairedT)&!is.na(pairedR)]
	groupR=pairedR[!is.na(pairedT)&!is.na(pairedR)]
	paired_diff = groupT - groupR
	result = .Call('_COHCAP_ttest_cpp_paired', PACKAGE = 'COHCAP', paired_diff)
	return(result)
}#end def cpp.paired.ttest.wrapper

cpp.lm.1var.wrapper = function(arr, var1){
	full_continuous= var1[!is.na(arr)]
	full_beta= arr[!is.na(arr)]
	result = .Call('_COHCAP_lmResidual_cpp_1var', PACKAGE = 'COHCAP', full_beta, full_continuous)
	return(result)
}#end def cpp.lm.1var.wrapper

fastLm_wrapper = function(arr, var1){
	var1= var1[!is.na(arr)]
	arr= arr[!is.na(arr)]
	fit_stats = fastLmPure(as.matrix(var1), arr)
	t_stat = fit_stats$coefficients / fit_stats$stderr
	return(pt(-abs(t_stat), fit_stats$df.residual))
}#end def fastLm_wrapper

fastLm_wrapper2 = function(arr, independent.mat){
	independent.mat= independent.mat[!is.na(arr),]
	arr= arr[!is.na(arr)]
	fit_stats = fastLmPure(independent.mat, arr)
	t_stat = fit_stats$coefficients[1] / fit_stats$stderr[1]
	return(pt(-abs(t_stat), fit_stats$df.residual))
}#end def fastLm_wrapper2

`COHCAP.site` = function (sample.file, beta.table, project.name, project.folder,
							methyl.cutoff=0.7, unmethyl.cutoff = 0.3, paired=FALSE,
							delta.beta.cutoff = 0.2, pvalue.cutoff=0.05, fdr.cutoff=0.05,
							ref="none", num.groups=2,lower.cont.quantile=0, upper.cont.quantile=1,
							create.wig = "avg", alt.pvalue="none", plot.heatmap=TRUE, output.format='txt',
							heatmap.dist.fun="Euclidian")
{
	fixed.color.palatte = c("green","orange","purple","cyan","pink","maroon","yellow","grey","red","blue","black",colors())
	
	if(heatmap.dist.fun =="Euclidian"){
		heatmap.dist.fun=dist
	}else if(heatmap.dist.fun =="Pearson Dissimilarity"){
		heatmap.dist.fun=cor.dist
	}else{
		stop("'heatmap.dist.fun' must be either 'Euclidian' or 'Pearson Dissimilarity'")
	}
	
	site.folder=file.path(project.folder,"CpG_Site")
	dir.create(site.folder, showWarnings=FALSE)
	
	data.folder=file.path(project.folder,"Raw_Data")
	dir.create(data.folder, showWarnings=FALSE)
	
	print("Reading Sample Description File....")
	sample.table = read.table(sample.file, header=F, sep = "\t", stringsAsFactors=TRUE)
	samples = as.character(sample.table[[1]])
	for (i in 1:length(samples))
		{
			if(length(grep("^[0-9]",samples[i])) > 0)
				{
					samples[i] = paste("X",samples[i],sep="")
				}#end if(length(grep("^[0-9]",samples[i])) > 0)
		}#end def for (i in 1:length(samples))
	sample.group = sample.table[[2]]
	sample.group = as.factor(gsub(" ",".",sample.group))
	pairing.group = NA
	if(paired == "continuous"){
		print("using continuous covariate")
		pairing.group = sample.table[[3]]
		pairing.group = as.numeric(pairing.group)
	}else if(paired){
		print("using pairing ID")
		pairing.group = sample.table[[3]]
		pairing.group = as.factor(gsub(" ",".",pairing.group))
	}
	ref = gsub(" ",".",ref)

	sample.names = names(beta.table)[6:ncol(beta.table)]
	beta.values = beta.table[,6:ncol(beta.table)]
	annotation.names = names(beta.table)[1:5]
	annotations = beta.table[,1:5]
	print(dim(beta.values))

	#print(samples)
	#print(sample.names)

	if(length(samples) != length(sample.names[match(samples, sample.names, nomatch=0)]))
		{
			warning("Some samples in sample description file are not present in the beta file!")
			warning(paste(length(samples),"items in sample description file",sep=" "))
			warning(paste(length(sample.names),"items in gene beta file",sep=" "))
			warning(paste(length(sample.names[match(samples, sample.names, nomatch=0)]),"matching items in gene beta file",sep=" "))
			#warning(sample.names[match(samples, sample.names, nomatch=0)])
			stop()
		}

	if(length(samples)>1)
		{
			beta.values = beta.values[,match(samples, sample.names, nomatch=0)]
			colnames(beta.values)=samples
		}
		
	print(dim(beta.values))
	#print(samples)
	#print(sample.names)
	#print(colnames(beta.values))
	#print(beta.values[1,])

	groups = levels(sample.group)
	
	if((length(groups) != num.groups) & (ref != "continuous"))
		{
			print(paste("Group:",groups, sep=" "))

			warning(paste("There are ",length(groups)," but user specified algorithm for ",num.groups," groups.",sep=""))
			warning("Please restart algorithm and specify the correct number of groups")
			stop()
		}
	
	if (ref == "continuous"){
		print("Continous Variable :")
		print(sample.table[[2]])
	}

	stat.table = annotations
	rm(annotations)

	if((create.wig == "sample")|(create.wig == "avg.and.sample")){
			#create .wig files for each sample
			wig.folder=file.path(site.folder,paste(project.name,"wig",sep="_"))
			dir.create(wig.folder, showWarnings=FALSE)

			temp.stat.file = file.path(wig.folder, "temp.txt")
			Perl.Path = file.path(path.package("COHCAP"), "Perl")
			perl.script = file.path(Perl.Path , "create_wig_files.pl")
			
			sample.beta = beta.table
			sample.beta = sample.beta[order(sample.beta$Chr, sample.beta$Loc), ]
			colnames(sample.beta) = paste(colnames(beta.table),"beta",sep=".")
			write.table(sample.beta, temp.stat.file, quote=F, row.names=F, sep="\t")
			cmd = paste("perl \"",perl.script,"\" \"", temp.stat.file,"\" \"", wig.folder,"\"", sep="")
			res = system(cmd, intern=TRUE, wait=TRUE)
			message(res)

			shortcut.beta.file = file.path(wig.folder, "SiteID.beta.wig")
			unlink(shortcut.beta.file)
			
			shortcut.beta.file = file.path(wig.folder, "Chr.beta.wig")
			unlink(shortcut.beta.file)

			shortcut.beta.file = file.path(wig.folder, "Gene.beta.wig")
			unlink(shortcut.beta.file)
			
			shortcut.beta.file = file.path(wig.folder, "Loc.beta.wig")
			unlink(shortcut.beta.file)

			shortcut.beta.file = file.path(wig.folder, "Island.beta.wig")
			unlink(shortcut.beta.file)
		}#end if(create.wig)
	
	rm(beta.table)
	
	if(ref == "continuous"){
			continous.var = sample.table[[2]]
			if ((paired == TRUE) | (paired == "continuous")){
				if(alt.pvalue == "RcppArmadillo.fastLmPure"){
					print("Using fastLm and pt() instead of lm(), with 2nd variable")
					independent.mat = cbind(continous.var, pairing.group)
					lm.pvalue = apply(beta.values, 1, fastLm_wrapper2, independent.mat)
				}else{		
					lm.pvalue = apply(beta.values, 1, lm.pvalue2, continous.var, pairing.group)
				}
			} else{
				if(alt.pvalue == "cppLmResidual.1var"){
					print("Using Rcpp residual t-test instead of lm()")
					lm.pvalue = apply(beta.values, 1, cpp.lm.1var.wrapper, continous.var)
				}else if(alt.pvalue == "RcppArmadillo.fastLmPure"){
					print("Using fastLm and pt() instead of lm()")
					lm.pvalue = apply(beta.values, 1, fastLm_wrapper, continous.var)
				}else{
					lm.pvalue = apply(beta.values, 1, lm.pvalue, continous.var)
				}
			}
			lm.fdr = p.adjust(lm.pvalue, "fdr")
			beta.cor = apply(beta.values, 1, custom.cor, var1=continous.var)
			#upper.beta is max if positive correlation, min if negative correlation
			#lower.beta is min if positive correlation, max if negative correlation
			#in both cases, delta.beta is upper.beta - lower.beta
			beta.min= apply(beta.values, 1, quantile, na.rm=TRUE, probs=c(lower.cont.quantile))
			beta.max= apply(beta.values, 1, quantile, na.rm=TRUE, probs=c(upper.cont.quantile))
			
			upper.beta = beta.max
			upper.beta[!is.na(beta.cor) & (beta.cor < 0)] = beta.min[!is.na(beta.cor) & (beta.cor < 0)]
			lower.beta = beta.min
			lower.beta[!is.na(beta.cor) & (beta.cor < 0)] = beta.max[!is.na(beta.cor) & (beta.cor < 0)]
			
			rm(beta.min)
			rm(beta.max)
			
			delta.beta = upper.beta - lower.beta

			#make format compatible with 2-group code
			stat.table = data.frame(stat.table, upper.beta = upper.beta, lower.beta = lower.beta,
									delta.beta = delta.beta, pvalue = lm.pvalue, fdr = lm.fdr,
									cor.coef = beta.cor)
	} else if(length(groups) == 1) {
	col.names = c(annotation.names)
	print("Averaging Beta Values for All Samples")

	#print(beta.values[1:3,])
	if(length(samples) > 1)
		{
			avg.beta = apply(beta.values, 1, custom.mean)
		}
	else
		{
			avg.beta= beta.values
		}
	#print(avg.beta)

	stat.table = data.frame(stat.table, avg.beta=avg.beta)
	} else if(length(groups) == 2) {
	print("Differential Methylation Stats for 2 Groups with Reference")
	trt = groups[groups != ref]
	#print(trt)
	#print(ref)
	trt.samples = samples[which(sample.group == trt)]
	#print(trt.samples)
	ref.samples = samples[which(sample.group == ref)]
	#print(ref.samples)

	all.indices = 1:length(samples)
	trt.indices = all.indices[which(sample.group == trt)]
	ref.indices = all.indices[which(sample.group == ref)]

	if (length(trt.indices) == 1){
		trt.avg.beta = beta.values[, trt.indices]
	} else {
		trt.beta.values = beta.values[, trt.indices]	
		trt.avg.beta = apply(trt.beta.values, 1, custom.mean)
	}
	
	if (length(ref.indices) == 1){
		ref.avg.beta = beta.values[, ref.indices]
	} else {
		ref.beta.values = beta.values[, ref.indices]
		ref.avg.beta = apply(ref.beta.values, 1, custom.mean)
	}

	delta.beta = trt.avg.beta - ref.avg.beta

	if(paired == "continuous"){
		print("Analysis of two numeric variables")
		beta.pvalue = unlist(apply(beta.values, 1, annova.2way.pvalue, grp.levels=as.numeric(sample.group), pairing.levels=pairing.group))
	}else if(paired){
		print("Factor in Paired Samples")
		#print(sample.group)
		#print(pairing.group)
		if (alt.pvalue == "cppPairedTtest"){
			print("Using Rcpp/Boost Paired t-test instead of 2-way ANOVA")
			pair.table = table(sample.table[,2], sample.table[,3])
			
			pairIDs=colnames(pair.table)[(as.numeric(pair.table[1,]) == 1)&(as.numeric(pair.table[2,]) == 1)]
			if(length(pairIDs) != length(levels(as.factor(pairing.group)))){
				print("Only pairings with 2 samples are used:")
				print("While incomplete pairs will be removed for missing values,")
				print("please only provide a sample description table with pairs that you would like to compare.")
				stop()
			}#end if(length(pair.table) != length(levels(as.factor(pairing.group)))

			pairedT = c()
			pairedR = c()
			for(i in 1:length(pairIDs)){
				pairID = pairIDs[i]
				pairedT[i] = samples[(pairing.group == pairID)&(sample.group != ref)]
				pairedR[i] = samples[(pairing.group == pairID)&(sample.group == ref)]
			}#end for(pairID in names(pair.table))
			pairTi = match(pairedT, samples)
			pairRi = match(pairedR, samples)
			beta.pvalue = apply(beta.values, 1, anova.pvalue, grp.levels=sample.group)
		}else if(alt.pvalue == "cppANOVA.2way"){
			print("Using Rcpp/Boost 2-way ANOVA")
			max.df = length(samples)-length(levels(as.factor(sample.group)))*length(levels(as.factor(pairing.group)))
			if(max.df < 1){
				print("Not enough degrees of freedom for this implementation.  Consider using 'alt.pvalue' = 'cppPairedTtest'")
				stop()
			}
			beta.pvalue = unlist(apply(beta.values, 1, cpp.annova.2way.wrapper, sample.group, pairing.group, ref))
		}else{
			beta.pvalue = unlist(apply(beta.values, 1, annova.2way.pvalue, grp.levels=sample.group, pairing.levels=pairing.group))
		}
	}else{
		if(pvalue.cutoff == 1){
			print("Skip p-value calculation to save time.  May help with Average-by-Site workflow, but generally not recommended.")
			beta.pvalue = rep(1, nrow(beta.values))
		}else if (alt.pvalue == "rANOVA.1way"){
			print("Using R-based ANOVA instead of t-test")
			beta.pvalue = apply(beta.values, 1, anova.pvalue, grp.levels=sample.group)
		}else if (alt.pvalue == "cppANOVA.1way"){
			print("Using Rcpp/Boost ANOVA instead of t-test")
			beta.pvalue = apply(beta.values, 1, cpp.ANOVA.1way.wrapper, grp.levels=sample.group, ref=ref)
		}else if (alt.pvalue == "cppWelshTtest"){
			print("Using Rcpp/Boost Welsh t-test instead of t.test()")
			beta.pvalue = apply(beta.values, 1, cpp.ttest.wrapper, grp.levels=sample.group, ref=ref)
		}else{
			beta.pvalue = apply(beta.values, 1, ttest2, grp1=trt.indices, grp2=ref.indices)
		}
	}#end else

	#print(beta.pvalue)
	
	beta.fdr = p.adjust(beta.pvalue, method="fdr")

	#print(names(beta.values))
	print(dim(stat.table))
	#print(length(annotation.names))
	#print(length(trt.avg.beta))
	#print(length(ref.avg.beta))
	#print(length(delta.beta))
	#print(length(beta.pvalue))
	#print(length(beta.fdr))

	stat.table = data.frame(stat.table, trt.beta = trt.avg.beta, ref.beta = ref.avg.beta, delta.beta = delta.beta, pvalue = beta.pvalue, fdr = beta.fdr)
	print(dim(stat.table))
	#print(names(stat.table))
	#test = c(annotation.names,
	#							paste(trt,"avg.beta",sep="."),
	#							paste(ref,"avg.beta",sep="."),
	#							paste(trt,"vs",ref,"delta.beta",sep="."),
	#							paste(trt,"vs",ref,"pvalue",sep="."),
	#							paste(trt,"vs",ref,"fdr",sep="."))
	#print(test)
	#print(names(stat.table))
	colnames(stat.table) = c(annotation.names,
								paste(trt,"avg.beta",sep="."),
								paste(ref,"avg.beta",sep="."),
								paste(trt,"vs",ref,"delta.beta",sep="."),
								paste(trt,"vs",ref,"pvalue",sep="."),
								paste(trt,"vs",ref,"fdr",sep="."))
	#print(names(stat.table))
	} else if(length(groups) > 2) {
	print("Calculating Average Beta and ANOVA p-values")
	col.names = c(annotation.names)

	for (i in 1:length(groups))
	{
		temp.grp = groups[i]
		grp.beta = beta.values[,which(sample.group == temp.grp)]
		avg.beta = apply(grp.beta, 1, custom.mean)
		col.names = col.names = c(col.names, paste(temp.grp,"avg.beta",sep="."))
		stat.table = data.frame(stat.table, avg.beta=avg.beta)
		colnames(stat.table) = col.names
	}#end for (i in 1:length(groups)

	if(paired == "continuous"){
		print("Analysis of two numeric variables")
		beta.pvalue = unlist(apply(beta.values, 1, annova.2way.pvalue, grp.levels=as.numeric(sample.group), pairing.levels=pairing.group))
	}else if(paired){
		print("Factor in Paired Samples")
		beta.pvalue = unlist(apply(beta.values, 1, annova.2way.pvalue, grp.levels=sample.group, pairing.levels=pairing.group))
	}else{
		pvalue = apply(beta.values, 1, anova.pvalue, grp.levels = sample.group)
	}#end else

	anova.fdr = p.adjust(pvalue, method="fdr")
	col.names = c(col.names, "anova.pvalue", "annova.fdr")
	stat.table = data.frame(stat.table, anova.pvalue = pvalue, anova.fdr = anova.fdr)
	colnames(stat.table) = col.names
	} else {
	print("Invalid groups specified in sample description file!")
	}

	stat.table = stat.table[order(stat.table$Chr, stat.table$Loc), ]
	if(output.format == 'xls'){
		xlsfile = file.path(data.folder, paste(project.name,"_CpG_site_stats.xlsx",sep=""))
		WriteXLS("stat.table", ExcelFileName = xlsfile)
	} else if(output.format == 'csv') {
		txtfile = file.path(data.folder, paste(project.name,"_CpG_site_stats.csv",sep=""))
		write.table(stat.table, file=txtfile, quote=F, row.names=F, sep=",")
	} else if(output.format == 'txt') {
		txtfile = file.path(data.folder, paste(project.name,"_CpG_site_stats.txt",sep=""))
		write.table(stat.table, file=txtfile, quote=F, row.names=F, sep="\t")
	} else {
		print(paste(output.format," is not a valid output format!  Please use 'txt' or 'xls'.",sep=""))
	}

	if((create.wig == "avg")|(create.wig == "avg.and.sample")){
		#create .wig file based upon average methylation values
		wig.folder=file.path(site.folder,paste(project.name,"wig",sep="_"))
		dir.create(wig.folder, showWarnings=FALSE)

		temp.stat.file = file.path(wig.folder, "temp.txt")
		Perl.Path = file.path(path.package("COHCAP"), "Perl")
		perl.script = file.path(Perl.Path , "create_wig_files.pl")
		write.table(stat.table, temp.stat.file, quote=F, row.names=F, sep="\t")
		cmd = paste("perl \"",perl.script,"\" \"", temp.stat.file,"\" \"", wig.folder,"\"", sep="")
		res = system(cmd, intern=TRUE, wait=TRUE)
		message(res)
	}#end if(create.wig)
	
	#filter sites
	print(dim(stat.table))
	filter.table = stat.table
	if((length(groups) == 2)|(ref == "continuous")) {
		temp.trt.beta = stat.table[[6]]
		temp.ref.beta = stat.table[[7]]
		temp.delta.beta = abs(stat.table[[8]])
		temp.pvalue = stat.table[[9]]
		temp.fdr = stat.table[[10]]
		
		if(unmethyl.cutoff > methyl.cutoff)
			{
				print("unmethyl.cutoff > methyl.cutoff ...")
				print("so, delta-beta decides which group should be subject to which cutoff")
				temp.delta.beta = stat.table[[8]]
				temp.methyl.up = filter.table[(temp.trt.beta >= methyl.cutoff) & (temp.ref.beta<=unmethyl.cutoff) & !is.na(temp.pvalue) & (temp.pvalue <= pvalue.cutoff) & (temp.fdr <= fdr.cutoff) & (temp.delta.beta >= delta.beta.cutoff),]
				print(dim(temp.methyl.up))
				temp.methyl.down = filter.table[(temp.ref.beta >= methyl.cutoff) & (temp.trt.beta<=unmethyl.cutoff) & !is.na(temp.pvalue) & (temp.pvalue <= pvalue.cutoff) & (temp.fdr <= fdr.cutoff) & (temp.delta.beta <= -delta.beta.cutoff),]
				print(dim(temp.methyl.down))
				filter.table = rbind(temp.methyl.up, temp.methyl.down)
			}
		else
			{
				temp.methyl.up = filter.table[(temp.trt.beta >= methyl.cutoff) & (temp.ref.beta<=unmethyl.cutoff) & !is.na(temp.pvalue) & (temp.pvalue <= pvalue.cutoff) & (temp.fdr <= fdr.cutoff) & (temp.delta.beta >= delta.beta.cutoff),]
				temp.methyl.down = filter.table[(temp.ref.beta >= methyl.cutoff) & (temp.trt.beta<=unmethyl.cutoff) & !is.na(temp.pvalue) & (temp.pvalue <= pvalue.cutoff) & (temp.fdr <= fdr.cutoff) & (temp.delta.beta >= delta.beta.cutoff),]
				filter.table = rbind(temp.methyl.up, temp.methyl.down)
			}
	}else if(length(groups) == 1) {
		temp.avg.beta = stat.table$avg.beta
		filter.table = filter.table[(temp.avg.beta >= methyl.cutoff) | (temp.avg.beta <=unmethyl.cutoff),]
	} else if(length(groups) > 2) {
		temp.pvalue = stat.table[[ncol(stat.table) -1]]
		temp.fdr = stat.table[[ncol(stat.table)]]
		filter.table = filter.table[(temp.pvalue <= pvalue.cutoff) & (temp.fdr <= fdr.cutoff),]
	} else {
	print("Invalid groups specified in sample description file!")
	}
	print(dim(filter.table))
	filter.table = filter.table[!is.na(filter.table$SiteID),]
	print(dim(filter.table))

	if(output.format == 'xls'){
		xlsfile = file.path(site.folder, paste(project.name,"_CpG_site_filter.xlsx",sep=""))
		WriteXLS("filter.table", ExcelFileName = xlsfile)
	} else if(output.format == 'csv') {
		txtfile = file.path(site.folder, paste(project.name,"_CpG_site_filter.csv",sep=""))
		write.table(filter.table, file=txtfile, quote=F, row.names=F, sep=",")
	} else if(output.format == 'txt') {
		txtfile = file.path(site.folder, paste(project.name,"_CpG_site_filter.txt",sep=""))
		write.table(filter.table, file=txtfile, quote=F, row.names=F, sep="\t")
	} else {
		warning(paste(output.format," is not a valid output format!  Please use 'txt' or 'xls'.",sep=""))
	}

if((plot.heatmap)& (nrow(filter.table) > 1)& (nrow(filter.table) < 10000)){
	temp.beta.mat = apply(beta.values[match(as.character(filter.table$SiteID),as.character(stat.table$SiteID)),], 1, as.numeric)
	
	probe.count.obs= apply(temp.beta.mat, 2, count.observed)
	if(length(probe.count.obs[probe.count.obs < 0.5 * nrow(temp.beta.mat)]) > 0){
		print(paste("filtering NA probes from heatmap: ",paste(filter.table$SiteID[probe.count.obs < 0.5 * nrow(temp.beta.mat)],collapse=","),sep=""))
		print(dim(temp.beta.mat))
		filter.table = filter.table[probe.count.obs >= 0.5 * nrow(temp.beta.mat), ]
		temp.beta.mat = temp.beta.mat[,probe.count.obs >= 0.5 * nrow(temp.beta.mat)]
		print(dim(temp.beta.mat))
	}#end if(length(probe.count.obs[probe.count.obs < 0.5 * nrow(temp.beta.mat)]) > 0)
	
	if(nrow(filter.table) < 25){
		colnames(temp.beta.mat) = filter.table$SiteID
	} else {
		colnames(temp.beta.mat) = rep("", nrow(filter.table))
	}
	rownames(temp.beta.mat) = samples
	
	labelColors = rep("black",times=nrow(temp.beta.mat))
	if(ref == "continuous"){
		continuous.color.breaks = 10	
		plot.var = as.numeric(continous.var)
		plot.var.min = min(plot.var, na.rm=T)
		plot.var.max = max(plot.var, na.rm=T)
		
		plot.var.range = plot.var.max - plot.var.min
		plot.var.interval = plot.var.range / continuous.color.breaks
		
		color.range = colorRampPalette(c("green","black","orange"))(n = continuous.color.breaks)
		plot.var.breaks = plot.var.min + plot.var.interval*(0:continuous.color.breaks)
		for (j in 1:continuous.color.breaks){
			labelColors[(plot.var >= plot.var.breaks[j]) &(plot.var <= plot.var.breaks[j+1])] = color.range[j]
		}#end for (j in 1:continuous.color.breaks)
	}else{
		color.palette = fixed.color.palatte[1:length(groups)]
		for (j in 1:length(groups)){
			labelColors[sample.group == as.character(groups[j])] = color.palette[j]
		}#end for (j in 1:length(groups))
	}
	
	beta.breaks = c(0:40*2.5)
	if(max(temp.beta.mat, na.rm=T) < 2){
		beta.breaks = beta.breaks / 100
	}else{
		print("Adjusting heatmap scale, assuming percent methylation between 0 and 100")
	}
	
	heatmap.file = file.path(site.folder, paste(project.name,"_CpG_site_heatmap.pdf",sep=""))
	pdf(file = heatmap.file)
	heatmap.2(temp.beta.mat,
				col=colorpanel(40, low="blue", mid="black", high="red"), breaks=beta.breaks,
				density.info="none", key=TRUE, distfun= heatmap.dist.fun,
				 RowSideColors=labelColors, trace="none", margins = c(5,15), cexCol=0.8, cexRow=0.8)
	if(ref == "continuous"){
		legend("topright",legend=c(round(plot.var.max,digits=1),rep("",length(color.range)-2),round(plot.var.min,digits=1)),
				col=rev(color.range), pch=15, y.intersp = 0.4, cex=0.8, pt.cex=1.5)
	}else{
		legend("topright",legend=groups,col=color.palette,  pch=15)
	}
	dev.off()
}#end if((plot.heatmap)& (nrow(island.avg.table) > 0))
	
	return(filter.table)
}#end def COHCAP.site
cwarden45/COHCAP documentation built on April 16, 2024, 3:17 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
cwarden45/COHCAP
CpG Island Analysis Pipeline for Illumina Methylation Array and Targeted BS-Seq Data

R/COHCAP.site.R
In cwarden45/COHCAP: CpG Island Analysis Pipeline for Illumina Methylation Array and Targeted BS-Seq Data

Defines functions fastLm_wrapper2 fastLm_wrapper cpp.lm.1var.wrapper cpp.paired.ttest.wrapper cpp.ttest.wrapper cpp.annova.2way.wrapper cpp.ANOVA.1way.wrapper cor.dist count.observed annova.2way.pvalue lm.pvalue2 lm.pvalue anova.pvalue ttest2 custom.cor custom.mean

R Package Documentation

Browse R Packages

We want your feedback!

cwarden45/COHCAP CpG Island Analysis Pipeline for Illumina Methylation Array and Targeted BS-Seq Data

R/COHCAP.site.R In cwarden45/COHCAP: CpG Island Analysis Pipeline for Illumina Methylation Array and Targeted BS-Seq Data

Defines functions fastLm_wrapper2 fastLm_wrapper cpp.lm.1var.wrapper cpp.paired.ttest.wrapper cpp.ttest.wrapper cpp.annova.2way.wrapper cpp.ANOVA.1way.wrapper cor.dist count.observed annova.2way.pvalue lm.pvalue2 lm.pvalue anova.pvalue ttest2 custom.cor custom.mean

R Package Documentation

Browse R Packages

We want your feedback!

cwarden45/COHCAP
CpG Island Analysis Pipeline for Illumina Methylation Array and Targeted BS-Seq Data

R/COHCAP.site.R
In cwarden45/COHCAP: CpG Island Analysis Pipeline for Illumina Methylation Array and Targeted BS-Seq Data