custom.mean = function(arr)
{
return(mean(as.numeric(arr), na.rm = T))
}#end def ttest2
custom.cor = function(arr, var1)
{
if(length(arr[!is.na(arr)]) >= 0.5 * length(arr)){
return(cor(as.numeric(arr), var1, use="complete.obs"))
}else{
return(NA)
}
}#end def custom.cor
ttest2 = function(arr, grp1, grp2)
{
group1 = as.numeric(arr[grp1])
group2 = as.numeric(arr[grp2])
#print(grp1)
#print(grp2)
#stop()
#require at least 2 replicates
if((length(group1[!is.na(group1)]) >=2) & (length(group2[!is.na(group2)]) >=2))
{
result = t.test(group1, group2)
if(is.na(result$p.value)){
return(1)
}else{
return(result$p.value)
}
}
else
{
return(1)
}
}#end def ttest2
anova.pvalue = function(arr, grp.levels)
{
#print(arr)
#print(grp.levels)
grp.no.na = as.factor(as.character(grp.levels[!is.na(arr)]))
if(length(levels(grp.no.na)) >= 2)
{
test = aov(arr ~ grp.levels)
result = summary(test)[[1]][['Pr(>F)']][1]
if(is.null(result))
{
return(1)
}
else
{
return(result)
}
}
else
{
return(1)
}
}#end def anova.pvalue
lm.pvalue = function(arr, var1)
{
if(length(arr[!is.na(arr)]) >= 0.5 * length(arr)){
fit = lm(as.numeric(arr)~var1)
result = summary(fit)
pvalue = result$coefficients[2,4]
return(pvalue)
}else{
return(NA)
}
}#end def lm.pvalue
lm.pvalue2 = function(arr, var1, var2)
{
if(typeof(var2) == "character"){
var2 = as.factor(var2)
}
if(length(arr[!is.na(arr)]) >= 0.5 * length(arr)){
fit = lm(as.numeric(arr)~var1 + as.numeric(var2))
result = summary(fit)
pvalue = result$coefficients[2,4]
return(pvalue)
}else{
return(NA)
}
}#end def lm.pvalue
annova.2way.pvalue = function(arr, grp.levels, pairing.levels)
{
#print(arr)
#print(grp.levels)
#print(pairing.levels)
grp.no.na = as.factor(as.character(grp.levels[!is.na(arr)]))
rep.flag = 1
if((length(levels(grp.no.na)) >= 2) && (rep.flag == 1))
{
test = aov(arr ~ grp.levels + pairing.levels)
result = summary(test)[[1]][['Pr(>F)']][1]
if(is.null(result))
{
return(1)
}
else
{
return(result)
}
}
else
{
return(1)
}
}#end def anova.pvalue
count.observed = function(arr){
return(length(arr[!is.na(arr)]))
}#end def count.observed
cor.dist = function(mat){
cor.mat = cor(as.matrix(t(mat)), use="pairwise.complete.obs")
dis.mat = 1 - cor.mat
dis.mat[is.na(dis.mat)]=2
return(as.dist(dis.mat))
}#end def cor.dist
cpp.ANOVA.1way.wrapper = function(arr, grp.levels, ref){
full_beta = arr[!is.na(arr)]
betaT = arr[(grp.levels != ref)&!is.na(arr)]
betaR = arr[(grp.levels == ref)&!is.na(arr)]
result = .Call('_COHCAP_ANOVA_cpp_2group', PACKAGE = 'COHCAP', full_beta, betaT, betaR)
return(result)
}#end def cpp.ANOVA.1way.wrapper
cpp.annova.2way.wrapper = function(arr, var1, var2, ref){
full_beta = arr[!is.na(arr)]
max.df = length(full_beta)-length(levels(as.factor(as.character(var1[!is.na(arr)]))))*length(levels(as.factor(as.character(var2[!is.na(arr)]))))
if(max.df < 1){
return(NA)
}else{
betaT = arr[(var1 != ref)&!is.na(arr)]
betaR = arr[(var1 == ref)&!is.na(arr)]
interaction.var = paste(var1, var2, sep="-")
interaction.var = interaction.var[!is.na(arr)]
#Rcpp code uses numeric array
interaction.var = as.numeric(as.factor(interaction.var))
if(min(table(interaction.var)) < 2){
return(NA)
}else{
result = .Call('_COHCAP_ANOVA_cpp_2group_2way', PACKAGE = 'COHCAP',full_beta, betaT, betaR, interaction.var)
return(result)
}
}#end else
}#end def cpp.annova.2way.wrapper
cpp.ttest.wrapper = function(arr, grp.levels, ref){
betaT = arr[(grp.levels != ref)&!is.na(arr)]
betaR = arr[(grp.levels == ref)&!is.na(arr)]
result = .Call('_COHCAP_ttest_cpp', PACKAGE = 'COHCAP', betaT, betaR)
return(result)
}#end def cpp.ttest.wrapper
cpp.paired.ttest.wrapper = function(arr, iTrt, iRef){
pairedT = arr[iTrt]
pairedR = arr[iRef]
groupT=pairedT[!is.na(pairedT)&!is.na(pairedR)]
groupR=pairedR[!is.na(pairedT)&!is.na(pairedR)]
paired_diff = groupT - groupR
result = .Call('_COHCAP_ttest_cpp_paired', PACKAGE = 'COHCAP', paired_diff)
return(result)
}#end def cpp.paired.ttest.wrapper
cpp.lm.1var.wrapper = function(arr, var1){
full_continuous= var1[!is.na(arr)]
full_beta= arr[!is.na(arr)]
result = .Call('_COHCAP_lmResidual_cpp_1var', PACKAGE = 'COHCAP', full_beta, full_continuous)
return(result)
}#end def cpp.lm.1var.wrapper
fastLm_wrapper = function(arr, var1){
var1= var1[!is.na(arr)]
arr= arr[!is.na(arr)]
fit_stats = fastLmPure(as.matrix(var1), arr)
t_stat = fit_stats$coefficients / fit_stats$stderr
return(pt(-abs(t_stat), fit_stats$df.residual))
}#end def fastLm_wrapper
fastLm_wrapper2 = function(arr, independent.mat){
independent.mat= independent.mat[!is.na(arr),]
arr= arr[!is.na(arr)]
fit_stats = fastLmPure(independent.mat, arr)
t_stat = fit_stats$coefficients[1] / fit_stats$stderr[1]
return(pt(-abs(t_stat), fit_stats$df.residual))
}#end def fastLm_wrapper2
`COHCAP.avg.by.island` =function (sample.file, site.table, beta.table, project.name,
project.folder, methyl.cutoff=0.7, unmethyl.cutoff = 0.3,
delta.beta.cutoff = 0.2, pvalue.cutoff=0.05, fdr.cutoff=0.05,
num.groups=2, num.sites=4, plot.box=TRUE, plot.heatmap=TRUE,
paired=FALSE, ref="none",lower.cont.quantile=0, upper.cont.quantile=1,
max.cluster.dist = NULL, alt.pvalue="none", output.format = "txt",
gene.centric=TRUE, heatmap.dist.fun="Euclidian")
{
fixed.color.palatte = c("green","orange","purple","cyan","pink","maroon","yellow","grey","red","blue","black",colors())
if(heatmap.dist.fun =="Euclidian"){
heatmap.dist.fun=dist
}else if(heatmap.dist.fun =="Pearson Dissimilarity"){
heatmap.dist.fun=cor.dist
}else{
stop("'heatmap.dist.fun' must be either 'Euclidian' or 'Pearson Dissimilarity'")
}
island.folder=file.path(project.folder,"CpG_Island")
dir.create(island.folder, showWarnings=FALSE)
data.folder=file.path(project.folder,"Raw_Data")
dir.create(data.folder, showWarnings=FALSE)
print("Reading Sample Description File....")
sample.table = read.table(sample.file, header=F, sep = "\t", stringsAsFactors=TRUE)
samples = as.character(sample.table[[1]])
for (i in 1:length(samples))
{
if(length(grep("^[0-9]",samples[i])) > 0)
{
samples[i] = paste("X",samples[i],sep="")
}#end if(length(grep("^[0-9]",samples[i])) > 0)
}#end def for (i in 1:length(samples))
sample.group = sample.table[[2]]
sample.group = as.factor(gsub(" ",".",sample.group))
pairing.group = NA
if(paired == "continuous"){
print("using continuous covariate")
pairing.group = sample.table[[3]]
pairing.group = as.numeric(pairing.group)
}else if(paired){
print("using pairing ID")
pairing.group = sample.table[[3]]
pairing.group = as.factor(gsub(" ",".",pairing.group))
}
ref = gsub(" ",".",ref)
sample.names = names(beta.table)[6:ncol(beta.table)]
beta.values = beta.table[,6:ncol(beta.table)]
annotation.names = names(beta.table)[1:5]
annotations = beta.table[,1:5]
print(dim(beta.values))
if(length(samples) != length(sample.names[match(samples, sample.names, nomatch=0)]))
{
warning("Some samples in sample description file are not present in the beta file!")
warning(paste(length(samples),"items in sample description file",sep=" "))
warning(paste(length(sample.names),"items in gene beta file",sep=" "))
warning(paste(length(sample.names[match(samples, sample.names, nomatch=0)]),"matching items in gene beta file",sep=" "))
#warning(sample.names[match(samples, sample.names, nomatch=0)])
stop()
}
if(length(samples)>1)
{
beta.values = beta.values[,match(samples, sample.names, nomatch=0)]
colnames(beta.values)=samples
}
print(dim(beta.values))
#print(samples)
#print(sample.names)
#print(colnames(beta.values))
#print(beta.values[1,])
rm(beta.table)
groups = levels(sample.group)
print(paste("Group:",groups, sep=" "))
if((length(groups) != num.groups) & (ref != "continuous"))
{
print(paste("Group:",groups, sep=" "))
warning(paste("There are ",length(groups)," but user specified algorithm for ",num.groups," groups.",sep=""))
warning("Please restart algorithm and specify the correct number of groups")
stop()
}
if (ref == "continuous"){
print("Continous Variable :")
print(sample.table[[2]])
}
print("Checking CpG Site Stats Table")
print(dim(site.table))
site.table = site.table[!is.na(site.table[[5]]), ]
print(dim(site.table))
if(!is.null(max.cluster.dist)){
if ((num.groups==2)|(ref=="continuous")){
print("Looking for clusters within provided CpG Island Annotations")
island.clusters = rep(NA,nrow(site.table))
cpg.islands = levels(as.factor(as.character(site.table[[5]])))
for (i in 1:length(cpg.islands))
{
island = cpg.islands[i]
cpg.sites = as.character(site.table[site.table[[5]] == island,1])
ann.mat = site.table[match(cpg.sites, as.character(site.table[,1]),nomatch=0),]
if(nrow(ann.mat) >= num.sites)
{
#print(island)
ann.mat = ann.mat[order(ann.mat$Loc),]
probe.pos = ann.mat$Loc
temp.cluster.probes = c(as.character(ann.mat$SiteID[1]))
temp.region.chr = as.character(ann.mat$Chr[1])
temp.region.start = ann.mat$Loc[1]
temp.region.stop = ann.mat$Loc[1]
temp.delta.beta = ann.mat[1,8]
temp.site.count = 1
for (i in 2:length(probe.pos)){
test.pos = probe.pos[i]
test.delta.beta = ann.mat[i,8]
sign.check = FALSE
if((temp.delta.beta > 0) & (test.delta.beta > 0)){
sign.check = TRUE
}else if((temp.delta.beta < 0) & (test.delta.beta < 0)){
sign.check = TRUE
}
if(sign.check){
if(((test.pos - temp.region.stop) <= max.cluster.dist) & (temp.site.count == 1)){
#start cluster
temp.region.stop = test.pos
temp.site.count = 2
temp.cluster.probes = c(temp.cluster.probes, as.character(ann.mat$SiteID[i]))
}else if (temp.site.count > 1){
if((test.pos - temp.region.stop) <= max.cluster.dist){
#expand region
temp.region.stop = test.pos
temp.site.count = temp.site.count + 1
temp.cluster.probes = c(temp.cluster.probes, as.character(ann.mat$SiteID[i]))
} else {
#define stop
if(temp.site.count >= num.sites){
cluster.chr = paste("chr",temp.region.chr,sep="")
cluster.start = temp.region.start
cluster.stop = temp.region.stop
new.island = paste(cluster.chr,":",cluster.start,"-",cluster.stop,sep="")
island.clusters[match(temp.cluster.probes,as.character(site.table$SiteID), nomatch=0)]=new.island
}#end if(temp.site.count >= num.sites)
temp.cluster.probes = c(as.character(ann.mat$SiteID[i]))
temp.region.chr = as.character(ann.mat$Chr[i])
temp.region.start = ann.mat$Loc[i]
temp.region.stop = ann.mat$Loc[i]
temp.site.count = 1
}#end else
}else{
#reset
temp.cluster.probes = c(as.character(ann.mat$SiteID[i]))
temp.region.chr = as.character(ann.mat$Chr[i])
temp.region.start = ann.mat$Loc[i]
temp.region.stop = ann.mat$Loc[i]
temp.site.count = 1
}#end else
}else{
#reset
if (temp.site.count >= num.sites){
cluster.chr = paste("chr",temp.region.chr,sep="")
cluster.start = temp.region.start
cluster.stop = temp.region.stop
new.island = paste(cluster.chr,":",cluster.start,"-",cluster.stop,sep="")
island.clusters[match(temp.cluster.probes,as.character(site.table$SiteID), nomatch=0)]=new.island
}#end if (temp.site.count >= num.sites)
temp.cluster.probes = c(as.character(ann.mat$SiteID[i]))
temp.region.chr = as.character(ann.mat$Chr[i])
temp.region.start = ann.mat$Loc[i]
temp.region.stop = ann.mat$Loc[i]
temp.site.count = 1
}#end else
temp.delta.beta = ann.mat[i,8]
}#end for (i in 2:length(test.pos))
if (temp.site.count >= num.sites){
cluster.chr = paste("chr",temp.region.chr,sep="")
cluster.start = temp.region.start
cluster.stop = temp.region.stop
new.island = paste(cluster.chr,":",cluster.start,"-",cluster.stop,sep="")
island.clusters[match(temp.cluster.probes,as.character(site.table$SiteID), nomatch=0)]=new.island
}#end if (temp.hits > 1)
}#end if(nrow(data.mat) >= num.sites)
}#end for (i in 1:length(cpg.islands))
#print(island.clusters)
mapping.table = data.frame(SiteID = as.character(site.table$SiteID), Chr =as.character(site.table$Chr),
Loc=as.character(site.table$Loc), updated.island=island.clusters)
if(output.format == 'xls'){
xlsfile = file.path(data.folder, paste(project.name,"_DeNovo_Site_to_Island_Mapping-Avg_by_Island.xlsx",sep=""))
WriteXLS("mapping.table", ExcelFileName = xlsfile)
} else if(output.format == 'csv'){
txtfile = file.path(data.folder, paste(project.name,"_DeNovo_Site_to_Island_Mapping-Avg_by_Island.csv",sep=""))
write.table(mapping.table, file=txtfile, quote=F, row.names=F, sep=",")
} else if(output.format == 'txt'){
txtfile = file.path(data.folder, paste(project.name,"_DeNovo_Site_to_Island_Mapping-Avg_by_Island.txt",sep=""))
write.table(mapping.table, file=txtfile, quote=F, row.names=F, sep="\t")
} else {
warning(paste(output.format," is not a valid output format! Please use 'txt' or 'xls'.",sep=""))
}
rm(mapping.table)
site.table[,5] = island.clusters
site.table = site.table[!is.na(site.table[[5]]), ]
} else{
print("Can only define clusters with 2-group (or continuous) comparison. Sorry.")
}
}#revise island annotations
cpg.islands = levels(as.factor(as.character(site.table[[5]])))
print(length(cpg.islands))
sites.per.island = tapply(as.character(site.table$SiteID),as.character(site.table[[5]]),length)
genes = array(dim=length(cpg.islands))
island.values= matrix(nrow=length(cpg.islands), ncol=ncol(beta.values))
colnames(island.values) = samples
#average CpG sites per CpG Island
print("Average CpG Sites per CpG Island")
for (i in 1:length(cpg.islands))
{
island = cpg.islands[i]
#print(island)
cpg.sites = site.table[site.table[[5]] == island,1]
#print(cpg.sites)
data.mat = beta.values[match(cpg.sites, annotations[[1]],nomatch=0),]
#print(data.mat)
if(nrow(data.mat) >= num.sites)
{
info.mat = site.table[site.table[5] == island,]
genes[i] = NA
if(length(levels(as.factor(as.character(info.mat[[4]])))) == 1)
{
genes[i] = as.character(info.mat[1,4])
}
island.values[i,]=apply(data.mat,2,mean, na.rm=T)
}#end if(nrow(data.mat) >= num.sites)
}#end for (i in 1:length(cpg.islands))
island.avg.table = data.frame(island=cpg.islands, gene=genes, island.values)
if(gene.centric)
{
island.avg.table = island.avg.table[!is.na(island.avg.table[[2]]), ]
}
annotations = island.avg.table[,1:2]
annotation.names = c("island","gene")
beta.values = island.avg.table[,3:ncol(island.avg.table)]
#print(beta.values)
rm(site.table)
rm(cpg.islands)
rm(genes)
rm(island.values)
#CpG Island statistical analysis
island.table = annotations
rm(annotations)
if(ref == "continuous"){
continous.var = sample.table[[2]]
if ((paired == TRUE) | (paired == "continuous")){
if(alt.pvalue == "RcppArmadillo.fastLmPure"){
print("Using fastLm and pt() instead of lm(), with 2nd variable")
independent.mat = cbind(continous.var, pairing.group)
lm.pvalue = apply(beta.values, 1, fastLm_wrapper2, independent.mat)
}else{
lm.pvalue = apply(beta.values, 1, lm.pvalue2, continous.var, pairing.group)
}
} else{
if(alt.pvalue == "cppLmResidual.1var"){
print("Using Rcpp residual t-test instead of lm()")
lm.pvalue = apply(beta.values, 1, cpp.lm.1var.wrapper, continous.var)
}else if(alt.pvalue == "RcppArmadillo.fastLmPure"){
print("Using fastLm and pt() instead of lm()")
lm.pvalue = apply(beta.values, 1, fastLm_wrapper, continous.var)
}else{
lm.pvalue = apply(beta.values, 1, lm.pvalue, continous.var)
}
}
lm.fdr = p.adjust(lm.pvalue, "fdr")
beta.cor = apply(beta.values, 1, custom.cor, var1=continous.var)
#upper.beta is max if positive correlation, min if negative correlation
#lower.beta is min if positive correlation, max if negative correlation
#in both cases, delta.beta is upper.beta - lower.beta
beta.min= apply(beta.values, 1, quantile, na.rm=TRUE, probs=c(lower.cont.quantile))
beta.max= apply(beta.values, 1, quantile, na.rm=TRUE, probs=c(upper.cont.quantile))
upper.beta = beta.max
upper.beta[beta.cor < 0] = beta.min[beta.cor < 0]
lower.beta = beta.min
lower.beta[beta.cor < 0] = beta.max[beta.cor < 0]
rm(beta.min)
rm(beta.max)
delta.beta = upper.beta - lower.beta
#make format compatible with 2-group code
island.table = data.frame(island.table, upper.beta = upper.beta, lower.beta = lower.beta,
delta.beta = delta.beta, island.pvalue = lm.pvalue, island.fdr = lm.fdr,
cor.coef = beta.cor)
} else if(length(groups) == 1) {
col.names = c(annotation.names)
print("Averaging Beta Values for All Samples")
if(length(sample.names) > 1)
{
avg.beta = apply(beta.values, 1, mean, na.rm = T)
}
else
{
avg.beta= beta.values
}
island.table = data.frame(island.table, avg.beta=avg.beta)
} else if(length(groups) == 2) {
print("Differential Methylation Stats for 2 Groups with Reference")
trt = groups[groups != ref]
#print(ref)
#print(trt)
#print(samples)
#print(sample.group)
trt.samples = samples[which(sample.group == trt)]
#print(trt.samples)
ref.samples = samples[which(sample.group == ref)]
#print(ref.samples)
all.indices = 1:length(sample.names)
trt.indices = all.indices[which(sample.group == trt)]
#print(trt.indices)
ref.indices = all.indices[which(sample.group == ref)]
#print(ref.indices)
trt.beta.values = beta.values[, trt.indices]
ref.beta.values = beta.values[, ref.indices]
if(length(trt.indices) > 1)
{
trt.avg.beta = apply(trt.beta.values, 1, mean, na.rm = T)
}
else
{
trt.avg.beta = trt.beta.values
}
if(length(ref.indices) > 1)
{
ref.avg.beta = apply(ref.beta.values, 1, mean, na.rm = T)
}
else
{
ref.avg.beta= ref.beta.values
}
delta.beta = trt.avg.beta - ref.avg.beta
if(paired == "continuous"){
print("Analysis of two numeric variables")
beta.pvalue = unlist(apply(beta.values, 1, annova.2way.pvalue, grp.levels=as.numeric(sample.group), pairing.levels=pairing.group))
}else if(paired){
print("Factor in Paired Samples")
if (alt.pvalue == "cppPairedTtest"){
print("Using Rcpp/Boost Paired t-test instead of 2-way ANOVA")
pair.table = table(sample.table[,2], sample.table[,3])
pairIDs=colnames(pair.table)[(as.numeric(pair.table[1,]) == 1)&(as.numeric(pair.table[2,]) == 1)]
if(length(pairIDs) != length(levels(as.factor(pairing.group)))){
print("Only pairings with 2 samples are used:")
print("While incomplete pairs will be removed for missing values,")
print("please only provide a sample description table with pairs that you would like to compare.")
stop()
}#end if(length(pair.table) != length(levels(as.factor(pairing.group)))
pairedT = c()
pairedR = c()
for(i in 1:length(pairIDs)){
pairID = pairIDs[i]
pairedT[i] = samples[(pairing.group == pairID)&(sample.group != ref)]
pairedR[i] = samples[(pairing.group == pairID)&(sample.group == ref)]
}#end for(pairID in names(pair.table))
pairTi = match(pairedT, samples)
pairRi = match(pairedR, samples)
beta.pvalue = apply(beta.values, 1, anova.pvalue, grp.levels=sample.group)
}else if(alt.pvalue == "cppANOVA.2way"){
print("Using Rcpp/Boost 2-way ANOVA")
max.df = length(samples)-length(levels(as.factor(sample.group)))*length(levels(as.factor(pairing.group)))
if(max.df < 1){
print("Not enough degrees of freedom for this implementation. Consider using 'alt.pvalue' = 'cppPairedTtest'")
stop()
}
beta.pvalue = unlist(apply(beta.values, 1, cpp.annova.2way.wrapper, sample.group, pairing.group, ref))
}else{
beta.pvalue = unlist(apply(beta.values, 1, annova.2way.pvalue, grp.levels=sample.group, pairing.levels=pairing.group))
}
}else{
if (alt.pvalue == "rANOVA.1way"){
print("Using R-based ANOVA instead of t-test")
beta.pvalue = apply(beta.values, 1, anova.pvalue, grp.levels=sample.group)
}else if (alt.pvalue == "cppANOVA.1way"){
print("Using Rcpp/Boost ANOVA instead of t-test")
beta.pvalue = apply(beta.values, 1, cpp.ANOVA.1way.wrapper, grp.levels=sample.group, ref=ref)
}else if (alt.pvalue == "cppWelshTtest"){
print("Using Rcpp/Boost Welsh t-test instead of t.test()")
beta.pvalue = apply(beta.values, 1, cpp.ttest.wrapper, grp.levels=sample.group, ref=ref)
}else{
beta.pvalue = apply(beta.values, 1, ttest2, grp1=trt.indices, grp2=ref.indices)
}
}#end else
beta.fdr = p.adjust(beta.pvalue, method="fdr")
#print(dim(island.table))
#print(length(trt.avg.beta))
#print(length(ref.avg.beta))
#print(length(delta.beta))
#print(length(beta.pvalue))
#print(length(beta.fdr))
island.table = data.frame(island.table, trt.beta = trt.avg.beta, ref.beta = ref.avg.beta, delta.beta = delta.beta, pvalue = beta.pvalue, fdr = beta.fdr)
colnames(island.table) = c(annotation.names,
paste(trt,"avg.beta",sep="."),
paste(ref,"avg.beta",sep="."),
paste(trt,"vs",ref,"delta.beta",sep="."),
"island.pvalue",
"island.fdr")
} else if(length(groups) > 2) {
print("Calculating Average Beta and ANOVA p-values")
col.names = c(annotation.names)
for (i in 1:length(groups))
{
temp.grp = groups[i]
all.indices = 1:length(sample.names)
grp.indices = all.indices[which(sample.group == temp.grp)]
grp.beta = beta.values[,which(sample.group == temp.grp)]
if(length(grp.indices) > 1)
{
avg.beta = apply(grp.beta, 1, mean, na.rm = T)
}
else
{
avg.beta= grp.beta
}
col.names = c(col.names, paste(temp.grp,"avg.beta",sep="."))
island.table = data.frame(island.table, avg.beta=avg.beta)
colnames(island.table) = col.names
}#end for (i in 1:length(groups)
if(paired == "continuous"){
print("Analysis of two numeric variables")
beta.pvalue = unlist(apply(beta.values, 1, annova.2way.pvalue, grp.levels=as.numeric(sample.group), pairing.levels=pairing.group))
}else if(paired){
print("Factor in Paired Samples")
beta.pvalue = unlist(apply(beta.values, 1, annova.2way.pvalue, grp.levels=sample.group, pairing.levels=pairing.group))
}else{
pvalue = apply(beta.values, 1, anova.pvalue, grp.levels = sample.group)
}#end else
anova.fdr = p.adjust(pvalue, method="fdr")
col.names = c(col.names, "island.pvalue", "island.fdr")
island.table = data.frame(island.table, anova.pvalue = pvalue, anova.fdr = anova.fdr)
colnames(island.table) = col.names
} else {
warning("Invalid groups specified in sample description file!")
}
island.table = data.frame(island.table, num.sites = sites.per.island[match(as.character(island.table$island),names(sites.per.island))])
if(output.format == 'xls'){
xlsfile = file.path(data.folder, paste(project.name,"_CpG_island_stats-Avg_by_Island.xlsx",sep=""))
WriteXLS("island.table", ExcelFileName = xlsfile)
} else if(output.format == 'csv'){
txtfile = file.path(data.folder, paste(project.name,"_CpG_island_stats-Avg_by_Island.csv",sep=""))
write.table(island.table, file=txtfile, quote=F, row.names=F, sep=",")
} else if(output.format == 'txt'){
txtfile = file.path(data.folder, paste(project.name,"_CpG_island_stats-Avg_by_Island.txt",sep=""))
write.table(island.table, file=txtfile, quote=F, row.names=F, sep="\t")
} else {
warning(paste(output.format," is not a valid output format! Please use 'txt' or 'xls'.",sep=""))
}
#filter CpG islands
print(dim(island.table))
filter.table = island.table
if((length(groups) == 2)|(ref == "continuous")){
temp.trt.beta = island.table[[3]]
temp.ref.beta = island.table[[4]]
temp.delta.beta = abs(island.table[[5]])
temp.pvalue = island.table[[6]]
temp.fdr = island.table[[7]]
if(unmethyl.cutoff > methyl.cutoff)
{
print("unmethyl.cutoff > methyl.cutoff ...")
print("so, delta-beta decides which group should be subject to which cutoff")
temp.delta.beta = island.table[[5]]
temp.methyl.up = filter.table[(temp.trt.beta >= methyl.cutoff) & (temp.ref.beta<=unmethyl.cutoff) & !is.na(temp.pvalue) & (temp.pvalue <= pvalue.cutoff) & (temp.fdr <= fdr.cutoff) & (temp.delta.beta >= delta.beta.cutoff),]
temp.methyl.down = filter.table[(temp.ref.beta >= methyl.cutoff) & (temp.trt.beta<=unmethyl.cutoff) & !is.na(temp.pvalue) & (temp.pvalue <= pvalue.cutoff) & (temp.fdr <= fdr.cutoff) & (temp.delta.beta <= -delta.beta.cutoff),]
filter.table = rbind(temp.methyl.up, temp.methyl.down) }
else
{
temp.methyl.up = filter.table[(temp.trt.beta >= methyl.cutoff) & (temp.ref.beta<=unmethyl.cutoff) & !is.na(temp.pvalue) & (temp.pvalue <= pvalue.cutoff) & (temp.fdr <= fdr.cutoff) & (temp.delta.beta >= delta.beta.cutoff),]
temp.methyl.down = filter.table[(temp.ref.beta >= methyl.cutoff) & (temp.trt.beta<=unmethyl.cutoff) & !is.na(temp.pvalue) & (temp.pvalue <= pvalue.cutoff) & (temp.fdr <= fdr.cutoff) & (temp.delta.beta >= delta.beta.cutoff),]
filter.table = rbind(temp.methyl.up, temp.methyl.down)
}
} else if(length(groups) == 1) {
temp.avg.beta = island.table$avg.beta
filter.table = filter.table[(temp.avg.beta >= methyl.cutoff) | (temp.avg.beta <=unmethyl.cutoff),]
} else if(length(groups) > 2) {
temp.pvalue = island.table[[ncol(island.table) -1]]
temp.fdr = island.table[[ncol(island.table)]]
filter.table = filter.table[(temp.pvalue <= pvalue.cutoff) & (temp.fdr <= fdr.cutoff),]
} else {
warning("Invalid groups specified in sample description file!")
}
print(dim(filter.table))
sig.islands = filter.table[[1]]
print(paste("There are ",length(sig.islands)," differentially methylated islands",sep=""))
if(output.format == 'xls'){
xlsfile = file.path(island.folder, paste(project.name,"_CpG_island_filtered-Avg_by_Island.xlsx",sep=""))
WriteXLS("filter.table", ExcelFileName = xlsfile)
} else if(output.format == 'csv'){
txtfile = file.path(island.folder, paste(project.name,"_CpG_island_filtered-Avg_by_Island.csv",sep=""))
write.table(filter.table, file=txtfile, quote=F, row.names=F, sep=",")
} else if(output.format == 'txt'){
txtfile = file.path(island.folder, paste(project.name,"_CpG_island_filtered-Avg_by_Island.txt",sep=""))
write.table(filter.table, file=txtfile, quote=F, row.names=F, sep="\t")
} else {
warning(paste(output.format," is not a valid output format! Please use 'txt' or 'xls'.",sep=""))
}
print(dim(island.avg.table))
island.avg.table = island.avg.table[match(sig.islands,island.avg.table[[1]],nomatch=0),]
print(dim(island.avg.table))
if(output.format == 'xls'){
xlsfile = file.path(data.folder, paste(project.name,"_CpG_island_filtered_beta_values-Avg_by_Island.xlsx",sep=""))
WriteXLS("island.avg.table", ExcelFileName = xlsfile)
} else if(output.format == 'csv'){
txtfile = file.path(data.folder, paste(project.name,"_CpG_island_filtered_beta_values-Avg_by_Island.csv",sep=""))
write.table(island.avg.table, file=txtfile, quote=F, row.names=F, sep=",")
} else if(output.format == 'txt'){
txtfile = file.path(data.folder, paste(project.name,"_CpG_island_filtered_beta_values-Avg_by_Island.txt",sep=""))
write.table(island.avg.table, file=txtfile, quote=F, row.names=F, sep="\t")
} else {
warning(paste(output.format," is not a valid output format! Please use 'txt' or 'xls'.",sep=""))
}
if((plot.box) & (nrow(island.avg.table) > 0))
{
#better to provide values between 0 and 1, but just in case user provided percent methylation values
plot.max = 1
if(max(island.avg.table[,3:ncol(island.avg.table)],na.rm=T) > 10){
plot.max = 100
}
if (ref == "continuous"){
print("Plotting Significant Islands for Continuous Variable..")
continous.var = sample.table[[2]]
plot.folder=file.path(island.folder,paste(project.name,"_Line_Plots",sep=""))
dir.create(plot.folder, showWarnings=FALSE)
for (i in 1:length(sig.islands))
{
island = as.character(island.avg.table[i,1])
gene = as.character(island.avg.table[i,2])
gene = gsub(";","_",gene)
island = gsub(":","_",island)
island = gsub("-","_",island)
plot.file = file.path(plot.folder, paste(gene,"_",island,".pdf", sep=""))
methyl.level = as.numeric(island.avg.table[i,3:ncol(island.avg.table)])
pdf(file=plot.file)
plot(continous.var, methyl.level, pch=16, col="black", ylim=c(0,plot.max),
main=paste(gene,sep=""), ylab="Methylation (Beta)", xlab="")
abline(lm(methyl.level~continous.var),lwd=2, col="red")
dev.off()
}#end for (1 in 1:length(sig.islands))
}else{
print("Plotting Significant Islands Box-Plots..")
plot.folder=file.path(island.folder,paste(project.name,"_Box_Plots",sep=""))
dir.create(plot.folder, showWarnings=FALSE)
labelColors = fixed.color.palatte[1:length(groups)]
#print(samples)
#print(sample.names)
for (i in 1:length(sig.islands))
{
island = as.character(island.avg.table[i,1])
gene = as.character(island.avg.table[i,2])
gene = gsub(";","_",gene)
island = gsub(":","_",island)
island = gsub("-","_",island)
plot.file = file.path(plot.folder, paste(gene,"_",island,".pdf", sep=""))
data = t(island.avg.table[i,3:ncol(island.avg.table)])
pdf(file=plot.file)
plot(sample.group, data, pch=20, col=labelColors, ylim=c(0,plot.max),
main=paste(gene,sep=""), ylab="Methylation (Beta)")
dev.off()
}#end for (1 in 1:length(sig.islands))
}#end else
}#end if((plot.box) & (nrow(island.avg.table) > 0))
integrate.tables = list(beta.table=island.avg.table, filtered.island.stats=filter.table)
if((plot.heatmap)& (nrow(island.avg.table) > 1)& (nrow(island.avg.table) < 10000)){
temp.beta.mat = apply(island.avg.table[,3:ncol(island.avg.table)], 1, as.numeric)
if(length(sig.islands) < 25){
colnames(temp.beta.mat) = sig.islands
} else {
colnames(temp.beta.mat) = rep("", length(sig.islands))
}
rownames(temp.beta.mat) = samples
labelColors = rep("black",times=nrow(temp.beta.mat))
if(ref == "continuous"){
continuous.color.breaks = 10
plot.var = as.numeric(continous.var)
plot.var.min = min(plot.var, na.rm=T)
plot.var.max = max(plot.var, na.rm=T)
plot.var.range = plot.var.max - plot.var.min
plot.var.interval = plot.var.range / continuous.color.breaks
color.range = colorRampPalette(c("green","black","orange"))(n = continuous.color.breaks)
plot.var.breaks = plot.var.min + plot.var.interval*(0:continuous.color.breaks)
for (j in 1:continuous.color.breaks){
labelColors[(plot.var >= plot.var.breaks[j]) &(plot.var <= plot.var.breaks[j+1])] = color.range[j]
}#end for (j in 1:continuous.color.breaks)
}else{
color.palette = fixed.color.palatte[1:length(groups)]
for (j in 1:length(groups)){
labelColors[sample.group == as.character(groups[j])] = color.palette[j]
}#end for (j in 1:length(groups))
}
beta.breaks = c(0:40*2.5)
if(max(temp.beta.mat, na.rm=T) < 2){
beta.breaks = beta.breaks / 100
}else{
print("Adjusting heatmap scale, assuming percent methylation between 0 and 100")
}
heatmap.file = file.path(island.folder, paste(project.name,"_CpG_island_heatmap-Avg_by_Island.pdf",sep=""))
pdf(file = heatmap.file)
heatmap.2(temp.beta.mat,
col=colorpanel(40, low="blue", mid="black", high="red"), breaks=beta.breaks,
density.info="none", key=TRUE, distfun = heatmap.dist.fun,
RowSideColors=labelColors, trace="none", margins = c(15,15), cexCol=0.8, cexRow=0.8)
if(ref == "continuous"){
legend("topright",legend=c(round(plot.var.max,digits=1),rep("",length(color.range)-2),round(plot.var.min,digits=1)),
col=rev(color.range), pch=15, y.intersp = 0.4, cex=0.8, pt.cex=1.5)
}else{
legend("topright",legend=groups,col=color.palette, pch=15)
}
dev.off()
}#end if((plot.heatmap)& (nrow(island.avg.table) > 0))
return(integrate.tables)
}#end def COHCAP.avg.by.island
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.