R/champ.DMR.R

if(getRversion() >= "3.1.0") utils::globalVariables(c("myNorm","myLoad","mylimma","detectCores","probe.features","illumina450Gr","seqlevels<-","seqlevels"))

champ.DMR <- function(beta=myNorm,
                      pheno=myLoad$pd$Sample_Group,
                      compare.group=NULL,
                      arraytype="450K",
                      method = "Bumphunter",
                      minProbes=7,
                      adjPvalDmr=0.05,
                      cores=3,
                      ## following parameters are specifically for Bumphunter method.
                      maxGap=300,
                      cutoff=NULL,
                      pickCutoff=TRUE,
                      smooth=TRUE,
                      smoothFunction=loessByCluster,
                      useWeights=FALSE,
                      permutations=NULL,
                      B=250,
                      nullMethod="bootstrap",
                      ## following parameters are specifically for probe ProbeLasso method.
                      meanLassoRadius=375,
                      minDmrSep=1000,
                      minDmrSize=50,
                      adjPvalProbe=0.05,
                      Rplot=T,
                      PDFplot=T,
                      resultsDir="./CHAMP_ProbeLasso/",
                      ## following parameters are specifically for DMRcate method.
                      rmSNPCH=T,
                      fdr=0.05,
                      dist=2,
                      mafcut=0.05,
                      lambda=1000,
                      C=2)
{
    message("[===========================]")
    message("[<<<<< ChAMP.DMR START >>>>>]")
    message("-----------------------------")

    message("!!! important !!! We just upgrate champ.DMR() function, since now champ.DMP() could works on multiple phenotypes, but ProbeLasso can only works on one DMP result, so if your pheno parameter contains more than 2 phenotypes, and you want to use ProbeLasso function, you MUST specify compare.group=c(\"A\",\"B\"). Bumphunter and DMRcate should not be influenced.")

    message("\n[ Section 1:  Check Input Pheno Start ]\n")

     if(length(which(is.na(beta)))>0) message(length(which(is.na(beta)))," NA are detected in your beta Data Set, which may cause fail or uncorrect of SVD analysis. You may want to impute NA with champ.impute() function first.")

    if(!class(pheno) %in% c("character","factor","numeric")) stop("pheno parameter must be a category vector, could be character, factor or numeric (numeric is not recommended).")

    message("  You pheno is ",class(pheno)," type.")
    message("    Your pheno information contains following groups. >>")
    sapply(unique(pheno),function(x) message("    <",x,">:",sum(pheno==x)," samples."))


    if(method == "ProbeLasso")
    {
        message("  ProbeLasso Method can only be done between two phenotypes. So we need to do more check here...")
        if(length(unique(pheno))>2)
        {
            message("    Your pheno contains more than two phenotypes.")
            message("    You may specify compare.group to do comparision between certain two phenotypes")
            if(is.null(compare.group))
            {
                stop("    You did not specifically compare.group parameter to specify which two phenotypes you want to analysis.")
            } else if(sum(compare.group %in% unique(pheno)) == 2) {
                message("    Your compare.group is in accord with your pheno parameter, which is good.")
                message("    Now champ.DMR() would extract values for only these two phenotypes to analysis.")
                beta <- beta[,which(pheno %in% compare.group)]
                pheno <- pheno[which(pheno %in% compare.group)]
            } else {
                stop("    Seems you specified compare.group, but elements in your compare.group are not all found in your pheno parameter. Please recheck your pheno or compare.group.")
            }
        } else if(length(unique(pheno))==2) {
            message("    Your pheno parameter contains extactly two phenotypes, which is good and compare.group is not needed, champ.DMR() would proceed with your whole data set.")
        } else {
            stop("    Seems something wrong with your pheno data. champ.DMR() can not proceed. Please recheck your pheno information.")
        }
    }
    message("\n[ Section 1:  Check Input Pheno Done ]\n")

    message("\n[ Section 2:  Run DMR Algorithm Start ]\n")

    if(arraytype=="EPIC"){
        RSobject <- RatioSet(beta, annotation = c(array = "IlluminaHumanMethylationEPIC",annotation = "ilm10b4.hg19"))
    }else{
        RSobject <- RatioSet(beta, annotation = c(array = "IlluminaHumanMethylation450k",annotation = "ilmn12.hg19"))
    }
    probe.features <- getAnnotation(RSobject)

    if(cores > detectCores()) cores <- detectCores()


    if(method=="Bumphunter")
    {
        message("<< Find DMR with Bumphunter Method >>")

        message(cores," cores will be used to do parallel Bumphunter computing.")
        registerDoParallel(cores = cores)
        

        cpg.idx <- intersect(rownames(beta),rownames(probe.features))
        Anno <- probe.features[cpg.idx,]
        Anno <- Anno[order(Anno$chr,Anno$pos),]
        cpg.idx <- rownames(Anno)

        cl <- clusterMaker(Anno$chr,Anno$pos,maxGap=maxGap)
        names(cl) <- cpg.idx
        bumphunter.idx <- cpg.idx[which(cl %in% names(which(table(cl)>minProbes)))]

        message("According to your data set, champ.DMR() detected ",sum(table(cl)>minProbes)," clusters contains MORE THAN ",minProbes," probes within",maxGap," maxGap. These clusters will be used to find DMR.\n")

        X <- cbind(1,(as.numeric(as.factor(pheno))-1))
        Beta <- beta[bumphunter.idx,]
        # Beta <- replace(Beta,which(Beta <= 0.001),0.001)
        # Beta <- replace(Beta,which(Beta >= 0.999),0.999)
        Beta[Beta <= 0.001] <- 0.001
        Beta[Beta >= 0.999] <- 0.999
        Y <- log((Beta/(1-Beta)),2)
        
        Bumps <- bumphunter(Y,
                            design=X,
                            chr=Anno[bumphunter.idx,]$chr,
                            pos=Anno[bumphunter.idx,]$pos,
                            cluster=cl[bumphunter.idx],
                            cutoff=cutoff,
                            pickCutoff=pickCutoff,
                            smooth=smooth,
                            smoothFunction=smoothFunction,
                            useWeights=useWeights,
                            permutations=permutations,
                            verbose=TRUE,
                            B=B,
                            nullMethod=nullMethod)

        message("<< Calculate DMR success. >>")
        DMR <- Bumps$table[which(Bumps$table$p.valueArea <= adjPvalDmr),]
        message("Bumphunter detected ",nrow(DMR)," DMRs with P value <= ",adjPvalDmr,".")

        if(nrow(DMR) == 0) stop("No DMR detected.")

        rownames(DMR) <- paste("DMR",1:nrow(DMR),sep="_")
        #DMRProbes <- apply(DMR,1,function(x) Anno[which(Anno$chr==x[1] & Anno$pos>= as.numeric(x[2]) & Anno$pos<= as.numeric(x[3])),])
        DMR <- data.frame(DMR[,1:3],width=DMR[,3]-DMR[,2],strand="*",DMR[,4:14])
        colnames(DMR)[1:3] <- c("seqnames","start","end") 

        #OutputDMR <- list(BumphunterDMR=DMR,BumphunterDMRProbes=DMRProbes)
        OutputDMR <- list(BumphunterDMR=DMR)

    } else if(method == "ProbeLasso")
    {
        if (!file.exists(resultsDir)) dir.create(resultsDir)
        message("champ.DMR Results will be saved in ",resultsDir)

        message("<< Find DMR with ProbeLasso Method >>")
        gc()

        DMP <- champ.DMP(beta=beta,pheno=pheno,adjPVal=1)
        if(length(DMP) > 1) stop("Your pheno parameter seems contains more than 2 phenotypes. champ.DMR() only take covariates with only 2 phenotypes. Please manually extract your sample and covariates, then retry champ.DMR()")
        DMP <- DMP[[1]]

        if(arraytype=="EPIC") data(illuminaEPICGr) else data(illumina450Gr)
        if(length(which(DMP$adj.P.Val < adjPvalProbe))==0) stop("There is no probe show significant difference from champ.DMP() function.")


        myResultsGr <- illumina450Gr[match(rownames(DMP), names(illumina450Gr))]
        myResultsGr$P.Value <- DMP$P.Value[match(names(myResultsGr), rownames(DMP))];
        myResultsGr$adj.P.Val <- DMP$adj.P.Val[match(names(myResultsGr), rownames(DMP))]
        seqlevels(myResultsGr) <- sort(seqlevels(myResultsGr));
        myResultsGr <- sort(myResultsGr,ignore.strand=T) # sort for later
        ### readjust pValues after masking
        myResultsGr$adj.P.Val <- p.adjust(mcols(myResultsGr)$P.Value, method = "BH")
        ### Probe spacing and quantile derivation

        message("<< Get closestProbe for each Probe >>")
        closestProbe <- as.data.frame(distanceToNearest(myResultsGr,ignore.strand=T))$distance
        closestProbeSp <- split(closestProbe, mcols(myResultsGr)$featureCgi); rm(closestProbe)

        message("<< Get lassoQuantileThreshold for each featureCgi >>")
        lassoQuantileDeviation <- abs(meanLassoRadius - rowMeans(as.data.frame(lapply(closestProbeSp,function(x) quantile(x,(1:1000)/1000)))))
        lassoQuantileThreshold <- which.min(lassoQuantileDeviation) / 1000;
        lassoSizes <- lapply(closestProbeSp, function(x) quantile(x, lassoQuantileThreshold, na.rm = T))

        message("<< Get expend ranges for each probe >>")
        myResultsGrSp <- split(myResultsGr, myResultsGr$featureCgi) # splits myResultsGr by 'featureCgi'; length = 28
        lassoGr <- mapply(function(x, y) promoters(x, upstream = y, downstream = y), x = myResultsGrSp, y = lassoSizes)
        lassoGr <- unlist(GRangesList(lassoGr)); rm(myResultsGrSp)
        myResultsSigGr <- myResultsGr[which(mcols(myResultsGr)$adj.P.Val < adjPvalProbe)]
        lassoProbeCountOverlap <- countOverlaps(lassoGr, myResultsSigGr, ignore.strand = T);rm(myResultsSigGr)

        message("<< Get DMR from overlapped probes >>")
        dmrGr <- reduce(lassoGr[which(lassoProbeCountOverlap >= minProbes)], min.gapwidth = minDmrSep, ignore.strand=TRUE);
        rm(lassoProbeCountOverlap, lassoGr) # lassos capturing 'minSigProbesLasso', merged 
        strand(dmrGr) <- '*'
        dmrGr <- dmrGr[which(width(dmrGr) > minDmrSize)] # remove DMRs < minDmrSize
        probeIndex <- as.data.frame(findOverlaps(dmrGr, myResultsGr))
        pValuesGr <- myResultsGr[probeIndex$subjectHits, "P.Value"]
        myBetas <- beta[match(names(pValuesGr), rownames(beta)), ]
        myBetas <- split(as.data.frame(myBetas), probeIndex$queryHits)
            
        message("<< Get adjusted P value for DMR >>")
        correl <- lapply(myBetas, function(x) cor(t(x)))
        weights <- lapply(correl, function(x) 1/apply(x^2,1,sum)); rm(correl)
        dmrQP <- qnorm(mcols(pValuesGr)$P.Value); dmrQP <- split(dmrQP, probeIndex$queryHits)
        dmrQPW <- mapply("*", dmrQP, weights); rm(dmrQP)
        if(class(dmrQPW) == "matrix") dmrStat <- sum(dmrQPW) else dmrStat <- lapply(dmrQPW, sum)
        rm(dmrQPW)
        dmrSd <- lapply(weights, function(x) sqrt(sum(x^2))); rm(weights)
        dmrP <- mapply(function(x,y) pnorm(x,0, sd=y), dmrStat, dmrSd); rm(dmrStat, dmrSd)
        dmrP <- p.adjust(dmrP, method = "BH")
        goodDmr <- which(dmrP < adjPvalDmr)
        dmrGr <- dmrGr[goodDmr] 
        dmrP <- dmrP[goodDmr]
        dmrpRank <- rank(dmrP, ties.method="min"); rm(goodDmr)



        ### get pvalues and betas for GOOD DMRs
        message("<< Get Start-End Ranges for each DMR >>")
        probeIndex <- as.data.frame(findOverlaps(dmrGr, myResultsGr))
        dmrProbesGr <- myResultsGr[probeIndex$subjectHits]
        myBetas <- beta[match(names(dmrProbesGr), rownames(beta)), ]; myBetas <- as.data.frame(myBetas)
        dmrCoreStart <- start(dmrProbesGr)
        dmrCoreEnd <- end(dmrProbesGr)
        myBetas <- split(myBetas, probeIndex$queryHits)
        dmrCoreStart <- split(dmrCoreStart, probeIndex$queryHits); dmrCoreStart <- sapply(dmrCoreStart, min)
        dmrCoreEnd <- split(dmrCoreEnd, probeIndex$queryHits); dmrCoreEnd <- sapply(dmrCoreEnd, max)
        ### calculate methylation scores for each DMR in each Sample_Group

        message("<< Calculate Methylation Scores for each DMR >>")
        groupIndex <- pheno
        dmrGroupMeans <- do.call(rbind, lapply(myBetas, function(x) sapply(split(t(x), groupIndex), mean)))
        colnames(dmrGroupMeans) <- paste("betaAv", colnames(dmrGroupMeans), sep = "_")
        probeGroupMeans <- lapply(myBetas, function(x) split(as.data.frame(t(x)), groupIndex)); rm(groupIndex, myBetas)
        probeGroupMeans <- lapply(probeGroupMeans, function(x) lapply(x, colMeans))
        probeGroupMeans <- do.call(rbind, lapply(probeGroupMeans, function(x) t(do.call(rbind, x))))
        colnames(probeGroupMeans) <- paste("betaAv", colnames(probeGroupMeans), sep = "_")

        ### probe-level data and DMR metadata

        message("<< Generate Probe-level Data >>")
        myDmrProbesGr <- myResultsGr[probeIndex$subjectHits]
        myDmrProbesGr <- as(cbind(as.data.frame(myDmrProbesGr), probeGroupMeans), "GRanges");
        rm(probeGroupMeans)
        myDmrProbesGr$dmrNo <- probeIndex$queryHits
        myDmrProbesGr$dmrP <- dmrP[probeIndex$queryHits]
        myDmrProbesGr$dmrpRank <- dmrpRank[probeIndex$queryHits]
        myDmrProbesGr$dmrChrom <- seqnames(dmrGr[probeIndex$queryHits])
        myDmrProbesGr$dmrStart <- start(dmrGr[probeIndex$queryHits])
        myDmrProbesGr$dmrEnd <- end(dmrGr[probeIndex$queryHits])
        myDmrProbesGr$dmrSize <- width(dmrGr[probeIndex$queryHits])
        myDmrProbesGr$dmrCoreStart <- dmrCoreStart[probeIndex$queryHits]
        myDmrProbesGr$dmrCoreEnd <- dmrCoreEnd[probeIndex$queryHits]    
        myDmrProbesGr$dmrCoreSize <- myDmrProbesGr$dmrCoreEnd - myDmrProbesGr$dmrCoreStart + 1

        ### DMR metadata
        message("<< Generate DMR metadata >>")
        myDmrGr <- dmrGr
        myDmrGr$dmrNo <- unique(probeIndex$queryHits)
        myDmrGr$dmrP <- dmrP; rm(dmrP)
        myDmrGr$dmrpRank <- dmrpRank; rm(dmrpRank)
        myDmrGr$dmrChrom <- seqnames(dmrGr) 
        myDmrGr$dmrStart <- start(dmrGr)
        myDmrGr$dmrEnd <- end(dmrGr)
        myDmrGr$dmrSize <- width(dmrGr); rm(dmrGr)
        myDmrGr$dmrCoreStart <- dmrCoreStart
        myDmrGr$dmrCoreEnd <- dmrCoreEnd
        myDmrGr$dmrCoreSize <- myDmrGr$dmrCoreEnd - myDmrGr$dmrCoreStart + 1
        genes <- split(as.data.frame(myResultsGr)[probeIndex$subjectHits, c("ensemblID", "geneSymbol")], probeIndex$queryHits); rm(probeIndex)
        myDmrGr$ensemblID <- sapply(genes, function(x) paste(unique(unlist(strsplit(x$ensemblID, ";"))), collapse = ";"))
        myDmrGr$geneSymbol <- sapply(genes, function(x) paste(unique(unlist(strsplit(x$geneSymbol, ";"))), collapse = ";")); rm(genes)
        myDmrGr <- as(cbind(as.data.frame(myDmrGr), dmrGroupMeans), "GRanges"); rm(dmrGroupMeans) 

	    # plot of lassos
        interplot <- function(illumina450Gr,lassoSizes)
        {
            cgiNames <- levels(illumina450Gr$cgi)
            featureNames <- levels(illumina450Gr$feature)
            sfo <- c(6, 7, 3, 1, 4, 2, 5)
            scgio <- c(1, 4, 3, 2)
            sfcgio <- rep((sfo - 1) *4, each = 4) + rep(scgio, 7)
            lassoSizes <- round(unlist(lassoSizes))
            #imageName <- paste(getwd(),"myLassos.pdf",sep="/")
            #pdf(imageName,width=9,height=9)

            par(mar = c(7, 4, 4, 3)+0.5)
            plot(c(1,28), y=c(range(0.3*sqrt(lassoSizes))[1]*0.8, range(0.3*sqrt(lassoSizes))[2]*1.2), type="n", xaxt="n", xlab="", yaxt="n", ylab="lasso radius [bp]", main=paste("lasso quantile = ", round(lassoQuantileThreshold,2), "\nmean lasso radius = ", meanLassoRadius, "bp", sep = ""), bty="n")
            segments(1:28, rep(0,28), 1:28, 0.3*sqrt(lassoSizes[sfcgio]), lty=3, col="grey")
            points(1:28, 0.3*sqrt(lassoSizes[sfcgio]), pch=16, cex=0.3*sqrt(lassoSizes[sfcgio]), col=rep(rainbow(7,alpha=0.5)[sfo], each=4))
            text(1:28, 0.3*sqrt(lassoSizes[sfcgio]), lassoSizes[sfcgio], pos=3, cex=0.8)
            axis(1, at = 1:28, labels = rep(cgiNames[scgio], 7), las = 2)
            par(xpd = T)
            segments(seq(1, 28, 4), rep(-2.5, 7), seq(4, 28, 4), rep(-2.5, 7))
            mtext(text = featureNames[sfo], side = 1, at = seq(2.5, 28, 4), line = 5.5, las = 1, cex.axis = 1)
            axis(2, at=c(0,max(0.3*sqrt(lassoSizes))), labels=F)
        }
        if(Rplot) interplot(illumina450Gr,lassoSizes)
        if(PDFplot)
        {
            pdf(paste(resultsDir,"myLassos.pdf",sep="/"),width=9,height=9)
            interplot(illumina450Gr,lassoSizes)
            dev.off()
        }
        DMRProbes <- as.data.frame(myDmrProbesGr)
        DMRProbes <- data.frame(probe.features[rownames(DMRProbes),],DMRProbes[,which(colnames(DMRProbes)=="P.Value"):which(colnames(DMRProbes)=="dmrNo")])
        DMRProbes <- split(DMRProbes,DMRProbes$dmrNo)
        DMR <- as.data.frame(myDmrGr)

        message("ProbeLasso detected ",nrow(DMR)," DMRs with P value <= ",adjPvalDmr,".")
        if(nrow(DMR) == 0) stop("No DMR detected.")

        rownames(DMR) <- paste("DMR",DMR$dmrNo,sep="_")
        names(DMRProbes) <- rownames(DMR)

        if(arraytype=="EPIC")
            DMR[,1] <- paste("chr",DMR[,1],sep="")

        #OutputDMR <- list(ProbeLassoDMR=DMR,ProbeLassoDMRProbes=DMRProbes)
        OutputDMR <- list(ProbeLassoDMR=DMR)

    }else if(method=="DMRcate")
    {

        message(cores," cores will be used to do parallel DMRcate computing.")

        message("<< Find DMR with DMRcate Method >>")
        myMs <- logit2(beta)
        if(rmSNPCH) myMs <- rmSNPandCH(myMs, dist=dist, mafcut=mafcut)
        design <- model.matrix(~ pheno)

        if(arraytype == "450K"){
            myannotation <- cpg.annotate(datatype="array",fdr=fdr, myMs,design=design,coef=ncol(design), analysis.type="differential",annotation=c(array = "IlluminaHumanMethylation450k", annotation = "ilmn12.hg19"),what="M")
        } else{
            myannotation <- cpg.annotate(datatype="array",fdr=fdr, myMs,design=design,coef=ncol(design), analysis.type="differential",annotation=c(array = "IlluminaHumanMethylationEPIC", annotation = "ilm10b4.hg19"),what="M")
        }
        M <- do.call("cbind", lapply(myannotation, as.data.frame))
        colnames(M) <- names(myannotation)

        dmrcoutput <- dmrcate(myannotation, min.cpgs = minProbes, lambda=lambda, C=C,mc.cores = cores)
        data(dmrcatedata)
        DMR <- as.data.frame(extractRanges(dmrcoutput, genome = "hg19"))

        message("DMRcate detected ",nrow(DMR)," DMRs with mafcut as= ",adjPvalDmr,".")
        if(nrow(DMR) == 0) stop("No DMR detected.")

        if(nrow(DMR)!=0)
        {
            #DMRProbes <- apply(DMR,1,function(x) M[which(M[,3]==x[1] & M[,4]>= as.numeric(x[2]) & M[,4]<= as.numeric(x[3])),])
            rownames(DMR) <- paste("DMR",1:nrow(DMR),sep="_")
            #names(DMRProbes) <- rownames(DMR)
            #for(i in names(DMRProbes)) rownames(DMRProbes[[i]]) <- DMRProbes[[i]]$ID
            #X <- lapply(DMRProbes,as.data.frame)
            #DMRProbes <- lapply(X,function(x) cbind(ID=x[,1],probe.features[rownames(x),],x[,c(2,5,6,7)]))
            
            OutputDMR <- list(DMRcateDMR=DMR)
        }else
        {
            OutputDMR <- NULL
        }
        #OutputDMR <- list(DMRcateDMR=DMR,DMRcateDMRProbes=DMRProbes)

    } else {
        stop("Please assign correct DMR method: 'Bumphunter' or 'ProbeLasso'")
    }
    message("\n[ Section 2:  Run DMR Algorithm Done ]\n")

    message("[<<<<<< ChAMP.DMR END >>>>>>]")
    message("[===========================]")
    message("[You may want to process DMR.GUI() or champ.GSEA() next.]\n")
    return(OutputDMR)
}
gaberosser/ChAMP documentation built on May 9, 2019, 2:22 a.m.