inst/doc/recountmethylation_data_analyses.R

## ----setup, echo = FALSE------------------------------------------------------
suppressMessages(library(rhdf5))
suppressMessages(library(minfi))
suppressMessages(library(recountmethylation))
suppressMessages(library(knitr))
suppressMessages(library(ggplot2))
suppressMessages(library(gridExtra))
suppressMessages(library(GenomicRanges))
suppressMessages(library(limma))
suppressMessages(library(HDF5Array))
opts_chunk$set(eval = FALSE, echo = TRUE, 
               warning = FALSE, message = FALSE)

## ---- eval = TRUE-------------------------------------------------------------
sf <- system.file(file.path("extdata", "data_analyses"), 
                  package = "recountmethylation")
load(file.path(sf, "data_analyses.RData"))

## ---- eval = TRUE-------------------------------------------------------------
# get local metadata
path <- system.file("extdata", "metadata", package = "recountmethylation")
mdpath <- paste(path, list.files(path)[1], sep = "/")
md <- get(load(mdpath))

## -----------------------------------------------------------------------------
#  # load methylset
#  gmdn <- "remethdb-h5se_gm_0-0-1_1590090412"
#  gm <- loadHDF5SummarizedExperiment(gmdn)
#  # load grset
#  grdn <- "remethdb-h5se_gr_0-0-1_1590090412"
#  gr <- loadHDF5SummarizedExperiment(grdn)

## ---- eval = TRUE-------------------------------------------------------------
mdf <- md[!md$age == "valm:NA",]
mdf$chron.age <- as.numeric(gsub(";.*", "", gsub("^valm:", "", mdf$age)))
mdf$predage <- as.numeric(mdf$predage)
mdf <- mdf[!is.na(mdf$chron.age),]
mdf <- mdf[!is.na(mdf$predage),]

## ---- eval = TRUE-------------------------------------------------------------
mdf$stype <- as.character(gsub(";.*", "", 
  gsub("^msraptype:", "", mdf$sampletype)))
mdf <- mdf[!is.na(mdf$stype),]

## ---- eval = TRUE-------------------------------------------------------------
mdf$is.cx <- ifelse(grepl(".*cancer.*", mdf$disease), TRUE, FALSE)

## ---- eval = TRUE-------------------------------------------------------------
xdif <- ngsm <- c()
for(g in unique(mdf$gseid)){
    mdff <- mdf[mdf$gseid==g, ]
    xdif <- c(xdif, mean(abs(mdff$chron.age - as.numeric(mdff$predage))))
    ngsm <- c(ngsm, nrow(mdff))
}
names(xdif) <- names(ngsm) <- unique(mdf$gseid)

## ---- eval = TRUE-------------------------------------------------------------
filt <- mdf$stype == "tissue" & !mdf$is.cx
filt <- filt & !mdf$gseid %in% names(xdif[xdif > 10])
mdff <- mdf[filt, ]

## ---- eval = TRUE-------------------------------------------------------------
lm1 <- lm(mdf$predage ~ mdf$chron.age + mdf$gseid + mdf$stype + mdf$is.cx)
lm2 <- lm(mdff$predage ~ mdff$chron.age + mdff$gseid)

## ---- eval = TRUE-------------------------------------------------------------
# anovas
av1 <- anova(lm1)
av2 <- anova(lm2)
# results summaries
sperc1 <- round(100*av1$`Sum Sq`[1:4]/sum(av1$`Sum Sq`), 2)
pval1 <- format(av1$`Pr(>F)`[1:4], scientific = TRUE, digits = 3)
sperc2 <- round(100*av2$`Sum Sq`[1:2]/sum(av2$`Sum Sq`), 2)
pval2 <- format(av2$`Pr(>F)`[1:2], scientific = TRUE, digits = 3)
# summary table
dan <- data.frame(Vperc1 = c(sperc1), 
                  Pval1 = c(pval1),
                  Vperc2 = c(sperc2, "-", "-"), 
                  Pval2 = c(pval2, "-", "-"), 
                  stringsAsFactors = FALSE)
rownames(dan) <- c("Chron.Age", "GSEID", "SampleType", "Cancer")
knitr::kable(dan, align = "c")

## ---- eval = TRUE-------------------------------------------------------------
# rsquared
rsq1 <- round(summary(lm1)$r.squared, 2)
rsq2 <- round(summary(lm2)$r.squared, 2)
# correlation coefficient
rho1 <- round(cor.test(mdf$predage, mdf$chron.age, 
                      method = "spearman")$estimate, 2)
rho2 <- round(cor.test(mdff$predage, mdff$chron.age, 
                       test = "spearman")$estimate, 2)
# mean absolute difference
mad1 <- round(mean(abs(mdf$chron.age - mdf$predage)), 2)
mad2 <- round(mean(abs(mdff$chron.age - mdff$predage)), 2)

## ---- eval = TRUE-------------------------------------------------------------
dss <- data.frame(group = c("1", "2"),
                  ngsm = c(nrow(mdf), nrow(mdff)),
                  ngse = c(length(unique(mdf$gseid)), 
                    length(unique(mdff$gseid))),
                  r.squared = c(rsq1, rsq2), rho = as.character(c(rho1, rho2)),
                  mad = c(mad1, mad2), stringsAsFactors = FALSE)
knitr::kable(dss, align = "c")

## ---- eval = TRUE, fig.width = 3.8, fig.height = 3.4--------------------------
plot(xdif, ngsm, ylab = "Study Size (Num. GSM)", 
     xlab = "Age Difference, MAD[Chron, Pred]")
abline(v = 10, col = "red")

## ---- eval = TRUE, fig.width = 3.4, fig.height = 3.1--------------------------
ggplot(mdff, aes(x = chron.age, y = predage)) +
  geom_point(size = 1.2, alpha = 0.2) + geom_smooth(method = "lm", size = 1.2) +
  theme_bw() + xlab("Chronological Age") + ylab("Epigenetic (DNAm) Age")

## ---- eval = TRUE-------------------------------------------------------------
mdf <- md[!md$storage == "NA",]
mdf$sgroup <- ifelse(grepl("FFPE", mdf$storage), "ffpe", "frozen")

## -----------------------------------------------------------------------------
#  # get summary table
#  sst <- get_sst(sgroup.labs = c("ffpe", "frozen"), mdf)
#  knitr::kable(sst, align = "c") # table display

## -----------------------------------------------------------------------------
#  gmf <- gm[, gm$gsm %in% mdf$gsm] # filt h5se object
#  mdf <- mdf[order(match(mdf$gsm, gmf$gsm)),]
#  identical(gmf$gsm, mdf$gsm)
#  gmf$storage <- mdf$storage # append storage info

## -----------------------------------------------------------------------------
#  meth.all <- getMeth(gmf)
#  unmeth.all <- getUnmeth(gmf)

## -----------------------------------------------------------------------------
#  blocks <- getblocks(slength = ncol(gmf), bsize = 1000)

## -----------------------------------------------------------------------------
#  ms <- matrix(nrow = 0, ncol = 2)
#  l2meth <- l2unmeth <- c()
#  for(i in 1:length(blocks)){
#    b <- blocks[[i]]
#    gmff <- gmf[, b]
#    methb <- as.matrix(meth.all[, b])
#    unmethb <- as.matrix(unmeth.all[, b])
#    l2meth <- c(l2meth, apply(methb, 2, function(x){
#      log2(median(as.numeric(x)))
#    }))
#    l2unmeth <- c(l2unmeth, apply(unmethb, 2, function(x){
#      log2(median(as.numeric(x)))
#    }))
#    ms <- rbind(ms, matrix(c(l2meth, l2unmeth), ncol = 2))
#    message(i)
#  }
#  rownames(ms) <- colnames(meth.all)
#  colnames(ms) <- c("meth.l2med", "unmeth.l2med")
#  ds <- as.data.frame(ms)
#  ds$storage <- ifelse(grepl("FFPE", gmf$storage), "ffpe", "frozen")

## ---- eval = TRUE, fig.width = 4.3, fig.height = 3.1--------------------------
ggplot(ds, aes(x = meth.l2med, y = unmeth.l2med, color = storage)) + 
  geom_point(alpha = 0.35, cex = 3) + theme_bw() +
  scale_color_manual(values = c("ffpe" = "orange", "frozen" = "purple"))

## ---- eval = TRUE, fig.width = 4.5, fig.height = 2.5--------------------------
vp <- matrix(nrow = 0, ncol = 2)
vp <- rbind(vp, matrix(c(ds$meth.l2med, paste0("meth.", ds$storage)), 
  ncol = 2))
vp <- rbind(vp, matrix(c(ds$unmeth.l2med, paste0("unmeth.", ds$storage)), 
  ncol = 2))
vp <- as.data.frame(vp, stringsAsFactors = FALSE)
vp[,1] <- as.numeric(vp[,1])
colnames(vp) <- c("signal", "group")
vp$col <- ifelse(grepl("ffpe", vp$group), "orange", "purple")
# make plot
ggplot(vp, aes(x = group, y = signal, color = group)) + 
  scale_color_manual(values = c("meth.ffpe" = "orange", 
    "unmeth.ffpe" = "orange", "meth.frozen" = "purple", 
    "unmeth.frozen" = "purple")) +
  geom_violin(draw_quantiles = c(0.5)) + theme_bw() + 
    theme(legend.position = "none")

## ---- eval = TRUE-------------------------------------------------------------
gsmv <- c(adipose.gsmv, liver.gsmv)
mdf <- md[md$gsm %in% gsmv,]
mdf$sgroup <- ifelse(mdf$gsm %in% adipose.gsmv, "adipose", "liver")
sst.tvar <- get_sst(sgroup.labs = c("liver", "adipose"), mdf)
knitr::kable(sst.tvar, align = "c")

## -----------------------------------------------------------------------------
#  ms <- gm[,colnames(gm) %in% rownames(mdf)]
#  ms <- ms[,order(match(colnames(ms), rownames(mdf)))]
#  identical(colnames(ms), rownames(mdf))
#  # [1] TRUE
#  ms$sgroup <- mdf$sgroup
#  ms <- mapToGenome(ms)
#  dim(ms)
#  # [1] 485512    252

## -----------------------------------------------------------------------------
#  # get log2 medians
#  meth.tx <- getMeth(ms)
#  unmeth.tx <- getUnmeth(ms)
#  blocks <- getblocks(slength = ncol(ms), bsize = 50)
#  # process data in blocks
#  l2m <- matrix(nrow = 0, ncol = 2)
#  for(i in 1:length(blocks)){
#    b <- blocks[[i]]
#    gmff <- ms[, b]
#    methb <- as.matrix(meth.tx[, b])
#    unmethb <- as.matrix(unmeth.tx[, b])
#    l2meth <- l2unmeth <- c()
#    l2meth <- c(l2meth, apply(methb, 2, function(x){
#      log2(median(as.numeric(x)))
#    }))
#    l2unmeth <- c(l2unmeth, apply(unmethb, 2, function(x){
#      log2(median(as.numeric(x)))
#    }))
#    l2m <- rbind(l2m, matrix(c(l2meth, l2unmeth), ncol = 2))
#    message(i)
#  }
#  ds2 <- as.data.frame(l2m)
#  colnames(ds2) <- c("l2med.meth", "l2med.unmeth")
#  ds2$tissue <- as.factor(ms$sgroup)

## ---- eval = TRUE, fig.width = 4.5, fig.height = 3.1--------------------------
ggplot(ds2, aes(x = l2med.meth, y = l2med.unmeth, color = tissue)) + 
  geom_point(alpha = 0.3, cex = 3) + theme_bw()

## -----------------------------------------------------------------------------
#  lmv <- lgr <- lmd <- lb <- lan <- list()
#  tv <- c("adipose", "liver")
#  # get noob norm data
#  gr <- gr[,colnames(gr) %in% colnames(ms)]
#  gr <- gr[,order(match(colnames(gr), colnames(ms)))]
#  identical(colnames(gr), colnames(ms))
#  gr$sgroup <- ms$sgroup
#  # do study ID adj
#  for(t in tv){
#    lmv[[t]] <- gr[, gr$sgroup == t]
#    msi <- lmv[[t]]
#    madj <- limma::removeBatchEffect(getM(msi), batch = msi$gseid)
#    # store adjusted data in a new se object
#    lgr[[t]] <- GenomicRatioSet(GenomicRanges::granges(msi), M = madj,
#                                annotation = annotation(msi))
#    # append samples metadata
#    lmd[[t]] <- pData(lgr[[t]]) <- pData(lmv[[t]])
#    # append preprocessing metadata
#    metadata(lgr[[t]]) <- list("preprocess" = "noobbeta;removeBatchEffect_gseid")
#    # make betavals list
#    lb[[t]] <- getBeta(lgr[[t]]) # beta values list
#  }

## -----------------------------------------------------------------------------
#  anno <- getAnnotation(gr)
#  chr.xy <-c("chrY", "chrX")
#  cg.xy <- rownames(anno[anno$chr %in% chr.xy,])
#  lbf <- list()
#  for(t in tv){
#    bval <- lb[[t]]
#    lbf[[t]] <- bval[!rownames(bval) %in% cg.xy,]
#  }
#  bv <- lbf[[1]]

## -----------------------------------------------------------------------------
#  lvar <- list()
#  cnf <- c("gseid", "predsex", "predage", "predcell.CD8T",
#           "predcell.CD4T", "predcell.NK", "predcell.Bcell",
#           "predcell.Mono", "predcell.Gran")
#  for(t in tv){
#    for(c in cnf){
#      if(c %in% c("gseid", "predsex")){
#        lvar[[t]][[c]] <- as.factor(pData(lgr[[t]])[,c])
#      } else{
#        lvar[[t]][[c]] <- as.numeric(pData(lgr[[t]])[,c])
#      }
#    }
#  }

## -----------------------------------------------------------------------------
#  bv <- lbf[[1]]
#  blocks <- getblocks(slength = nrow(bv), bsize = 100000)
#  mr <- matrix(nrow = 0, ncol = 18)
#  lan <- list("adipose" = mr, "liver" = mr)
#  t1 <- Sys.time()
#  for(bi in 1:length(blocks)){
#    for(t in tv){
#      datr <- lbf[[t]][blocks[[bi]],]
#      tvar <- lvar[[t]]
#      newchunk <- t(apply(datr, 1, function(x){
#        # do multiple regression and anova
#        x <- as.numeric(x)
#        ld <- lm(x ~ tvar[[1]] + tvar[[2]] + tvar[[3]] + tvar[[4]] +
#                   tvar[[5]] + tvar[[6]] + tvar[[7]] + tvar[[8]] + tvar[[9]])
#        an <- anova(ld)
#        # get results
#        ap <- an[c(1:9),5] # pval
#        av <- round(100*an[c(1:9),2]/sum(an[,2]), 3) # percent var
#        return(as.numeric(c(ap, av)))
#      }))
#      # append new results
#      lan[[t]] <- rbind(lan[[t]], newchunk)
#    }
#    message(bi, "tdif: ", Sys.time() - t1)
#  }
#  # append colnames
#  for(t in tv){colnames(lan[[t]]) <- rep(cnf, 2)}

## -----------------------------------------------------------------------------
#  pfilt <- 1e-3
#  varfilt <- 10
#  lcgkeep <- list() # list of filtered probe sets
#  for(t in tv){
#    pm <- lan[[t]][,c(1:9)]
#    vm <- lan[[t]][,c(10:18)]
#    # parse variable thresholds
#    cm <- as.data.frame(matrix(nrow = nrow(pm), ncol = ncol(pm)))
#    for(c in 1:ncol(pm)){
#      pc <- pm[,c];
#      pc.adj <- as.numeric(p.adjust(pc))
#      pc.filt <- pc.adj < pfilt
#      vc.filt <- vm[,c] >= varfilt
#      cm[,c] <- (pc.filt & vc.filt)
#    }
#    cgkeep <- apply(cm, 1, function(x){return((length(x[x == TRUE]) == 0))})
#    lcgkeep[[t]] <- rownames(pm)[cgkeep]
#  }
#  lgr.filt <- list("adipose" = lgr[[1]][lcgkeep[[1]],],
#                   "liver" = lgr[[2]][lcgkeep[[2]],])

## -----------------------------------------------------------------------------
#  cnv <- c("min", "max", "mean", "median", "sd", "var")
#  bv <- getBeta(lgr.filt[[t]])
#  lbt <- lcg.ss <- list()
#  bsize = 100000
#  for(t in tv){
#    lcg.ss[[t]] <- matrix(nrow = 0, ncol = 6)
#    lbt[[t]] <- bt <- as.matrix(getBeta(lgr.filt[[t]]))
#    blockst <- getblocks(slength = nrow(bt), bsize = bsize)
#    for(bi in 1:length(blockst)){
#      bc <- bt[blockst[[bi]],]
#      newchunk <- t(apply(bc, 1, function(x){
#        newrow <- c(min(x), max(x), mean(x), median(x), sd(x), var(x))
#        return(as.numeric(newrow))
#      }))
#      lcg.ss[[t]] <- rbind(lcg.ss[[t]], newchunk)
#      message(t, ";", bi)
#    }
#    colnames(lcg.ss[[t]]) <- cnv
#  }

## -----------------------------------------------------------------------------
#  qiv = seq(0, 1, 0.01)
#  qwhich = c(100)
#  lmvp.abs <- list()
#  lci <- list()
#  for(t in tv){
#    cgv <- c()
#    sa <- lcg.ss[[t]]
#    sa <- as.data.frame(sa, stringsAsFactors = FALSE)
#    q <- quantile(sa$var, qiv)[qwhich]
#    lmvp.abs[[t]] <- rownames(sa[sa$var > q,])
#  }

## -----------------------------------------------------------------------------
#  # binned quantiles method
#  qiv = seq(0, 1, 0.01) # quantile filter
#  qwhich = c(100)
#  bin.xint <- 0.1
#  binv = seq(0, 1, bin.xint)[1:10] # binned bval mean
#  # iter on ncts
#  lmvp.bin = list()
#  for(t in tv){
#    sa <- as.data.frame(lcg.ss[[t]])
#    cgv <- c()
#    # iterate on betaval bins
#    for(b in binv){
#      bf <- sa[sa$mean >= b & sa$mean < b + bin.xint, ] # get probes in bin
#      q <- qf <- quantile(bf$var, qiv)[qwhich] # do bin filter
#      cgv <- c(cgv, rownames(bf)[bf$var > q]) # append probes list
#    }
#    lmvp.bin[[t]] <- cgv
#  }

## ---- eval = TRUE-------------------------------------------------------------
cgav <- c()
for(t in tv){
  txcg <- unique(c(lmvp.abs[[t]], lmvp.bin[[t]]))
  cgav <- c(cgav, txcg)
}
cgdf <- as.data.frame(table(cgav))
cgdf$type <- ifelse(cgdf[,2] > 1, "non-specific", "tissue-specific")
table(cgdf$type)

## -----------------------------------------------------------------------------
#  cgfilt <- cgdf$type == "non-specific"
#  cgdff <- cgdf[!cgfilt,]
#  ltxcg <- list()
#  for(t in tv){
#    cgtx <- c()
#    cgabs <- lmvp.abs[[t]]
#    cgbin <- lmvp.bin[[t]]
#    st <- as.data.frame(lcg.ss[[t]])
#    # get t tissue specific probes
#    filtbt <- rownames(st) %in% cgdff[,1]
#    st <- st[filtbt,]
#    # get top 1k t tissue specific abs probes
#    filt.bf1 <- rownames(st) %in% cgabs
#    sf1 <- st[filt.bf1,]
#    sf1 <- sf1[rev(order(sf1$var)),]
#    cgtx <- rownames(sf1)[1:1000]
#    # get top 1k t tissue specific bin probes, after filt
#    filt.bf2 <- rownames(st) %in% cgbin &
#                !rownames(st) %in% rownames(sf1)
#    sf2 <- st[filt.bf2,]
#    sf2 <- sf2[rev(order(sf2$var)),]
#    cgtx <- c(cgtx, rownames(sf2)[1:1000])
#    ltxcg[[t]] <- cgtx
#  }

## -----------------------------------------------------------------------------
#  # filtered cg summaries
#  lfcg <- lapply(lcg.ss,
#    function(x){x <- x[rownames(x) %in% unique(unlist(ltxcg)),]})
#  # annotation subset
#  anno <- getAnnotation(gr) # save anno for cga
#  anno <- anno[,c("Name", "UCSC_RefGene_Name", "UCSC_RefGene_Group",
#    "Relation_to_Island")]
#  anno <- anno[rownames(anno) %in% unique(unlist(ltxcg)),]
#  # filtered beta values
#  lcgssf <- list()
#  for(t in tv){
#    bv <- lcg.ss[[t]]
#    bvf <- bv[rownames(bv) %in% ltxcg[[t]],]
#    lcgssf[[t]] <- bvf
#  }

## ---- eval = TRUE, fig.width = 3.8, fig.height = 4.5--------------------------
lvp <- makevp(lfcg, ltxcg)
grid.arrange(lvp[[1]], lvp[[2]], ncol = 1, bottom = "Tissue")

## ---- eval = TRUE-------------------------------------------------------------
tcgss <- matrix(nrow = 0, ncol = 6)
for(t in tv){
  datt <- apply(lcgssf[[t]], 2, function(x){
      round(mean(x), digits = 2)
  })
  mt <- matrix(datt, nrow = 1)
  tcgss <- rbind(tcgss, mt)
}
colnames(tcgss) <- colnames(lcgssf$adipose)
rownames(tcgss) <- tv
knitr::kable(t(tcgss), align = "c")

## ---- eval = TRUE, fig.width = 8.2, fig.height = 4.7--------------------------
cga <- get_cga(anno)
lhmset <- hmsets(ltxcg, lfcg, cga)
lhmplots <- hmplots(lhmset$hm.mean, lhmset$hm.var, lhmset$hm.size)
grid.arrange(lhmplots$hm.mean.plot, lhmplots$hm.var.plot, 
             layout_matrix = matrix(c(1, 1, 1, 1, 1, 2, 2), nrow = 1),
             bottom = "Tissue", left = "Annotation/Region Type")

## ----get_sessioninfo, eval = TRUE---------------------------------------------
sessionInfo()

Try the recountmethylation package in your browser

Any scripts or data that you put into this service are public.

recountmethylation documentation built on Nov. 8, 2020, 4:59 p.m.