inst/doc/camthc.R

## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ---- eval=FALSE--------------------------------------------------------------
#  rCAM <- CAM(data, K = 2:5, thres.low = 0.30, thres.high = 0.95)

## ---- echo=TRUE---------------------------------------------------------------
library(CAMTHC)

data(ratMix3) 
#ratMix3$X: X matrix containing mixture expression profiles to be analyzed
#ratMix3$A: ground truth A matrix containing proportions
#ratMix3$S: ground truth S matrix containing subpopulation-specific expression profiles

data <- ratMix3$X
#10000 genes * 21 tissues
#meet the input data requirements

## ---- echo=TRUE, message=FALSE------------------------------------------------
set.seed(111) # set seed for internal clustering to generate reproducible results
rCAM <- CAM(data, K = 2:5, thres.low = 0.30, thres.high = 0.95)
#CAM return three sub results: 
#rCAM@PrepResult contains details corresponding to data preprocessing.
#rCAM@MGResult contains details corresponding to marker gene clusters detection.
#rCAM@ASestResult contains details corresponding to A and S matrix estimation.

## ---- fig.height=6, fig.width=6-----------------------------------------------
plot(MDL(rCAM), data.term = TRUE)

## ---- echo=TRUE---------------------------------------------------------------
Aest <- Amat(rCAM, 3)
Sest <- Smat(rCAM, 3)

## ---- echo=TRUE---------------------------------------------------------------
MGlist <- MGsforA(rCAM, K = 3) #for three subpopulations

## ---- echo=TRUE---------------------------------------------------------------
MGstat <- MGstatistic(data, Aest, boot.alpha = 0.05, nboot = 1000)
MGlist.FC <- lapply(seq_len(3), function(x) 
    rownames(MGstat)[MGstat$idx == x & MGstat$OVE.FC > 10])
MGlist.FCboot <- lapply(seq_len(3), function(x) 
    rownames(MGstat)[MGstat$idx == x & MGstat$OVE.FC.alpha > 10])

## ---- echo=TRUE---------------------------------------------------------------
MGlist.re <- reselectMG(data, MGlist, fc.thres='q0.2', err.thres='q1.2')
#q0.2: 0.2-quantile
#q1.2: 1-quantile (maximum) times 1.2

## ---- echo=TRUE---------------------------------------------------------------
rre <- redoASest(data, MGlist.re, maxIter = 2, methy = FALSE)
#rre$Aest: re-estimated A matrix
#rre$Sest: re-estimated S matrix

## ---- fig.height=6, fig.width=6-----------------------------------------------
layout(matrix(c(1,2,3,4), 2, 2, byrow = TRUE))
simplexplot(data, Aest, MGlist, main=c('Initially detected markers'))
simplexplot(data, Aest, MGlist.FC, main=c('fc > 10'))
simplexplot(data, Aest, MGlist.FCboot, 
            main=c(expression(bold(paste('fc(bootstrap,',alpha,'=0.05) > 10')))))
simplexplot(data, Aest, MGlist.re, main=c("fc >= 'q0.2', error <= 'q1.2'"))

## ---- fig.height=6, fig.width=6-----------------------------------------------
simplexplot(data, Aest, MGlist.FCboot, 
            data.extra=rbind(t(ratMix3$A),t(Aest)),
            corner.order = c(2,1,3), col = "blue", 
            mg.col = c("red","orange","green"),
            ex.col = "black", ex.pch = c(19,19,19,17,17,17))
legend("bottomright", cex=1.2, inset=.01, c("Ground Truth","CAM Estimation"),
       pch=c(19,17), col="black")

## ---- eval=FALSE--------------------------------------------------------------
#  library(rgl)
#  Xp <- data %*% t(PCAmat(rCAM))
#  plot3d(Xp[, 1:3], col='gray', size=3,
#         xlim=range(Xp[,1]), ylim=range(Xp[,2:3]), zlim=range(Xp[,2:3]))
#  abclines3d(0,0,0, a=diag(3), col="black")
#  for(i in seq_along(MGlist)){
#      points3d(Xp[MGlist[[i]], 1:3], col= rainbow(3)[i], size = 8)
#  }

## ---- eval=FALSE--------------------------------------------------------------
#  library(rgl)
#  clear3d()
#  Xproj <- XWProj(data, PCAmat(rCAM))
#  Xp <- Xproj[,-1]
#  plot3d(Xp[, 1:3], col='gray', size=3,
#         xlim=range(Xp[,1:3]), ylim=range(Xp[,1:3]), zlim=range(Xp[,1:3]))
#  abclines3d(0,0,0, a=diag(3), col="black")
#  for(i in seq_along(MGlist)){
#      points3d(Xp[MGlist[[i]], 1:3], col= rainbow(3)[i], size = 8)
#  }

## ---- echo=TRUE---------------------------------------------------------------
cor(Aest, ratMix3$A)
cor(Sest, ratMix3$S)

## ---- echo=TRUE---------------------------------------------------------------
unlist(lapply(seq_len(3), function(k) {
    k.match <- which.max(cor(Aest[,k], ratMix3$A));
    mgk <- MGlist.FCboot[[k]];
    corr <- cor(Sest[mgk, k], ratMix3$S[mgk, k.match]);
    names(corr) <- colnames(ratMix3$A)[k.match];
    corr}))


## ---- echo=TRUE---------------------------------------------------------------
set.seed(111)

#Data preprocession
rPrep <- CAMPrep(data, thres.low = 0.30, thres.high = 0.95)

#Marker gene cluster detection with a fixed K
rMGC <- CAMMGCluster(3, rPrep)

#A and S matrix estimation
rASest <- CAMASest(rMGC, rPrep, data)

#Obtain A and S matrix
Aest <- Amat(rASest)
Sest <- Smat(rASest)

#obtain marker gene list detected by CAM and used for A estimation
MGlist <- MGsforA(PrepResult = rPrep, MGResult = rMGC)

#obtain a full list of marker genes 
MGstat <- MGstatistic(data, Aest, boot.alpha = 0.05, nboot = 1000)
MGlist.FC <- lapply(seq_len(3), function(x) 
    rownames(MGstat)[MGstat$idx == x & MGstat$OVE.FC > 10])
MGlist.FCboot <- lapply(seq_len(3), function(x) 
    rownames(MGstat)[MGstat$idx == x & MGstat$OVE.FC.alpha > 10])
MGlist.re <- reselectMG(data, MGlist, fc.thres='q0.2', err.thres='q1.2')

## ---- echo=TRUE, message=FALSE------------------------------------------------
#clustering
library(apcluster)
apres <- apclusterK(negDistMat(r=2), t(data),  K = 10) 
#You can also use apcluster(), but need to make sure the number of clusters is large than potential subpopulation number.

data.clusterMean <- lapply(slot(apres,"clusters"), 
                           function(x) rowMeans(data[, x, drop = FALSE]))
data.clusterMean <- do.call(cbind, data.clusterMean)

set.seed(111)
rCAM <- CAM(data.clusterMean, K = 2:5, thres.low = 0.30, thres.high = 0.95)
# or rPrep <- CAMPrep(data.clusterMean, thres.low = 0.30, thres.high = 0.95)

## ---- echo=TRUE---------------------------------------------------------------
Sest <- Smat(rCAM,3)
MGlist <- MGsforA(rCAM, K = 3)
Aest <- AfromMarkers(data, MGlist)
#(Optional) alternative re-estimation of A and S matrix
rre <- redoASest(data, MGlist, maxIter = 10) 

## ---- echo=TRUE, message=FALSE------------------------------------------------
#download data and phenotypes
library(GEOquery)
gsm<- getGEO('GSE11058')
pheno <- pData(phenoData(gsm[[1]]))$characteristics_ch1
mat <- exprs(gsm[[1]])
mat <- mat[-grep("^AFFX", rownames(mat)),]
mat.aggre <- sapply(unique(pheno), function(x) rowMeans(mat[,pheno == x]))
data <- mat.aggre[,5:8]

#running CAM
set.seed(111)
rCAM <- CAM(data, K = 4, thres.low = 0.70, thres.high = 0.95)
Aest <- Amat(rCAM, 4)
Aest

#Use ground truth A to validate CAM-estimated A matrix
Atrue <- matrix(c(2.50, 0.50, 0.10, 0.02,
              1.25, 3.17, 4.95, 3.33,
              2.50, 4.75, 1.65, 3.33,
              3.75, 1.58, 3.30, 3.33), 4,4,
              dimnames = list(c("MixA", "MixB", "MixC","MixD"),
                               c("Jurkat", "IM-9", "Raji", "THP-1")))
Atrue <- Atrue / rowSums(Atrue)
Atrue
cor(Aest, Atrue)

## ---- echo=TRUE, message=FALSE, eval=FALSE------------------------------------
#  #download data
#  library(GEOquery)
#  gsm <- getGEO('GSE41826')
#  mixtureId <- unlist(lapply(paste0('Mix',seq_len(9)),
#      function(x) gsm[[1]]$geo_accession[gsm[[1]]$title==x]))
#  data <- gsm[[1]][,mixtureId]
#  
#  #Remove CpG sites in sex chromosomes if tissues are from both males and females
#  #Not necessary in this example as mixtures are from the same subject
#  #gpl<- getGEO('GPL13534')
#  #annot<-dataTable(gpl)@table[,c('Name','CHR')]
#  #rownames(annot) <- annot$Name
#  #annot <- annot[rownames(data),]
#  #data <- data[annot$CHR != 'X' & annot$CHR != 'Y',]
#  
#  #downsample CpG sites
#  featureId <- sample(seq_len(nrow(data)), 50000)
#  
#  #Running CAM
#  #When input data has too many probes, lof has to be disabled due to space limit.
#  #When cluster.num is large, quick.select can be enable to increase the speed
#  rCAM <- CAM(data[featureId,], K = 2, thres.low = 0.10, thres.high = 0.60,
#              cluster.num = 100, MG.num.thres = 10,
#              lof.thres = 0, quick.select = 20)
#  MGlist <- MGsforA(rCAM, K = 2)
#  
#  #Identify markers from all CpG sites
#  MGlist.re <- reselectMG(data, MGlist, fc.thres='q0.2', err.thres='q1.2')
#  
#  #re-esitmation with methylation constraint
#  rre <- redoASest(data, MGlist.re, maxIter = 20, methy = TRUE)
#  
#  #Validation using ground truth A matrix
#  Atrue <- cbind(seq(0.1, 0.9, 0.1), seq(0.9, 0.1, -0.1))
#  cor(rre$Aest, Atrue)

## ---- echo=TRUE, message=FALSE, eval=FALSE------------------------------------
#  rCAM <- CAM(1 - exprs(data[featureId,]),
#              K = 2, thres.low = 0.10, thres.high = 0.60,
#              cluster.num = 100, MG.num.thres = 10,
#              lof.thres = 0, quick.select = 20)

## ---- echo=TRUE, eval=FALSE---------------------------------------------------
#  Aest <- AfromMarkers(data, MGlist)
#  #MGlist is a list of vectors, each of which contains known markers for one subpopulation

## ---- echo=TRUE, eval=FALSE---------------------------------------------------
#  rre <- redoASest(data, MGlist, maxIter = 10)
#  #MGlist is a list of vectors, each of which contains known markers for one subpopulation
#  #maxIter = 0: No re-estimation by ALS
#  #rre$Aest: estimated A matrix
#  #rre$Sest: estimated S matrix

## ---- echo=TRUE---------------------------------------------------------------
data <- ratMix3$X
S <- ratMix3$S #known S matrix

#Identify markers
pMGstat <- MGstatistic(S, c("Liver","Brain","Lung"))
pMGlist.FC <- lapply(c("Liver","Brain","Lung"), function(x) 
    rownames(pMGstat)[pMGstat$idx == x & pMGstat$OVE.FC > 10])

#Estimate A matrix
Aest <- AfromMarkers(data, pMGlist.FC)

#(Optional) alternative re-estimation
rre <- redoASest(data, pMGlist.FC, maxIter = 10) 

## ---- echo=TRUE, eval=FALSE---------------------------------------------------
#  Aest <- redoASest(data, pMGlist.FC, S=S, maxIter = 0)$Aest

## ---- echo=TRUE---------------------------------------------------------------
data <- ratMix3$X
A <- ratMix3$A #known A matrix

#Estimate S matrix
Sest <- t(NMF::.fcnnls(A, t(data))$coef)

#Identify markers
pMGstat <- MGstatistic(data, A)
pMGlist.FC <- lapply(unique(pMGstat$idx), function(x) 
    rownames(pMGstat)[pMGstat$idx == x & pMGstat$OVE.FC > 10])

#(Optional) alternative re-estimation
rre <- redoASest(data, pMGlist.FC, A=A,  maxIter = 10) 

## ---- echo=TRUE, eval=FALSE---------------------------------------------------
#  Aest <- AfromMarkers(data, MGlist)
#  #MGlist is a list of vectors, each of which contains known markers and/or CAM-detected markers for one subpopulation

## ---- echo=TRUE, eval=FALSE---------------------------------------------------
#  rre <- redoASest(data, MGlist, maxIter = 10)
#  #MGlist is a list of vectors, each of which contains known markers and/or CAM-detected markers for one subpopulation
#  #maxIter = 0: No re-estimation by ALS
#  #rre$Aest: estimated A matrix
#  #rre$Sest: estimated S matrix

Try the CAMTHC package in your browser

Any scripts or data that you put into this service are public.

CAMTHC documentation built on April 29, 2020, 2:29 a.m.