vignettes/sims.R

# Create simulated single cell RNA-seq data to test out the various algorithms
# A key criterion is ability to recover batch effects even after censoring.
# Also want to see that the latent variables are correctly estimated.
# Data is generated from small variations on the same model used for inference
# This file a slight modification of https://github.com/willtownes/vamf-paper/blob/master/simulations/sims.R

library(Matrix)

rm_zero_rowcol<-function(Y){
  #remove all rows and columns containing all zeros
  Y<-Y[rowSums(Y>0)>0,] #remove rows with zeros all the way across
  Y<-Y[,colSums(Y>0)>0]
  Y
}

genCluster<-function(n,id,mu=c(0,0),sigma=diag(2)){
  d<-data.frame(MASS::mvrnorm(n,mu,sigma))
  d$id<-id
  return(d)
}

simulate_data<-function(L,G,N,Gsignal=G,U=NULL,y0=10,std=list(a=2,w=2,u=2,v=1/sqrt(L))){
  #L is dimension of latent subspace
  #G is number of rows, N is number of columns
  #Gsignal is number of rows that have signal, rest are noise
  #U is optionally prespecified matrix of latent factors (MxN)
  #default if NULL: all columns will be in same batch and no batch effect
  #y0: overall mean (bias/intercept term)
  #std: list of standard deviation parameters for random effects
  #RETURN: a list of parameters:
  # U is MxN matrix of latent factors (ie column/ cell factors)
  # V is MxG "factor loading" matrix (ie row/ gene factors)
  # a is Nx1 column biases
  # w is Gx1 row biases
  # y0 is overall bias/intercept term
  # Y is "true" latent expression matrix (uncensored)
  # E[Y_ng] = y0+a_n+w_g+u_n'v_g
  stopifnot(Gsignal>0 && Gsignal<=G)
  a<-if(std$a==0) 0 else rnorm(N,0,std$a)
  w<-rnorm(G,0,std$w)
  if(is.null(U)) U<-matrix(rnorm(N*L,0,std$u),nrow=L,ncol=N)
  V<-matrix(rnorm(Gsignal*L,0,std$v),nrow=L,ncol=Gsignal)
  Y<-t(V)%*%U
  if(Gsignal<G){
    Ynoise<-matrix(0,nrow=G-Gsignal,ncol=N)
    Y<-rbind(Y,Ynoise)
  }
  #rows=genes,columns=cells
  Y<-Y+w #take advantage of recycling to add same value to each row
  Y<-t(t(Y)+a) #again taking advantage of recycling
  Y<-y0+Y
  mget(c("U","V","a","w","y0","Y")) #combine into list of parameters
}

censor_dat_mar<-function(dat,censor_rate,s_y=1){
  noise<-rnorm(prod(dim(dat)),0,s_y)
  Z<-rsparsematrix(nrow=nrow(dat),ncol=ncol(dat),1-censor_rate,rand.x=NULL)
  #binary "pattern" matrix
  (dat+noise)*Z
}

expit<-function(x){1/(1+exp(-x))}
#inv_probit<-function(x){pnorm(x)}
#inv_cauchit<-function(x){.5+atan(x)/pi}
#INV_LINK_FNS<-list(logit=expit,probit=inv_probit,cauchit=inv_cauchit)
#QUANTILE_FNS<-list(logit=qlogis,probit=qnorm,cauchit=qcauchy)

rbeta2<-function(N,mu,phi=max(1/mu,1/(1-mu))+0.1){
  #return beta distributed variates with a given mean and precision
  #If phi not provided, shape chosen such that there is maximal dispersion for the given mean while still perserving a unimodal distribution
  # large values of phi mean smaller variance (higher precision)
  # more details on this parameterization
  # https://cran.r-project.org/web/packages/betareg
  #stopifnot(mu>0 && mu<1)
  #guarantee both shape pars >1 ==> unimodal
  rbeta(N, phi*mu, phi*(1-mu))
}

censor_dat_mnar1<-function(ptrue,cens_rates=0.5,mcar=0.3,capture_eff=0.5,sigma_y=2){
  # apply censoring using following formula
  # ptrue is a list of true parameters from simulate_data()
  # cens_rates is overall rate of missing data, including MCAR and MNAR parts
  # mcar is the fraction of missing data that is MCAR
  # ie, data mcar as a fraction of total data = mcar*censor_rate = "(1-k)"
  # capture_eff is slope parameter for probit MNAR model. Typically >0
  # sigma_y is stdev of observed Y values (measurement error noise)
  # MNAR model determined as follows
  # P(missing|eta_ng) = k*inv_probit(dn + capture_eff * eta_ng)
  # k=1-mcar*censor_rate
  # eta_ng = y0+an+wg+un'vg (from simulate_data function)
  # analytically integrating over all gene specific parameters (wg and vg), we get a formula for marginal missingness for cell n. This is set equal to the censor_rate
  # invert this formula to get implied dn values for each cell
  # use average of these as the intercept in the probit model
  # this leads to heterogeneity of the censoring rates, roughly centered around the desired censor_rate
  # see also http://arxiv.org/abs/1603.06045v1
  N<-ncol(ptrue$Y)
  G<-nrow(ptrue$Y)
  Yt<-t(ptrue$Y)
  b1<-capture_eff
  Qn<-1-cens_rates #overall detection rates for each cell
  kn<-1-mcar*cens_rates #1-k = overall MCAR rates
  stopifnot(all(kn>Qn))
  #dn<-sapply(1:N,function(n){det2thresh(n,ptrue,Q,k,b1)})
  dng<-t(qnorm(Qn/kn)-b1*Yt)
  dn<-colMeans(dng)
  probs<-t(kn*pnorm(dn + b1*Yt)) #implicit vectorization
  U<-matrix(runif(N*G),nrow=G)
  Z<-Matrix(U<probs)
  #hist(colMeans(Z))
  #plot(Qn,colMeans(Z))
  noise<-matrix(rnorm(N*G,0,sigma_y),nrow=G)
  res<-(ptrue$Y+noise)*Z
}

noise_only<-function(N,G,Gsignal,y0=5,std=list(a=0,w=2,v=1/sqrt(2)),cens=c(.56,.95),mcar=0.1,capture_eff=1,sigma_y=1){
  #creates a dataset based on the noise-only model (two batches with variable detection rates)
  stopifnot(N%%2==0) #even number of cells required
  dat<-matrix(rnorm(N*2,0,1/sqrt(2)),N)
  colnames(dat)<-paste0("dim",1:2)
  ptrue<-simulate_data(2,G,N,Gsignal,U=t(dat),y0=y0,std=std)
  batch1<-1:floor(N/2)
  batch2<-(floor(N/2)+1):N
  cens_rates<-rep(NA,N)
  cens_rates[batch1]<-rbeta2(length(batch1),cens[1],phi=200)
  cens_rates[batch2]<-rbeta2(length(batch2),cens[2],phi=200)
  ref<-data.frame(dat,cens_rates=cens_rates)
  ref$batch<-factor(rep(c("high_detection","low_detection"),each=N/2))
  Y_obs<-censor_dat_mnar1(ptrue,cens_rates,mcar=mcar,capture_eff=capture_eff,sigma_y=sigma_y)
  list(Y_obs=rm_zero_rowcol(Y_obs),ref=ref)
}

latent_clusters<-function(N,G,Gsignal,y0=5,std=list(a=0,w=2,v=1/sqrt(2)),cens=c(.56,.95),mcar=0.1,capture_eff=1,sigma_y=1){
  #generate samples from the four clusters, two batches scenario
  stopifnot(N%%4==0)
  #sigma<-matrix(c(1.5,1.3,1.3,1.5),nrow=2)
  sigma<-matrix(c(1,0,0,1),nrow=2)
  mu<-list(c(-2.5,2.5),c(2.5,2.5),c(2.5,-2.5),c(-2.5,-2.5))
  dat<-do.call("rbind",lapply(1:4,function(x){genCluster(N/4,x,mu[[x]],sigma)}))
  colnames(dat)[1:2]<-paste0("dim",1:2)
  ref<-data.frame(dat[,1:2],id=as.factor(as.character(dat$id)))
  ptrue<-simulate_data(2,G,N,Gsignal,U=t(as.matrix(ref[,1:2])),y0=y0,std=std)
  #randomly assign half of cells to a low-censoring regime and other half to high-censoring
  batch1<-seq(from=1,to=N,by=2)
  batch2<-seq(from=2,to=N,by=2)
  cens_rates<-rep(NA,N)
  cens_rates[batch1]<-rbeta2(length(batch1),cens[1],phi=200)
  cens_rates[batch2]<-rbeta2(length(batch2),cens[2],phi=200)
  ref$batch<-factor(rep(c("high_detection","low_detection"),N/2))
  ref$cens_rates<-cens_rates
  Y_obs<-censor_dat_mnar1(ptrue,cens_rates,mcar=mcar,capture_eff=capture_eff,sigma_y=sigma_y)
  Y_obs<-rm_zero_rowcol(Y_obs)
  list(Y_obs=Y_obs,ref=ref)
}
willtownes/vamf documentation built on May 8, 2019, 9:31 a.m.