R/distribution.R
In RnaSeqSampleSize: RnaSeqSampleSize

Documented in est_count_dispersion est_power_distribution

##' est_count_dispersion
##'
##' A function to estitamete the gene read count and dispersion distribution of RNA-seq data.
##'
##' A function to estitamete the gene read count and dispersion distribution of RNA-seq data.
##'
##' @param counts numeric matrix of read counts.
##' @param group vector or factor giving the experimental group/condition for each sample/library.
##' @param subSampleNum number of samples used to estitamete distribution.
##' @param minAveCount Only genes with avarage read counts above this value are used in the estimation of distribution.
##' @param convertId logical, whether to convert th gene Id into entrez gene Id. If set as True, then dataset and filters parameter should also be set.
##' @inheritParams convertIdOneToOne
##' @return A DEGlist from edgeR package.
##' @importFrom edgeR DGEList calcNormFactors estimateCommonDisp estimateTagwiseDisp
##' @export
##' @examples counts<-matrix(sample(1:1000,6000,replace=TRUE),ncol=6)
##' est_count_dispersion(counts=counts,group=rep(0,6))
est_count_dispersion<-function(counts,group=rep(1,NCOL(counts)),subSampleNum=20,minAveCount=1,convertId=FALSE,dataset="hsapiens_gene_ensembl",filters="hgnc_symbol") {
	subSample<-if (ncol(counts)<=subSampleNum) 1:ncol(counts) else sample(1:ncol(counts),subSampleNum)
	countsSub<-counts[,subSample]
	groupSub<-group[subSample]
	countsSub<-countsSub[which(rowMeans(counts)>=minAveCount),]

	y<-DGEList(countsSub, group=groupSub )
	y <- calcNormFactors(y)
	y <- estimateCommonDisp(y, verbose=TRUE)
	y <- estimateTagwiseDisp(y)
	y$pseudo.counts.mean<-rowMeans(y$pseudo.counts)
	if (convertId) {
		y$converted.entrezgene<-convertIdOneToOne(names(y$pseudo.counts.mean),dataset=dataset,filters=filters,verbose=FALSE)
	}
	y$pseudo.counts<-NULL
	y$counts<-NULL
	return(y)
}

##' est_power_distribution
##'
##' A function to estitamete the power for differential expression analysis of RNA-seq data.
##'
##' A function to estitamete the power for differential expression analysis of RNA-seq data.
##'
##' @param n Numer of samples.
##' @param repNumber Number of genes used in estimation of read counts and dispersion distribution.
##' @param dispersionDigits Digits of dispersion.
##' @param distributionObject A DGEList object generated by est_count_dispersion function. RnaSeqSampleSizeData package contains 13 datasets from TCGA, you can set distributionObject as any one of "TCGA_BLCA","TCGA_BRCA","TCGA_CESC","TCGA_COAD","TCGA_HNSC","TCGA_KIRC","TCGA_LGG","TCGA_LUAD","TCGA_LUSC","TCGA_PRAD","TCGA_READ","TCGA_THCA","TCGA_UCEC" to use them.
##' @param libSize numeric vector giving the total count for each sample. If not specified, the libsize in distributionObject will be used.
##' @param minAveCount Minimal average read count for each gene. Genes with smaller read counts will not be used.
##' @param maxAveCount Maximal average read count for each gene. Genes with larger read counts will be taken as maxAveCount.
##' @param selectedGenes Optianal. Name of interesed genes. Only the read counts and dispersion distribution for these genes will be used in power estimation.
##' @param pathway Optianal. ID of interested KEGG pathway. Only the read counts and dispersion distribution for genes in this pathway will be used in power estimation.
##' @param species Optianal. Species of interested KEGG pathway.
##' @param countFilterInRawDistribution Logical. If the count filter will be applied on raw count distribution. If not, count filter will be applied on libSize scaled count distribution.
##' @param selectedGeneFilterByCount Logical. If the count filter will be applied to selected genes when selectedGenes parameter was used.
##' @param removedGene0Power Logical. When selectedGenes or pathway are used, some genes may have read count less than minAveCount and will be removed by count filter. This parameter indicates if they will be used as 0 power in power estimation. If not, they will not be used in power estimation.
##' @return Average power or a list including count ,distribution and power for each gene.
##' @inheritParams est_power
##' @inheritParams sample_size
##' @import RnaSeqSampleSizeData
##' @export
##' @examples
##' #Please note here the parameter repNumber was very small (2) to make the example code faster.
##' #We suggest repNumber should be at least set as 100 in real analysis.
##' est_power_distribution(n=65,f=0.01,rho=2,distributionObject="TCGA_READ",repNumber=2)
##' #Power estimation based on some interested genes. We use storeProcess=TRUE to return the details for all selected genes.
##' selectedGenes<-names(TCGA_READ$pseudo.counts.mean)[c(1,3,5,7,9,12:30)]
##' powerDistribution<-est_power_distribution(n=65,f=0.01,rho=2,distributionObject="TCGA_READ",selectedGenes=selectedGenes,minAveCount=1,storeProcess=TRUE,repNumber=2)
##' str(powerDistribution)
##' mean(powerDistribution$power)
##' #Power estimation based on genes in interested pathway
##' powerDistribution<-est_power_distribution(n=65,f=0.01,rho=2,distributionObject="TCGA_READ",pathway="00010",minAveCount=1,storeProcess=TRUE,repNumber=2)
##' mean(powerDistribution$power)
est_power_distribution<-function(n,f=0.1,m=10000,m1=100, w=1, rho=2,repNumber=100,dispersionDigits=1,distributionObject,libSize,minAveCount=5,maxAveCount=2000,selectedGenes,pathway,species="hsa",storeProcess=FALSE,countFilterInRawDistribution=TRUE,selectedGeneFilterByCount=FALSE,removedGene0Power=TRUE) {
	temp<-selectDistribution(distributionObject=distributionObject,libSize=libSize,repNumber=repNumber,dispersionDigits=dispersionDigits,minAveCount=minAveCount,maxAveCount=maxAveCount,selectedGenes=selectedGenes,pathway=pathway,species=species,countFilterInRawDistribution=countFilterInRawDistribution,
			selectedGeneFilterByCount=selectedGeneFilterByCount,removedGene0Power=removedGene0Power)
	dispersionDistribution<-temp$selectedDispersion
	countDistribution<-temp$selectedCount

	#determin alpha from most consertive power
	phi0<-temp$maxDispersionDistribution
	lambda0<-max(min(countDistribution),1)
	leastPower<-est_power(n=n, w=w, rho=rho,lambda0=lambda0,phi0=phi0)
	alpha<-leastPower*m1*f/((m-m1)*(1-f))

	#power distribution
	powerDistribution<-est_power_distribution_root(n=n,alpha=alpha,w=w, rho=rho,dispersionDistribution=dispersionDistribution,countDistribution=countDistribution)

	#Add removed genes as 0 power
	if (removedGene0Power) {
		powerDistribution<-c(powerDistribution,rep(0,(temp$inputGeneNumber-length(powerDistribution))))
	}

	if (storeProcess) {
		return(list(power=powerDistribution,count=countDistribution,dispersion=dispersionDistribution))
	} else {
		return(mean(powerDistribution))
	}
}


est_power_distribution_root<-function(n,alpha=0.05, w=1,k=1, rho=2, error=0.001,dispersionDistribution,countDistribution) {
	#power distribution
	powerDistribution<-NULL
	for (dispersion in unique(dispersionDistribution)) {
		model<-generateModel(lambda0=c(1,5,10),n=n, w=w, rho=rho, phi0=dispersion,alpha=alpha, error=error,showMessage=FALSE)
		countDistributionTable<-table(countDistribution[which(dispersionDistribution==dispersion)])
		powerTable<-NULL
		for (count in as.integer(names(countDistributionTable))) {
			powerTable<-c(powerTable,modelPower(model,n=n,w=w,lambda0=count,rho=rho, phi0=dispersion))
		}
		powerDistribution<-c(powerDistribution,rep(powerTable,countDistributionTable))
	}
	return(powerDistribution)
}

selecteGeneByName<-function(selectedGenes,distributionObject) {
	result<-which(names(distributionObject$pseudo.counts.mean) %in% selectedGenes)
	if (length(result)>0) {
		temp<-setdiff(selectedGenes,names(distributionObject$pseudo.counts.mean)[result])
		if (length(temp)>0) {
			warning(paste0(length(temp)," selectedGenes were not found in distributionObject, discard them for further analysis\n"))
		}
	} else {
		stop("selectedGenes have no overlap with gene IDs in distributionObject. Please check if the ID types are same\n")
	}
	return(result)
}

selecteGeneByPathway<-function(distributionObject,species="hsa",pathway="00010") {
	selectedGenes<-keggLink(species,paste0(species,pathway))
	selectedGenes<-gsub(paste0(species,":"),"",selectedGenes)
	if ("converted.entrezgene" %in% names(distributionObject)) {
		result<-which(distributionObject$converted.entrezgene %in% selectedGenes)
		if (length(result)==0) {
			stop(paste0("Can't find any gene of selected pathway in distributionObject\n"))
		}
	} else {
		stop(paste0("There is no converted.entrezgene in distributionObject, please use est_dispersion to generate it again with convertId set as True\n"))
	}
	return(result)
}

selectDistribution<-function(distributionObject,libSize,repNumber,dispersionDigits,minAveCount,maxAveCount,selectedGenes,pathway,species,countFilterInRawDistribution=TRUE,selectedGeneFilterByCount=FALSE,removedGene0Power=TRUE) {
	distributionInPackage<-data(package="RnaSeqSampleSizeData")$results[,"Item"]

	if (is.character(distributionObject)) { #distributionInPackage
		if (distributionObject %in% distributionInPackage) {
			data(list=distributionObject)
			distributionObject<-get(distributionObject)
		} else {
			stop(paste0("The distributionObject name ",distributionObject," is not in the dataset of RnaSeqSamplSize package"))
		}
	} else {
		if (!is(distributionObject, "DGEList")) {
			stop(paste0("The distributionObject is not a DGEList. Please use est_count_dispersion function to estimate distribution"))
		}
	}

	#if minAveCount < distributionObject min count, make minAveCount as distributionObject min count
	minAveCount<-max(1,minAveCount,min(round(distributionObject$pseudo.counts.mean)))

	#Process libSize and Filter genes by count
	if (missing(libSize)) {
		libSize<-distributionObject$pseudo.lib.size
	}
	if (countFilterInRawDistribution) {
		genesLargerThanMinCount<-which(distributionObject$pseudo.counts.mean>=minAveCount)
		distributionObject$pseudo.counts.mean<-round(distributionObject$pseudo.counts.mean*(libSize/distributionObject$pseudo.lib.size))
	} else {
		distributionObject$pseudo.counts.mean<-round(distributionObject$pseudo.counts.mean*(libSize/distributionObject$pseudo.lib.size))
		genesLargerThanMinCount<-which(distributionObject$pseudo.counts.mean>=minAveCount)
	}
	distributionObject$pseudo.counts.mean[distributionObject$pseudo.counts.mean>=maxAveCount]<-maxAveCount

	#Dispersion distribution
	dispersionDistribution<-round(distributionObject$tagwise.dispersion,dispersionDigits)
	dispersionDistribution[dispersionDistribution==0]<-10^-dispersionDigits

	#selectedGenes or not
	if (!missing(pathway) | !missing(selectedGenes)) {
		if (!missing(pathway)) {
			selectedGenes<-selecteGeneByPathway(distributionObject=distributionObject,species=species,pathway=pathway)
		} else { #!missing(selectedGenes)
			selectedGenes<-selecteGeneByName(selectedGenes,distributionObject)
		}
		if (selectedGeneFilterByCount) { #filter selected genes
			temp<-intersect(selectedGenes,genesLargerThanMinCount)
		} else { #Not filter selected genes, but need keep genes with at least 1 count
			minAveCount<-1
			temp<-selectedGenes[which(distributionObject$pseudo.counts.mean[selectedGenes]>=minAveCount)]
		}
		if (length(temp)==0) {
			stop(paste0("No selectedGenes have average read count larger than selected minAveCount:",minAveCount,"\n"))
		} else if (length(setdiff(selectedGenes,temp)>0)) {
			if (removedGene0Power) {
				warning(paste0(length(setdiff(selectedGenes,temp))," selectedGenes have average read count less than selected minAveCount:",minAveCount,", they will have 0 power in power or sample size estimation\n"))
			} else {
				warning(paste0(length(setdiff(selectedGenes,temp))," selectedGenes have average read count less than selected minAveCount:",minAveCount,", discard them for further analysis\n"))
			}
		}
		repNumber<-length(selectedGenes)
		maxDispersionDistribution<-max(dispersionDistribution[selectedGenes])
		selectedGenes<-temp
	} else {
		selectedGenes<-sample(genesLargerThanMinCount,repNumber)
		maxDispersionDistribution<-max(dispersionDistribution[selectedGenes])
	}

	#return selected genes' distribution
	countDistribution<-distributionObject$pseudo.counts.mean[selectedGenes]
#	countDistribution[countDistribution==0]<-1
	dispersionDistribution<-dispersionDistribution[selectedGenes]
	return(list(selectedCount=countDistribution,selectedDispersion=dispersionDistribution,inputGeneNumber=repNumber,maxDispersionDistribution=maxDispersionDistribution))
}