R/controlPlotsBiSeq.R

Defines functions rnb.plot.num.sites.covg rnb.plot.coverage.thresholds rnb.plot.biseq.coverage.violin rnb.plot.biseq.coverage.hist rnb.plot.biseq.coverage

Documented in rnb.plot.biseq.coverage rnb.plot.biseq.coverage.hist rnb.plot.biseq.coverage.violin rnb.plot.coverage.thresholds rnb.plot.num.sites.covg

########################################################################################################################
## controlPlotsBiSeq.R
## created: 2012-04-04
## creator: Pavlo Lutsik
## ---------------------------------------------------------------------------------------------------------------------
## Quality control plots for the reduced-representation and the whole-genome bisulfite sequencing experiments.
########################################################################################################################

## F U N C T I O N S ###################################################################################################

#' rnb.plot.biseq.coverage
#'
#' Plots the sequencing coverage of the RnBiseqSet object across the genomic coordinate 
#'   
#' @param rnbs.set			RnBiseqSet object
#' @param type				\code{character} singleton. If \code{site} the coverage
#' 							information is plotted for each methylation site. Otherwise
#' 							should be one of the regions returned by \code{rnb.region.types}
#' @param sample			unique sample identifier. In case \code{rnb.getOption("identifiers.column")} 
#' 							is not \code{NULL}, \code{sample} should attain values from the corresponding column, 
#' 							or \code{colnames(meth(rnb.set))} otherwise
#' @param writeToFile 		flag specifying whether the output should be saved as \code{\linkS4class{ReportPlot}}
#' @param numeric.names if \code{TRUE} and \code{writeToFile} is \code{TRUE}substitute the plot options in the plot file name with digits
#' @param covg.lists		if available, the output of \code{\link{rnb.execute.quality}}
#' @param ... 				other arguments to \code{\link{createReportPlot}}
#' 
#' @return plot as an object of type \code{\linkS4class{ReportPlot}} if \code{writeToFile} is \code{TRUE} and of class 
#' 			\code{\link{ggplot}} otherwise.
#' @author Pavlo Lutsik
#' @export
rnb.plot.biseq.coverage<-function(
		rnbs.set, 
		sample, 
		type = "sites", 
		writeToFile = FALSE, 
		numeric.names = FALSE, 
		covg.lists=NULL, 
		...){
	
	if(!inherits(rnbs.set, "RnBiseqSet"))
		stop("Invalid value for rnbs.set: object of class RnBiseqSet expected")
	
	if(!(is.character(type) && type %in% c("sites", rnb.region.types(assembly(rnbs.set)))))
		stop("Invalid value for type: \"sites\" or one of the rnb.region.types() expected")
	
	if (!(is.character(sample) && sample %in% samples(rnbs.set))) {
		stop("Invalid value for sample: character expected")
	}
	assembly <- assembly(rnbs.set)	
	covg.rnbs<-covg(rnbs.set, type)
	covg.rnbs[is.na(covg.rnbs)]<-0
	
		
	if(type=="sites"){
		type<-"CpG"
		map<-sites(rnbs.set)
	}else{
		map<-regions(rnbs.set, type)
	}
	
	sample.id<-match(sample, samples(rnbs.set))
	
	if(max(covg.rnbs[,sample.id])==0){
		
		if(writeToFile) plot.file<-createReportPlot(paste("CoveragePlot", ifelse(numeric.names, sample.id, sample), sep="_"), ...)
			print(rnb.message.plot("All CpG have zero coverage in this sample"))
		if(writeToFile) {
			off(plot.file)
			return(plot.file)
		}else{
			return(invisible(NULL))
		}
	}
	
	if(is.null(covg.lists)){
				
		covg.lists<-lapply(names(rnb.get.chromosomes(assembly=assembly)), function(chr) {
					chr.id<-match(chr, names(rnb.get.chromosomes(assembly=assembly)))
					covg.weight<-rep(0, rnb.annotation.size(assembly=assembly)[chr.id])
					covg.weight[map[map[,2]==chr.id,3]]<-covg.rnbs[map[,2]==chr.id,sample.id]
					covg.weight
				})
	}else if(is.list(covg.lists)){
		
		covg.lists<-lapply(1:length(covg.lists), function(chr.id){
					covg.weight<-covg.lists[[chr.id]]
					covg.weight[map[map[,2]==chr.id,3]]<-covg.rnbs[map[,2]==chr.id,sample.id]
					covg.weight
				})
	}
	names(covg.lists)<-names(rnb.get.chromosomes(assembly=assembly))
	
	cv.templ<-coverage(rnb.get.annotation(type,assembly=assembly), weight=covg.lists)
	
	if(writeToFile) plot.file<-createReportPlot(paste("CoveragePlot", ifelse(numeric.names, sample.id, sample), sep="_"), ...)
		
	names(cv.templ)<-NULL
	plot<-ggbio::autoplot(cv.templ, type = "viewMaxs", stat = "slice", lower = 8, ylab="", ylim=2000, labeller=function(variable,value){names(rnb.get.chromosomes(assembly=assembly))[value]})+
					scale_y_continuous(limits=c(0,2000))
		
	if(writeToFile) {
		print(plot)
		off(plot.file)
		return(plot.file)
	}else{
		return(plot)
	}
}

########################################################################################################################

#' rnb.plot.biseq.coverage.hist
#'
#' Plots the histograms of the coverage 
#'   
#' @param rnbs.set			RnBiseqSet object
#' @param sample			unique sample identifier. In case \code{rnb.getOption("identifiers.column")} 
#' 							is not \code{NULL}, \code{sample} should attain values from the corresponding column, 
#' 							or \code{colnames(meth(rnb.set))} otherwise
#' @param type				\code{character} singleton. If \code{site} the coverage
#' 							information is plotted for each methylation site. Otherwise
#' 							should be one of the regions returned by \code{rnb.region.types}
#' @param writeToFile 		a flag specifying whether the output should be saved as \code{\linkS4class{ReportPlot}}
#' @param numeric.names if \code{TRUE} and \code{writeToFile} is \code{TRUE}substitute the plot options in the plot file name with digits
#' @param covg.max.percentile the maximum percentile of the coverage to be plotted
#' @param ... 				other arguments to \code{\link{createReportPlot}}
#' 
#' @return plot as an object of type \code{\linkS4class{ReportPlot}} if \code{writeToFile} is \code{TRUE} and of class 
#' 			\code{\link{ggplot}} otherwise.
#'
#' @author Pavlo Lutsik
#' @export
rnb.plot.biseq.coverage.hist<-function(
		rnbs.set, 
		sample, 
		type = "sites", 
		writeToFile = FALSE, 
		numeric.names = FALSE, 
		covg.max.percentile=1.0, 
		...){
	
	if(!inherits(rnbs.set, "RnBiseqSet"))
		stop("Invalid value for rnbs.set: object of class RnBiseqSet expected")
	
	if(!(is.character(type) && type %in% c("sites", rnb.region.types(assembly(rnbs.set)))))
		stop("Invalid value for type: \"sites\" or one of the rnb.region.types() expected")
	
	if (!(is.character(sample) && sample %in% samples(rnbs.set))) {
		stop("Invalid value for sample: character expected")
	}
	if (!is.double(covg.max.percentile) ||  covg.max.percentile > 1 || covg.max.percentile < 0){
		stop("Invalid value for covg.max.percentile: number between 0 and 1 expected")
	}
	
	sample.id <- match(sample, samples(rnbs.set))
	covg.rnbs <- covg(rnbs.set, type)
	max.covg  <- max(covg.rnbs, na.rm=TRUE)
	if (covg.max.percentile > 0 && covg.max.percentile < 1){
		max.covg <- quantile(covg.rnbs, probs=c(covg.max.percentile), na.rm=TRUE)[1]
	}
	covg.rnbs<-covg.rnbs[,sample.id]
	covg.rnbs[is.na(covg.rnbs)]<-0
	
	rnb.cleanMem()
	
	if(writeToFile) {
		plot.file<-createReportPlot(paste("CoverageHistogram", ifelse(numeric.names, sample.id, sample), sep="_"), ...)
	}
		
	dframe<-data.frame(Coverage=covg.rnbs)
	#TODO: cut after 95 percentile of coverage
	pp <- ggplot(dframe, aes_string(x = "Coverage")) + coord_cartesian(xlim=c(0,max.covg))+ 
		labs(x = "Coverage", y = "# CpGs") + geom_histogram(aes_string(y = "..count.."), binwidth = 10)
	
	if(writeToFile) {
		print(pp)
		off(plot.file)
		return(plot.file)
	}else{
		return(pp)
	}
	
}
########################################################################################################################

#' rnb.plot.biseq.coverage.violin
#'
#' Plots the violin plots of the coverage distribution
#'   
#' @param rnbs.set			RnBiseqSet object
#' @param samples			unique sample identifiers. In case \code{rnb.getOption("identifiers.column")} 
#' 							is not \code{NULL}, \code{samples} should attain values from the corresponding column, 
#' 							or \code{colnames(meth(rnb.set))} otherwise
#' @param fname				base filename for the files to be plotted. If NULL, the plot will not be written to file
#' @param type				\code{character} singleton. If \code{site} the coverage
#' 							information is plotted for each methylation site. Otherwise
#' 							should be one of the regions returned by \code{rnb.region.types}
#' @param covg.range	    Vector of length 2 specifying the range of coverage to be plotted. if \code{NULL} (default) the entire range will be plotted
#' @param ... 				other arguments to \code{\link{createReportPlot}}
#' 
#' @return plot as an object of type \code{\linkS4class{ReportPlot}} if \code{writeToFile} is \code{TRUE} and of class 
#' 			\code{\link{ggplot}} otherwise.
#'
#' @author Fabian Mueller
#' @export
rnb.plot.biseq.coverage.violin <- function(rnbs.set, samples, fname=NULL, type = "sites", covg.range=NULL, ...){
	
	if(!inherits(rnbs.set, "RnBiseqSet"))
		stop("Invalid value for rnbs.set: object of class RnBiseqSet expected")
	
	if(!(is.character(type) && type %in% c("sites", rnb.region.types(assembly(rnbs.set)))))
		stop("Invalid value for type: \"sites\" or one of the rnb.region.types() expected")
	
	if (!(is.character(samples) && all(samples %in% samples(rnbs.set)))) {
		stop("Invalid value for samples: character expected")
	}
	if (!is.null(covg.range) && length(covg.range)!=2){
		stop("Invalid value for covg.range. vector of length 2 or NULL expected")
	}
	
	sample.id <- match(samples, samples(rnbs.set))
	covg.rnbs <- covg(rnbs.set, type)
	max.covg  <- max(covg.rnbs, na.rm=TRUE)
	covg.rnbs <- covg.rnbs[,sample.id]
	covg.rnbs[is.na(covg.rnbs)]<-0
	
	dframe <- data.frame(Coverage=as.vector(covg.rnbs),
			Sample=rep(samples,times=rep(nrow(covg.rnbs),length(samples))))
	
	if(!is.null(fname)) plot.file <- createReportPlot(fname, ...)
	
	pp <- ggplot(dframe, aes_string(x = "Sample", y = "Coverage")) 
	if (!is.null(covg.range)) {
		pp <- pp + ylim(covg.range[1],covg.range[2])
	}
	pp <- pp + labs(x = "Sample", y = "Coverage") +
		geom_violin(aes_string(fill="Sample"),scale="count") +
		scale_fill_manual(values=rep(rnb.getOption("colors.category"),length.out=length(samples))) +
		coord_flip() + theme(legend.position="none")
	
	if(!is.null(fname)) {
		print(pp)
		off(plot.file)
		return(plot.file)
	}else{
		return(pp)
	}
}

########################################################################################################################

#' rnb.plot.coverage.thresholds
#'
#' Plots the number of remaining CpGs after applying different thresholds for coverage and support.
#'
#' @param rnb.set       Methylation dataset as an object of type \code{\linkS4class{RnBiseqSet}}.
#' @param min.coverages Non-empty \code{integer} vector storing the unique positive cutoff values to be applied for
#'                      minimal coverage. Names, if present, are interpreted as colors that must be used to denote the
#'                      corresponding values.
#' @param fname         File name to save the generated plot to. See the \emph{Details} section for restrictions.
#' @param ...           Additional named parameters related to saving the plot to files. These can include:
#'                      \code{report}, \code{width}, \code{height}, \code{create.pdf}, \code{low.png} and
#'                      \code{high.png}. These parameters are ignored when \code{fname} is \code{NULL} or \code{NA}.
#' @return If \code{fname} is \code{NULL} or \code{NA} (default), the generated plot as an object of type
#'         \code{ggplot2}; otherwise, the initialized and closed \code{\linkS4class{ReportPlot}} object, invisibly.
#' 
#' @details
#' If \code{fname} is specified, this function calls \code{\link{createReportPlot}} to save the plot to PDF and/or PNG
#' files. See \link[=createReportPlot]{its documentation} for information on acceptable file names. Additional
#' parameters - \code{report}, \code{width}, \code{height}, etc. - can also be given. If image width is not specified,
#' it is set to a value between 4.7 and 9.2 (inches), depending on the number of samples in the dataset. The default
#' image height is fixed to 7.2.
#' 
#' @author Yassen Assenov
#' @export
rnb.plot.coverage.thresholds <- function(rnb.set, min.coverages, fname = NA, ...) {

	## Validate parameter values
	if (!inherits(rnb.set, "RnBiseqSet")) {
		stop("invalid value for rnb.set")
	}
	if (is.double(min.coverages) && isTRUE(all(min.coverages == as.integer(min.coverages)))) {
		cnames <- names(min.coverages)
		min.coverages <- as.integer(min.coverages)
		names(min.coverages) <- cnames
		rm(cnames)
	}
	if (!(is.integer(min.coverages) && length(min.coverages) != 0 && all(!is.na(min.coverages))
		  && all(min.coverages > 0))) {
		stop("invalid value for min.coverages")
	}
	if (anyDuplicated(min.coverages)) {
		stop("invalid value for min.coverage; duplicates found")
	}
	min.coverages <- sort(min.coverages)
	if (is.null(fname)) {
		fname <- NA
	} else if (!is.na(fname)) {
		if (!is.character(fname)) {
			stop("invalid value for fname")
		}
	}

	## Define colors to denote min coverage thresholds
	if (is.null(names(min.coverages))) {
		n <- length(min.coverages)
		colors.covered <- rgb(seq(0, 0.5, length.out = n), seq(0, 0.5, length.out = n), seq(0, 1, length.out = n))
		names(colors.covered) <- min.coverages
	} else {
		colors.covered <- names(min.coverages)
		colors.covered[is.na(colors.covered)] <- "#808080"
		names(colors.covered) <- min.coverages
	}

	## Construct data.frame storing the plot information
	matrix.coverages <- covg(rnb.set)
	n <- ncol(matrix.coverages)
	if (n <= 50) {
		s.values <- n:1
		s.indices <- 1:n
		xlabel <- "Support (number of samples)"
	} else {
		s.values <- seq(0, 1, length.out = 51)[-1]
		s.indices <- as.integer(ceiling(quantile(0:n, probs = s.values)))
		s.indices <- n + 1L - s.indices
		xlabel <- "Support (percentile)"
	}
	dframe <- do.call(rbind, lapply(as.integer(min.coverages), function(mc) {
			interrogated <- as.integer(rowSums(matrix.coverages >= mc))
			cpgs <- cumsum(rev(tabulate(interrogated, ncol(matrix.coverages)))) / 10^6
			data.frame("mc" = mc, "s" = s.values, cpgs = cpgs[s.indices])
		}
	))
	dframe$mc <- factor(as.character(dframe$mc), levels = as.character(min.coverages))

	## Generate the plot
	if (n <= 20) {
		img.width <- 4.7
		xbreaks <- 1:n
		xlabels <- as.character(xbreaks)
		xlimits <- 0.5 + c(n, 0)
	} else if (n <= 50) {
		img.width <- 1.7 + 0.15 * n
		xbreaks <- seq(2L, n, by = 2L)
		xlabels <- as.character(xbreaks)
		xlimits <- 0.5 + c(n, 0)
	} else {
		img.width <- 1.7 + 0.15 * 50
		xbreaks <- seq(0, 1, length.out = 26)
		xlabels <- as.character(xbreaks * 100)
		xlimits <- c(1.01, 0.01)
	}
	pp <- ggplot2::ggplot(dframe, aes_string(x = "s", y = "cpgs", color = "mc", group = "mc")) + ggplot2::geom_line() +
		ggplot2::geom_point() + ggplot2::scale_color_manual(values = colors.covered) +
		ggplot2::scale_x_reverse(breaks = xbreaks, limits = xlimits, labels = xlabels, expand = c(0, 0)) +
		ggplot2::labs(x = xlabel, y = "CpGs (million)", color = "Coverage\nthreshold") +
		ggplot2::theme(legend.position = c(1, 0.5), legend.justification = c(0, 0.5)) +
		ggplot2::theme(plot.margin = unit(0.1 + c(0, 1, 0, 0), "in"))
	if (is.na(fname)) {
		return(pp)
	}

	## Save the plot to file(s)
	additional.params <- list(...)
	if (is.null(additional.params$width)) {
		additional.params$width <- img.width
	}
	if (is.null(additional.params$height)) {
		additional.params$height <- 7.2
	}
	additional.params$fname <- fname
	rplot <- do.call(createReportPlot, additional.params)
	print(pp)
	rplot <- off(rplot)

	## Attach the data as an attribute to the plot
	dframe$mc <- as.integer(as.character(dframe$mc))
	colnames(dframe) <- c("Minimal coverage", xlabel, "CpGs (million)")
	attr(rplot, "data") <- dframe
	invisible(rplot)
}


########################################################################################################################

#' rnb.plot.num.sites.covg
#'
#' plot the number of sites vs median and other percentiles of coverage
#'   
#' @param rnbs			RnBiseqSet object
#' @param addSampleNames	should the sample names be added to the plot
#' @param bar.percentiles	the percentiles to be used for the error bars. Must be a vector of length 2 of which the
#' 							first two elements will be used
#' 
#' @return plot as an object of type \code{\link{ggplot}}
#'
#' @author Fabian Mueller
#' @export
rnb.plot.num.sites.covg <- function(rnbs, addSampleNames=(length(samples(rnbs))<100), bar.percentiles=c(0.25,0.75)){
	rnb.require("scales")
	perc.vec <- c(bar.percentiles[1],0.5,bar.percentiles[2])
	df2p <- do.call("rbind",lapply(1:length(samples(rnbs)),FUN=function(j){
		sn <- samples(rnbs)[j]
		cc <- as.vector(covg(rnbs,j=j))
		cc[cc<1] <- NA
		ns <- sum(!is.na(cc))
		quants <- quantile(cc, probs=perc.vec, na.rm=TRUE)
		res <- data.frame(
			sample=sn,
			numSites=ns,
			covgPercLow=quants[1],
			covgMedian=quants[2],
			covgPercUp=quants[3],
			stringsAsFactors=FALSE
		)
		return(res)
	}))
	rownames(df2p) <- samples(rnbs)

	pp <- ggplot(df2p) + aes(x=numSites, y=covgMedian, ymin=covgPercLow, ymax=covgPercUp) + geom_pointrange() +
		  scale_x_continuous(labels = comma) +  ylab(paste("coverage (median,",bar.percentiles[1],"and",bar.percentiles[2],"percentiles)"))
	if (addSampleNames){
		pp <- pp + geom_text(aes(label=sample), hjust=0.5, vjust=-0.5, angle=90, size=2, color="grey50")
	}
	pp
}

Try the RnBeads package in your browser

Any scripts or data that you put into this service are public.

RnBeads documentation built on March 3, 2021, 2 a.m.