Nothing
########################################################################################################################
## RnBiseqSet-class.R
## created: 2012-10-29
## creator: Pavlo Lutsik
## ---------------------------------------------------------------------------------------------------------------------
## RnBiseqSet class definition.
########################################################################################################################
#' RnBiseqSet Class
#'
#' A class for storing the DNA methylation and quality information from bisulfite sequencing experiments
#'
#' @details TBA
#'
#' @section Slots:
#' \describe{
#' \item{\code{status}}{Normalization status.}
#' }
#'
#' @section Methods and Functions:
#' \describe{
#' \item{\code{\link[BiocGenerics]{combine}}}{Combines two datasets.}
#' }
#'
#' @name RnBiseqSet-class
#' @rdname RnBiseqSet-class
#' @author Pavlo Lutsik
#' @exportClass RnBiseqSet
setClass("RnBiseqSet",
representation(status="listOrNULL"),
contains="RnBSet",
prototype(pheno=data.frame(), meth.sites=matrix(nrow=0,ncol=0), covg.sites=NULL, target="CpG", status=NULL),
package = "RnBeads")
## ---------------------------------------------------------------------------------------------------------------------
## CONSTRUCTORS
## ---------------------------------------------------------------------------------------------------------------------
setMethod("initialize", "RnBiseqSet",
function(.Object,
pheno=data.frame(),
sites=matrix(ncol=0,nrow=0),
meth.sites=matrix(ncol=0,nrow=0),
covg.sites=matrix(ncol=0,nrow=0),
regions=list(),
meth.regions=list(),
assembly="hg19",
target="CpG",
status=list(),
inferred.covariates=list()
){
.Object@pheno<-pheno
.Object@sites<-sites
.Object@meth.sites<-meth.sites
.Object@covg.sites<-covg.sites
.Object@regions<-regions
.Object@meth.regions<-meth.regions
.Object@assembly<-assembly
.Object@target<-target
.Object@status<-status
.Object@inferred.covariates <- inferred.covariates
.Object
}
)
#' Wrapper function RnBiseqSet
#'
#' @param pheno phenotypic data.
#' @param sites CpG site definition, as a \code{data.frame}
#' with 3 variables: chromosome (of type
#' \code{character}), position (\code{integer}) and
#' strand (\code{character}, one of \code{"+"}, \code{"-"} or \code{"*"}
#' @param meth summarized methylation calls as a \code{matrix} or \code{ff_matrix}
#' @param covg read coverage information as a \code{matrix} or \code{ff_matrix}
#' @param assembly the genome assembly
#' @param target target DNA methylation features (CpG sites)
#' @param summarize.regions ...
#' @param region.types region annotations for which the methylation data should be summarized
#' @param useff flag specifying whether the ff functionality should be used
#' @param usebigff flag specifying whether the extended ff functionality should be used (large matrix support for ff)
#' @param verbose flag specifying whether the diagnostic messages should be written to the
#' console or to the RnBeads logger, if the latter is initialized
#'
#' @return an object of class RnBiseqSet
#'
#' @name RnBiseqSet
#' @rdname RnBiseqSet-class
#' @aliases initialize,RnBiseqSet-method
#' @export
RnBiseqSet<-function(
pheno,
sites,
meth,
covg = NULL,
assembly="hg19",
target="CpG",
summarize.regions=TRUE,
region.types=rnb.region.types.for.analysis(assembly),
useff=rnb.getOption("disk.dump.big.matrices"),
usebigff=rnb.getOption("disk.dump.bigff"),
verbose=FALSE){
if(missing(pheno)){
stop("argument pheno should be supplied")
}
if(missing(sites)){
#warning("Valid RnBeadSet object was not created: pheno and/or betas missing")
stop("argument meth should be supplied")
}
if(missing(meth)){
#warning("Valid RnBeadSet object was not created: pheno and/or betas missing")
stop("argument meth should be supplied")
}
if(!is.data.frame(pheno)){
stop("invalid value for pheno: should be a data frame")
}
if(!(any(c('data.frame', 'matrix') %in% class(sites)))){
stop("invalid value for sites: should be a data frame or a matrix of type")
}
if(!any(c('matrix', 'ff_matrix', 'BigFfMat') %in% class(meth))){
stop("invalid value for meth: should be a matrix or an ff_matrix")
}
if(!is.null(covg) && !any(c('matrix', 'ff_matrix', 'BigFfMat') %in% class(covg))){
stop("invalid value for covg: should be a matrix or an ff_matrix")
}
if(!is.character(assembly) || length(assembly)!=1L){
stop("invalid value for assembly: should be a character of length one")
}
if(!is.character(target) || length(target)!=1L){
stop("invalid value for target: should be a character of length one")
}
if(!is.logical(summarize.regions) || length(summarize.regions)!=1L){
stop("invalid value for summarize.regions: should be a logical of length one")
}
if(!is.logical(useff) || length(useff)!=1L){
stop("invalid value for useff: should be a logical of length one")
}
if (nrow(sites)!= nrow(meth)){
msg<-c("Inconsistent values for sites and meth")
rnb.error(msg)
}
if(usebigff){
bff.finalizer <- rnb.getOption("disk.dump.bigff.finalizer")
}
if(!is.null(rnb.getOption("identifiers.column"))){
sample.names<-pheno[[rnb.getOption("identifiers.column")]]
}else if(!is.null(colnames(meth))){
sample.names<-colnames(meth)
}else{
sample.names<-as.character(1:ncol(meth))
}
#
# prepare sites definition
#
chroms<-names(rnb.get.chromosomes(assembly=assembly))
if(is.data.frame(sites)){
sites.df<-sites
sites<-matrix(nrow=nrow(sites.df), ncol=3, dimnames=list(NULL, c("chr", "coord", "strand")))
sites[,1L]<-as.integer(match(sites.df[,1L], chroms))
sites[,2L]<-as.integer(sites.df[,2L])
sites[,3L]<-as.integer(factor(sites.df[,3L], levels=c("+","-","*")))
rm(sites.df)
}
num.all.sites<-nrow(sites)
#remove sites with illegal chromosomes
legal.sites <- rowSums(is.na(sites))==0
num.illegal <- sum(!legal.sites)
sites <- sites[legal.sites,,drop=FALSE]
#meth <- meth[legal.sites,,drop=FALSE]
#covg <- covg[legal.sites,,drop=FALSE]
if(verbose && num.illegal > 0) {
rnb.info(c("Removed", num.illegal, "sites with unknown chromosomes"))
}
if(nrow(sites)<1L){
rnb.warning("All sites have been removed, returning NULL")
return(invisible(NULL))
}
#
# match to annotation
#
guess.strand <- all(sites[,3L]==3)
if(verbose && guess.strand) {
rnb.info("Inferring strand information from annotation enabled")
}
# Chromosome ordering fix.
#
#chrdata<-sapply(sort(unique(sites[,1L])), function(chr){
chrdata<-lapply(sort(unique(sites[,1L])), function(chr){
#dump<-sapply(unique(sites[,1L]), function(chr){
chr.subset<-which(sites[,1L]==chr)
# version with proper ordering
#
# chr.subset<-sites[,1L]==chr
if(guess.strand){
site.ranges <- GRanges(rep(chroms[chr], length(chr.subset)),
IRanges(start=sites[chr.subset,2],
width=rep(1,length(chr.subset))),
rep("*",length(chr.subset)))
c.coords.ref <- GenomicRanges::flank(rnb.get.annotation(assembly=assembly)[[chr]],-1)
match <- GenomicRanges::findOverlaps(c.coords.ref, site.ranges, type="start")
} else {
site.ranges<-GRanges(rep(chroms[chr], length(chr.subset)),
IRanges(start=sites[chr.subset,2L],
width=rep(1,length(chr.subset))),
c("+","-","*")[sites[chr.subset,3L]])
# match<-GenomicRanges::findOverlaps(site.ranges, rnb.get.annotation(assembly=assembly)[[chr]], type="start")
# sites[chr.subset[queryHits(match)],2]<<-subjectHits(match)
# sites[chr.subset[-queryHits(match)],2]<<-NA
# return(NULL)
# version with proper ordering
#
match<-GenomicRanges::findOverlaps(rnb.get.annotation(assembly=assembly)[[chr]], site.ranges, type="start")
}
bed.h <- subjectHits(match)
ref.h <- queryHits(match)
multi.hit.ref <- duplicated(ref.h)
if(any(multi.hit.ref)){
rnb.warning(paste("Multiple entries detected in the bed file that match to the same reference C. --> Picking the first one"))
bed.h <- bed.h[!multi.hit.ref]
ref.h <- ref.h[!multi.hit.ref]
}
multi.hit.bed <- duplicated(bed.h)
if(any(multi.hit.bed)){
rnb.warning(paste("Some entries in the bed file match multiple reference Cs. --> Picking the first one"))
bed.h <- bed.h[!multi.hit.bed]
ref.h <- ref.h[!multi.hit.bed]
}
if(length(bed.h)!=length(ref.h)){
stop("Something went wrong while matching the sites, exiting")
}
n.match <- length(bed.h)
#chrmeths<-meth[chr.subset,,drop=FALSE][bed.h,,drop=FALSE]
#chrcovgs<-covg[chr.subset,,drop=FALSE][bed.h,,drop=FALSE]
#indices<-chr.subset[bed.h]
chrsites<-cbind(rep(1L,n.match),rep(chr, n.match), ref.h, chr.subset[bed.h])
#return(list(chrsites,chrmeths,chrcovgs))
chrsites
})
rnb.cleanMem()
#version with proper ordering
#sites<-do.call("rbind", chrdata[1,])
sites<-do.call("rbind", chrdata)
#meth<-do.call("rbind", chrdata[2,])
#covg<-do.call("rbind", chrdata[3,])
valid<-rowSums(is.na(sites))==0
if(nrow(sites)<1L){
rnb.warning("All sites have been removed, returning NULL")
return(invisible(NULL))
}else{
if(verbose){
rnb.status(c("Matched", sum(valid),"of", num.all.sites,"methylation sites to the annotation"))
}
}
if(!is.null(covg)){
if(verbose) rnb.status(c("Checking site coverage")) #TODO: debug info: remove me
zero.covg.sites <- rowSums(covg[sites[,4L],,drop=FALSE], na.rm=TRUE)==0 #TODO: could this be too memory wasteful?
if(verbose && any(zero.covg.sites)) {
rnb.status(c("Removed",length(intersect(which(zero.covg.sites), which(valid))),"of",
sum(valid),"methylation sites because they were not covered in any sample"))
}
valid<-which(valid | zero.covg.sites)
rnb.cleanMem()
if(!useff){
covg[is.na(covg)]<-0L
}else{
# if(verbose) rnb.status(c("Setting zero coverages")) #TODO: debug info: remove me
for(ci in 1:ncol(covg)){
covg[is.na(covg[,ci]),ci]<-0L
}
}
}
rnb.cleanMem()
sites <- sites[valid,]
if(!useff){
meth <- meth[which(legal.sites)[sites[,4L]],,drop=FALSE]
if(!is.null(covg)){
covg <- covg[which(legal.sites)[sites[,4L]],,drop=FALSE]
}
}else{
meth.old <- meth
if(verbose) rnb.status(c("Creating methylation matrix")) #TODO: debug info: remove me
if (usebigff) {
meth <- BigFfMat(row.n=nrow(sites), col.n=ncol(meth.old), row.names=NULL, col.names=sample.names, finalizer=bff.finalizer)
} else {
meth <- ff(NA, dim=c(nrow(sites), ncol(meth.old)), dimnames=list(NULL, sample.names), vmode="double")
}
# if(verbose) rnb.status(c("Filling methylation matrix")) #TODO: debug info: remove me
# meth[,] <- meth.old[which(legal.sites)[sites[,4L]],]
site.inds <- which(legal.sites)[sites[,4L]]
for (j in 1:ncol(meth.old)) meth[,j] <- meth.old[site.inds,j]
rm(meth.old)
rnb.cleanMem()
if(!is.null(covg)){
covg.old <- covg
if(verbose) rnb.status(c("Creating coverage matrix")) #TODO: debug info: remove me
if (usebigff) {
covg <- BigFfMat(row.n=nrow(sites), col.n=ncol(covg.old), row.names=NULL, col.names=sample.names, na.prototype=as.integer(NA), finalizer=bff.finalizer)
} else {
covg <- ff(NA_integer_, dim=c(nrow(sites), ncol(covg.old)), dimnames=list(NULL, sample.names))
}
# if(verbose) rnb.status(c("Filling coverage matrix")) #TODO: debug info: remove me
# covg[,] <- covg.old[which(legal.sites)[sites[,4L]],]
for (j in 1:ncol(covg.old)) covg[,j] <- covg.old[site.inds,j]
rm(covg.old)
rnb.cleanMem()
}
}
sites <- sites[,-4L]
colnames(sites)<-c("context", "chr", "index")
meth.sample.n.valid<-integer()
for(ci in 1:ncol(meth)){
meth.sample.n.valid[ci] <- sum(!is.na(meth[,ci]))
}
if(any(meth.sample.n.valid<1)){
rnb.error(c("The following samples have no valid methylation values:",colnames(meth)[meth.sample.n.valid<1]))
#stop("invalid sample methylation values")
}
# sites<-cbind(rep(1, nrow(sites)), sites)
# sites<-sites[,2:4]
# colnames(sites)<-c("context", "chr", "index")
#be memory efficient by cleaning up
# rownames take a lot of space
# sites <- sites[valid,,drop=FALSE]
# meth <- meth[valid,,drop=FALSE]
# covg <- covg[valid,,drop=FALSE]
rnb.cleanMem()
# if(!is.null(covg)){
# zero.covg.sites <- rowSums(covg, na.rm=TRUE)==0
# if(any(zero.covg.sites)){
# valid <- !zero.covg.sites
# sites <- sites[valid,,drop=FALSE]
# meth <- meth[valid,,drop=FALSE]
# covg <- covg[valid,,drop=FALSE]
# rnb.cleanMem()
#
# if(verbose) {
# rnb.status(c("Removed",sum(zero.covg.sites),"of",
# length(valid),"methylation sites because they were not covered in any sample"))
# }
# }
# }
status <- list(disk.dump=FALSE)
if(useff){
#subsampling for large datasets (ff currently supports only objects of size .Machine$integer.max)
if (prod(dim(meth))>.Machine$integer.max & !usebigff){
sites.allowed <- as.integer(.Machine$integer.max/ncol(meth))
sample.site.inds <- sort(sample.int(nrow(meth),sites.allowed))
msg<-c("Full dataset is too large to be supported by ff. --> downsampling to",sites.allowed,"( of",nrow(meth),") sites")
rnb.warning(msg)
meth <- meth[sample.site.inds,]
if(!is.null(covg)){
covg <- covg[sample.site.inds,]
}
sites <- sites[sample.site.inds,]
}
if("matrix" %in% class(meth)){
if (usebigff){
meth <- BigFfMat(meth, finalizer=bff.finalizer)
} else {
meth <- convert.to.ff.matrix.tmp(meth)
}
}
if(!is.null(covg)){
if("matrix" %in% class(covg)){
if (usebigff){
covg <- BigFfMat(covg, finalizer=bff.finalizer)
} else {
covg <- convert.to.ff.matrix.tmp(covg)
}
}
}
status <- list(disk.dump=TRUE, disk.dump.bigff=usebigff)
}
if(verbose) rnb.status(c("Creating object")) #TODO: debug info: remove me
object<-new("RnBiseqSet",
pheno=pheno,
sites=sites,
meth.sites=meth,
covg.sites=covg,
status=status,
assembly=assembly,
target=target)
if(!rnb.getOption("strand.specific")){
msg<-c("Summarizing","strand","methylation")
if(verbose){
rnb.status(msg)
}
if(is.null(covg)){
aggr<-"mean"
}else{
aggr<-"coverage.weighted"
}
object <- summarize.regions(object, "strands", aggregation=aggr)
rnb.cleanMem()
}
for (region.type in region.types) {
if (region.type %in% rnb.region.types(assembly)) {
msg<-c("Summarizing",region.type,"methylation")
if(verbose){
rnb.status(msg)
}
object <- summarize.regions(object, region.type)
rnb.cleanMem()
}
}
object
}
## ---------------------------------------------------------------------------------------------------------------------
## VALIDITY
## ---------------------------------------------------------------------------------------------------------------------
#
# check.rnb.biseq.set
#
# A routine that checks the validity of a freshly loaded RnBiseqSet object
#
# #param object RnBiseqSet object to test
# #param verbose A flag specifying whether the logger is initialized
#
# #return TRUE if all checks are successful
#
# Pavlo Lutsik
#
check.rnb.biseq.set<-function(object, verbose=TRUE){
if(!inherits(object, "RnBiseqSet")){
if(verbose){
rnb.info("The supplied object is not of class RnBiseqSet. Breaking the check...")
}
return(FALSE)
}else{
if(verbose){
rnb.info("Checking the supplied RnBiseqSet object")
}
valid<-TRUE
}
#check the methylation data
mm <- meth(object)
if(nrow(mm)<1){
if(verbose){
rnb.info("The object contains information for 0 methylation sites")
}
valid<-FALSE
}else{
if(verbose){
rnb.info(sprintf("The object contains information for %d methylation sites",nrow(mm)))
}
}
if(ncol(mm)<1){
if(verbose){
rnb.info("The object contains information for 0 samples")
}
valid<-FALSE
}else{
if(verbose){
rnb.info(sprintf("The object contains information for %d samples",ncol(mm)))
}
}
if(sum(!is.na(mm))<1){
if(verbose){
rnb.info("All methylation values are missing")
}
valid<-FALSE
}else{
if(verbose){
rnb.info(sprintf("The object contains %d missing methylation values",sum(is.na(mm))))
}
}
if(sum(mm[!is.na(mm)]<0 & mm[!is.na(mm)]>1)>0){
if(verbose){
rnb.info("The object contains incorrect methylation values (less than 0 or greater than 1)")
}
valid<-FALSE
}else{
if(verbose){
rnb.info("Methylation values are within the expected range")
}
}
#check the coverage data
if(is.null(object@covg.sites)){
if(verbose){
rnb.info("No coverage information found")
}
valid<-FALSE
return(valid)
}else{
if(verbose){
rnb.info("The object contains coverage information")
}
cvg<-covg(object)
}
if(sum(cvg[!is.na(cvg)]<0)>0){
if(verbose){
rnb.info("The object contains incorrect coverage values (negative values)")
}
valid<-FALSE
}else if(sum(cvg[!is.na(cvg)]>1000)>0.5*length(as.numeric(cvg))){
if(verbose){
rnb.info("The object contains incorrect coverage values (half of the sites have coverage > 1000 )")
}
}else{
if(verbose){
rnb.info("Coverage values are within the expected range")
}
}
return(valid)
}
########################################################################################################################
setMethod("show", "RnBiseqSet",
function(object) {
cat("Object of class RnBiseqSet\n")
cat(sprintf("%8d samples\n", nrow(object@pheno)))
cat(sprintf("%8d methylation sites\n", nrow(object@sites)))
if (!is.null(object@regions)){
cat("Region types:\n")
for (rn in names(object@regions)) {
cat(sprintf("\t%8d regions of type %s\n", nrow(object@regions[[rn]]), rn))
}
}
cat(sprintf("Coverage information is %s\n", ifelse(is.null(object@covg.sites), "absent", "present")))
if(isImputed(object)){
cat("Data set was imputed.\n")
}
}
)
########################################################################################################################
#' as("RnBeadSet", "RnBiseqSet")
#'
#' Convert a \code{\linkS4class{RnBeadSet}} object to a "mock" \code{\linkS4class{RnBiseqSet}} object
#' (used in the combine method)
#'
#' @name coercion-methods
#'
setAs("RnBeadSet", "RnBiseqSet",
function(from, to){
ann<-annotation(from)
site.info<-ann[,c("Chromosome", "Start", "Strand")]
object<-RnBiseqSet(
pheno=pheno(from),
sites=site.info,
meth=meth(from),
covg=NULL,
assembly=from@assembly,
target="CpG",
useff=isTRUE(from@status$disk.dump),
usebigff=isTRUE(from@status$disk.dump.bigff),
verbose=FALSE
)
object
})
########################################################################################################################
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.