# PART THAT DEALS with annotation of CpGs and differential methylation events
##############################################################################
# SECTION 1:
# reading annotation to GRanges
# makes GRanges object from a given bed6 or bed12 file to granges object
##############################################################################
#######################################
# SECTION 1: S3 functions
#######################################
# extracts exons from a bed12 file and puts them into GRanges object
# done in pure R
bed12.to.exons<-function(ref)
{
#colClasses=c("factor","integer","integer","factor","integer","factor","integer","integer","integer","integer","factor","factor")
#ref =read.table(bed.file,header=F,skip=skip,colClasses = colClasses,stringsAsFactors=F) # give colClasses for fast read-in
ref=unique(ref)
# apply strsplit on columns 11 and 12 to extract exon start positions and exon sizes
b.start.size=cbind(as.integer(unlist(strsplit(as.character(ref$V12),"," ))),as.integer(unlist(strsplit(as.character(ref$V11),"," ))))
rep.ref =ref[rep(1:nrow(ref),ref[,10]),] # replicate rows occurs as many as its exon number
#exon.id =unlist(sapply(ref[,10],function(x) 1:x));rep.ref$V5=exon.id
exon.id =unlist( mapply( function(x,y) if(x=="+"){return(1:y)}else{return(y:1)} ,ref[,6],ref[,10] ) );rep.ref$V5=exon.id
rep.ref$V3 =rep.ref$V2+b.start.size[,1]+b.start.size[,2] # calculate exon start and ends
rep.ref$V2 =rep.ref$V2+b.start.size[,1]
#return(rep.ref[,1:6]) if you want to return as a data.frame
# if(is.null(chrom.file))
# {
strands=as.character(rep.ref$V6)
strands[strands=="."]="*"
grange.exons=GenomicRanges::GRanges(seqnames=as.character(rep.ref$V1),
ranges=IRanges(start=rep.ref$V2+1, end=rep.ref$V3),
strand=strands, score=rep.ref$V5,name=rep.ref$V4)
return(grange.exons)
# }
# #read table for chromosome info
# chrom=read.table(chrom.file,header=F)
# chrom=chrom[chrom[,1] %in% rep.ref$V1,1:2]
# seqlengths=chrom[,2];names(seqlengths)=as.character(chrom[,1])
# do adjustments so that trans and exons have the same numerical id
# strands=as.character(rep.ref$V6)
# strands[strands=="."]="*"
# grange.exons=GRanges(seqnames=as.character(rep.ref$V1),
# ranges=IRanges(start=rep.ref$V2+1, end=rep.ref$V3),
# strand=strands, score=rep.ref$V5,name=rep.ref$V4,seqlengths=seqlengths)
# return(grange.exons)
}
# extracts exons from a bed12 file and puts them into GRanges object
# done in pure R
bed12.to.introns<-function(ref)
{
#colClasses=c("factor","integer","integer","factor","integer","factor","integer","integer","integer","integer","factor","factor")
#ref =read.table(bed.file,header=F,skip=skip,colClasses = colClasses,stringsAsFactors=F) # give colClasses for fast read-in
#remove the genes with one exon only (they won't have any introns)
ref=ref[ref[,10]>1,]
ids=paste(ref[,1],ref[,2],ref[,3],ref[,4],sep="")
ref=cbind(ref,id=ids)
ref=unique(ref)
# apply strsplit on columns 11 and 12 to extract exon start positions and exon sizes
b.start.size=cbind(as.integer(unlist(strsplit(as.character(ref$V12),"," ))),as.integer(unlist(strsplit(as.character(ref$V11),"," ))))
rep.ref =ref[rep(1:nrow(ref),ref[,10]),] # replicate rows occurs as many as its exon number
#exon.id =unlist(sapply(ref[,10],function(x) 1:x));rep.ref$V5=exon.id
exon.id =unlist( mapply( function(x,y) if(x=="+"){return(1:y)}else{return(y:1)} ,ref[,6],ref[,10] ) );rep.ref$V5=exon.id
rep.ref$V3 =rep.ref$V2+b.start.size[,1]+b.start.size[,2] # calculate exon start and ends
rep.ref$V2 =rep.ref$V2+b.start.size[,1]
rep.ref =rep.ref[,c(1:6,13)]
# now try to get the exons by cbinding consecutive exons
temp.ref =cbind(rep.ref[1:(nrow(rep.ref)-1),],rep.ref[2:nrow(rep.ref),])
temp.ref =temp.ref[temp.ref[,7]==temp.ref[,14],]
temp.ref[,2]=temp.ref[,3]
temp.ref[,3]=temp.ref[,9]
rep.ref =temp.ref[,1:6]
rep.ref[rep.ref[,6]=="-",5]=rep.ref[rep.ref[,6]=="-",5]-1 # subtract 1 from - strand exon numbers
# if(is.null(chrom.file))
# {
strands=as.character(rep.ref$V6)
strands[strands=="."]="*"
grange.exons=GenomicRanges::GRanges(seqnames=as.character(rep.ref$V1),
ranges=IRanges(start=rep.ref$V2+1, end=rep.ref$V3),
strand=strands, score=rep.ref$V5,name=rep.ref$V4)
return(grange.exons)
# }
# #read table for chromosome info
# chrom=read.table(chrom.file,header=F)
# chrom=chrom[chrom[,1] %in% rep.ref$V1,1:2]
# seqlengths=chrom[,2];names(seqlengths)=as.character(chrom[,1])
# # do adjustments so that trans and exons have the same numerical id
# strands=as.character(rep.ref$V6)
# strands[strands=="."]="*"
# grange.exons=GRanges(seqnames=as.character(rep.ref$V1),
# ranges=IRanges(start=rep.ref$V2+1, end=rep.ref$V3),
# strand=strands, score=rep.ref$V5,name=rep.ref$V4,seqlengths=seqlengths)
# return(grange.exons)
}
# checks the validity of the bed data.frame if it is a legitimate bed columns
check.bed.validity<-function(bed.df,type="none")
{
# does it have at least 3 columns
num.col=(ncol(bed.df)>=3)
# does it have 1st and 2nd column as numbers
col1.2= (is.integer(bed.df[,2]) & is.integer(bed.df[,3]) )
# does it have chr string in 1st column
#chr= sum(grepl("chr",bed.df[,1]) )
if(type=="exons" | type=="introns")
{
#does it have 12>= columns
ex=(ncol(bed.df)>=12)
return(num.col & col1.2 & ex)
}
return(num.col & col1.2 )
}
#######################################
# SECTION 1: S4 functions
#######################################
# convert a data frame read-in from a bed file to a GRanges object
#
# @param bed a data.frame where column order and content resembles a bed file with 12 columns
# @usage convert.bed.df(bed)
# @return \code{\link{GRanges}} object
#
# @note one bed track per file is only accepted, the bed files with multiple tracks will cause en error
#
# @export
# @docType methods
# @rdname convert.bed.df-methods
setGeneric("convert.bed.df",function(bed) standardGeneric("convert.bed.df"))
# @aliases convert.bed.df,data.frame-method
# @rdname convert.bed.df-methods
setMethod("convert.bed.df" ,signature(bed = "data.frame" ),
function(bed){
if(! check.bed.validity(bed)){stop("this is not a valid bed file")}
if(ncol(bed)>=6){
# check if 6th column is really strand
if( sum( unique(bed[,6]) %in% c("+","-",".") ) != length(unique(bed[,6])) ){stop("Strand column of the bed file or data.frame is wrong")}
#convert to granges
strands=as.character(bed$V6)
strands[strands=="."]="*"
grange=GRanges(seqnames=as.character(bed$V1),
ranges=IRanges(start=bed$V2+1, end=bed$V3),
strand=strands, score=bed$V5,name=bed$V4)
}
if(ncol(bed)==5){
grange=GRanges(seqnames=bed$V1,ranges=IRanges(start=bed$V2+1, end=bed$V3),strand=rep("*",nrow(bed)), score=bed$V5,name=bed$V4 )
}
if(ncol(bed)==4){
grange=GRanges(seqnames=bed$V1,ranges=IRanges(start=bed$V2+1, end=bed$V3),strand=rep("*",nrow(bed)),name=bed$V4)
}
if(ncol(bed)==3){
grange=GRanges(seqnames=bed$V1,ranges=IRanges(start=bed$V2+1, end=bed$V3),strand=rep("*",nrow(bed)))
}
return(grange)
})
# convert a data frame read-in from a bed file to a GRanges object for exons
#
# @param bed.df a data.frame where column order and content resembles a bed file with 12 columns
# @usage convert.bed2exons(bed.df)
# @return \code{\link{GRanges}} object
#
# @note one bed track per file is only accepted, the bed files with multiple tracks will cause en error
#
# @export
# @docType methods
# @rdname convert.bed2exons-methods
setGeneric("convert.bed2exons",function(bed.df) standardGeneric("convert.bed2exons"))
# @aliases convert.bed2exons,data.frame-method
# @rdname convert.bed2exons-methods
setMethod("convert.bed2exons" ,signature(bed.df = "data.frame" ),
function(bed.df){
if(! check.bed.validity(bed.df,"exon")){stop("this is not a valid bed file")}
bed12.to.exons(bed.df)
})
# convert a data frame read-in from a bed file to a GRanges object for introns
#
# @param bed.df a data.frame where column order and content resembles a bed file with 12 columns
# @usage convert.bed2introns(bed.df)
# @return \code{\link{GRanges}} object
#
# @note one bed track per file is only accepted, the bed files with multiple tracks will cause en error
#
# @export
# @docType methods
# @rdname convert.bed2introns-methods
setGeneric("convert.bed2introns",function(bed.df) standardGeneric("convert.bed2introns"))
# @aliases convert.bed2introns,data.frame-method
# @rdname convert.bed2introns-methods
setMethod("convert.bed2introns" ,signature(bed.df = "data.frame" ),
function(bed.df){
if(! check.bed.validity(bed.df,"exon")){stop("this is not a valid bed file")}
bed12.to.introns(bed.df)
})
#' read a bed file and convert it to GRanges
#'
#' The function reads a BED file and coverts it to a \code{\link[GenomicRanges]{GRanges}} object
#'
#' @param location location of the file, a character string such as: "/home/user/my.bed"
#' @param remove.unsual if TRUE(default) remove the chromomesomes with unsual names, mainly random chromsomes etc
#'
#' @usage read.bed(location,remove.unsual=TRUE)
#' @return \code{\link{GRanges}} object
#' @examples
#' bed.file=system.file("extdata", "cpgi.hg18.bed.txt", package = "methylKit")
#' bed.gr=read.bed(location=bed.file,remove.unsual=TRUE)
#'
#' @note one bed track per file is only accepted, the bed files with multiple tracks will cause an error
#'
#' @export
#' @docType methods
#' @rdname read.bed-methods
setGeneric("read.bed", function(location,remove.unsual=TRUE) standardGeneric("read.bed"))
#' @aliases read.bed,character-method
#' @rdname read.bed-methods
setMethod("read.bed", signature(location = "character"),#remove.unsual="logical" ),
function(location,remove.unsual){
# find out if there is a header, skip 1st line if there is a header
f.line=readLines(con = location, n = 1)
skip=0
if(grepl("^track",f.line))(skip=1)
# read bed6
bed=.readTableFast(location,header=F,skip=skip)
if(remove.unsual){ bed=bed[grep("_", as.character(bed[,1]),invert=T),] }
convert.bed.df(bed)
})
#' Read transcript features from a BED file
#'
#' The function returns a \code{\link[GenomicRanges]{GRangesList}} containing exon, intron, TSS(transcription start site) and promoter locations
#'
#' @param location location of the bed file with 12 or more columns
#' @param remove.unsual remove the chromomesomes with unsual names, mainly random chromsomes etc
#' @param up.flank up-stream from TSS to detect promoter boundaries
#' @param down.flank down-stream from TSS to detect promoter boundaries
#' @param unique.prom get only the unique promoters, promoter boundaries will not have a gene name if you set this option to be TRUE
#' @usage read.transcript.features(location,remove.unsual=TRUE,up.flank=1000,down.flank=1000,unique.prom=TRUE)
#' @return a \code{\link[GenomicRanges]{GRangesList}} containing locations of exon/intron/promoter/TSS
#' @examples
#' gene.obj=read.transcript.features(system.file("extdata", "refseq.hg18.bed.txt", package = "methylKit"))
#'
#' @note one bed track per file is only accepted, the bed files with multiple tracks will cause en error
#'
#' @export
#' @docType methods
#' @rdname read.transcript.features-methods
setGeneric("read.transcript.features", function(location,remove.unsual=TRUE,up.flank=1000,down.flank=1000,unique.prom=TRUE) standardGeneric("read.transcript.features"))
#' @aliases read.transcript.features,character-method
#' @rdname read.transcript.features-methods
setMethod("read.transcript.features", signature(location = "character"),#,remove.unsual="logical",up.flank="numeric",down.flank="numeric",unique.prom="logical" ),
function(location,remove.unsual,up.flank ,down.flank ,unique.prom){
# find out if there is a header, skip 1st line if there is a header
f.line=readLines(con = location, n = 1)
skip=0
if(grepl("^track",f.line))(skip=1)
# read bed6
bed=.readTableFast(location,header=F,skip=skip)
if(remove.unsual){ bed=bed[grep("_", as.character(bed[,1]),invert=T),] }
introns=convert.bed2introns(bed)
exons =convert.bed2exons(bed)
# get the locations of TSSes
tss=bed
# + strand
tss[tss$V6=="+",3]=tss[tss$V6=="+",2]
# - strand
tss[tss$V6=="-",2]=tss[tss$V6=="-",3]
tssg=GRanges(seqnames=as.character(tss$V1),
ranges=IRanges(start=tss$V2, end=tss$V3),
strand=as.character(tss$V6),score=rep(0,nrow(tss)),name=tss$V4)
# get the locations of promoters
# + strand
bed[bed$V6=="+",3]=bed[bed$V6=="+",2]+down.flank
bed[bed$V6=="+",2]=bed[bed$V6=="+",2]-up.flank
# - strand
bed[bed$V6=="-",2]=bed[bed$V6=="-",3]-down.flank
bed[bed$V6=="-",3]=bed[bed$V6=="-",3]+up.flank
if(! unique.prom)
{
prom.df=(bed[,c(1,2,3,4,6)])
prom=GRanges(seqnames=as.character(prom.df$V1),
ranges=IRanges(start=prom.df$V2, end=prom.df$V3),
strand=as.character(prom.df$V6),score=rep(0,nrow(prom.df)),name=prom.df$V4)
}else{
prom.df=unique(bed[,c(1,2,3,6)])
prom=GRanges(seqnames=as.character(prom.df$V1),
ranges=IRanges(start=prom.df$V2, end=prom.df$V3),
strand=as.character(prom.df$V6),score=rep(0,nrow(prom.df)),name=rep(".",nrow(prom.df)) )
}
GRangesList(exons=exons,introns=introns,promoters=prom,TSSes=tssg)
})
#' Get upstream and downstream adjacent regions to a genomic feature
#'
#' The function returns flanking regions on either side of a genomic feature. It is useful for getting flanking regions such as CpG island shores.
#'
#' @param grange \code{\link[GenomicRanges]{GRanges}} object for the feature
#' @param flank number of basepairs for the flanking regions
#' @param clean If set to TRUE, flanks overlapping with other main features will be trimmed, and overlapping flanks will be removed
#' this will remove multiple counts when other features overlap with flanks
#'
#' @usage getFlanks(grange,flank=2000,clean=T)
#' @examples
#'
#' # read the bed file as GRanges object
#' bed.file=system.file("extdata", "cpgi.hg18.bed.txt", package = "methylKit")
#' bed.gr=read.bed(location=bed.file,remove.unsual=TRUE)
#'
#' # get flanks on the either side
#' bed.flanks=getFlanks(bed.gr,flank=2000,clean=TRUE)
#'
#' @return \code{\link[GenomicRanges]{GRanges}} object for flanking regions
#' @export
#' @docType methods
#' @rdname getFlanks-methods
setGeneric("getFlanks", function(grange,flank=2000,clean=T) standardGeneric("getFlanks"))
#' @aliases getFlanks,GRanges-method
#' @rdname getFlanks-methods
setMethod("getFlanks", signature(grange= "GRanges"),
function(grange,flank=2000,clean=T){
shores=c( IRanges::flank(grange,flank),IRanges::flank(grange,flank,FALSE) )
if(clean){
shores=IRanges::reduce(IRanges::setdiff(shores, grange)) # ,erge overlapping shores remove CpG coordinates from all shores, das ist so cool!!
}
shores
})
#' a function to read-in genomic features and their upstream and downstream adjecent regions such as CpG islands and their shores
#'
#' @param location for the bed file of the feature
#' @param flank number of basepairs for the flanking regions
#' @param clean If set to TRUE, flanks overlapping with other main features will be trimmed
#' @param remove.unsual remove chromsomes with unsual names random, Un and antyhing with "_" character
#' @param feature.flank.name the names for feature and flank ranges, it should be a character vector of length 2. example: c("CpGi","shores")
#' @usage read.feature.flank(location,remove.unsual=TRUE,flank=2000,clean=TRUE,feature.flank.name=NULL)
#' @return a \code{\link[GenomicRanges]{GenomicRangesList}} contatining one GRanges object for flanks and one for GRanges object for the main feature.
#'
#' @examples
#' # location of the example CpG file
#' my.loc=system.file("extdata", "cpgi.hg18.bed.txt", package = "methylKit")
#' cpg.obj=read.feature.flank(location=my.loc,feature.flank.name=c("CpGi","shores"))
#'
#' @export
#' @docType methods
#' @rdname read.feature.flank-methods
setGeneric("read.feature.flank", function(location,remove.unsual=TRUE,flank=2000,clean=TRUE,feature.flank.name=NULL) standardGeneric("read.feature.flank") )
#' @aliases read.feature.flank,character-method
#' @rdname read.feature.flank-methods
setMethod("read.feature.flank", signature(location = "character"),
function(location,remove.unsual,flank ,clean,feature.flank.name){
feat=read.bed(location,remove.unsual)
flanks=getFlanks(feat,flank=flank,clean=clean)
x=GenomicRangesList(features=feat,flanks=flanks)
if(!is.null(feature.flank.name) & length(feature.flank.name)==2)
{
names(x)=feature.flank.name
}
x
})
##############################################################################
# SECTION 2:
# annotate granges objects with annotations that read-in and converted to GRanges objects
##############################################################################
#######################################
# SECTION 2: Define new classes
#######################################
# A set of objects that will hold statistics about feature and annotation overlap
#' An S4 class for overlap of target features with a generic annotation
#'
#' This object is desgined to hold statistics and information about genomic feature overlaps it extends \code{\link{list}} class.
#'
#' @section Slots:\describe{
#' \item{\code{members}}{a matrix showing overlap of target features with annotation genomic features}
#' \item{\code{annotation}}{a named vector of percentages of overlap between feature and annotation}'
#' \item{\code{precedence}}{a named vector of percentages of overlap between feature and annotation}
#' \item{\code{num.annotation}}{a named vector of numbers of overlap between feature and annotation}
#' \item{\code{num.precedence}}{a named vector of numbers of overlap between feature and annotation}
#' \item{\code{no.of.OlapFeat}}{vector}
#' \item{\code{perc.of.OlapFeat}}{vector}
#' }
#' @examples
#' data(methylKit)
#' cpg.obj=read.feature.flank(system.file("extdata", "cpgi.hg18.bed.txt", package = "methylKit"),feature.flank.name=c("CpGi","shores"))
#'
#' # the following function returns annotationByFeature object
#' ann=annotate.WithFeature.Flank(methylDiff.obj,cpg.obj$CpGi,cpg.obj$shores,feature.name="CpGi",flank.name="Shores")
#' ann
#'
#' @seealso see \code{\link[methylKit]{annotate.WithFeature.Flank}} and \code{\link[methylKit]{annotate.WithFeature}} on how to create this object.
#' see following functions that operates on this object:
#' \code{\link[methylKit]{getMembers}}, \code{\link[methylKit]{getTargetAnnotationStats}},
#' \code{\link[methylKit]{getFeatsWithTargetsStats}}, \code{\link[methylKit]{plotTargetAnnotation}}
#' @name annotationByFeature-class
#' @aliases annotationByFeature
#' @docType class
#' @rdname annotationByFeature-class
#' @export
setClass("annotationByFeature", representation(members ="matrix",
annotation ="numeric",
precedence ="numeric",
num.annotation ="numeric",
num.precedence="numeric",
no.of.OlapFeat ="numeric",
perc.of.OlapFeat="numeric"))
#' An S4 class for overlap of target features with gene annotation
#'
#' This object is desgined to hold basic statistics and information about genomic feature overlaps. It extends \code{\link[methylKit]{annotationByFeature}} class.
#' The class contains an additional slot for data containing distance to nearest transcription start site (TSS).
#'
#' @section Slots:\describe{
#' \item{\code{members}}{a matrix showing overlap of target features with gene annotation features}
#' \item{\code{annotation}}{a named vector of percentages of overlap between feature and gene annotation}
#' \item{\code{precedence}}{a named vector of percentages of overlap between feature and gene annotation}
#' \item{\code{num.annotation}}{a named vector of numbers of overlap between feature and gene annotation}
#' \item{\code{num.precedence}}{a named vector of numbers of overlap between feature and gene annotation}
#' \item{\code{no.of.OlapFeat}}{a named vector of numbers of overlap between gene annotation and the feature}
#' \item{\code{perc.of.OlapFeat}}{a named vector of percentages of overlap between gene annotation and the feature }
#' \item{\code{dist.to.TSS}}{a data frame showing distances to TSS and gene/TSS names and strand}
#' }
#' @examples
#' data(methylKit)
#' gene.obj=read.transcript.features(system.file("extdata", "refseq.hg18.bed.txt", package = "methylKit"))
#'
#' # the following function returns an annotationByGenicParts object
#' ann=annotate.WithGenicParts(methylDiff.obj, gene.obj)
#'
#' @seealso see \code{\link[methylKit]{annotate.WithGenicParts}} on how to create this object, see following functions that operates on this object
#' \code{\link[methylKit]{getTargetAnnotationStats}}, \code{\link[methylKit]{getMembers}}, \code{\link[methylKit]{getAssociationWithTSS}},
#' \code{\link[methylKit]{getTargetAnnotationStats}}, code{\link[methylKit]{getFeatsWithTargetsStats}},\code{\link[methylKit]{plotTargetAnnotation}}
#'
#' @name annotationByGenicParts-class
#' @aliases annotationByGenicParts
#' @docType class
#' @rdname annotationByGenicParts-class
#' @export
setClass("annotationByGenicParts", representation(dist.to.TSS ="data.frame"),contains="annotationByFeature")
#new.obj=new("annotationByGenicParts",
# members=matrix(c(1,2,3,4)),annotation=c(1,2,0,3,4),
# precedence=c(a=1,b=2,c=0,d=3,e=4),
# num.annotation =c(1,2,0,3,4),
# num.precedence=c(1,2,0,3,4),
# no.of.OlapFeat =c(1,2,0,3,4),
# perc.of.OlapFeat=c(1,2,0,3,4),
# dist.to.TSS =c(1,2,0,3,4) )
#' @rdname show-methods
#' @aliases show,annotationByGenicParts-method
setMethod("show", "annotationByGenicParts", function(object) {
cat("summary of target set annotation with genic parts\n");cat(nrow(object@members));cat(" rows in target set\n--------------\n")
cat("--------------\n")
cat("percentage of target features overlapping with annotation :\n");print(object@annotation);cat("\n\n")
cat("percentage of target features overlapping with annotation (with promoter>exon>intron precedence) :\n"); print(object@precedence) ;cat("\n\n")
cat("percentage of annotation boundaries with feature overlap :\n");print(object@perc.of.OlapFeat);cat("\n\n")
cat("summary of distances to the nearest TSS :\n")
print(summary(abs(object@dist.to.TSS[,2])))
})
#' @rdname show-methods
#' @aliases show,annotationByFeature-method
setMethod("show", "annotationByFeature", function(object) {
cat("summary of target set annotation with feature annotation\n");cat(nrow(object@members));cat(" rows in target set\n--------------\n")
cat("--------------\n")
cat("percentage of target features overlapping with annotation :\n");print(object@annotation);cat("\n\n")
cat("percentage of annotation boundaries with feature overlap :\n");print(object@perc.of.OlapFeat);cat("\n\n")
})
#######################################
# SECTION 2: S3 FUNCTIONS
# these shouldn't be exported
#######################################
annotate.gr.WithGenicParts<-function(gr,prom,exon,intron,strand=F)
{
#require(GenomicRanges)
if( ! strand){strand(gr)="*"}
memb=data.frame(matrix(rep(0,length(gr)*3),ncol=3) ) ;colnames(memb)=c("prom","exon","intron")
memb[countOverlaps(gr,prom)>0,1]=1
memb[countOverlaps(gr,exon)>0,2]=1
memb[countOverlaps(gr,intron)>0,3]=1
annotation=c(promoter =100*sum(memb$prom>0)/nrow(memb) ,
exon =100*sum(memb$exon>0)/nrow(memb) ,
intron =100*sum(memb$intron>0)/nrow(memb) ,
intergenic=100*sum(rowSums(memb)==0)/nrow(memb) )
num.annotation=c( promoter =sum(memb$prom>0) ,
exon =sum(memb$exon>0) ,
intron =sum(memb$intron>0) ,
intergenic=sum(rowSums(memb)==0) )
precedence=c(promoter =100*sum(memb$prom>0)/nrow(memb) ,
exon =100*sum(memb$exon>0 & memb$prom==0)/nrow(memb) ,
intron =100*sum(memb$intron>0 & memb$exon==0 & memb$prom==0)/nrow(memb) ,
intergenic=100*sum(rowSums(memb)==0)/nrow(memb) )
num.precedence=c( promoter =sum(memb$prom>0) ,
exon =sum(memb$exon>0 & memb$prom==0),
intron =sum(memb$intron>0 & memb$exon==0 & memb$prom==0) ,
intergenic=sum(rowSums(memb)==0) )
numberOfOlapFeat=c(promoter=sum(countOverlaps(prom,gr)>0),
exon=sum(countOverlaps(exon,gr)>0),
intron=sum(countOverlaps(intron,gr)>0) )
percOfOlapFeat =100*numberOfOlapFeat/c(length(prom),length(exon),length(intron))
return(list(members=memb,annotation=annotation,precedence=precedence,num.annotation=num.annotation,
num.precedence=num.precedence,
numberOfOlapFeat=numberOfOlapFeat,percOfOlapFeat=percOfOlapFeat) )
}
distance2nearestFeature<-function(g.idh,tss)
{
#require(GenomicRanges)
elementMetadata(g.idh)=DataFrame(elementMetadata(g.idh),orig.row=1:length(g.idh))
id.col =ncol( elementMetadata(g.idh))+5 # get the row number column
tss.id.col=ncol( elementMetadata(g.idh))+5+7 # get the id column for tss in the merged data.frame below
strnd.col=ncol( elementMetadata(g.idh))+5+7-2 # get the id column for tss
met.tss = .nearest.2bed(g.idh, tss) # a merged data.frame is returned by this function
dist.col=ncol(met.tss)
met.tss[met.tss$end<met.tss$start.y & met.tss$strand.y=="+",dist.col] = -1* met.tss[met.tss$end<met.tss$start.y & met.tss$strand.y=="+",dist.col]
met.tss[met.tss$end>met.tss$start.y & met.tss$strand.y=="-",dist.col] = -1* met.tss[met.tss$end>met.tss$start.y & met.tss$strand.y=="-",dist.col]
res=met.tss[order(met.tss[,id.col]),c(id.col,dist.col,tss.id.col,strnd.col)]
names(res)=c("target.row","dist.to.feature" , "feature.name" , "feature.strand")
return(res)
}
# get the nearest features between a subject and query bed file
# get grange objects in bed zormat
# g.bed is GenomicRanges object, biatch!!!
# subject is also GenomicRanges object
.nearest.2bed<-function(g.bed,subject)
{
chrs1=IRanges::levels(seqnames(g.bed))
chrs2=IRanges::levels(seqnames(subject))
chrs=chrs1[chrs1 %in% chrs2]
res.df=NULL
for(i in 1:length(chrs))
{
# find the nearest for this range
ind =nearest(ranges(g.bed[seqnames(g.bed)==chrs[i],]),ranges(subject[seqnames(subject)==chrs[i],]))
sbdf1 =IRanges::as.data.frame(g.bed[seqnames(g.bed)==chrs[i],] )
sbdf2 =IRanges::as.data.frame(subject[seqnames(subject)==chrs[i],] )
sbdf2 =sbdf2[ind,]
names(sbdf2)=paste(names(sbdf2),".y",sep="")
temp =cbind(sbdf1,sbdf2)
res.df=rbind(res.df,temp)
}
res.dist=rep(0,nrow(res.df))
dist1=abs(res.df$start-res.df$end.y)+1
dist2=abs(res.df$start.y-res.df$end)+1
totti=res.df$width + res.df$width.y #total length
res.dist[pmax(dist1,dist2)>totti]=pmin(dist1,dist2)[pmax(dist1, dist2) > totti]
res.df=cbind(res.df,dist=res.dist)
return(res.df)
}
#######################################
# SECTION 2: S4 FUNCTIONS
#######################################
#' Annotate given object with gene annotation
#'
#' The function annotates given genomic feature or methylKit object with gene annotation such as promoter, exon, intron & intergenic.
#' It also gets the distance to nearest TSS (transcription start site) for each genomic feature or methylation event.
#'
#'
#' @param target a \code{\link[methylKit]{methylDiff}} or a \code{\link[GenomicRanges]{GRanges}} object storing chromosome locations to be annotated
#' @param GRangesList.obj A \code{\link[GenomicRanges]{GRangesList}} object containing GRanges object for promoter,exons,introns and TSSes, or simply output of \code{\link[methylKit]{read.transcript.features}} function
#' @param strand If set to TRUE, annotation features and target features will be overlapped based on strand (def:FALSE)
#' @usage annotate.WithGenicParts(target,GRangesList.obj,strand=FALSE)
#' @return \code{\link[methylKit]{annotationByGenicParts}} object
#' @examples
#' data(methylKit)
#' gene.obj=read.transcript.features(system.file("extdata", "refseq.hg18.bed.txt", package = "methylKit"))
#' annotate.WithGenicParts(methylDiff.obj, gene.obj)
#' @seealso
#' \code{\link[methylKit]{getMembers}}, \code{\link[methylKit]{getTargetAnnotationStats}},
#' \code{\link[methylKit]{getFeatsWithTargetsStats}}, \code{\link[methylKit]{getAssociationWithTSS}},
#' \code{\link[methylKit]{plotTargetAnnotation}}
#' @export
#' @docType methods
#' @rdname annotate.WithGenicParts-methods
setGeneric("annotate.WithGenicParts", function(target,GRangesList.obj,strand=FALSE) standardGeneric("annotate.WithGenicParts") )
#' @aliases annotate.WithGenicParts,GRanges,GRangesList-method
#' @rdname annotate.WithGenicParts-methods
setMethod("annotate.WithGenicParts", signature(target= "GRanges",GRangesList.obj="GRangesList"),
function(target,GRangesList.obj,strand){
a.list =annotate.gr.WithGenicParts(target,GRangesList.obj$promoters,GRangesList.obj$exons,GRangesList.obj$introns,strand=strand)
dist2TSS =distance2nearestFeature(target,GRangesList.obj$TSSes)
new("annotationByGenicParts",
members =as.matrix(a.list$members),
annotation =a.list$annotation,
precedence =a.list$precedence,
num.annotation =a.list$num.annotation,
num.precedence=a.list$num.precedence,
no.of.OlapFeat =a.list$numberOfOlapFeat,
perc.of.OlapFeat=a.list$percOfOlapFeat,
dist.to.TSS = dist2TSS )
})
#' @aliases annotate.WithGenicParts,methylDiff,GRangesList-method
#' @rdname annotate.WithGenicParts-methods
setMethod("annotate.WithGenicParts", signature(target = "methylDiff",GRangesList.obj="GRangesList"),
function(target,GRangesList.obj,strand){
gr=as(target,"GRanges")
annotate.WithGenicParts(gr,GRangesList.obj,strand)
})
#' Annotate an object with two sets of genomic features
#'
#' The function annotates given genomic feature or methylKit object with two sets of annotation.
#' It is primarily useful when annotating objects with CpG islands and shores.
#'
#'
#' @param target a \code{\link[methylKit]{methylDiff}} or a \code{\link[GenomicRanges]{GRanges}} object storing chromosome locations to be annotated
#' @param feature a \code{\link[GenomicRanges]{GRanges}} object storing chromosome locations of a feature (can be CpG islands, ChIP-seq peaks, etc)
#' @param flank a \code{\link[GenomicRanges]{GRanges}} object storing chromosome locations of the flanks of the feature
#' @param feature.name string for the name of the feature
#' @param flank.name string for the name of the flanks
#' @param strand If set to TRUE, annotation features and target features will be overlapped based on strand (def:FALSE)
#' @usage annotate.WithFeature.Flank(target,feature,flank,feature.name="feat",flank.name="flank",strand=FALSE)
#' @return returns an \code{\link[methylKit]{annotationByFeature}} object
#' @examples
#' data(methylKit)
#' cpg.obj=read.feature.flank(system.file("extdata", "cpgi.hg18.bed.txt", package = "methylKit"),feature.flank.name=c("CpGi","shores"))
#'
#' annotate.WithFeature.Flank(methylDiff.obj,cpg.obj$CpGi,cpg.obj$shores,feature.name="CpGi",flank.name="Shores")
#'
#' @seealso
#' \code{\link[methylKit]{getMembers}}, \code{\link[methylKit]{getTargetAnnotationStats}},
#' \code{\link[methylKit]{getFeatsWithTargetsStats}}, \code{\link[methylKit]{plotTargetAnnotation}}
#'
#' @export
#'
#' @docType methods
#' @rdname annotate.WithFeature.Flank-methods
setGeneric("annotate.WithFeature.Flank", function(target,feature,flank,feature.name="feat",flank.name="flank",strand=FALSE) standardGeneric("annotate.WithFeature.Flank") )
#' @aliases annotate.WithFeature.Flank,GRanges,GRanges,GRanges-method
#' @rdname annotate.WithFeature.Flank-methods
setMethod( "annotate.WithFeature.Flank", signature(target = "GRanges",feature="GRanges",flank="GRanges"),
function(target, feature, flank,feature.name,flank.name,strand){
if( ! strand){strand(target)="*"}
memb=data.frame(matrix(rep(0,length(target)*2),ncol=2) ) ;colnames(memb)=c(feature.name,flank.name)
memb[countOverlaps(target,feature)>0,1]=1
memb[countOverlaps(target,flank)>0,2]=1
annotation=c(100*sum(memb[,1]>0)/nrow(memb) ,
100*sum(memb[,2]>0)/nrow(memb) ,
100*sum(rowSums(memb)==0)/nrow(memb) )
names(annotation)=c(feature.name,flank.name,"other")
num.annotation=c( sum(memb[,1]>0) , sum(memb[,2]>0) , sum(rowSums(memb)==0) )
names(num.annotation)=c(feature.name,flank.name,"other")
precedence=c(100*sum(memb[,1]>0)/nrow(memb) ,
100*sum(memb[,2]>0 & memb[,1]==0)/nrow(memb) ,
100*sum(rowSums(memb)==0)/nrow(memb) )
names(precedence)=c(feature.name,flank.name,"other")
num.precedence=c( sum(memb[,1]>0) , sum(memb[,2]>0 & memb[,1]==0) , sum(rowSums(memb)==0) )
names(num.precedence)=c(feature.name,flank.name,"other")
numberOfOlapFeat=c(sum(countOverlaps(feature,target)>0),
sum(countOverlaps(flank,target)>0) )
names(numberOfOlapFeat)=c(feature.name,flank.name)
percOfOlapFeat =100*numberOfOlapFeat/c(length(feature),length(flank) )
#return(list(members=memb,annotation=annotation,precedence=precedence,
# numberOfOlapFeat=numberOfOlapFeat,percOfOlapFeat=percOfOlapFeat) )
new("annotationByFeature",
members =as.matrix(memb),
annotation =annotation,
precedence =precedence,
num.annotation =num.annotation,
num.precedence=num.precedence,
no.of.OlapFeat =numberOfOlapFeat,
perc.of.OlapFeat=percOfOlapFeat)
})
#' @aliases annotate.WithFeature.Flank,methylDiff,GRanges,GRanges-method
#' @rdname annotate.WithFeature.Flank-methods
setMethod("annotate.WithFeature.Flank", signature(target= "methylDiff",feature="GRanges",flank="GRanges"),
function(target, feature, flank,feature.name,flank.name,strand){
gr=as(target,"GRanges")
annotate.WithFeature.Flank(gr,feature, flank,feature.name,flank.name,strand)
})
#' Annotate object with a set of genomic features
#'
#' The function annotates given genomic feature or methylKit object with a set of annotation.
#' It is primarily useful when annotating objects with simple genomic features, such as enhancer locations.
#
#' @param target a \code{\link[methylKit]{methylDiff}} or a \code{\link[GenomicRanges]{GRanges}} object storing chromosome locations to be annotated
#' @param feature a \code{\link[GenomicRanges]{GRanges}} object storing chromosome locations of a feature (can be CpG islands, ChIP-seq peaks, etc)
#' @param strand If set to TRUE, annotation features and target features will be overlapped based on strand (def:FALSE)
#' @param extend specifiying a positive value will extend the feature on both sides as much as \code{extend}
#' @param feature.name name of the annotation feature. For example: H3K4me1,CpGisland etc.
#' @usage annotate.WithFeature(target,feature,strand=FALSE,extend=0,feature.name="feat1")
#' @examples
#' data(methylKit)
#' cpg.gr=read.bed(system.file("extdata", "cpgi.hg18.bed.txt", package = "methylKit"),remove.unsual=TRUE)
#'
#' annotate.WithFeature(methylDiff.obj,cpg.gr,strand=FALSE,extend=0,feature.name="CpGi")
#'
#' @return returns an \code{\link[methylKit]{annotationByFeature}} object
#'
#' @seealso
#' \code{\link[methylKit]{getMembers}}, \code{\link[methylKit]{getTargetAnnotationStats}},
#' \code{\link[methylKit]{getFeatsWithTargetsStats}}, \code{\link[methylKit]{plotTargetAnnotation}}
#'
#' @export
#' @docType methods
#' @rdname annotate.WithFeature-methods
setGeneric("annotate.WithFeature", function(target,feature,strand=FALSE,extend=0,feature.name="feat1") standardGeneric("annotate.WithFeature") )
#' @aliases annotate.WithFeature,GRanges,GRanges-method
#' @rdname annotate.WithFeature-methods
setMethod("annotate.WithFeature", signature(target = "GRanges",feature="GRanges"),
function(target, feature, strand,extend,feature.name){
if( ! strand){strand(target)="*"}
memb=rep(0,length(target))
if(extend>0)
{
start(feature)=start(feature)-extend
end(feature) =end(feature)+extend
}
memb[countOverlaps(target,feature)>0]=1
annotation=c(100*sum(memb>0)/length(memb) ,
100*sum((memb)==0)/length(memb) )
num.annotation=c(sum(memb>0) ,sum((memb)==0) )
names(annotation)=c(feature.name,"other")
numberOfOlapFeat=c(sum(countOverlaps(feature,target)>0))
percOfOlapFeat =100*numberOfOlapFeat/c(length(feature))
new("annotationByFeature",
members =as.matrix(memb),
annotation =annotation,
precedence =annotation,
num.annotation =num.annotation,
num.precedence=num.annotation,
no.of.OlapFeat =numberOfOlapFeat,
perc.of.OlapFeat=percOfOlapFeat)
})
#' @aliases annotate.WithFeature,methylDiff,GRanges-method
#' @rdname annotate.WithFeature-methods
setMethod("annotate.WithFeature", signature(target = "methylDiff",feature="GRanges"),
function(target, feature, strand,extend,feature.name){
gr=as(target,"GRanges")
annotate.WithFeature(gr, feature, strand,extend,feature.name)
})
# ACCESSOR FUNCTIONS
#annotationByFeature
#annotationBygenicparts
#' Get the membership slot of annotationByFeature
#'
#' Membership slot defines the overlap of target features with annotation features as a matrix.
#'
#'
#' @param x an \code{\link[methylKit]{annotationByFeature}} object
#'
#' @return a matrix showing overlap of target features with annotation features. 1 for overlap, 0 for non-overlap.
#' Each row in the matrix corresponds to a genomic feature that is annoted by one of the following functions:
#' \code{\link[methylKit]{annotate.WithFeature}},
#' \code{\link[methylKit]{annotate.WithFeature.Flank}},
#' \code{\link[methylKit]{annotate.WithGenicParts}}
#'
#' @usage getMembers(x)
#' @examples
#' data(methylKit)
#' cpg.obj=read.feature.flank(system.file("extdata", "cpgi.hg18.bed.txt", package = "methylKit"),feature.flank.name=c("CpGi","shores"))
#'
#' ann=annotate.WithFeature.Flank(methylDiff.obj,cpg.obj$CpGi,cpg.obj$shores,feature.name="CpGi",flank.name="Shores")
#' mat=getMembers(ann)
#' head(mat)
#'
#' @export
#' @docType methods
#' @rdname getMembers-methods
setGeneric("getMembers", def=function(x) standardGeneric("getMembers"))
#' @aliases getMembers,annotationByFeature-method
#' @rdname getMembers-methods
setMethod("getMembers", signature(x = "annotationByFeature"),
function(x){
return(x@members)
})
#' Get the percentage of target features overlapping with annotation from annotationByFeature
#'
#' This function retrieves percentage/number of target features overlapping with annotation
#'
#' @param x an \code{\link[methylKit]{annotationByFeature}} or an \code{\link[methylKit]{annotationByGenicParts}} object
#' @param percentage TRUE|FALSE. If TRUE percentage of target features will be returned. If FALSE, number of target features will be returned
#' @param precedence TRUE|FALSE. If TRUE there will be a hierachy of annotation features when calculating numbers (with promoter>exon>intron precedence)
#' That means if a feature overlaps with a promoter it will be counted as promoter overlapping only, or if it is overlapping with a an exon but not a promoter,
#' it will be counted as exon overlapping only whether or not it overlaps with an intron.
#'
#' @usage getTargetAnnotationStats(x,percentage=TRUE,precedence=TRUE)
#' @examples
#' data(methylKit)
#' cpg.obj=read.feature.flank(system.file("extdata", "cpgi.hg18.bed.txt", package = "methylKit"),feature.flank.name=c("CpGi","shores"))
#'
#' ann=annotate.WithFeature.Flank(methylDiff.obj,cpg.obj$CpGi,cpg.obj$shores,feature.name="CpGi",flank.name="Shores")
#' getTargetAnnotationStats(ann,percentage=TRUE,precedence=TRUE)
#'
#' @return a \code{vector} of percentages or counts showing quantity of target features overlapping with annotation
#'
#' @export
#' @docType methods
#' @rdname getTargetAnnotationStats-methods
setGeneric("getTargetAnnotationStats", def=function(x,percentage=TRUE,precedence=TRUE) standardGeneric("getTargetAnnotationStats"))
#' @rdname getTargetAnnotationStats-methods
#' @aliases getTargetAnnotationStats,annotationByFeature-method
setMethod("getTargetAnnotationStats", signature(x = "annotationByFeature"),
function(x,percentage ,precedence ){
if(percentage){
if(precedence){return(x@precedence)
}else{return(x@annotation)}
}else{
if(precedence){return(x@num.precedence)
}else{return(x@num.annotation)}
}
})
#' Get the percentage/count of annotation features overlapping with target features
#'
#' This function retrieves percentage/number of annotation features overlapping with targets.
#' For example, if \code{annotationByFeature} object is containing statistics of differentially methylated
#' regions overlapping with gene annotation. This function will return number/percentage of introns,exons and promoters
#' overlapping with differentially methylated regions.
#'
#' @param x an \code{\link[methylKit]{annotationByFeature}} or an \code{\link[methylKit]{annotationByGenicParts}} object
#' @param percentage TRUE|FALSE. If TRUE percentage of annotation features will be returned. If FALSE, number of annotation features will be returned
#'
#' @return a vector of percentages or counts showing quantity of annotation features overlapping with target features
#' @usage getFeatsWithTargetsStats(x,percentage=TRUE)
#' @examples
#' data(methylKit)
#' cpg.obj=read.feature.flank(system.file("extdata", "cpgi.hg18.bed.txt", package = "methylKit"),feature.flank.name=c("CpGi","shores"))
#'
#' ann=annotate.WithFeature.Flank(methylDiff.obj,cpg.obj$CpGi,cpg.obj$shores,feature.name="CpGi",flank.name="Shores")
#' getFeatsWithTargetsStats(ann,percentage=TRUE)
#'
#' @export
#' @docType methods
#' @rdname getFeatsWithTargetsStats-methods
setGeneric("getFeatsWithTargetsStats", def=function(x,percentage=TRUE) standardGeneric("getFeatsWithTargetsStats"))
#' @rdname getFeatsWithTargetsStats-methods
#' @aliases getFeatsWithTargetsStats,annotationByFeature-method
setMethod("getFeatsWithTargetsStats", signature(x = "annotationByFeature" ),
function( x,percentage ){
if(percentage){
return(x@perc.of.OlapFeat)
}else{
return(x@no.of.OlapFeat)
}
})
#' Get distance to nearest TSS and gene id from annotationByGenicParts
#'
#' This accessor function gets the nearest TSS, its distance to target feature,strand and name of TSS/gene from annotationByGenicParts object
#' @param x an \code{\link[methylKit]{annotationByGenicParts}} object
#'
#' @return a \code{\link{data.frame}} containing row number of the target features,distance of target to nearest TSS, TSS/Gene name, TSS strand
#' @usage getAssociationWithTSS(x)
#' @examples
#' data(methylKit)
#' gene.obj=read.transcript.features(system.file("extdata", "refseq.hg18.bed.txt", package = "methylKit"))
#' ann=annotate.WithGenicParts(methylDiff.obj, gene.obj)
#' df=getAssociationWithTSS(ann)
#' head(df)
#'
#' @export
#' @docType methods
#' @rdname annotationByGenicParts-methods
setGeneric("getAssociationWithTSS", def=function(x) standardGeneric("getAssociationWithTSS"))
#' @rdname annotationByGenicParts-methods
#' @docType methods
#' @aliases getAssociationWithTSS,annotationByGenicParts-method
setMethod("getAssociationWithTSS", signature(x = "annotationByGenicParts"),
function(x){
return(x@dist.to.TSS)
})
# PLOTTING FUNCTIONS
#' Plot annotation categories from annotationByGenicParts or annotationByFeature
#'
#' This function plots a pie or bar chart for showing percentages of targets annotated by genic parts or other query features.
#' @param x an \code{\link[methylKit]{annotationByFeature}} or an \code{\link[methylKit]{annotationByGenicParts}} object
#' @param precedence TRUE|FALSE. If TRUE there will be a hierachy of annotation features when calculating numbers (with promoter>exon>intron precedence).
#' This option is only valid when x is a \code{annotationByGenicParts} object
#' @param col a vector of colors for piechart or the bar plot
#' @param ... graphical parameters to be passed to \code{pie} or \code{barplot} functions
#'
#' @usage plotTargetAnnotation(x,precedence=TRUE,col=rainbow(length(x@@annotation)), ...)
#' @examples
#' data(methylKit)
#' gene.obj=read.transcript.features(system.file("extdata", "refseq.hg18.bed.txt", package = "methylKit"))
#' ann=annotate.WithGenicParts(methylDiff.obj, gene.obj)
#' plotTargetAnnotation(ann,precedence=FALSE)
#'
#'
#'
#' @return plots a piechart or a barplot for percentage of the target features overlapping with annotation
#'
#' @export
#' @docType methods
#' @rdname plotTargetAnnotation-methods
setGeneric("plotTargetAnnotation", def=function(x,precedence=TRUE,col=rainbow(length(x@annotation)),...) standardGeneric("plotTargetAnnotation"))
#' @rdname plotTargetAnnotation-methods
#' @docType methods
#' @aliases plotTargetAnnotation,annotationByFeature-method
setMethod("plotTargetAnnotation", signature(x = "annotationByFeature"),
function(x,precedence,col,...){
props=getTargetAnnotationStats(x,percentage=TRUE,precedence=precedence)
if(precedence | ( is(x,"annotationByFeature") & !is(x,"annotationByGenicParts")) ){
slice.names=names(props)
#names(props)=paste(names(props),paste(round(props),"%"),sep=" ")
names(props)=paste( paste(round(props),"%"),sep=" ")
pie(props,cex=0.9,col=col,...)
legend("topright",legend=slice.names,fill=col )
}
else{
mids=barplot(props,col=col,...)
text(mids,y=props,labels=paste(paste(round(props),"%",sep="")),pos=1)
}
})
# SECTION 3:
# annotate ML objects with annotations read-in and converted to GRanges objects
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.