#' Load annotation datasets
#'
#' Load dataset files containing genomic regions annotation information from
#' hard drive. `loadAnnotations` calls `readQuery` and `readSubject`
#' to read in genomic regions as `GenomicRanges` objects stored as .RDS / .rds
#' files. Each region needs chromosome, start, end and strand information.
#' A unique ID and a name column must be present in the `GenomicRanges` object
#' metadata. OGRE searches for the query file in your query folder and any
#' number of subject files in your subjects folder. Alternatively, .gff (v2&v3) files
#' in the query or subject folder with attribute columns containing "ID" and "name"
#' information are read in by OGRE.
#' @param OGREDataSet A OGREDataSet.
#' @return A OGREDataSet.
#' @examples
#' myOGRE <- makeExampleOGREDataSet()
#' myOGRE <- loadAnnotations(myOGRE)
#' @export
loadAnnotations <- function(OGREDataSet){
OGREDataSet <- readQuery(OGREDataSet)
OGREDataSet <- readSubject(OGREDataSet)
return(OGREDataSet)
}
#' Read query dataset
#'
#' [readQuery()] scanns `queryFolder` for a `GRanges` object stored as .RDS/.rds
#' or .gff .GFF file and attaches it to the OGREDataSet.
#' @importFrom rtracklayer import.gff
#' @param OGREDataSet A OGREDataSet.
#' @return A OGREDataSet.
#' @keywords internal
readQuery=function(OGREDataSet){
assertthat::assert_that(length(list.files(metadata(OGREDataSet)$queryFolder))>0,
msg="Query folder is empty!")
message("Reading query dataset... ")
if(is.null(metadata(OGREDataSet)$queryFolder)){ #if no folder supplied, check default folder
}else{ #read queryFolder
queryPath <- list.files(metadata(OGREDataSet)$queryFolder,full.names = TRUE)
queryName <- tools::file_path_sans_ext(list.files(metadata(OGREDataSet)$queryFolder))
if(grepl("rds",queryPath,ignore.case=TRUE)){
OGREDataSet[[queryName]]<- readRDS(queryPath)
}
else if(grepl("gff",queryPath,ignore.case=TRUE)){ #read in .gff
OGREDataSet[[queryName]] <- rtracklayer::import.gff(queryPath,genome="hg19")%>%
GenomeInfoDb::keepStandardChromosomes("Homo_sapiens",pruning.mode="coarse")
GenomeInfoDb::seqlevelsStyle(OGREDataSet[[queryName]]) <- "Ensembl"
GenomeInfoDb::seqlevels(OGREDataSet[[queryName]])<-
GenomeInfoDb::seqlevels(GenomeInfoDb::Seqinfo(
GenomeInfoDb::extractSeqlevels("Homo_sapiens", "Ensembl"),
genome = "hg19"))
#getting seqlength(Chromosomes) for coverage calculation
OGREDS<-OGREDataSetFromDir(file.path(system.file('extdata', package = 'OGRE'),"query"),
file.path(system.file('extdata', package = 'OGRE'),"subject"))
OGREDS<-loadAnnotations(OGREDS)
GenomeInfoDb::seqlevels(OGREDataSet)<-GenomeInfoDb::sortSeqlevels(
GenomeInfoDb::seqlevels(OGREDataSet))
GenomeInfoDb::seqlengths(OGREDataSet[[queryName]])<-
GenomeInfoDb::seqlengths(OGREDS[[1]])
}
}
assertthat::assert_that(c("ID")%in%names(mcols(OGREDataSet[[queryName]])),
msg="Query must contain ID column.")
assertthat::assert_that(!any(duplicated(OGREDataSet[[queryName]]$ID)),
msg="ID column must be unique.")
assertthat::assert_that(length(OGREDataSet[[queryName]])!=0,
msg=paste0("Dataset has no ranges: ",queryName))
return(OGREDataSet)
}
#' Read subject datasets
#'
#' [readSubject()] scanns `SubjectFolder` for `GRanges` objects stored as .RDS/.rds
#' or .gff .GFF files and attaches them to the OGREDataSet.
#' @param OGREDataSet A OGREDataSet.
#' @return A OGREDataSet.
#' @keywords internal
readSubject=function(OGREDataSet){
assertthat::assert_that(length(list.files(metadata(OGREDataSet)$subjectFolder))>0,
msg="Subject folder is empty!")
message("Reading subject datasets... ")
#read SubjectFolder
subjectPath <- list.files(metadata(OGREDataSet)$subjectFolder,full.names = TRUE)
subjectName <- tools::file_path_sans_ext(list.files(metadata(OGREDataSet)$subjectFolder))
OGREDataSet <- c(OGREDataSet,mapply(function(x,y){
if(grepl("rds",y,ignore.case=TRUE)){
tmp=readRDS(y)
}
else if(grepl("gff",y,ignore.case=TRUE)){ #read in .gff
tmp <- rtracklayer::import.gff(y)%>%
GenomeInfoDb::keepStandardChromosomes("Homo_sapiens",pruning.mode="coarse")
GenomeInfoDb::seqlevelsStyle(tmp) <- "Ensembl"
GenomeInfoDb::seqlevels(tmp)<-
GenomeInfoDb::seqlevels(GenomeInfoDb::Seqinfo(
GenomeInfoDb::extractSeqlevels("Homo_sapiens", "Ensembl"),
genome = "hg19"))
OGREDS=OGREDataSetFromDir(file.path(system.file('extdata', package = 'OGRE'),"query"),
file.path(system.file('extdata', package = 'OGRE'),"subject"))
#getting seqlength(Chromosomes) for coverage calculation
OGREDS<-loadAnnotations(OGREDS)
GenomeInfoDb::seqlevels(tmp)<-
GenomeInfoDb::sortSeqlevels(GenomeInfoDb::seqlevels(tmp))
GenomeInfoDb::seqlengths(tmp)<-GenomeInfoDb::seqlengths(OGREDS[[1]])}
assertthat::assert_that(c("ID")%in%names(mcols(tmp)),msg="Subject must contain ID column.")
assertthat::assert_that(!any(duplicated(tmp$ID)),msg="ID column must be unique.")
assertthat::assert_that(length(tmp)!=0,msg=paste0("Dataset has no ranges: ",x))
tmp
},x=subjectName,y=subjectPath))
return(OGREDataSet)
}
#' Read dataset(s) from folder
#'
#' [readDataSetFromFolder()] scanns queryFolder and subjectFolder for either
#' .RDS/.rds or .CSV/.csv files and adds them to a OGREDataSet. Each region
#' needs chromosome, start, end and strand information. (tabular file columns
#' must be named accordingly). A unique ID and a name column must be present in
#' the `GenomicRanges` object's metatdata and tabular file.
#' @param OGREDataSet A OGREDataSet.
#' @param type \code{character} and one of query/subject.
#' @return A OGREDataSet.
#' @export
#' @examples
#' myOGRE <- makeExampleOGREDataSet()
#' myOGRE <- readDataSetFromFolder(myOGRE,type="query")
#' myOGRE <- readDataSetFromFolder(myOGRE,type="subject")
readDataSetFromFolder=function(OGREDataSet,type){
message("Reading dataset(s)... ")
if(type=="query"){
files <- list.files(metadata(OGREDataSet)$queryFolder)
assertthat::assert_that(length(files)>0,msg="Folder is empty!")
assertthat::assert_that(grep("rds|RDS|csv|CSV",files)>0,msg="Wrong file type!")
queryPath <- list.files(metadata(OGREDataSet)$queryFolder,full.names = TRUE)
queryName <- tools::file_path_sans_ext(list.files(metadata(OGREDataSet)$queryFolder))
if(grep("rds|RDS",files)>0){
OGREDataSet[[queryName]]<- readRDS(queryPath)}
else{
OGREDataSet[[queryName]]<- makeGRangesFromDataFrame(fread(queryPath),
keep.extra.columns = TRUE)
}
assertthat::assert_that(c("ID")%in%names(mcols(OGREDataSet[[queryName]])),
msg="Query must contain ID column.")
assertthat::assert_that(!any(duplicated(OGREDataSet[[queryName]]$ID)),
msg="ID column must be unique.")
assertthat::assert_that(length(OGREDataSet[[queryName]])!=0,
msg=paste0("Dataset has no ranges: ",queryName))
}
else if(type=="subject"){
files <- list.files(metadata(OGREDataSet)$subjectFolder)
assertthat::assert_that(length(files)>0,msg="Folder is empty!")
assertthat::assert_that(length(grep("rds|RDS|csv|CSV",files))==length(files),
msg="Wrong file type!")
subjectPath <- list.files(metadata(OGREDataSet)$subjectFolder,full.names = TRUE)
subjectName <- tools::file_path_sans_ext(list.files(metadata(OGREDataSet)$subjectFolder))
if(length(grep("rds|RDS",files))==length(files)){
OGREDataSet <- c(OGREDataSet,mapply(function(x,y){
tmp=readRDS(y)
assertthat::assert_that(c("ID")%in%names(mcols(tmp)),msg="Subject must contain ID column.")
assertthat::assert_that(!any(duplicated(tmp$ID)),msg="ID column must be unique.")
assertthat::assert_that(length(tmp)!=0,msg=paste0("Dataset has no ranges: ",x))
tmp
},x=subjectName,y=subjectPath))
}else{
OGREDataSet <- c(OGREDataSet,mapply(function(x,y){
tmp <-makeGRangesFromDataFrame(fread(y),keep.extra.columns = TRUE)
assertthat::assert_that(c("ID")%in%names(mcols(tmp)),msg="Subject must contain ID column.")
assertthat::assert_that(!any(duplicated(tmp$ID)),msg="ID column must be unique.")
assertthat::assert_that(length(tmp)!=0,msg=paste0("Dataset has no ranges: ",x))
tmp
},x=subjectName,y=subjectPath))
}
}
return(OGREDataSet)
}
#' Find overlaps
#'
#' Finds all overlaps between query and subject(s) and stores each hit (overlap)
#' in data table `detailDT`. Data table `sumDT` shows all
#' overlaps of a certain subject type for all query elements. By default also
#' partially overlaps are reported. Overlap calculation is done using
#' \code{GenomicRanges::findOverlaps()} implementation.
#'
#' @param OGREDataSet A OGREDataSet.
#' @param selfHits \code{logical} if FALSE(default) ignores self hits of
#' identical regions (with identical IDs) within datasets.
#' @param ignoreStrand \code{logical} If TRUE (default) two regions with
#' overlapping locations on different strands are considered an overlap hit.
#' @param ... Additional parameters, see \code{GenomicRanges::findOverlaps()}
#' @return OGREDataSet.
#' @examples
#' myOGRE <- makeExampleOGREDataSet()
#' myOGRE <- loadAnnotations(myOGRE)
#' myOGRE <- fOverlaps(myOGRE)
#' @export
fOverlaps <- function(OGREDataSet,selfHits=FALSE,ignoreStrand=TRUE,...){
queryID <- subjID <- subjType <- . <- NULL
detailDT <- data.table() #data table to store all overlaps for query vs all subjects
metadata(OGREDataSet)$subjectNames <- names(
OGREDataSet[seq(2,length(OGREDataSet))])
for(subj in metadata(OGREDataSet)$subjectNames){
ol <- findOverlaps(OGREDataSet[[1]],OGREDataSet[[subj]],ignore.strand=ignoreStrand,...)
if(length(ol)==0){ #if no overlap hits skip to next subj
message("No overlap found for: ",subj)
next
}
else{
overlapWidth <- width(pintersect(OGREDataSet[[1]][queryHits(ol)],
OGREDataSet[[subj]][subjectHits(ol)],ignore.strand=ignoreStrand))
#overlap ratio with reference to query.(overLen=2,queryLen=6, ratio=0.33)
#https://support.bioconductor.org/p/72656/
overlapRatio <- overlapWidth / width(OGREDataSet[[1]][queryHits(ol)])
ol <- data.table(q=queryHits(ol),s=mcols(OGREDataSet[[subj]])[subjectHits(ol),"ID"])
detailDT <- rbind(detailDT,data.table(
queryID=mcols(OGREDataSet[[1]])[ol$q,"ID"],
queryType=names(OGREDataSet)[1],
subjID=ol$s,
subjType=subj,
queryChr=as.character(seqnames(OGREDataSet[[names(OGREDataSet)[1]]]))[ol$q],
queryStart=start(OGREDataSet[[names(OGREDataSet)[1]]])[ol$q],
queryEnd=end(OGREDataSet[[names(OGREDataSet)[1]]])[ol$q],
queryStrand=as.character(strand(OGREDataSet[[names(OGREDataSet)[1]]]))[ol$q],
subjChr=as.character(seqnames(OGREDataSet[[subj]]))
[match(ol$s,mcols(OGREDataSet[[subj]])[,"ID"])],
subjStart=start(OGREDataSet[[subj]])
[match(ol$s,mcols(OGREDataSet[[subj]])[,"ID"])],
subjEnd=end(OGREDataSet[[subj]])
[match(ol$s,mcols(OGREDataSet[[subj]])[,"ID"])],
subjStrand=as.character(strand(OGREDataSet[[subj]]))
[match(ol$s,mcols(OGREDataSet[[subj]])[,"ID"])],
overlapWidth=overlapWidth,
overlapRatio=overlapRatio
))
if(isFALSE(selfHits)){ #remove self hits
detailDT <- detailDT[!(queryID==subjID)]
}
}
}
metadata(OGREDataSet)$sumDT <- detailDT[,
list(subjID=paste(subjID,collapse = " ")),by=.(queryID,subjType)]
assert_that(nrow(metadata(OGREDataSet)$sumDT)!=0,
msg="No overlaps found for any subjects!")
metadata(OGREDataSet)$detailDT <- detailDT
metadata(OGREDataSet)$quickDT <- detailDT[,c("queryID","subjType")]
metadata(OGREDataSet)$quickDT <- data.table::dcast(data=metadata(OGREDataSet)$quickDT,
queryID ~ subjType,value.var = "subjType",fun=length)
#add and set query elements without hits to 0
withoutHits <- setdiff(mcols(OGREDataSet[[1]])$ID,metadata(OGREDataSet)$quickDT$queryID)
metadata(OGREDataSet)$quickDT <- rbind(metadata(OGREDataSet)$quickDT,
data.table(queryID=withoutHits),fill=TRUE)
metadata(OGREDataSet)$quickDT[is.na(metadata(OGREDataSet)$quickDT)] <- 0
#set order to match other tables
metadata(OGREDataSet)$quickDT <- metadata(OGREDataSet)$quickDT[
match(mcols(OGREDataSet[[1]])$ID,metadata(OGREDataSet)$quickDT$queryID),]
return(OGREDataSet)
}
#' Generate summary plot
#'
#' `sumPlot()` calculates key numbers i.e.
#' (total number of overlaps, number of overlaps per subject...) to help with an
#' exploratory data evaluation and displays them in an informative barplot.
#' @importFrom AnnotationHub query
#' @param OGREDataSet A OGREDataSet.
#' @return OGREDataSet.
#' @examples
#' myOGRE <- makeExampleOGREDataSet()
#' myOGRE <- loadAnnotations(myOGRE)
#' myOGRE <- fOverlaps(myOGRE)
#' myOGRE <- sumPlot(myOGRE)
#' @export
sumPlot <- function(OGREDataSet){
assert_that(!is.null(metadata(OGREDataSet)$sumDT),
msg="No data to generate sumPlot!")
xtext <- lab <-queryLog<- NULL
#summary barplot
query_N <- length(unique(mcols(OGREDataSet[[1]])$ID))
query_ol <- length(unique(metadata(OGREDataSet)$sumDT$queryID))
query_No_ol <- query_N-query_ol
query_full <- sum(table(metadata(OGREDataSet)$sumDT$queryID)>=length(OGREDataSet)-1)
dtbarplot<-data.table(x=c("query_N","query_w_minOne_overlap",
"genes_w_overlap_allSubjTypes"),query=c(query_N,query_ol,query_full))
dtbarplot$x<-factor(x = dtbarplot$x,levels = c("query_N","query_w_minOne_overlap",
"genes_w_overlap_allSubjTypes"))
dtbarplot$lab<-c(paste0("Total number of submitted queries: ",query_N," (100%)"),
paste0("Queries with >=1 subject overlaps: ",query_ol," (",
round(query_ol/query_N*100),"%)"),
paste0("Queries with overlaps in all subject types: ",
query_full," (",round(query_full/query_N*100),"%)"))
dtbarplot$col<-"steelblue2"
temp<-vapply(metadata(OGREDataSet)$subjectNames,function(x){
length(unique(metadata(OGREDataSet)$detailDT[
metadata(OGREDataSet)$detailDT$subjType==x,][["queryID"]]))},integer(1))
temp<-data.table(x=names(temp),
Subjects=temp,
lab=paste0("Queries with ",names(temp),": ",temp," (",round(temp/query_N*100),"%)"),
col="steelblue3")
subjPerQuery<-round(vapply(metadata(OGREDataSet)$subjectNames,function(x){
sum(metadata(OGREDataSet)$detailDT$subjType==x)},numeric(1))/query_N,digits=1)
subjPerQuery<-data.table(x=names(subjPerQuery),Subjects=subjPerQuery,
lab=paste0("Average ",names(subjPerQuery)," per query: ",subjPerQuery),col="steelblue4")
dtbarplotDetailed<-rbind(dtbarplot,temp,subjPerQuery, use.names=FALSE)
dtbarplotDetailed$sequence<-seq(dim(dtbarplotDetailed)[1],1)
dtbarplotDetailed$xtext<-dtbarplotDetailed$sequence+0.55
metadata(OGREDataSet)$barplot_summary <-
ggplot(dtbarplotDetailed, aes(x = sequence, y =query,fill=col)) +
geom_bar(stat = "identity", width = 0.3) +
scale_fill_manual(guide="none",values = c("steelblue2" = "steelblue2",
"steelblue3" = "steelblue3",
"steelblue4" = "steelblue4"))+
coord_flip() +labs(x = "", y = "N") +theme_bw() + theme_classic() +
theme(axis.text.y = element_blank(), axis.ticks.y = element_blank()) +
geom_text(aes(x = xtext, y = 0, label = lab),size=7,hjust = 0, vjust = 1)+
guides(size = "none")+theme(axis.text = element_text(size = 14))
ggsave(plot=metadata(OGREDataSet)$barplot_summary,
path=metadata(OGREDataSet)$outputFolder,width = 30,
filename="Barplot_Summary.png",height = 20,dpi = 400,units = "cm")
dtbarplotDetailed$queryLog<-log2(dtbarplotDetailed$query+1)#add +1 to avoid log2(1)=0 and log2(0)=-inf
metadata(OGREDataSet)$barplot_summary_dt <- dtbarplotDetailed
metadata(OGREDataSet)$barplot_summary_log2 <-
ggplot(dtbarplotDetailed, aes(x = sequence, y =queryLog,fill=col)) +
geom_bar(stat = "identity", width = 0.3) +
scale_fill_manual(guide="none",values = c("steelblue2" = "steelblue2",
"steelblue3" = "steelblue3",
"steelblue4" = "steelblue4"))+
coord_flip() +labs(x = "", y = "Log2(N)") +theme_bw() + theme_classic() +
theme(axis.text.y = element_blank(), axis.ticks.y = element_blank()) +
geom_text(aes(x = xtext, y = 0, label = lab),size=7,hjust = 0, vjust = 1)+
guides(size = "none")+theme(axis.text = element_text(size = 14))
ggsave(plot=metadata(OGREDataSet)$barplot_summary_log,
path=metadata(OGREDataSet)$outputFolder,width = 30,
filename="Barplot_Summary_log.png",height = 20,dpi = 400,units = "cm")
return(OGREDataSet)
}
#' Generate Gviz plot
#'
#'`gvizPlot` generates a plot around one or many given query elements with all
#'overlapping subject hits. In addition, each generated plot can be stored in
#'the `gvizPlots` folder get or set by \code{gvizPlotsFolder}. A maximum of 25
#'elements can be plotted per track.
#' @importFrom grDevices dev.off pdf
#' @importFrom stats setNames
#' @rawNamespace import(Gviz, except = tags)
#' @param OGREDataSet A OGREDataSet.
#' @param query A character vector of one or many query elements ID's (i.e. Gene ID's).
#' @param gvizPlotsFolder A character pointing to the plot(s) output directory.
#' If not supplied a folder is automatically generated and can be accessed by
#' \code{metatdata(OGREDataSet)$gvizPlotsFolder}.
#' @param showPlot \code{logical} If \code{FALSE}(default) plots are only saved
#' to `gvizPlotsFolder`. If \code{TRUE}
#' plots are additionally send to the plotting window.
#' @param trackRegionLabels A labeled character vector that defines the type of
#' label that is displayed for query and subject elements during plotting.
#' Vector values represent the type of label and vector labels define the type
#' of subject element. In the following example
#' \code{setNames(c("ID","name"),c("genes","CGI"))}
#' Value "ID" and label "genes" would annotate your genes with IDs taken from the
#' ID column of your dataset. Datasets not defined in this vector are
#' plotted without track labels.
#' @param trackShapes A labeled character vector that defines the type of
#' shape in which every dataset's elements are displayed.
#' Vector values represent the type of shape and vector labels define the type
#' of subject element. In the following example
#' \code{setNames(c("fixedArrow","box"),c("genes","CGI"))}
#' Value "fixedArrow" and label "genes" would display your genes in fixedArrow
#' and CGI as box shape. Possible values:
#' (box, arrow, fixedArrow, ellipse, and smallArrow) Default="fixedArrow"
#' @param extendPlot \code{int vector} Integer vector of length two that extends
#' the plot window to the left or right by adding the first value to query start
#' and the second value to query end coordinates(bp). e.g. \code{c(-1000,1000)}
#' zooms out, \code{c(1000,-1000)} zooms in and \code{c(-1000,0)} shifts the plot
#' window to the left.
#' @param nElements \code{integer} Number of elements that are displayed
#' in each track (Default=25). High n.elements can lead to overplotting. Use
#' \code{nElements=FALSE} to display all elements.
#' @return OGREDataSet.
#' @examples
#' myOGRE <- makeExampleOGREDataSet()
#' myOGRE <- loadAnnotations(myOGRE)
#' myOGRE <- fOverlaps(myOGRE)
#' myOGRE <- gvizPlot(myOGRE,query="ENSG00000142168")
#' @export
gvizPlot <- function(OGREDataSet,query,
gvizPlotsFolder = metadata(OGREDataSet)$gvizPlotsFolder,
trackRegionLabels =setNames(rep("ID",length(OGREDataSet)),names(OGREDataSet)),
trackShapes=setNames(rep("fixedArrow",length(OGREDataSet)),names(OGREDataSet)),
showPlot=FALSE,extendPlot=c(-300,300),nElements=25){
queryID <- subjType <- NULL
for(q in query){
message(paste0("Plotting query: ",q))
#Subject tracks
GvizSubjTracks<-lapply(metadata(OGREDataSet)$subjectNames,function(x){
tmp<-metadata(OGREDataSet)$detailDT[subjType==x & queryID==q]
if(is.numeric(nElements&dim(tmp)[1]>nElements)){tmp<-tmp[sample
(seq_len(dim(tmp)[1]),size = nElements),]}
regionLabels<-mcols(OGREDataSet[[x]])[[trackRegionLabels[x]]][match(
tmp$subjID,mcols(OGREDataSet[[x]])[["ID"]])]
if(dim(tmp)[1]!=0){# if subjectType overlaps with query create track
if(is.null(regionLabels)){
AnnotationTrack(start=tmp$subjStart,end = tmp$subjEnd,chromosome=tmp$subjChr,
name=x,id = regionLabels,fontsize.title=24,shape=trackShapes[x],
featureAnnotation=NULL,fontcolor.title="black",fontcolor="black",
fontcolor.group="black",fontcolor.item="black",rotation.item=20,
arrowHeadWidth=30,strand=tmp$subjStrand)
}else{
AnnotationTrack(start=tmp$subjStart,end = tmp$subjEnd,chromosome=tmp$subjChr,
name=x,id = regionLabels,fontsize.title=24,shape=trackShapes[x],
featureAnnotation="id",fontcolor.title="black",fontcolor="black",
fontcolor.group="black",fontcolor.item="black",rotation.item=20,
arrowHeadWidth=30,strand=tmp$subjStrand)
}
}else{
AnnotationTrack(GRanges(),name = x,fontcolor.title="black",fontsize.title=24)
}
})
#Gviz helper tracks
chr<-as.character(seqnames(OGREDataSet[[1]])[mcols(OGREDataSet[[1]])$ID==q])
itrack <-IdeogramTrack(genome="hg38",chromosome=chr)#ideogram track (chromosome bands etc)
gtrack<-GenomeAxisTrack()
queryGR<-OGREDataSet[[1]][mcols(OGREDataSet[[1]])$ID==q]
from <- start(queryGR)+extendPlot[1] #add 300bp left and righ as plotWindow
to <- end(queryGR)+extendPlot[2]
#Gviz query track
regionLabels<-mcols(queryGR)[[trackRegionLabels[1]]]
queryTrack<-AnnotationTrack(range=queryGR,name=names(OGREDataSet)[1],fill="red",
arrowHeadWidth=30,shape=trackShapes[1],featureAnnotation="id",
id=regionLabels,fontsize.group=20,fontsize.title=24,
fontcolor.title="black")#,stacking = "dense"
allTracks<-c(itrack,gtrack,queryTrack,GvizSubjTracks)#;names(temp)=make.unique(names(temp))
#Gviz plotting
pdf(file.path(gvizPlotsFolder,paste0(q,".pdf")),width = 30/2.54,height = 20/2.54)
plotTracks(allTracks,showOverplotting=TRUE,from = from, to = to,title.width=6,
rotation.title=0,background.title="gray92",just.group="above")
dev.off()
if(showPlot){
plotTracks(allTracks,showOverplotting=TRUE,from = from, to = to,title.width=6,
rotation.title=0,background.title="gray92",just.group="above")
}
}
return(OGREDataSet)
}
#' Plot histogram
#'
#' Plots overlap histograms of all subject datasets and stores them as a list,
#' that can be accessed by \code{metadata(myOGRE)$hist}
#' @importFrom ggplot2 ggplot
#' @param OGREDataSet An OGREDataSet
#' @param plot0 plot0=FALSE(default) plots a histogram of all dataset elements
#' with overlaps, excluding elements without overlaps. plot0=FALSE also includes
#' elements without overlaps.
#' @return OGREDataSet.
#' @examples
#' myOGRE <- makeExampleOGREDataSet()
#' myOGRE <- loadAnnotations(myOGRE)
#' myOGRE <- fOverlaps(myOGRE)
#' myOGRE <- plotHist(myOGRE)
#' metadata(myOGRE)$hist
#' @export
plotHist <- function(OGREDataSet,plot0=FALSE){
for(i in colnames(metadata(OGREDataSet)$quickDT[,-c(1)])){
dt <- metadata(OGREDataSet)$quickDT
if(isFALSE(plot0))dt <- dt[dt[[i]]!=0,]
p <- ggplot(dt, aes(x=.data[[i]])) +
geom_histogram(fill="steelblue2", position="dodge")+
geom_vline(xintercept=stats::median(dt[[i]]),
linetype="dashed",size=1.5)+
scale_x_continuous(trans = 'log10')+
theme(legend.position="top")+theme_classic()+
theme(text = element_text(size = 20))+
xlab(paste(i,"number per",names(OGREDataSet)[1]))+ylab("Frequency")
metadata(OGREDataSet)$hist[[i]] <- p
}
return(OGREDataSet)
}
.datatable.aware=TRUE #to use "[" without importing whole data.table
#' Coverage plot
#'
#' Generates coverage plots of all subject datasets and stores them as a list,
#' that can be accessed by \code{metadata(OGREDataSet)$covPlot}
#' @import ggplot2
#' @rawNamespace import(data.table, except = c(shift,second,first))
#' @importFrom tidyr %>%
#' @param OGREDataSet An OGREDataSet
#' @param datasets \code{character vector} of subject dataset names. Default:
#' Generates a coverage plots for all subjects
#' @param nbin Number of bins
#' @return OGREDataSet.
#' @examples
#' myOGRE <- makeExampleOGREDataSet()
#' myOGRE <- loadAnnotations(myOGRE)
#' myOGRE <- fOverlaps(myOGRE)
#' myOGRE <- covPlot(myOGRE)
#' metadata(myOGRE)$covPlot
#' @export
covPlot <- function(OGREDataSet,
datasets=names(OGREDataSet)[seq(2,length(OGREDataSet))],
nbin=100){
message("Generating coverage plot(s), this might take a while...")
subjType<-queryID<-bins<-rCov<-ID<-x<-Coverage<-NULL
#filter out queries<nbin
#regions that are at least nbin nucleotides long
message("Excluding regions with nucleotides<nbin")
regions <-mcols(OGREDataSet[[1]][width(OGREDataSet[[1]])>=nbin])$ID
#& have overlaps
regions <- regions[regions%in%metadata(OGREDataSet)$detailDT$queryID]
for(d in datasets){
regions <- regions[regions%in%metadata(OGREDataSet)$detailDT[subjType==d][["queryID"]]]
cov <- GenomicRanges::coverage(OGREDataSet[[d]])
covDT <- sapply(unique(regions),function(r){
cor <- metadata(OGREDataSet)$detailDT[queryID==r&subjType==d,]
dt <- data.table(rCov=cov[[cor$queryChr[1]]][seq(cor$queryStart[1],cor$queryEnd[1])]%>%as.numeric())
dt[,bins:=ggplot2::cut_interval(seq(1,length(rCov)), n = nbin)]#set bins)
dt[,sum(rCov),by=bins][["V1"]] #bin overlaps are summed up
})
covDT <- as.data.table(t(covDT),keep.rownames = "ID")
covDT[,strand:=as.vector(strand(OGREDataSet[[1]]))[match(ID,mcols(OGREDataSet[[1]])$ID)]]
covDT_for <- subset(covDT[strand=="+",],select=-c(strand,ID))
covDT_rev <- subset(covDT[strand=="-",],select=-c(strand,ID))
covDT_rev <- rev(covDT_rev) #reverse minus strand
covDT_both <- subset(covDT[strand=="*",],select=-c(strand,ID))
covDT <- rbind(covDT_for,covDT_rev,covDT_both)
covDT <- colSums(covDT) #column summarization for now
#covDT <- colSums(covDT_for) #add support for forward/reverse plots
#covDT <- colSums(covDT_rev)
dt <- data.table(x=seq(1,length(covDT)),Coverage=covDT)
p <- ggplot(dt, aes(x=x, y=Coverage)) + xlab("Region bins")+theme_classic()+
geom_smooth(se=FALSE)+ylab("Number of overlaps")+scale_y_continuous()
metadata(OGREDataSet)$covPlot[[d]] <- (list(plot=p,data=dt))
}
return(OGREDataSet)
}
#' Calculates min/max/average overlap
#'
#' Calculates min/max/average overlap for all datasets using \code{summary()}.
#' Results can be accessed by \code{metadata(OGREDataSet)$summaryDT} which is a
#' \code{list()} of two \code{data.table} objects. The first one includes
#' elements without any overlap at all and the second provides summary numbers
#' for all elements that have at least one overlap.
#' @param OGREDataSet An OGREDataSet
#' @return OGREDataSet.
#' @examples
#' myOGRE <- makeExampleOGREDataSet()
#' myOGRE <- loadAnnotations(myOGRE)
#' myOGRE <- fOverlaps(myOGRE)
#' myOGRE <- summarizeOverlap(myOGRE)
#' metadata(myOGRE)$summaryDT
#' @export
summarizeOverlap <- function(OGREDataSet){
dt <- metadata(OGREDataSet)$quickDT[,-c(1)]
metadata(OGREDataSet)$summaryDT[["includes0"]] <- do.call(cbind, lapply(dt, summary))
dt[dt==0] <- NA
metadata(OGREDataSet)$summaryDT[["excludes0"]] <- do.call(cbind, lapply(dt, summary))
return(OGREDataSet)
}
#' List predefined datasets
#'
#' Use `listPredefinedDataSets()` to receive a vector of names for predefined
#' datasets that can be aquired from AnnotationHub that are already correctly
#' parsed and formatted. Each of the listed names can be used as input for
#' `addDataSetFromHub()`. Currently supported:
#' \itemize{
#' \item protCodingGenes - Protein coding genes from HG19 (GRCh37) Ensembl
#' For additional information use:
#' `getInfoOnIds(AnnotationHub(), "AH10684")`
#' \item CGI - CpG islands from HG19 UCSC
#' For additional information use:
#' `getInfoOnIds(AnnotationHub(), "AH5086")`
#' \item SNP - Common Single Nucleotide Polymorphism from HG19 UCSC
#' For additional information use:
#' `getInfoOnIds(AnnotationHub(), "AH5105")`
#' \item TFBS - Transcription Factor Binding Sites conserved from HG19 UCSC
#' For additional information use:
#' `getInfoOnIds(AnnotationHub(), "AH5090")`
#' \item Promoters - Promoter and flanking regions from HG19 Ensembl (Note:
#' This annotation is currently not included in AnnotationHub and is therefore
#' downloaded from Ensembl's ftp site)
#' }
#' @return \code{character} vector.
#' @examples
#' listPredefinedDataSets()
#' @export
listPredefinedDataSets <- function(){
return(c("protCodingGenes","CGI","SNP","TFBS","Promoters"))
}
#' Add dataSet from AnnotationHub
#'
#' AnnotationHub offers a wide range of annotated datasets which can be manually
#' aquired but need some parsing to work with OGRE as detailed in vignette
#' section "Load datasets from AnnotationHub".
#' For convienence `addDataSetFromHub()` adds one of the predefined human
#' dataSets of `listPredefinedDataSets()` to a OGREDataSet.Those are taken from
#' AnnotationHub and are ready to use for OGRE. Additional information on
#' dataSets can be found here \code{\link{listPredefinedDataSets}}.
#' @rawNamespace import(data.table, except = c(shift,second,first))
#' @importFrom assertthat assert_that
#' @importFrom AnnotationHub AnnotationHub
#' @importFrom GenomeInfoDb keepStandardChromosomes seqlevelsStyle dropSeqlevels
#' @importFrom rtracklayer import.gff
#' @param OGREDataSet OGREDataSet
#' @param dataSet \code{character} Name of one predefined dataSets to add as
#' query or subject to a OGREDataSet. Possible dataSets can be show with
#' `listPredefinedDataSets()`.
#' @param type Type of dataSet, must be either query or subject. If query the
#' dataSet will be added as query and at the first position of OGREDataSet.
#' @return OGREDataSet.
#' @examples
#' myOGRE <- OGREDataSet()
#' myOGRE <- addDataSetFromHub(myOGRE,"protCodingGenes","query")
#' @export
addDataSetFromHub <- function(OGREDataSet,dataSet,type){
assertthat::assert_that(type%in%c("query","subject"),
msg="Parameter type must be query or subject.")
assertthat::assert_that(dataSet%in%listPredefinedDataSets(),
msg=paste("Parameter dataSet type must be among:",
paste(listPredefinedDataSets(),collapse = " ")))
name<-ID<-NULL
if(is.null(metadata(OGREDataSet)$aH)){aH <- AnnotationHub()}
switch(dataSet,
protCodingGenes={x <- aH[["AH10684"]]
x <- GenomeInfoDb::keepStandardChromosomes(x,"Homo_sapiens",
pruning.mode="coarse")
x <- x[mcols(x)$type=="gene"&mcols(x)$gene_biotype=="protein_coding"]
mcols(x) <-mcols(x)[,c("gene_id","gene_name")]
colnames(mcols(x)) <- c("ID","name")
},
CGI={x <- aH[["AH5086"]]
x <- GenomeInfoDb::keepStandardChromosomes(x,"Homo_sapiens",
pruning.mode="coarse")
GenomeInfoDb::seqlevelsStyle(x) <- "Ensembl"
mcols(x) <- data.frame(ID=seq(1,length(x)),name=mcols(x)$name)
},
SNP={x <- aH[["AH5105"]]
x <- GenomeInfoDb::keepStandardChromosomes(x,"Homo_sapiens",
pruning.mode="coarse")
GenomeInfoDb::seqlevelsStyle(x) <- "Ensembl"
mcols(x) <-data.frame(ID=make.unique(mcols(x)$name),name=mcols(x)$name)
},
TFBS={x <- aH[["AH5090"]]
x <- GenomeInfoDb::keepStandardChromosomes(x,"Homo_sapiens",
pruning.mode="coarse")
GenomeInfoDb::seqlevelsStyle(x) <- "Ensembl"
mcols(x)$name <- gsub("V\\$","",mcols(x)$name)
mcols(x)$ID <- mcols(x)$name
mcols(x)$ID <- make.unique(gsub("_.*","",mcols(x)$ID))
},
Promoters={
x <- as.data.table(rtracklayer::import.gff("http://ftp.ensembl.org/pub/grch37/current/regulation/homo_sapiens/homo_sapiens.GRCh37.Regulatory_Build.regulatory_features.20201218.gff.gz"))
x <- x[type%in%c("promoter","promoter_flanking_region"),c("seqnames","start","end","ID")]
x[,name:=sub(".*:", "", ID)]
x <- makeGRangesFromDataFrame(x,keep.extra.columns = TRUE)
x <- GenomeInfoDb::keepStandardChromosomes(x,"Homo_sapiens",
pruning.mode="coarse")
GenomeInfoDb::seqlevelsStyle(x) <- "Ensembl"
}
)
#MT removed for compatibility with hg19,MT(hg19) differs from MT(GRCH37) see:
#https://gatk.broadinstitute.org/hc/en-us/articles/360035890711?id=23390
x <- GenomeInfoDb::dropSeqlevels(x,"MT","coarse")
genome(x) <- "hg19"
mData <- metadata(OGREDataSet)
if(dataSet%in%names(OGREDataSet)){
dataSet <- tail(make.unique(c(names(OGREDataSet),dataSet)),n=1)
warning("Renamed datasets cause of duplicated labels")
}
if(type=="query"){ #add dataSet as query or subject
OGREDataSet <- c(GRangesList(x),OGREDataSet) #add x at first position
names(OGREDataSet)[1] <- dataSet
metadata(OGREDataSet) <- mData
}else{
OGREDataSet[[dataSet]] <- x
metadata(OGREDataSet) <- mData
if(length(OGREDataSet)>1){ #update subject names
metadata(OGREDataSet)$subjectNames<-names(
OGREDataSet[seq(2,length(OGREDataSet))])
}}
message(paste0("Added ",dataSet))
return(OGREDataSet)
}
#' Add GenomicRanges
#'
#' Add a GenomicRanges dataset to OGREDataSet
#' @importFrom assertthat assert_that
#' @param OGREDataSet An OGREDataSet
#' @param dataSet A GRanges object. Each region needs chromosome, start, end and
#' strand information. A unique ID and a name column must be present in the
#' `GenomicRanges` object metadata. Avoid different chromosome naming
#' conventions i.e. (chr1, CHR1, 1, I) among all datasets
#' @param type Type of dataSet, must be either query or subject. If query the
#' dataSet will be added as query and at the first position of OGREDataSet.
#' @param label A \code{character} that will label your GRanges object. If
#' not supplied, the label will be guessed from the dataset parameter.
#' @return OGREDataSet.
#' @examples
#' myOGRE <- OGREDataSet()
#' myGRanges <- makeExampleGRanges()
#' myOGRE <- addGRanges(myOGRE,myGRanges,"query")
#' @export
addGRanges <- function(OGREDataSet,dataSet,type,label=NULL){
assertthat::assert_that(type%in%c("query","subject"),
msg="Parameter type must be query or subject.")
assertthat::assert_that(is(dataSet,"GRanges"),
msg="dataSet class must be GRanges.")
assertthat::assert_that(c("ID")%in%names(mcols(dataSet)),
msg="DataSet must contain ID column.")
assertthat::assert_that(!any(duplicated(dataSet$ID)),
msg="ID column must be unique.")
assertthat::assert_that(length(dataSet)!=0,
msg="DataSet has no ranges.")
mData <- metadata(OGREDataSet)
if(is.null(label))label <- deparse(substitute(dataSet)) #if label not supplied
if(label%in%names(OGREDataSet)){
label <- tail(make.unique(c(names(OGREDataSet()),label)),n=1)
warning("Renamed datasets cause of duplicated labels")
}
if(type=="query"){
OGREDataSet <- c(GRangesList(dataSet),OGREDataSet)
names(OGREDataSet)[1] <- label
metadata(OGREDataSet) <- mData
}else{
OGREDataSet[[label]] <- dataSet
metadata(OGREDataSet) <- mData
if(length(OGREDataSet)>1){ #update subject names
metadata(OGREDataSet)$subjectNames <-names(OGREDataSet
[seq(2,length(OGREDataSet))])
}
}
return(OGREDataSet)
}
#' Make a example OGRE dataset
#'
#' `makeExampleOGREDataSet` generates a example OGREDataSet from dataset files
#' stored in OGRE's extdata directory.
#' @return OGREDataSet.
#' @examples
#' myOGRE <- makeExampleOGREDataSet()
#' @export
makeExampleOGREDataSet <- function()
{
myQueryFolder <- file.path(system.file('extdata', package = 'OGRE'),"query")
mySubjectFolder <- file.path(system.file('extdata', package = 'OGRE'),"subject")
myOGRE <- OGREDataSetFromDir(queryFolder=myQueryFolder,subjectFolder=mySubjectFolder)
return(myOGRE)
}
#' Make an example GRanges dataset
#'
#' `makeExampleGRanges` generates an example GRanges dataset.
#' @importFrom GenomicRanges GRanges
#' @importFrom IRanges IRanges
#' @return OGREDataSet.
#' @examples
#' myGRanges <- makeExampleGRanges()
#' @export
makeExampleGRanges <- function()
{
myGRanges <- GRanges(Rle(c("2", "2", "1", "3"), c(1, 3, 2, 4)),
IRanges(seq_len(10), width=seq(10,1), names=head(letters, 10)),
Rle(strand(c("-", "+", "*", "+", "-")), c(1, 2, 2, 3, 2)),
ID=seq_len(10), name=paste0("gene",seq_len(10)))
return(myGRanges)
}
#' Subset a GRanges object
#'
#' Subsets a GRanges object with reference to it's ID column using a ID vector.
#' @importFrom GenomicRanges GRanges
#' @param OGREDataSet An OGREDataSet
#' @param IDs \code{character vector} with IDs used to subset the GRanges object
#' defined in \code{name}
#' @param name \code{character} Name of the GRanges object for subsetting. One of
#' the GRanges objects in a \code{OGREDataSet}
#' @return OGREDataSet.
#' @examples
#' myOGRE <- makeExampleOGREDataSet()
#' myOGRE <- loadAnnotations(myOGRE)
#' myOGRE <- subsetGRanges(myOGRE,c("ENSG00000142168","ENSG00000256715"),"genes")
#' @export
subsetGRanges <- function(OGREDataSet,IDs,name)
{
OGREDataSet[[name]] <- OGREDataSet[[name]][mcols(OGREDataSet[[name]])$ID%in%IDs,]
return(OGREDataSet)
}
#' Extend a GRanges object
#'
#' Extend(shrink) ranges of a GRanges object.
#' @importFrom GenomicRanges GRanges
#' @param OGREDataSet An OGREDataSet
#' @param name \code{character} Name of the GRanges object for extending
#' @param upstream \code{int} (positive or negative number)
#' @param downstream \code{int} (positive or negative number)
#' @return OGREDataSet
#' @examples
#' myOGRE <- makeExampleOGREDataSet()
#' myOGRE <- loadAnnotations(myOGRE)
#' #extend range by shifting start 100 bp in upstream direction
#' myOGRE <- extendGRanges(myOGRE,"genes",upstream=100)
#' #shrinking range by shifting end 100 bp in upstream direction
#' myOGRE <- extendGRanges(myOGRE,"genes",downstream=-100)
#' #shrinking range by shifting from both sides to the center
#' myOGRE <- extendGRanges(myOGRE,"genes",upstream=-10,downstream=-10)
#' @export
extendGRanges <- function(OGREDataSet,name, upstream=0, downstream=0)
#taken from https://support.bioconductor.org/p/78652/
{
if (any(strand(OGREDataSet[[name]]) == "*"))
warning("'*' ranges were treated as '+'")
on_plus <- strand(OGREDataSet[[name]]) == "+" | strand(OGREDataSet[[name]]) == "*"
new_start <- start(OGREDataSet[[name]]) - ifelse(on_plus, upstream, downstream)
new_end <- end(OGREDataSet[[name]]) + ifelse(on_plus, downstream, upstream)
ranges(OGREDataSet[[name]]) <- IRanges(new_start, new_end)
OGREDataSet[[name]] <- trim(OGREDataSet[[name]])
return(OGREDataSet)
}
#' Extract promoter
#'
#' A wrapper of \code{GenomicRanges::promoters()} to extract promoter regions of
#' a GRanges object stored in a OGREDataSet
#' @importFrom GenomicRanges GRanges
#' @param OGREDataSet An OGREDataSet
#' @param name \code{character} Name of the GRanges object
#' @param upstream \code{int} (positive) upstream=2000(default)
#' @param downstream \code{int} (positive) downstream=200(default)
#' @return OGREDataSet
#' @examples
#' myOGRE <- makeExampleOGREDataSet()
#' myOGRE <- loadAnnotations(myOGRE)
#' myOGRE <- extractPromoters(myOGRE,"genes", upstream=2000, downstream=200)
#' @export
extractPromoters <- function(OGREDataSet,name, upstream=2000, downstream=200)
#taken from https://support.bioconductor.org/p/78652/
{
if (any(strand(OGREDataSet[[name]]) == "*"))
warning("'*' ranges were treated as '+'")
on_plus <- strand(OGREDataSet[[name]]) == "+" | strand(OGREDataSet[[name]]) == "*"
new_start <- start(OGREDataSet[[name]]) - ifelse(on_plus, upstream, downstream)
new_end <- end(OGREDataSet[[name]]) + ifelse(on_plus, downstream, upstream)
ranges(OGREDataSet[[name]]) <- IRanges(new_start, new_end)
OGREDataSet[[name]] <- trim(OGREDataSet[[name]])
return(OGREDataSet)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.