#' Predict MicroExons in a Plant Genome
#'
#' MEPmod searches microexon-tags in 45 conserved microexon clusters in plants
#' using gapped Position Weight Matrix (PWM). The only input file is plant
#' genomic sequences or a plant genome. The sequences on both plus and
#' minus strands will be scanned.
#'
#' @param genome the path(s) to the fasta file(s) or a 'DNAStringSet' object.
#' Any contig sequence < 10 kb will be excluded.
#' @param min.score a character string containing a percentage specifying the
#' minimum score of each exon block (e.g. \code{"80\%"}). This parameter will
#' pass to \link[Biostrings]{matchPWM} from \bold{Biostrings} package.
#' @param clusters an integer vector between 1 to 45 specifying microexon-tag
#' clusters to search (default: all the 45 clusters).
#' @param include.intronLoss \code{TRUE} or \code{FALSE}. If \code{TRUE},
#' the microexons with any side of flanking intron loss will also be returned.
#' @param span the maximum spanning region of the microexon-tag (default: 20 kb).
#' @param min.intron minimum intron size
#' @param max.intron maximum intron size
#' @param cores number of cores to use. This parameter will pass to
#' \link[parallel]{mclapply} as *mc.cores* from \bold{parallel} package.
#'
#' @return A GRanges of microexons with 8 metadata columns:
#' \itemize{
#' \item NT, 108 bp DNA sequence of microexon-tag (72 bp for cluster 1 and 2).
#' \item AA, 36 aa protein sequence translated from NT (24 aa for cluster 1 and 2).
#' \item region, the span region of microexon-tag.
#' \item block.starts, the genomic coordinates of block start positions
#' (exon, intron, exon, ...).
#' \item block.sizes, block sizes (exon, intron, exon, ...).
#' \item score, match score of the microexon-tag.
#' Large score indicate high confidence.
#' \item isMicroExon, if the microexon has any flanking intron loss, FALSE;
#' otherwise, TRUE.
#' \item cluster, which cluster this microexon-tag belongs to.
#' }
#'
#' @importFrom methods is
#' @importFrom parallel mclapply
#' @import IRanges
#' @import GenomicRanges
#' @import Biostrings
#'
#' @export MEPmod
#'
#' @examples
#' suppressPackageStartupMessages(library(BSgenome))
#' if (!requireNamespace("BSgenome.Athaliana.TAIR.TAIR9", quietly = TRUE))
#' BiocManager::install("BSgenome.Athaliana.TAIR.TAIR9")
#' library(BSgenome.Athaliana.TAIR.TAIR9)
#'
#' Chr1<-BSgenome.Athaliana.TAIR.TAIR9[["Chr1"]]
#' Chr1<-DNAStringSet(Chr1)
#' names(Chr1)<-'Chr1'
#' x<-subseq(Chr1, 1, 1e6)
#' x
#'
#' mep<-MEPmod(genome=x, cores=1)
#'
#' mep
#' # GRanges object with 2 ranges and 8 metadata columns:
#' # seqnames ranges strand | NT AA
#' # <Rle> <IRanges> <Rle> | <DNAStringSet> <AAStringSet>
#' # [1] Chr1 456757-456761 + | TTTGATGCTA...ACTGTCTGAC FDARTAWSQC...AFGAVESLSD
#' # [2] Chr1 601547-601560 - | AAAGTTGAAT...ACATTCAGAT KVEFKDNEWK...RLDCLLKHSD
#' # region block.starts block.sizes score isMicroExon cluster
#' # <IRanges> <IntegerList> <IntegerList> <numeric> <logical> <integer>
#' # [1] 456595-456893 456595,456647,456757,... 52,110,5,... 0.9800 TRUE 8
#' # [2] 601327-601700 601653,601561,601547,... 48,92,14,... 0.9241 TRUE 35
#' # -------
#' # seqinfo: 1 sequence from an unspecified genome; no seqlengths
#'
###
MEPmod<-function(genome, min.score='80%', clusters=seq_len(45),
include.intronLoss=TRUE,span=20000, min.intron=20,
max.intron=10000, cores=4) {
MEPdata<-MEPmodeler::MEPdata
cat(paste(t0<-Sys.time(),'..... started run\n'))
cat(paste(Sys.time(),'..... loading genome\n'))
if (!is(genome, 'DNAStringSet')) {
if (!is.character(genome)) {
stop("'genome' must be the path(s) to the fasta file(s) or
a 'DNAStringSet' class" )
}
genome<-readDNAStringSet(genome)
}
names(genome)<-sub("^(\\S+)\\s+.*","\\1",names(genome)) # remove description
genome<-genome[width(genome)>=10000]
prior.params=letterFrequency(genome[order(-width(genome))][[1]],
DNA_BASES,as.prob = TRUE)
cat(paste(Sys.time(),'..... finished loading genome\n'))
res<-GRanges()
for(i in clusters) {
cat(sprintf('%s ..... Cluster %d (size: %d; phase: %d; motif: %s)\n',
Sys.time(),i,MEPdata$cluster$size[i],MEPdata$cluster$phase[i],
MEPdata$cluster$motif[i]))
res_i<-mapPWM(cons=MEPdata$matrix[[i]],exons=MEPdata$blocks[[i]],genome,
focus=MEPdata$cluster$me_order[i],min.score,
include.intronLoss,span, min.intron, max.intron,
prior.params=prior.params,cores = cores,check.params = FALSE)
if (length(res_i)==0) next
res_i$block.starts<-do.call(c,apply(res_i$block.starts, 1, IntegerList))
res_i$block.sizes<-do.call(c,apply(res_i$block.sizes, 1, IntegerList))
res_i$cluster<-i
res<-suppressWarnings(append(res,res_i))
}
cat(paste(t1<-Sys.time(),'..... finished successfully\n'))
cat(sprintf('### total microexon-tags found:\t%d\n', length(res)))
cat(sprintf('### total time used:\t%s mins\n',
round(difftime(t1,t0,units = 'mins'),2)))
res
}
#####
mapPWM<-function(cons,exons,genome,focus=2, min.score='80%',include.intronLoss=TRUE,
span=20000, min.intron=20, max.intron=10000,
prior.params=letterFrequency(genome[[1]],DNA_BASES,as.prob = TRUE),
cores=1, check.params=TRUE){
n<-length(exons)
if (n==3) {
return(mapPWM.0(cons,exons,genome,min.score,include.intronLoss, span,
min.intron, max.intron, prior.params, cores))
}
if (check.params) {
if (n<3) stop("'exons' must be a numeric vector of length >= 3")
if (sum(exons)!=ncol(cons))
stop ("sum(exons)==ncol(cons) is not TRUE")
if (!is(genome,'DNAStringSet')) stop("'genome' must be a DNAStringSet object")
if (length(focus)!=1 & focus[1]<=1 & focus[1]>=n)
stop("'focus' must be a numeric in (0,length(exons))!")
}
plus_intron1<-sprintf('^(GT[AG].{%d,%d}[CT]AG)%s$',min.intron,max.intron,
ifelse(include.intronLoss,"?",""))
plus_intron2<-sprintf('^(G[TC].{%d,%d}AG)%s$',min.intron,max.intron,
ifelse(include.intronLoss,"?",""))
minus_intron1<-sprintf('^(CT[AG].{%d,%d}[CT]AC)%s$',min.intron,max.intron,
ifelse(include.intronLoss,"?",""))
minus_intron2<-sprintf('^(CT.{%d,%d}[GA]C)%s$',min.intron,max.intron,
ifelse(include.intronLoss,"?",""))
pwm<-PWM(cons,prior.params=prior.params)
for (i in seq(n)) {
index<-seq.int(exons[i]) + cumsum(c(0,exons))[i]
assign(paste0('pwm',i),PWM(cons[,index,drop=FALSE],prior.params=prior.params))
}
plus<-mclapply(genome, function(x) {
# find matches for the firest and the last exon
e1<-suppressWarnings(matchPWM(get('pwm1'),x,min.score))@ranges
en<-suppressWarnings(matchPWM(get(paste0('pwm',n)),x,min.score))@ranges
# find regions spanning the first exon and the last exon
ov<-findOverlapPairs(e1,en,maxgap=span)
ov<-ov[start(ov@second)-end(ov@first)>=sum(exons[-c(1,n)])]
if (length(ov)==0) return(IRanges())
e1<-ov@first; en<-ov@second
intervals<-pgap(e1,en)
mcols(intervals)$id<-seq(along.with = intervals)
# screen the intervals containing all the other exons
for(i in seq(2,n-1)) {
mi<-suppressWarnings(matchPWM(get(paste0('pwm',i)),x,min.score))@ranges
ov<-findOverlapPairs(mi,intervals,type='within')
if (length(ov)==0) return(IRanges())
intervals<-unique(ov@second)
assign(paste0('m',i),unique(ov@first))
}
e1<-e1[mcols(intervals)$id]
en<-en[mcols(intervals)$id]
# check the order of the exons and introns
me<-lapply(seq(along.with = intervals), function(i) {
nt<-x[e1[i]] # fist exon
block<-data.frame(start(e1[i]))
ir<-intervals[i]
for (j in seq(2,n-1)) {
ov<-findOverlapPairs(get(paste0('m',j)),ir,type='within')
if (length(ov)==0) return(IRanges())
l<-grepl(plus_intron1,Views(x,flank(
ov@first,start(ov@first)-start(ov@second))),perl=TRUE)
if (!any(l)) l<-grepl(plus_intron2,Views(x,flank(
ov@first,start(ov@first)-start(ov@second))),perl=TRUE)
if (!any(l)) return(IRanges())
ov<-ov[l]
nt<-paste0(nt,Views(x,ov@first))
block<-data.frame(data.frame(lapply(block, rep, length(ov))),
rep(start(ov@first),each=nrow(block)))
ir<-pgap(ov@first,rep(en[i],length(ov)))
}
r<-grepl(plus_intron1,Views(x,ir),perl=TRUE)
if (!any(r)) r<-grepl(plus_intron2,Views(x,ir),perl=TRUE)
if (!any(r)) return(IRanges())
nt<-DNAStringSet(paste0(nt[r],Views(x,en[i])))
block<-as.matrix(data.frame(block[r,,drop=FALSE],start(en)[i]))
dimnames(block)<-NULL
ir<-IRanges(block[,focus],width=exons[focus],
NT=nt,AA=Biostrings::translate(nt,if.fuzzy.codon='solve'))
mcols(ir)$region<-IRanges(start(e1[i]),end(en[i]))
mcols(ir)$block.starts<-t(apply(block,1,function(xx) {
head(sort(c(xx,xx+exons)),-1)
}))
mcols(ir)$block.sizes<-t(apply(block,1,function(xx) {
diff(sort(c(xx,xx+exons)))
}))
return(ir)
})
me<-do.call(c,me[elementNROWS(me)>0])
me
},mc.cores = cores)
plus<-unlist(IRangesList(plus[elementNROWS(plus)>0]))
if (length(plus)==0) plus<-GRanges() else plus<-GRanges(names(plus),plus,'+')
minus<-mclapply(genome, function(x) {
# find matches for the firest and the last exon
e1<-suppressWarnings(matchPWM(reverseComplement(get('pwm1')),x,min.score))@ranges
en<-suppressWarnings(matchPWM(reverseComplement(
get(paste0('pwm',n))),x,min.score))@ranges
# find regions spanning the first exon and the last exon
ov<-findOverlapPairs(e1,en,maxgap=span)
ov<-ov[start(ov@first)-end(ov@second)>=sum(exons[-c(1,n)])]
if (length(ov)==0) return(IRanges())
e1<-ov@first; en<-ov@second
intervals<-pgap(e1,en)
mcols(intervals)$id<-seq(along.with = intervals)
# screen the intervals containing all the other exons
for(i in seq(2,n-1)) {
mi<-suppressWarnings(matchPWM(reverseComplement(
get(paste0('pwm',i))),x,min.score))@ranges
ov<-findOverlapPairs(mi,intervals,type='within')
if (length(ov)==0) return(IRanges())
intervals<-unique(ov@second)
assign(paste0('m',i),unique(ov@first))
}
e1<-e1[mcols(intervals)$id]
en<-en[mcols(intervals)$id]
# check the order of the exons and introns
me<-lapply(seq(along.with = intervals), function(i) {
nt<-x[e1[i]] # fist exon
block<-data.frame(start(e1[i]))
ir<-intervals[i]
for (j in seq(2,n-1)) {
ov<-findOverlapPairs(get(paste0('m',j)),ir,type='within')
if (length(ov)==0) return(IRanges())
r<-grepl(minus_intron1,Views(x,flank(
ov@first,end(ov@second)-end(ov@first),start = FALSE)),
perl=TRUE)
if (!any(r)) r<-grepl(minus_intron2,Views(x,flank(
ov@first,end(ov@second)-end(ov@first),start = FALSE)),
perl=TRUE)
if (!any(r)) return(IRanges())
ov<-ov[r]
nt<-paste0(Views(x,ov@first),nt)
block<-data.frame(data.frame(lapply(block, rep, length(ov))),
rep(start(ov@first),each=nrow(block)))
ir<-pgap(ov@first,rep(en[i],length(ov)))
}
l<-grepl(minus_intron1,Views(x,ir),perl=TRUE)
if (!any(l)) l<-grepl(minus_intron2,Views(x,ir),perl=TRUE)
if (!any(l)) return(IRanges())
nt<-reverseComplement(DNAStringSet(paste0(Views(x,en[i]),nt[l])))
block<-as.matrix(data.frame(block[l,,drop=FALSE],start(en)[i]))
dimnames(block)<-NULL
ir<-IRanges(block[,focus],width=exons[focus],NT=nt,
AA=Biostrings::translate(nt,if.fuzzy.codon='solve'))
mcols(ir)$region<-IRanges(start(en[i]),end(e1[i]))
mcols(ir)$block.starts<-t(apply(block,1,function(xx) {
sort(c(xx,xx+exons),decreasing = TRUE)[-1]
}))
mcols(ir)$block.sizes<-t(apply(block,1,function(xx) {
0L-diff(sort(c(xx,xx+exons),decreasing = TRUE))
}))
return(ir)
})
me<-do.call(c,me[elementNROWS(me)>0])
me
},mc.cores = cores)
minus<-unlist(IRangesList(minus[elementNROWS(minus)>0]))
if (length(minus)==0) minus<-GRanges() else minus<-GRanges(names(minus),minus,'-')
res<-suppressWarnings(append(plus,minus))
res<-res[grep("\\*",res$AA,invert = TRUE)]
if(length(res)>0) {
res$score<-round(vapply(res$NT, function(x) PWMscoreStartingAt(pwm,x), 1.0),4)
cluster<-reduce(GRanges(seqnames(res),res$region,strand(res)),with.revmap=TRUE)$revmap
if(any(elementNROWS(cluster)>1)) {
res<-res[unlist(cluster[lapply(extractList(res$score,cluster), which.max)])]
}
res$isMicroExon<-res$block.sizes[,2*(focus-1)]!=0 & res$block.sizes[,2*focus]!=0
}
cat ("---Number of microexon-tags found:\t", length(res),'\n')
unname(res)
}
###
mapPWM.0<-function(cons,exons,genome,min.score="80%",include.intronLoss=TRUE,
span=20000, min.intron=20, max.intron=10000,
prior.params=letterFrequency(genome[[1]],DNA_BASES,as.prob = TRUE),
cores=1){
pwm<-PWM(cons,prior.params=prior.params)
pwm1<-PWM(cons[,seq.int(exons[1]),drop=FALSE],prior.params=prior.params)
pwm2<-PWM(cons[,seq.int(exons[2])+exons[1],drop=FALSE],prior.params=prior.params)
pwm3<-PWM(cons[,seq.int(exons[3])+sum(exons[1:2]),drop=FALSE],prior.params=prior.params)
plus_intron1<-sprintf('^(GT[AG].{%d,%d}[CT]AG)%s$',min.intron,max.intron,
ifelse(include.intronLoss,"?",""))
plus_intron2<-sprintf('^(G[TC].{%d,%d}AG)%s$',min.intron,max.intron,
ifelse(include.intronLoss,"?",""))
minus_intron1<-sprintf('^(CT[AG].{%d,%d}[CT]AC)%s$',min.intron,max.intron,
ifelse(include.intronLoss,"?",""))
minus_intron2<-sprintf('^(CT.{%d,%d}[GA]C)%s$',min.intron,max.intron,
ifelse(include.intronLoss,"?",""))
plus<-mclapply(genome, function(x) {
e1<-suppressWarnings(matchPWM(pwm1,x,min.score))
e3<-suppressWarnings(matchPWM(pwm3,x,min.score))
ov<-findOverlapPairs(e1,e3,maxgap=span)
ov<-ov[start(ov@second)>=end(ov@first)+exons[2]]
if (length(ov)==0) return(IRanges())
e1<-ov@first;e3<-ov@second
intervals<-pgap(e1,e3)
me<-lapply(seq(along.with = intervals), function(i) {
xx<-Views(x,intervals[i])
m<-suppressWarnings(matchPWM(pwm2,xx,min.score))
if(length(m)==0) return(IRanges())
l<-grepl(plus_intron1,flank(m,start(m)-start(xx)),perl=TRUE)
if (!any(l)) l<-grepl(plus_intron2,flank(m,start(m)-start(xx)),perl=TRUE)
r<-grepl(plus_intron1,flank(m,end(xx)-end(m),start=FALSE),perl=TRUE)
if (!any(r)) r<-grepl(plus_intron2,flank(m,end(xx)-end(m),start=FALSE),perl=TRUE)
m<-m[l&r]
if(length(m)==0) return(IRanges())
e2<-m@ranges
mcols(e2)$NT<-DNAStringSet(paste0(e1[i],m,e3[i]))
mcols(e2)$AA<-Biostrings::translate(mcols(e2)$NT,if.fuzzy.codon='solve')
mcols(e2)$region<-IRanges(start(e1[i]),end(e3[i]))
mcols(e2)$block.starts<-as.matrix(data.frame(
start(e1)[i],end(e1)[i]+1,
start(e2),end(e2)+1,start(e3)[i],fix.empty.names=FALSE))
mcols(e2)$block.sizes<-as.matrix(data.frame(
exons[1],distance(e1[i],e2),
exons[2],distance(e2,e3[i]),exons[3],fix.empty.names=FALSE))
return(e2)
})
me<-do.call(c,me[elementNROWS(me)>0])
me
},mc.cores = cores)
plus<-unlist(IRangesList(plus[elementNROWS(plus)>0]))
if (length(plus)==0) plus<-GRanges() else plus<-GRanges(names(plus),plus,'+')
minus<-mclapply(genome, function(x) {
e1<-suppressWarnings(matchPWM(reverseComplement(pwm1),x,min.score))
e3<-suppressWarnings(matchPWM(reverseComplement(pwm3),x,min.score))
ov<-findOverlapPairs(e1,e3,maxgap=span)
ov<-ov[start(ov@first)>=end(ov@second)+exons[2]]
if (length(ov)==0) return(IRanges())
e1<-ov@first;e3<-ov@second
intervals<-pgap(e1,e3)
me<-lapply(seq(along.with = intervals), function(i) {
xx<-Views(x,intervals[i])
m<-suppressWarnings(matchPWM(reverseComplement(pwm2),xx,min.score))
if(length(m)==0) return(IRanges())
l<-grepl(minus_intron1,flank(m,start(m)-start(xx)),perl=TRUE)
if (!any(l)) l<-grepl(minus_intron2,flank(m,start(m)-start(xx)),perl=TRUE)
r<-grepl(minus_intron1,flank(m,end(xx)-end(m),start=FALSE),perl=TRUE)
if (!any(r)) r<-grepl(minus_intron2,flank(m,end(xx)-end(m),start=FALSE),perl=TRUE)
m<-m[l&r]
if(length(m)==0) return(IRanges())
e2<-m@ranges
mcols(e2)$NT<-reverseComplement(DNAStringSet(paste0(e3[i],m,e1[i])))
mcols(e2)$AA<-Biostrings::translate(mcols(e2)$NT,if.fuzzy.codon='solve')
mcols(e2)$region<-IRanges(start(e3[i]),end(e1[i]))
mcols(e2)$block.starts<-as.matrix(data.frame(
start(e1)[i],end(e2)+1,start(e2),end(e3)[i]+1,start(e3)[i],
fix.empty.names=FALSE))
mcols(e2)$block.sizes<-as.matrix(data.frame(
exons[1],distance(e1[i],e2),exons[2],distance(e2,e3[i]),
exons[3],fix.empty.names=FALSE))
return(e2)
})
me<-do.call(c,me[elementNROWS(me)>0])
me
},mc.cores = cores)
minus<-unlist(IRangesList(minus[elementNROWS(minus)>0]))
if (length(minus)==0) minus<-GRanges() else minus<-GRanges(names(minus),minus,'-')
res<-suppressWarnings(append(plus,minus))
res<-res[grep("\\*",res$AA,invert = TRUE)]
if(length(res)>0) {
res$score<-round(vapply(res$NT, function(x) PWMscoreStartingAt(pwm,x), 1.0),4)
cluster<-reduce(GRanges(seqnames(res),res$region,strand(res)),with.revmap=TRUE)$revmap
if(any(elementNROWS(cluster)>1)) {
res<-res[unlist(cluster[lapply(extractList(res$score,cluster), which.max)])]
}
res$isMicroExon<-res$block.sizes[,2]!=0 & res$block.sizes[,4]!=0
}
cat ("---Number of microexon-tags found:\t", length(res),'\n')
unname(res)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.