Nothing
##################################################
# re-implement HSMM, one more slot to handle distance array
##################################################
biomvRhsmm<-function(x, maxk=NULL, maxbp=NULL, J=3, xPos=NULL, xRange=NULL, usePos='start', emis.type='norm', com.emis=FALSE, xAnno=NULL, soj.type='gamma', q.alpha=0.05, r.var=0.75, useMC=TRUE, cMethod='F-B', maxit=1, maxgap=Inf, tol=1e-06, grp=NULL, cluster.m=NULL, avg.m='median', prior.m = 'cluster', trim=0, na.rm=TRUE){
## input checking
# lock.transition / lock.d, lock transition and sojourn #fixme
# est.method=c('viterbi', 'smooth')
message('Checking input ...')
if (!is.numeric(x) && !is.matrix(x) && !is(x, "GRanges"))
stop("'x' must be a numeric vector or matrix or a GRanges object.")
if(is(x, "GRanges")) {
xid<-names(values(x))
xRange<-x
mcols(xRange)<-NULL
x<-as.matrix(values(x))
if( any(sapply(as.character(unique(seqnames(xRange))), function(s) length(unique(strand(xRange)[seqnames(xRange)==s])))!=1) )
stop('For some sequence, there are data appear on both strands !')
} else if(length(dim(x))==2){
xid<-colnames(x)
x<-as.matrix(x)
} else {
message('No dim attributes, coercing x to a matrix with 1 column.')
x <- matrix(as.numeric(x), ncol=1)
xid<-NULL
}
# check the case when Rle in x mcols
if(!is.null(xRange) & length(xRange) != nrow(x)){
nr<-length(xRange)
spRle<-TRUE
if(nr > .Machine$integer.max) stop("Length of input is longer than .Machine$integer.max")
} else {
nr<-nrow(x)
spRle<-FALSE
}
nc<-ncol(x)
if(is.null(xid)){
xid<-paste('S', seq_len(nc), sep='')
colnames(x)<-xid
}
if (is.null(cMethod) || !(cMethod %in% c('F-B', 'Viterbi')))
stop("'cMethod' must be specified, must be either 'F-B' or 'Viterbi'!")
if (is.null(prior.m) || !(prior.m %in% c('quantile', 'cluster')))
stop("'prior.m' must be specified, must be either 'quantile' or 'cluster'!")
if (is.null(emis.type) || !(emis.type %in% c('norm', 'mvnorm', 'pois', 'nbinom', 'mvt', 't')))
stop("'emis.type' must be specified, must be one of 'norm', 'mvnorm', 'pois', 'nbinom', 'mvt', 't'!")
if (prior.m == 'cluster'){
if(length(find.package('cluster', quiet=T))==0) {
warning("'cluster' is not found, fall back to quantile method!!!")
prior.m <- 'quantile'
} else {
require(cluster)
}
}
## some checking on xPos and xRange, xRange exist then xPos derived from xRange,
if(!is.null(xRange) && (is(xRange, "GRanges") || is(xRange, "IRanges")) && !is.null(usePos) && length(xRange)==nr && usePos %in% c('start', 'end', 'mid')){
if(usePos=='start'){
xPos<-start(xRange)
} else if(usePos=='end'){
xPos<-end(xRange)
} else {
xPos<-(start(xRange)+end(xRange))/2
}
} else {
# no valid xRange, set it to null
message('no valid xRange and usePos found, check if you have specified xRange / usePos.')
xRange<- NULL
}
if (is.null(xPos) || !is.numeric(xPos) || length(xPos)!=nr){
message("No valid positional information found. Re-check if you have specified any xPos / xRange.")
xPos<-NULL
}
if (!is.null(maxbp) && (!is.numeric(maxbp) || (length(maxbp) != 1) || (maxbp <= 1) || ( !is.null(xPos) && maxbp > max(xPos,na.rm=na.rm)-min(xPos, na.rm=na.rm))))
stop(sprintf("'maxbp' must be a single integer between 2 and the maximum length of the region if xPos is available!"))
# check grp setting, cluster if needed, otherwise treat as one group
if(!is.null(grp)) grp<-as.character(grp)
grp<-preClustGrp(x, grp=grp, cluster.m=cluster.m) #?todo there could be problem if spRle and need clustering
# initial sojourn setup unify parameter input / density input, using extra distance, non-integer value can give a dtype value
if(!is.null(xAnno) && !is.null(soj.type) && soj.type %in% c('gamma', 'pois', 'nbinom') && (is(xAnno, "TxDb") || is(xAnno, "GRanges") || is(xAnno, "GRangesList") || is.list(xAnno))){
# this is only used when the xAnno object contains appropriate annotation information which could be used as prior for the sojourn dist in the new HSMM model
# if xAnno is also present, then J will be estimated from xAnno, and pop a warning, ## this only make sense if difference exist in the distribution of sojourn of states.
# a further list object allow direct custom input for initial sojourn dist parameters., e.g. list(lambda=c(10, 50, 1000))
soj<-sojournAnno(xAnno, soj.type=soj.type)
J<-soj$J
message('Estimated state number from xAnno: J = ', J)
#now, if there is xAnno, then J, maxbp could be inferred, and if xPos exists, then maxk could also be inferred.
if(is.null(maxbp)){
# estimating a reasonable number for maxbp
maxbp<- switch(soj.type,
nbinom = ceiling(median(soj$mu)),
pois = ceiling(median(soj$lambda)),
gamma = ceiling(median(soj$shape*soj$scale)),
stop("Invalid argument value for 'soj.type'! ")
)
}
# soj<-append(soj, maxbp=maxbp)
} else if(is.numeric(J) && J>1){
message('xAnno is not present or not supported, try to use maxbp/maxk in the uniform prior for the sojourn distribution.')
# J is ok
if(is.null(xPos)){
# no position as well, in turn means no xRange nor multiple seq,
# init if J and maxk are ok
if (!is.null(maxk) && is.numeric(maxk) && (length(maxk) == 1) && (maxk > 1) && (maxk < nr)) {
message('maxbp and xPos are not present or not valid, using maxk for the sojourn distribution.')
soj<-list(type = soj.type, J=J, maxk=maxk)
} else {
stop(sprintf("'maxk' must be a single integer between 2 and the number of rows of 'x': %d.", nr))
}
} else if(!is.null(maxbp) && maxbp > 1){
# has position and good maxbp, will init it latter, maxk will be estimated there
soj<-list(J=J, maxbp=maxbp, type = soj.type)
} else if (!is.null(maxk) && is.numeric(maxk) && (length(maxk) == 1) && (maxk > 1) && (maxk < nr)) {
#has pos, but no good maxbp
warning('Has positions but no maxbp, using maxk for the sojourn distribution !!!')
soj<-list(type = soj.type, J=J, maxk=maxk)
} else {
stop(sprintf("Both maxk and maxbp are not available!"))
}
} else {
# no good J
stop("J must be specified or estimated from xAnno !!")
}
# so far, soj is a list object, depending on which case
# case1, soj param J, maxbp from xAnno
# case2, no pos, has input maxk and J and d, ready for initSojDd # may reinitialize maxk later
# case3, has input xPos and maxbp
# check mv vs iterative
if (emis.type=='mvnorm' || emis.type=='mvt' ){
iterative<-FALSE
} else {
iterative<-TRUE
}
## build xRange if not a GRanges for the returning object
if(is.null(xRange) || !is(xRange, "GRanges")){
if(!is.null(xRange) && is(xRange, "IRanges")){
xRange<-GRanges(seqnames='sampleseq', xRange)
} else if(!is.null(xPos)){
xRange<-GRanges(seqnames='sampleseq', IRanges(start=xPos, width=1))
} else {
xRange<-GRanges(seqnames='sampleseq', IRanges(start=seq_len(nr), width=1))
}
}
# get seqnames status
seqs<-unique(as.character(seqnames(xRange))) # speed gain for large no. of contig
### pre calc emis if common prior for all seq, done outside of seqname loop
message("Estimating common emission prior ...")
if(com.emis){
if(iterative){
if(spRle){
emis<-lapply(seq_len(ncol(x)), function(c) estEmis(x[,c][[1]], J=J, emis.type=emis.type, prior.m=prior.m, q.alpha=q.alpha, r.var=r.var))
} else {
emis<-lapply(seq_len(ncol(x)), function(c) estEmis(x[,c], J=J, emis.type=emis.type, prior.m=prior.m, q.alpha=q.alpha, r.var=r.var))
}
} else {
if(spRle){
stop('Rle like structure is not currently supported for mvnorm / mvt!')
} else {
emis<-lapply(unique(grp), function(g) estEmis(x[,grp==g], J=J, emis.type=emis.type, prior.m=prior.m, q.alpha=q.alpha, r.var=r.var))
}
names(emis)<-unique(grp)
}
}
#make it parallel
if(useMC & length(find.package('parallel', quiet=T))==0) {
warning("'parallel' is not found, use normal 'lapply' function!!!")
useMC<-FALSE
mylapply<-lapply
} else if (useMC){
require(parallel)
mylapply<-mclapply
} else {
mylapply<-lapply
}
# we have more than one seq to batch
mcres<-mylapply(seq_along(seqs), function(s) {
r<-which(as.character(seqnames(xRange)) == seqs[s])
if(length(r)<2){
warning('Region too short for seq ', s, ' skipped!')
if(iterative){
runout<- sapply(seq_len(nc), function(c) list(NA))
} else {
runout<- sapply(unique(grp), function(g) list(NA))
}
} else {
runout<-list()
# prep soj for the c loop, since there are multiple seq, which also means there must be xpos and maxbp
message(sprintf("Preparing sojourn prior for seq '%s' ...", seqs[s]))
if(is.null(soj$maxk)){
# either has soj parameter, or has pos and maxbp for unif
ssoj<-append(soj, initDposV(xPos[r], maxbp))
if(is.null(ssoj$fttypes)){
# dont't have soj param
ssoj<-append(ssoj, list(d=unifMJ(ssoj$maxk*length(r), J)))
}
} else {
# maxk and unif d, no maxbp or xAnno, using the min of input maxk and length of the seq
ssoj<-append(soj, list(d=unifMJ(min(soj$maxk,length(r)), J)))
ssoj$maxk<-min(soj$maxk,length(r))
}
ssoj <- initSojDd(ssoj)
for(g in unique(grp)){
gi<-grp==g
if(iterative){
for(ci in which(gi)){
message(sprintf("Building HSMM for seq '%s' in column '%s' ...", seqs[s], ci))
if(com.emis) {
semis<-emis[[ci]]
}else {
if(spRle){
semis<-estEmis(x[,ci][[1]][r], J=J, prior.m=prior.m, emis.type=emis.type, q.alpha=q.alpha, r.var=r.var)
} else {
semis<-estEmis(x[r,ci], J=J, prior.m=prior.m, emis.type=emis.type, q.alpha=q.alpha, r.var=r.var)
}
}
if(spRle){
grunout<-tryCatch(hsmmRun(x[,ci][[1]][r], xid[ci], xRange[r], ssoj, semis, cMethod, maxit, maxgap, tol, avg.m=avg.m, trim=trim, na.rm=na.rm, com.emis=com.emis), error=function(e){ return(e) })
} else {
grunout<-tryCatch(hsmmRun(x[r,ci], xid[ci], xRange[r], ssoj, semis, cMethod, maxit, maxgap, tol, avg.m=avg.m, trim=trim, na.rm=na.rm, com.emis=com.emis), error=function(e){ return(e) })
}
if(length(grep('error', class(grunout), ignore.case=T))!=0){
warning('Model failed for seq ', s, ' and column ', c, '.\n', grunout)
runout<-append(runout, list(NA))
} else {
runout<-append(runout, list(grunout) )
}
}
} else {
message(sprintf("Building HSMM for group '%s' ...", g))
if(com.emis) {
semis<-emis[[g]]
}else {
if(spRle){
stop('Rle like structure is not currently supported for mvnorm / mvt!')
} else {
semis<-estEmis(x[r,gi], J=J, prior.m=prior.m, emis.type=emis.type, q.alpha=q.alpha, r.var=r.var)
}
}
grunout<-tryCatch(hsmmRun(x[r,gi], xid[gi], xRange[r], ssoj, semis, cMethod, maxit, maxgap, tol, avg.m=avg.m, trim=trim, na.rm=na.rm, com.emis=com.emis) , error=function(e){ return(e) })
if(length(grep('error', class(grunout), ignore.case=T))!=0){
warning('Model failed for seq ', s, ' and group ', g, '.\n', grunout)
runout<-append(runout, list(NA))
} else {
runout<-append(runout, list(grunout))
}
}
} # end for g
} # end if
return(runout)
})
message("Building HSMM complete, preparing output ...")
res<-do.call(c, lapply(seq_along(seqs), function(s){
do.call(c, lapply(seq_len(ifelse(iterative, nc, length(unique(grp)))), function(g){
if(!is.na(mcres[[s]][[g]][1])){
return(mcres[[s]][[g]]$res)
} else {
return(GRanges())
}
}))
}))
seqr<-table(seqnames(xRange))
ssp<-do.call(rbind, lapply(seq_along(seqs), function(s){
do.call(cbind, lapply(seq_len(ifelse(iterative, nc, length(unique(grp)))), function(g){
if(!is.na(mcres[[s]][[g]][1])){
return(DataFrame(s=mcres[[s]][[g]]$yhat, sp=mcres[[s]][[g]]$yp))
} else {
return(DataFrame(s=Rle(NA, seqr[seqs[s]]), sp=Rle(NA, seqr[seqs[s]])))
}
}))
}))
emis.par<-do.call(rbind, lapply(seq_along(seqs), function(s){
do.call(c, lapply(seq_len(ifelse(iterative, nc, length(unique(grp)))), function(g){
if(!is.na(mcres[[s]][[g]][1])){
return(list(mcres[[s]][[g]]$emispar))
} else {
return(list(NA))
}
}))
}))
rownames(emis.par)<-seqs
colnames(emis.par)<-if(iterative) xid else unique(grp)
soj.par<-do.call(rbind, lapply(seq_along(seqs), function(s){
do.call(c, lapply(seq_len(ifelse(iterative, nc, length(unique(grp)))), function(g){
if(!is.na(mcres[[s]][[g]][1])){
return(list(mcres[[s]][[g]]$sojpar))
} else {
return(list(NA))
}
}))
}))
rownames(soj.par)<-seqs
colnames(soj.par)<-if(iterative) xid else unique(grp)
# setup input data and state to xRange for returning
values(xRange)<-DataFrame(do.call(DataFrame, lapply(seq_len(nc), function(c) x[,c])), ssp, row.names = NULL)
colnames(mcols(xRange)) <- c(xid, paste(rep(c('S', 'SP'), ifelse(iterative, nc, length(unique(grp)))), rep(if(iterative) xid else unique(grp), each=2), sep='.'))
new("biomvRCNS",
x = xRange, res = res,
param=list(J=J, maxk=maxk, maxbp=maxbp, maxgap=maxgap, soj.type=soj.type, emis.type=emis.type, q.alpha=q.alpha, r.var=r.var, iterative=iterative, cMethod=cMethod, maxit=maxit, tol=tol, grp=grp, cluster.m=cluster.m, avg.m=avg.m, prior.m=prior.m, trim=trim, na.rm=na.rm, soj.par=soj.par, emis.par=emis.par)
)
}
hsmmRun<-function(x, xid='sampleid', xRange, soj, emis, cMethod='F-B', maxit=1, maxgap=Inf, tol= 1e-6, avg.m='median', trim=0, na.rm=TRUE, com.emis=FALSE){
# now x should be a matrix, when emis.type=mvt or mvnorm, ncol>1
if(is.null(dim(x))) x<-matrix(as.vector(x))
colnames(x)<-xid
nr<-nrow(x)
J<-soj$J
maxk<-soj$maxk
s<-unique(as.character(seqnames(xRange)))
# create default uniform initial probability
init<-rep(1/J, J) # start with uniform
# create default uniform transition probability
trans <- matrix(1/(J-1), nrow = J, ncol=J)
diag(trans)<-0
#estimation of most likely state sequence
#define likelihood
ll <- rep(NA,maxit)
# start MM iteration
for(it in 1:maxit) {
message(sprintf("[ hsmmRun ] seq '%s' column '%s' iteration: %d", s, paste(xid, collapse='+'), it))
# reestimationg of emmision
emis<-initEmis(emis=emis, x=x)
B = .C("backward", a=as.double(trans), pi=as.double(init), b=as.double(emis$p), d=as.double(soj$d), D=as.double(soj$D),
maxk=as.integer(maxk), DL=as.integer(nrow(soj$d)), T=as.integer(nr), J=as.integer(J),
eta = double(nrow(soj$d)*J), L=double(nr*J), N=double(nr), ahat=double(J*J), pihat=double(J),
F=double(nr*J), G=double(nr*J), L1 = double(nr*J), si=double(nr*J), PACKAGE='biomvRCNS')
#update initial prob PI, transition >=0 check
# init<-B$pihat
init<- abs(B$pihat)/sum(abs(B$pihat))
trans <- matrix(B$ahat,ncol=J)
trans[trans<0] <- 0
#check B$L
if(all(is.nan(B$L))) {
stop("Sojourn distribution does not work well, NaN in B$L ")
}
#update emission according to the new estimated distribution parameters using B$L
if(!com.emis){
emis<-initEmis(emis=emis, x=x, B=B)
}
# update sojourn dD, using B$eta
soj<-initSojDd(soj=soj, B=B)
# log-likelihood for this it, using B$N
ll[it]<-sum(log(B$N))
if( it>1 && abs(ll[it]-ll[it-1]) < tol) {
break()
}
} # end for maxit
BL<-matrix(B$L,ncol=J)
BL<-t(apply(BL, 1, function(x) abs(x)/sum(abs(x))))
# switch cMethod
if (cMethod=='Viterbi'){
emis<-initEmis(emis=emis, x=x)
logtrans<-log(trans)
logtrans[logtrans==-Inf] <- -.Machine$double.xmax
loginit<-log(init)
loginit[loginit==-Inf] <- -.Machine$double.xmax
logd = log(soj$d)
logd[logd==-Inf] <- -.Machine$double.xmax
logD = log(soj$D)
logD[logD==-Inf] <- -.Machine$double.xmax
logb<-log(emis$p)
logb[logb==-Inf] <- -.Machine$double.xmax
V = .C("logviterbi", a=as.double(logtrans), pi=as.double(loginit), b=as.double(logb), d=as.double(logd), D=as.double(logD),
maxk=as.integer(maxk), DL=as.integer(nrow(soj$d)), T=as.integer(nr), J=as.integer(J),
alpha = double(nr*J), shat=integer(nr), si=double(nr*J), opt=integer(nr*J), ops=integer(nr*J), PACKAGE='biomvRCNS')
yhat<-V$shat+1
} else if (cMethod=='F-B'){
## assign states and split if necessary.
yhat<-apply(BL, 1, which.max)
}
yp<-Rle(as.vector(BL)[(yhat-1)*nr+1:nr])
if(!is.null(soj$fttypes)){
yhat<-soj$fttypes[yhat]
}
yhat<-Rle(as.character(yhat))
# setup this new res gr
Ilist<-lapply(unique(yhat), function(j) do.call(cbind, splitFarNeighbouryhat(yhat, xRange=xRange, maxgap=maxgap, state=j)))
names(Ilist)<-unique(yhat)
res<- do.call('c',
lapply(unique(yhat), function(j)
GRanges(seqnames=as.character(seqnames(xRange)[1]),
IRanges(start=rep(start(xRange)[Ilist[[j]][,'IS']], length(xid)), end=rep(end(xRange)[Ilist[[j]][,'IE']], length(xid))),
strand=rep(strand(xRange)[Ilist[[j]][,'IS']], length(xid)),
SAMPLE=Rle(xid, rep(nrow(Ilist[[j]]), length(xid))),
STATE=Rle(as.character(j), nrow(Ilist[[j]])*length(xid)),
AVG=Rle(as.numeric(sapply(xid, function(s) apply(Ilist[[j]], 1, function(r) apply(as.matrix(x[r[1]:r[2],s]), 2, avgFunc, avg.m=avg.m, trim=trim, na.rm=na.rm)))))
)
)
)
seqlevels(res)<-seqlevels(xRange)
# return the soj. and emis par, maybe also the estimated state emis$p
emispar2ret<-c('mu', 'var', 'size', 'df')
ei<-names(emis) %in% emispar2ret
sojpar2ret<-c('lambda', 'shift', 'mu', 'size', 'scale', 'shape')
si<-names(soj) %in% sojpar2ret
return(list(yhat=yhat, res=res, yp=yp, emispar=emis[ei], sojpar=soj[si]))
}
##################################################
# HSMM helper functions
##################################################
##################################################
# create lists of uniform prior
##################################################
unifMJ<-function(M,J, ints=NULL){
# not finished create unif with supplied ints
if(is.null(ints)){
ret<-do.call(cbind, lapply(1:J, function(j) dunif(1:M, 0, M)))
} else if(min(ints)<1 | max(ints)> M) {
stop('All supplied intervals should be in the range of [1, M]')
} else {
# ints, a dataframe marks the starts and ends for each interval
ret<-do.call(cbind, lapply(1:J, function(j) do.call(c, lapply(1:nrow(ints), function(i) dunif(1:M, ints[i,1], ints[i,2])))))
}
ret
}
##################################################
# estimate sojourn distribution from annotation using gamma
##################################################
sojournAnno<-function(xAnno, soj.type= 'gamma', pbdist=NULL){
# xAnno has to be a Grange / rangedata obj
# check if xAnno class, txdb or dataframe or rangedata ...
# there is also the possibility of proposing an empirical number for the states.
# must ensure there are at least 2 for each state ? todo
if(is(xAnno, "TxDb")) {
if(length(find.package('GenomicFeatures', quiet=T))==0) {
stop("'GenomicFeatures' is not found !!!")
} else {
require(GenomicFeatures)
}
J<-3
fttypes<-c('intergenic', 'intron', 'exon')
#3 feature type, exon, intron, intergenic
transc <- transcripts(xAnno) # this give you all cds ranges ungrouped
intergenic<-gaps(transc)
# gaps() will by default produce extra * ranges and full range for empty chr
# https://stat.ethz.ch/pipermail/bioconductor/2013-May/052976.html
intergenic<-intergenic[strand(intergenic)!='*']
intergenic<-intergenic[which(width(intergenic) != seqlengths(intergenic)[as.character(seqnames(intergenic))])]
exon <- exons(xAnno) # this give you all exon ranges ungroupped
intron<- unlist(intronsByTranscript(xAnno))
ftdist<-list(intergenic=width(intergenic), intron=width(intron), exon=width(exon))
} else if(is(xAnno, "GRanges")){
# then the first column of the elementMetadata must be a character vector marking the type of features.
# need a way to sort
fts<-values(xAnno)[,1]
fttypes<- unique(fts)
J<-length(fttypes)
ftdist<-lapply(1:J, function(j) width(xAnno[fts==fttypes[j]]))
} else if (is(xAnno, "GRangesList")){
ng<-length(xAnno)
## the assumptions are number of ft types could be different from different list entry, group wise analysis, which of coz could be wrapped using foreach(ng) single group approach
fts<-lapply(1:ng, function(g) values(xAnno[[g]])[,1])
fttypesL<- lapply(fts, function(x) unique(x[order(x)]))
J<-length(unique(unlist(fts)))
ftdist<-lapply(1:ng, function(g) lapply(1:length(fttypesL[[g]]), function(j) width(xAnno[[g]][fts[[g]]==fttypesL[[g]][j]])))
fttypes<-unique(unlist(fttypesL))
fttypes<-fttypes[order(fttypes)]
ftdist<-lapply(fttypes, function(t) unlist(lapply(1:ng, function(g) ftdist[[g]][[match(t, fttypesL[[g]])]])))
} else if (is.list(xAnno)){
# checking if input list has valid specs.
paramID<- names(xAnno)
#check name fall into the right pool
paramIDok<- switch(soj.type,
nbinom = all( paramID %in% c('mu', 'size', 'shift')),
pois = all( paramID %in% c('lambda', 'shift')),
gamma = all( paramID %in% c('scale', 'shape')),
stop("Invalid argument value for 'soj.type'! ")
)
#check length of vectors match
J<-unique(sapply(xAnno, length))
if(length(J)!=1) {
stop("Length of vectors in xAnno are not equal, can't get valid state number J!")
}
soj<-append(xAnno, list(type = soj.type, J=J))
return(soj)
}
# should switch here between different soj.type
soj<-list(type = soj.type, fttypes=fttypes, J=J)
if(soj.type=='gamma'){
shape<-numeric()
scale<-numeric()
for(j in 1:J){
param<-gammaFit(ftdist[[j]])
if(! is.null(pbdist)){
# if distance between points are even
param['scale'] <- param['scale'] / pbdist
}
shape<-c(shape, param['shape'])
scale<-c(scale, param['scale'])
}
soj<-append(soj, list(shape=unname(shape), scale=unname(scale)) )
} else if (soj.type == 'nbinom'){
size<-numeric()
mu<-numeric()
shift<-numeric()
for(j in 1:J){
param<-nbinomFit(ftdist[[j]])
if(! is.null(pbdist)){
# if distance between points are even
param['mu'] <- param['mu'] / pbdist
}
size<-c(size, param['size'])
mu<-c(mu, param['mu'])
shift<-c(shift, param['shift'])
}
soj<-append(soj, list(size=unname(size), mu=unname(mu), shift=unname(shift)) )
} else if (soj.type == 'pois'){
lambda<-numeric()
shift<-numeric()
for(j in 1:J){
param<-poisFit(ftdist[[j]])
if(! is.null(pbdist)){
# if distance between points are even
param['lambda'] <- param['lambda'] / pbdist
}
lambda<-c(lambda, param['lambda'])
shift<-c(shift, param['shift'])
}
soj<-append(soj, list(lambda=unname(lambda), shift=unname(shift)) )
}
#return soj object
return(soj)
}
initDposV<-function(xpos, maxbp){
# for each position, find the maxk
nr<-length(xpos)
maxbpidx<-sapply(1:nr, function(i) max(which(xpos[i]+maxbp > xpos))) # >= will cause a NA at the end of each position,
# find the maxk idx
maxk<-max(maxbpidx - seq_len(nr))+1
# initialize the maxk position list for each position
## option 2, a TM * J matrix
dposV<-c(sapply(1:nr, function(t) xpos[t:(t+maxk-1)]-xpos[t]))+1 # dposV[(t-1)*maxk+u]
# sub > maxbp and NA
dposV[which(dposV>maxbp)]<-NA
return(list(dposV=dposV, maxk=maxk))
}
initSojDd <- function(soj, B=NULL) {
# take the initial soj dist parameter / or sample of density
if(! soj$type %in% c('nparam', 'gamma', 'pois', 'nbinom')) stop("invalid sojourn type found in soj$type !")
# parameter initialisation
J<-soj$J
maxk<-soj$maxk
if(is.null(soj$dposV)){
dposV<-1:maxk
} else {
## here d for all positions should be aggregated within each J
dposV<-soj$dposV
}
idx<- !is.na(dposV)
dposV[!idx]<- .Machine$integer.max
nb <- length(dposV)/maxk
if(soj$type == "gamma") {
if(!is.null(B)){
# then this is an update run
soj$d <- matrix(B$eta+.Machine$double.eps,ncol=J)
soj$shape <- soj$scale <- numeric(J)
for(j in 1:J) {
param <- gammaFit(dposV[idx],wt=soj$d[idx,j])
soj$shape[j] <- param['shape']
soj$scale[j] <- param['scale']
}
}
if(!is.null(soj$shape) && !is.null(soj$scale) && length(soj$shape)==J && length(soj$scale)==J) {
# for update with para estimated from B, or initial sojourn using param
soj$d<-sapply(1:J, function(j) dgamma(dposV, shape=soj$shape[j], scale=soj$scale[j]))
} # else assume soj$d exist.
} else if (soj$type == "pois") {
if(!is.null(B)){
# then this is an update run, re-estimation of dist params
soj$d <- matrix(B$eta+.Machine$double.eps,ncol=J)
soj$shift <- soj$lambda <- numeric(J)
ftidx<-apply(soj$d, 2, function(xc) xc >.Machine$double.eps & idx)
maxshift<- sapply(
lapply(1:J, function(j) which(ftidx[,j]) %% maxk ),
function(ic) min(dposV[seq(from=min(ic[ic>0]), to=nb*maxk, by=maxk)])
)
# cat(maxshift, '\n')
for(j in 1:J) {
param <- poisFit(dposV[ftidx[,j]], wt=soj$d[ftidx[,j],j], maxshift=maxshift[j])
soj$lambda[j] <- param['lambda']
soj$shift[j] <- param['shift']
}
}
if(!is.null(soj$shift) && !is.null(soj$lambda) && length(soj$lambda)==J && length(soj$shift)==J) {
# for update with para estimated from B, or initial sojourn using param
soj$d<-sapply(1:J, function(j) dpois(dposV-soj$shift[j], lambda=soj$lambda[j]))
} # else assume soj$d exist.
} else if (soj$type == "nbinom") {
if(!is.null(B)){
# then this is a update run, re-estimation of dist params
soj$d <- matrix(B$eta+.Machine$double.eps,ncol=J)
soj$shift <- soj$size <- soj$mu <- numeric(J)
ftidx<-apply(soj$d, 2, function(xc) xc >.Machine$double.eps & idx)
maxshift<- sapply(
lapply(1:J, function(j) which(ftidx[,j]) %% maxk ),
function(ic) min(dposV[seq(from=min(ic[ic>0]), to=nb*maxk, by=maxk)])
)
for(j in 1:J) {
param <- nbinomFit(dposV[ftidx[,j]],wt=soj$d[ftidx[,j],j], maxshift=maxshift[j])
soj$size[j] <- param['size']
soj$mu[j] <- param['mu']
soj$shift[j] <- param['shift']
}
}
if(!is.null(soj$shift) && !is.null(soj$size) && !is.null(soj$mu) && length(soj$mu)==J && length(soj$shift)==J && length(soj$size)==J) {
# for update with para estimated from B, or initial sojourn using param
soj$d<-sapply(1:J, function(j) dnbinom(dposV-soj$shift[j],size=soj$size[j],mu=soj$mu[j]) )
} # else assume soj$d exist.
} else if (soj$type == "nparam") {
if(!is.null(B)){
soj$d <- matrix(B$eta+.Machine$double.eps, ncol=J)
} # else assume soj$d exist.
}
soj$d<-sapply(1:J, function(j) sapply(1:nb, function(t) soj$d[((t-1)*maxk+1):(t*maxk),j]/sum(soj$d[((t-1)*maxk+1):(t*maxk),j], na.rm=T)))
soj$d[is.na(soj$d)]<-0 # NaN enters when all 0 in the den when normalizing
# add D slot
soj$D <- sapply(1:J, function(j) sapply(1:nb, function(t) rev(cumsum(rev(soj$d[((t-1)*maxk+1):(t*maxk),j])))))
return(soj)
}
##################################################
# to estimate emission par
##################################################
estEmis<-function(x, J=3, prior.m='quantile', emis.type='norm', q.alpha=0.05, r.var=0.75){
if(is.null(dim(x))) x<-matrix(as.vector(x))
emis<-list(type=emis.type)
if (prior.m == 'cluster'){
xclust<-clara(x, J)
if(ncol(x)>1){
if(emis$type == 'norm' || emis$type== 'mvnorm' || emis$type == 'mvt') {
emis$var<-lapply(order(xclust$medoids[,1]), function(j) cov(x[xclust$clustering==j,]))
}
emis$mu<-lapply(order(xclust$medoids[,1]), function(j) xclust$medoids[j,])
} else {
if(emis$type == 'norm' || emis$type== 'mvnorm' || emis$type == 'mvt') {
emis$var<-sapply(order(xclust$medoids), function(j) var(x[xclust$clustering==j,]))
emis$var[is.na(emis$var)]<-mean(emis$var[!is.na(emis$var)])
}
emis$mu<-as.numeric(xclust$medoids)[order(xclust$medoids)]
}
}
# old quantile method
if(prior.m == 'quantile'){
emis$mu <- estEmisMu(x, J, q.alpha=q.alpha)
if(emis$type == 'norm' || emis$type== 'mvnorm' || emis$type == 'mvt') {
emis$var <- estEmisVar(x, J, r.var=r.var)
}
}
if (emis$type == 'nbinom'){
emis$size <- rep(nbinomCLLDD(x)$par[1], J) # common prior
}
if (emis$type == 't' || emis$type == 'mvt'){
emis$df <- rep(1, J) # common prior
}
return(emis)
}
##################################################
# to estimate segment wise mean vector/list
##################################################
estEmisMu<- function(x, J, q.alpha=0.05, na.rm=TRUE){
nc<-ncol(x)
if(J >1){
if(is.null(nc) || nc == 1){
# univariate
ret<-as.numeric(quantile(x, seq(from=q.alpha, to=(1-q.alpha), length.out=J), na.rm=na.rm))
} else {
# multiple
muv<-t(sapply(1:nc, function(i) as.numeric(quantile(x[,i], seq(from=q.alpha, to=(1-q.alpha), length.out=J), na.rm=na.rm))))
ret<-lapply(apply(muv, 2, list),unlist)
}
} else {
ret<-mean(x, na.rm=na.rm)
}
return(ret)
}
##################################################
# to estimate segment wise variance vector / covariance matrix list
##################################################
estEmisVar<-function(x, J=3, na.rm=TRUE, r.var=0.75){
# r.var is the expected ratio of variance for state 1 and J versus any intermediate states
# a value larger than 1 tend to give more extreme states; a value smaller than 1 will decrease the probability of having extreme state, pushing it to the center.
nc<-ncol(x)
f.var<-rep(ifelse(r.var>=1, 1, r.var), J)
if(J%%2 == 1) {
f.var[(J+1)/2]<-ifelse(r.var>=1, 1/r.var, 1)
} else if( J>2 ){
f.var[(J/2):(J/2+1)]<-ifelse(r.var>=1, 1/r.var, 1)
}
if(J >1){
if(is.null(nc) || nc == 1){
# univariate
ret<-var(x, na.rm=na.rm)*f.var
} else {
# multiple
if(na.rm) na.rm<-'complete.obs'
ret<-lapply(1:J, function(j) cov(x, use=na.rm)*f.var[j])
}
} else {
ret<-var(as.numeric(x), na.rm=na.rm)
}
return(ret)
}
##################################################
# initialize and update emission probability
##################################################
initEmis<-function(emis, x, B=NULL){
if(is.null(B)){
# then this is for p initialization
J<-length(emis$mu)
if(emis$type == 'mvnorm') {
emis$p <- sapply(1:J, function(j) dmvnorm(x, mean = emis$mu[[j]], sigma = emis$var[[j]])) # here sigma requires cov mat
} else if (emis$type == 'pois'){
emis$p <-sapply(1:J, function(j) dpois(x, lambda=emis$mu[j]))
} else if (emis$type == 'norm'){
emis$p <- sapply(1:J, function(j) dnorm(x,mean=emis$mu[j], sd=sqrt(emis$var[j]))) # here is sd
} else if(emis$type == 'nbinom'){
emis$p <- sapply(1:J, function(j) dnbinom(x,size=emis$size[j], mu=emis$mu[j]))
} else if(emis$type == 'mvt') {
emis$p <- sapply(1:J, function(j) dmvt(x, df=emis$df[[j]], delta = emis$mu[[j]], sigma = cov2cor(emis$var[[j]]), log=FALSE)) # here sigma requires cor mat
} else if(emis$type == 't') {
emis$p <- sapply(1:J, function(j) dt(x, df=emis$df[j], ncp = emis$mu[j]))
}
emis$p<-emis$p/rowSums(emis$p) # normalized
} else {
# then this is for the re-estimation of emis param
J<-B$J
BL<-matrix(B$L,ncol=J)
BL<-t(apply(BL, 1, function(x) abs(x)/sum(abs(x))))
if(emis$type == 'mvnorm') {
isa <- !apply(is.na(x),1,any) # Find rows with NA's (cov.wt does not like them)
tmp <- apply(BL, 2, function(cv) cov.wt(x[isa, ], cv[isa])[c('cov', 'center')]) # x is already a matrix
emis$mu <- lapply(tmp, function(l) unname(l[['center']]))
emis$var <- lapply(tmp, function(l) unname(l[['cov']]))
} else if (emis$type == 'pois'){
isa <- !is.na(x)
emis$mu <- apply(BL, 2, function(cv) weighted.mean(matrix(x[isa]), cv[isa]))
} else if (emis$type == 'norm'){
isa <- !is.na(x)
tmp <- apply(BL, 2, function(cv) unlist(cov.wt(matrix(x[isa]), cv[isa])[c('cov', 'center')]))
emis$mu <- tmp['center',]
emis$var <- tmp['cov', ]
} else if(emis$type == 'nbinom'){
isa <- !is.na(x)
tmp <- apply(BL, 2, function(cv) nbinomFit((x[isa]), cv[isa]))
emis$mu <- tmp['mu',]
emis$size <- tmp['size', ]
} else if(emis$type == 'mvt') {
isa <- !apply(is.na(x),1,any)
tmp <- apply(BL, 2, function(cv) tmvtfFit((x[isa, ]), cv[isa]))
emis$mu <- lapply(tmp, function(l) l[['mu']])
emis$df <- lapply(tmp, function(l) l[['df']])
emis$var <- lapply(tmp, function(l) l[['var']])
} else if(emis$type == 't') {
isa <- !is.na(x)
tmp <- apply(BL, 2, function(cv) tmvtfFit((x[isa]), cv[isa]))
emis$mu <- sapply(tmp, function(l) l[['mu']])
emis$df <- sapply(tmp, function(l) l[['df']])
}
}
return(emis)
}
splitFarNeighbouryhat<-function(yhatrle, xPos=NULL, xRange=NULL, maxgap=Inf, state=NULL){
# yhatrle<-Rle(yhat)
rv<-runValue(yhatrle)
rl<-runLength(yhatrle)
ri<-which(rv==as.character(state))
intStart<-sapply(ri, function(z) sum(rl[seq_len(z-1)])+1)
intEnd<-sapply(ri, function(z) sum(rl[seq_len(z)]))
if(length(intStart)>0 && !is.null(maxgap) && !is.null(xPos) || !is.null(xRange)){
tmp<-splitFarNeighbour(intStart=intStart, intEnd=intEnd, xPos=xPos, xRange=xRange, maxgap=maxgap)
intStart<-tmp$IS
intEnd<-tmp$IE
}
return(list(IS=intStart, IE=intEnd))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.