JaBbA: Infer cancer graph genomes from read depth and junctions

#' @importFrom gUtils streduce si2gr seg2gr rrbind ra.overlaps ra.duplicated parse.gr hg_seqlengths grl.unlist grl.pivot grl.in grl.eval grl.bind grbind gr2dt gr.val gr.tile.map gr.tile
#' @importFrom gUtils gr.stripstrand gr.sum gr.string gr.start gr.end gr.simplify gr.setdiff gr.sample gr.reduce gr.rand gr.quantile gr.nochr
#' @importFrom gUtils gr.match gr.in gr.flipstrand gr.fix gr.findoverlaps gr.duplicated gr.dist gr.disjoin gr.breaks dt2gr "%^%" "%Q%" "%&%" "%$%"
#' @importFrom gGnome gG balance
#' @importFrom GenomicRanges GRanges GRangesList values split match setdiff
#' @importFrom gTrack gTrack
#' @importFrom igraph graph induced.subgraph V E graph.adjacency clusters
#' @importFrom optparse make_option OptionParser parse_args print_help
#' @importFrom data.table data.table as.data.table setnames setkeyv fread setkey
#' @importFrom Matrix which rowSums colSums Matrix sparseMatrix t diag
#' @importFrom parallel mclapply
#' @importFrom gplots col2hex
#' @importFrom graphics plot abline hist title
#' @importFrom grDevices col2rgb dev.off pdf png rgb
#' @importFrom stats C aggregate dist loess median ppois predict runif setNames hclust cutree acf glm ks.test lag quantile
#' @importFrom utils read.delim write.table
#' @importFrom sequenza segment.breaks baf.model.fit get.ci
#' @importFrom rtracklayer import
#' @importFrom GenomeInfoDb seqlengths
#' @importFrom DNAcopy CNA segment smooth.CNA
#' @importFrom methods as is
#' @useDynLib JaBbA

## appease R CMD CHECK misunderstanding of data.table syntax by declaring these global variables
low.count=high.count=seg=chromosome=alpha_high=alpha_low=beta_high=beta_low=predict.var=dup=psid=.N=es=res=esid=pid=ub=lb=esid=dup=lb=ub=dup=y=dup=pid=lb=es.s.ix=km=es.t.ix=adjusted.ratio=ref.count.t=alt.count.t=depth.normal=depth.tumor=good.reads=zygosity.normal=Bf=ALT=alt.count.n=bad=both.na=chr.a=chr.b=cn=eid=FILTER=force.in=FORMAT=from=from.cn=from.remain=from1=from2=GENO=grl.ix=gstr=i=id=ID=INFO=is.ref=j=label=mean.a=mean.b=mstr=nbins=nothing=oppo=ord=out=QUAL=ra=ra1.ix=ra2.ix=ref.count.n=ref.frac.n=reid=str1=str2=subid=this.cn=to=to.cn=to.remain=to1=to2=type=V1=var=NULL

.onLoad <- function(libname, pkgname) {
    op <- options()
    op.JaBbA <- list(
        JaBbA.verbose = TRUE,
        JaBbA.reference = "hg19"
        ## devtools.path = "~/R-dev",
        ## devtools.install.args = "",
        ## devtools.name = "Your name goes here",
        ## devtools.desc.author = "First Last <first.last@example.com> [aut, cre]",
        ## devtools.desc.license = "What license is it under?",
        ## devtools.desc.suggests = NULL,
        ## devtools.desc = list()
    )
    toset <- !(names(op.JaBbA) %in% names(op))
    if(any(toset)) options(op.JaBbA[toset])


    invisible()
}

#' @name JaBbA
#' @title JaBbA
#' @description
#' Main function to invoke junction balance analysis (JaBbA). Two input arguments are required: junctions and coverage. One output contains all information of the results, the gGraph saved in "jabba.simple.gg.rds".
#'
#' @param junctions rearrangement junctions  (i.e. breakpoint pairs with orientations). Supports BEDPE, BND VCF formats, Junction objects defined in gGnome, and GRangesList object. If providing GRangesList, the orientation must be "+" for a junction that fuses the side with larger coordinates and vice versa
#' @param coverage high-density read coverage data of constant-width genomic bins. Supports BED, BigWig, delimited text formats, and GRanges object
#'
#' @param juncs.uf supplement junctions in the same format as \code{junctions}
#' @param blacklist.junctions rearrangement junctions to be excluded from consideration
#' @param whitelist.junctions rearrangement junctions to be forced to be incorporated
#' @param geno logical whether the input junctions have GENO field in the metadata. If so, will match the name argument to the corresponding sample and make positive junctions tier 2, the others tier 3.
#' @param indel character of the decision to "exclude" or "include" small(< min.nbins * coverage bin width) isolated INDEL-like events into the model. Default NULL, do nothing.
#' @param cfield  character, junction confidence meta data field in ra
#' @param tfield  character, tier confidence meta data field in ra. tiers are 1 = must use, 2 = may use, 3 = use only in iteration>1 if near loose end. Default "tier".
#' @param reiterate integer scalar specifying how many extra re-iterations allowed, to rescue lower confidence junctions that are near loose end. Default 0. This requires junctions to be tiered via a metadata field tfield.
#' @param rescue.window integer window size in bp within which to look for lower confidence junctions. Default 1000.
#' @param nudge.balanced logical whether to attempt to add a small incentive for chains of quasi-reciprocal junctions.
#' @param thresh.balanced numeric maximum distance between a pair of reciprocal junctions. Default 500.
#' ## TODO think whether this is the reason balanced junction pairs tend to have too many copies
#' @param edgenudge  numeric hyper-parameter of how much to nudge or reward aberrant junction incorporation. Default 0.1 (should be several orders of magnitude lower than average 1/sd on individual segments), a nonzero value encourages incorporation of perfectly balanced rearrangements which would be equivalently optimal with 0 copies or more copies.
#' @param strict logical flag specifying whether to only include junctions that exactly overlap segs
#' @param all.in whether to use all of the junctions but the tier 3 INDELs all at once
#'
#' @param field name of the metadata column of coverage that contains the data. Default "ratio" (coverage ratio between tumor and normal). If using dryclean, usually it is "foreground".
#' @param seg  optional path to existing segmentation, if missing then will segment coverage using DNACopy with standard settings
#' @param nseg  optional path to normal seg file with $cn meta data field
#' @param hets  optional path to hets.file which is tab delimited text file with fields seqnames, start, end, alt.count.t, ref.count.t, alt.count.n, ref.count.n
#' @param purity cellularity value of the sample
#' @param ploidy ploidy value of the sample (segment length weighted copy number)
#' @param pp.method character select from "ppgrid" and "sequenza" to infer purity and ploidy if not both given. Default, "sequenza".
#' @param min.nbins integer minimum number of bins of coverage of a segment
#' @param max.na numeric between (0, 1), any vertex with more than this proportion missing coverage data is ignored
#' @param blacklist.coverage GRanges marking regions of the genome where coverage is unreliable
#' @param cn.signif numeric (0, 1), significance level of CN change point when seg is not given (the larger the more sensitive),
#' `alpha` parameter in DNAcopy::segment, default 1E-5
#'
#' @param slack.penalty numeric penalty to put on every loose end (or copy number if loose.penalty.mode is "linear"). Default 100.
#' @param loose.penalty.mode either \code{"linear"} or \code{"boolean"}, for penalizing each copy or each count of a loose end
#' @param tilim integer time limit MIP solver on each subgraph. Default 2400 (seconds).
#' @param max.threads maximum thread number CPLEX/Gurobi is allowed to use
#' @param max.mem maximum memory CPLEX/Gurobi is allowed to use
#' @param epgap relative optimality gap tolerated by the solver
#' @param use.gurobi logical flag specifying whether to use gurobi (if TRUE) instead of CPLEX (if FALSE). Default FALSE.
#' @param outdir  out directory to dump into, default `./`
#' @param name  sample name
#' @param mc.cores integer how many cores to use to fork subgraphs generation. Default 1. CAUTION as more cores will multiply the number of max threads used by CPLEX.
#' @param overwrite  logical flag whether to overwrite existing output directory contents or just continue with existing files.
#' @param verbose logical whether to print out the most verbose version of progress log
#' @param init jabba object (list) or path to .rds file containing previous jabba object which to use to initialize solution, this object needs to have the identical aberrant junctions as the current jabba object (but may have different segments and loose ends, i.e. is from a previous iteration)
#' @param dyn.tuning logical whether to let JaBbA dynamically tune the convergence criteria, default TRUE
#' @param lp (logical) whether to run as linear program, default FALSE
#' @param ism (logical) wehther to add ism constraints. default FALSE. only used if lp = TRUE
#' @param fix.thres (numeric) freeze the CN of large nodes with cost penalty exceeding this multiple of lambda. default -1 (no nodes are fixed)
#' @param min.bins (numeric) minimum number of bins needed for a valid segment CN estimate (default 5)
#' @param filter_loose (logical) run loose end annotation? (default FALSE)
#' @param drop.chr (logical) Drops chr from chromosome names. (default TRUE)
#' @export
JaBbA = function(## Two required inputs
                 junctions,
                 coverage,
                 ## options about junctions
                 juncs.uf = NULL,
                 blacklist.junctions = NULL,
                 whitelist.junctions = NULL,
                 geno = FALSE,
                 indel = NULL,
                 cfield = NULL,
                 tfield = "tier",
                 reiterate = 0,
                 rescue.window = 1e3,
                 rescue.all = TRUE,
                 nudge.balanced = FALSE,
                 thresh.balanced = 500,
                 edgenudge = 0.1,
                 strict = FALSE,
                 all.in = FALSE,
                 ## options about coverage
                 field = 'ratio',
                 seg = NULL,
                 max.na = -1,
                 blacklist.coverage = NULL,
                 nseg = NULL,
                 hets = NULL,
                 purity = NA,
                 ploidy = NA,
                 pp.method = "sequenza",
                 min.nbins = 5,
                 cn.signif = 1e-5,
                 ## options about optimization
                 slack.penalty = 1e2,
                 loose.penalty.mode = "boolean",
                 tilim = 6000,
                 max.threads = 12,
                 max.mem = 16,
                 epgap = 1e-4,
                 use.gurobi = FALSE,
                 ## options about general pipeline
                 outdir = './',
                 name = 'tumor',
                 mc.cores = 1,
                 overwrite = FALSE,
                 verbose = TRUE,
                 init = NULL,
                 dyn.tuning = TRUE,
                 lp = FALSE,
                 ism = FALSE,
                 fix.thres = -1,
                 min.bins = 1,
                 filter_loose = FALSE,
		 QCout=FALSE,
                 drop.chr = TRUE)
{
    system(paste('mkdir -p', outdir))
    jmessage('Starting analysis in ', outdir <- normalizePath(outdir))
    cdir = normalizePath(getwd())
    setwd(outdir)
    if (overwrite)
        jmessage('Overwriting previous analysis contents')
    ra = junctions
    reiterate = 1 + as.integer(reiterate)

    ## check optimizer choice is appropriate
    if (use.gurobi) {
        gurobi.dir = Sys.getenv("GUROBI_HOME")
        if (is.null(gurobi.dir)){
            jerror("GUROBI_HOME environment variable is not defined. Please
                install Gurobi or set this variable to the correct installation
                directory.")
        } else {
            if (!requireNamespace("gurobi", quietly = TRUE)) {
                jerror("Gurobi R library not found, attempting to install
                    from package binary in $GUROBI_HOME/R...") 
                gurobi_pattern <- "^gurobi_"
                gurobi_r_package_path <- list.files(
                    file.path(Sys.getenv("GUROBI_HOME"), "R"),
                    pattern = gurobi_pattern, full.names = TRUE
                )[1]
                if (!is.na(gurobi_r_package_path) && !is.null(gurobi_r_package_path)) {
                    install.packages(gurobi_r_package_path, repos = NULL)
                }
            } else {
                library(gurobi)
                if (!requireNamespace("gurobi", quietly = TRUE)) {
                    jerror("Gurobi installation was found but the gurobi R
                        package could not be imported.")
                }
            }
        }
    } else {
        cplex.dir = Sys.getenv("CPLEX_DIR")
        if (is.null(cplex.dir)){
            jerror("CPLEX_DIR environment variable not found.")
        } else if (!file.exists(paste0(cplex.dir, "/cplex"))) {
            jerror("${CPLEX_DIR}/cplex not found.")
        } else if (!file.exists(paste0(cplex.dir, "/cplex/include")) ||
                   !file.exists(paste0(cplex.dir, "/cplex/lib"))){
            jerror("Neither of ${CPLEX_DIR}/cplex/[(include)|(lib)] exist.")
        } else {
            library(gGnome)
            gGnome:::testOptimizationFunction()
        }
    }   

    ## if (is.character(ra))
    ## {
    ##     if (!file.exists(ra))
    ##     {
    ##         jerror(paste('Junction path', ra, 'does not exist'))
    ##     }
    ra.all = read.junctions(ra, geno = geno, chr.convert = drop.chr) ## GRL

    if (is.null(ra.all)){
        jwarning("no junction file is given, will be running JaBbA without junctions!")
        ra.all = GRangesList()
    }

    ## } else if (is.null(ra)) {
    ##     ra.all = GRangesList()
    ## } else {
    ##     ra.all = ra
    ## }

    if (inherits(ra.all, "list") && all(sapply(ra.all, inherits, "GRangesList"))){
        ## this is a multisample junction input
        ids = names(ra.all)
        match.nm = which(grepl(name, ids))
        if (length(match.nm)==1){
            ra.all = ra.all[[match.nm]]
        } else if (length(match.nm)==0){
            jerror("There's no junction matching this sample name: ", name)
        } else {
            jwarning("there are more than one sample id in the junciton input ",
                     ids[match.nm],
                     " match this run id ",
                     name,
                     " we are only using the first one.")
            ra.all = ra.all[[match.nm[1]]]
        }
    }

    if (!inherits(ra.all, "GRangesList")){
        jerror("The given input `ra` is not valid.")
    }

    ## temporary filter for any negative coords
    ## bad.bp = unname(grl.unlist(ra.all)) %Q% (start<0)
    ## if (length(bad.bp)>0){
    ##     jmessage("Warning!! ", length(bad.bp), " breakpoints in ",
    ##              length(bad.ix <- unique(bad.bp$grl.ix)), " junctions ",
    ##              "have negative coordinates, discard.")
    ##     ra.all = ra.all[setdiff(seq_along(ra.all), bad.ix)]
    ## }

    if (verbose)
    {
        jmessage("Read in ", length(ra.all), " total input junctions")
    }

    ## xtYao Tuesday, Jun 19, 2018 04:52:17 PM
    ## Only when tier exists or unfiltered junctions provided, do we do the iterations
    ## if unfiltered set is given first parse it
    ra.uf = NULL
    if (!is.null(juncs.uf) && file.exists(juncs.uf) && file.info(juncs.uf)$size > 0){
        jmessage("Loading supplementary junctions")
        ra.uf = read.junctions(juncs.uf, geno=FALSE)
    }

    if (is.null(tfield)){
        tfield = 'tier'
    }

    ## if no tier field in junctions, set all of them to 2
    if (!(tfield %in% names(values(ra.all))))
    {
        jwarning("Tier field ", tfield, " missing: giving every junction the same tier, i.e. all have the potential to be incorporated")
        values(ra.all)[, tfield] = rep(2, length.out = length(ra.all))
    }


    if (!is.null(ra.uf)){
        ## merge ra.all with ra.uf
        ## junctions from ra.all will always have tier 2
        ## junctions from ra.uf will always have tier 3
        ## at this point
        ## ra.all.uf = ra.merge(ra.all, ra.uf, pad=0, ind=TRUE) ## hard merge
        ra.all.uf = ra.merge(ra.all, ra.uf, pad=1000, ind=TRUE) ## soft merge
        ## those match a record in junction, will be assigned to the tier in junction
        if (tfield %in% names(values(ra.all.uf))) {
            values(ra.all.uf)[, tfield][which(!is.na(values(ra.all.uf)$seen.by.ra1))] =
                values(ra.all)[, tfield][values(ra.all.uf)$seen.by.ra1]
            values(ra.all.uf)[, tfield][which(is.na(values(ra.all.uf)$seen.by.ra1))] = 3
        } else {
            values(ra.all.uf)[, tfield] = 3
        }

        ## mark any NA tier junctions as tier 3
        values(ra.all.uf)[, tfield][which(is.na(values(ra.all.uf)[, tfield]))] = 3

        ## the rest will be tier 3
        ra.all = ra.all.uf
        ## FIXME: ra.merge still gives duplicates
        ## FIXME: substitute these ra.xx fuctions to Junction class in gGnome
        ## extra dedup
        ## ndup = which(!ra.duplicated(ra.all.uf))
        ## ra.all = ra.all.uf[ndup]
    }

    if (!is.null(blacklist.junctions) &&
        file.exists(blacklist.junctions) &&
        file.info(blacklist.junctions)$size > 0){
        blacklist.junctions = read.junctions(blacklist.junctions)
        if (length(blacklist.junctions)>0){
            black.ix = which(gUtils::grl.in(ra.all, blacklist.junctions))
            if (length(black.ix)>0){
                jmessage("Removing ", length(black.ix), " junctions matched the blacklist")
			ra.all = ra.all[setdiff(seq_along(ra.all), black.ix)]
	    }
	}
    }

    if (!is.null(whitelist.junctions) && file.exists(whitelist.junctions) && file.info(whitelist.junctions)$size > 0){
	    whitelist.junctions = read.junctions(whitelist.junctions)
		    if (length(whitelist.junctions)>0){
			    white.ix = which(gUtils::grl.in(ra.all, whitelist.junctions))
				    if (length(white.ix)>0){
					    jmessage("Forcing incorporation of ", length(white.ix), " junctions matched the whitelist")
						    values(ra.all)[white.ix, tfield] = 1
				    }
		    }
    }

    if (length(ra.all)>0){
## final sanity check before running
	    if (!all(unique(values(ra.all)[, tfield]) %in% 1:3)){
		    jerror('Tiers in tfield can only have values 1,2,or 3')
	    }
	    jmessage("In the input, There are ", sum(values(ra.all)[, tfield]==1), " tier 1 junctions; ",
			    sum(values(ra.all)[, tfield]==2), " tier 2 junctions; ",
			    sum(values(ra.all)[, tfield]==3), " tier 3 junctions.")
    }


## big change tonight, I'm gonna start with all of the tiers in the first round
## and then in each of following iterations keep the ones incorporated
## plus the ones that didn't but fall inside the range of a lo
    if (all.in & length(ra.all)>0){
	    t3 = values(ra.all)[, tfield]==3

		    if (any(t3)){
## save every t3 except small indel
			    t3.indel = which.indel(ra.all[which(t3)])
				    t3.non.indel = which(t3)[setdiff(seq_along(which(t3)), t3.indel)]
				    values(ra.all)[t3.non.indel, tfield] = 2
				    jmessage('All-in mode: ', length(t3.non.indel),
						    ' tier 3 junctions being included yielding ',
						    sum(values(ra.all)[, tfield]==2), ' total junctions\n')
		    }

## and then bump t2 to t1
	    t2 = values(ra.all)[, tfield]==2
		    if (any(t2)){
			    values(ra.all)[t2, tfield] = 1
				    jmessage("All-in mode: ", length(t2),
						    " tier 2 junctions forced into the model")
		    }
    }
## if we are iterating more than once
    if (reiterate>1){

## important: rescue.all should always be TRUE if not running filter.loose
	    if ((!rescue.all) & (!filter_loose)) {
		    jmessage("Resetting rescue.all to TRUE as filter.loose is FALSE")
			    rescue.all = TRUE
	    }

	    continue = TRUE
		    this.iter = 1;

	    values(ra.all)$id = seq_along(ra.all)
		    saveRDS(ra.all, paste(outdir, '/junctions.all.rds', sep = ''))

		    last.ra = ra.all[values(ra.all)[, tfield]<3]

		    jmessage('Starting JaBbA iterative with ', length(last.ra), ' junctions')
		    jmessage('Will progressively add junctions within ', rescue.window, 'bp of a loose end in prev iter')
		    jmessage('Iterating for max ', reiterate, ' iterations or until convergence (i.e. no new junctions added)')

		    while (continue) {
			    gc()

				    this.iter.dir = paste(outdir, '/iteration', this.iter, sep = '')
				    system(paste('mkdir -p', this.iter.dir))

				    jmessage('Starting iteration ', this.iter, ' in ', this.iter.dir, ' using ', length(last.ra), ' junctions')

				    if (this.iter>1){
					    kag1 = readRDS(paste0(outdir, '/iteration1/karyograph.rds'))
						    ploidy = kag1$ploidy
						    purity = kag1$purity
						    jmessage("Using ploidy ", ploidy,
								    " and purity ", purity,
								    " consistent with the initial iteration")

						    if (lp) {
							    jmessage("Using segments from JaBbA output of previous iteration")
								    loose.ends.fn = file.path(outdir,
										    paste0("iteration", this.iter - 1),
										    "loose.end.stats.rds")
								    seg.fn = file.path(outdir, paste0("iteration", this.iter - 1), "jabba.simple.rds")

								    seg = readRDS(seg.fn)$segstats[, c()]
								    seg = seg %Q% (strand(seg) == "+")
								    seg = gr.stripstrand(seg)
						    }

				    }

			    this.ra.file = paste(this.iter.dir, '/junctions.rds', sep = '')
				    saveRDS(last.ra, this.ra.file)

				    jab = jabba_stub(
						    junctions = this.ra.file,
						    seg = seg,
						    coverage = coverage,
						    blacklist.coverage = blacklist.coverage,
						    hets = hets,
						    nseg = nseg,
						    cfield = cfield,
						    tfield = tfield,
						    nudge.balanced = as.logical(nudge.balanced),
						    outdir = this.iter.dir,
						    mc.cores = as.numeric(mc.cores),
						    max.threads = as.numeric(max.threads),
						    max.mem = as.numeric(max.mem),
						    max.na = max.na,
						    edgenudge = as.numeric(edgenudge),
						    tilim = as.numeric(tilim),
						    strict = strict,
						    name = name,
						    use.gurobi = as.logical(use.gurobi),
						    field = field,
						    epgap = epgap,
## subsample = subsample,
						    slack.penalty = as.numeric(slack.penalty),
						    loose.penalty.mode = loose.penalty.mode,
						    mipstart = init,
						    ploidy = as.numeric(ploidy),
						    purity = as.numeric(purity),
						    pp.method = pp.method,
						    indel = indel,
						    min.nbins = min.nbins,
						    overwrite = as.logical(overwrite),
						    verbose = as.numeric(verbose),
						    dyn.tuning = dyn.tuning,
						    geno = geno,
						    cn.signif = cn.signif,
						    lp = lp,
						    ism = ism,
						    fix.thres = fix.thres,
						    min.bins = min.bins,
						    drop.chr = drop.chr,
						    filter_loose = filter_loose)

							    gc()

							    jab = readRDS(paste(this.iter.dir, '/jabba.simple.rds', sep = ''))
							    jabr = readRDS(paste(this.iter.dir, '/jabba.raw.rds', sep = ''))
							    le = gr.stripstrand(jab$segstats %Q% (loose==TRUE & strand=="+"))
							    if (length(le)==0){
								    jmessage("No more loose ends to resolve, terminating.")
									    break
							    } else if (!rescue.all){
								    le = le %Q% which(passed==TRUE)
									    if (length(le)==0){
										    jmessage("No more plausible loose ends, terminating")
											    break
									    }
							    } else {
								    jmessage("Rescuing all ", length(le), " loose ends, regardless of confidence.")
							    }

## determine orientation of loose ends
			    le.right = le %&% gr.start(jab$segstats %Q% (loose==FALSE))
				    strand(le.right) = "+"
				    le.left = le %&% gr.end(jab$segstats %Q% (loose==FALSE))
				    strand(le.left) = "-"
				    le = grbind(le.right, le.left)

## Annotate ra.all
				    all.input = readRDS(paste0(outdir, "/junctions.all.rds"))
				    all.ov = ra.overlaps(all.input, jab$junctions, pad=0, arr.ind=TRUE)
				    if (ncol(all.ov)==2){
					    all.ov = data.table(data.frame(all.ov))
						    all.ov[, this.cn := values(jab$junctions)$cn[ra2.ix]]
						    values(all.input)[, paste0("iteration", this.iter, ".cn")] =
						    all.ov[, setNames(this.cn, ra1.ix)][as.character(seq_along(all.input))]
				    } else {
					    values(all.input)[, paste0("iteration", this.iter, ".cn")] = NA
				    }
			    saveRDS(all.input, paste0(outdir, "/junctions.all.rds"))

## junction rescue
## rescues junctions that are within rescue.window bp of a loose end
## got used, stay there
## but not loose ends overlapping an exorbitant number of junctions
				    le.keep = which((le %N% (stack(ra.all) + rescue.window)) < 6)
				    tokeep = which(values(jab$junctions)$cn>0) 
				    new.ra.id = unique(c(
							    values(jab$junctions)$id[tokeep],
## near a loose ends, got another chance
							    values(ra.all)$id[which(grl.in(ra.all,
									    le[le.keep] + rescue.window,
									    some = T,
									    ignore.strand = FALSE))],
## tier 2 or higher must stay for all iterations
							    values(ra.all)$id[which(values(ra.all)$tier==2)]
							)) 
				    if (tfield %in% colnames(ra.all)){
					    high.tier.id = values(ra.all)$id[which(as.numeric(values(ra.all)[, tfield])<3)]
						    new.ra.id = union(new.ra.id, high.tier.id)
				    }
			    new.ra = ra.all[which(values(ra.all)$id %in% new.ra.id)]
## new.ra  = ra.all[union(values(last.ra)$id,
##                        values(ra.all)$id[grl.in(ra.all, le + rescue.window, some = T)])]
## new.junc.id = setdiff(new.ra.id, values(jab$junctions)$id[which(values(jab$junctions)$cn>0)])
				    new.junc.id = setdiff(new.ra.id, values(jab$junctions)$id)
## num.new.junc = length(setdiff(values(new.ra)$id, values(last.ra)$id)==0)
				    num.new.junc = length(new.junc.id)
				    jcn = rep(0, nrow(jab$ab.edges))
				    abix = rowSums(is.na(rbind(jab$ab.edges[, 1:2, 1])))==0
				    if (any(abix)){
					    jcn[abix] = jab$adj[rbind(jab$ab.edges[abix, 1:2, 1])]
				    }
			    num.used.junc = length(which(jcn>0))

				    t3 = values(new.ra)[, tfield]==3
				    jmessage('Adding ', num.new.junc,
						    ' new junctions, including ', sum(t3),
						    ' tier 3 junctions, yielding ', num.used.junc,
						    ' used junctions and ', length(new.ra), ' total junctions\n')

				    if (any(t3)){
					    values(new.ra)[t3, tfield] = 2
				    }

			    if (num.new.junc==0 | this.iter >= reiterate)
				    continue = FALSE
			    else {
				    last.ra = new.ra
					    this.iter = this.iter + 1
			    }
## keep using the initial purity ploidy values
			    pp1 = readRDS(paste0(
						    outdir,
						    '/iteration1/karyograph.rds.ppgrid.solutions.rds'))
				    purity = pp1$purity[1]
				    ploidy = pp1$ploidy[1]

				    seg = readRDS(paste0(outdir,'/iteration1/seg.rds')) ## read from the first iteration


				    if (verbose)
				    {
					    jmessage("Setting mipstart to previous iteration's jabba graph")
				    }

			    init = jab

				    if (verbose)
				    {
					    jmessage('Using purity ', round(purity,2), ' and ploidy ', round(ploidy,2), ' across ', length(seg), ' segments used in iteration 1')
				    }
		    }

	    this.iter.dir = paste(outdir, '/iteration', this.iter, sep = '')
		    system(sprintf('cp %s/* %s', this.iter.dir, outdir))
					    jab = readRDS(paste0(outdir, "/jabba.simple.gg.rds"))
					    jmessage('Done Iterating')
					    } else {
## if all.in, convert all tier 3 to tier 2
## if (tfield %in% colnames(values(ra.all))){
##     t3 = (values(ra.all)[, tfield] == 3)
##     if (all.in & length(ra.all)>0){
##         if (any(t3)){
##             ## save every t3 except small indel
##             t3.indel = which.indel(ra.all[which(t3)])
##             t3.non.indel = which(t3)[setdiff(seq_along(which(t3)), t3.indel)]
##             values(ra.all)[t3.non.indel, tfield] = 2
##             t3 = values(ra.all)[, tfield] == 3
##         }
##     }
##     ## if not all.in, only use t2 or t1
##     ra.all = ra.all[setdiff(seq_along(ra.all), which(t3))]
        ## }
        jab = jabba_stub(
            junctions = ra.all,
            seg = seg,
            coverage = coverage,
            blacklist.coverage = blacklist.coverage,
            hets = hets,
            nseg = nseg,
            cfield = cfield,
            tfield = tfield,
            nudge.balanced = as.logical(nudge.balanced),
            outdir = outdir,
            mc.cores = as.numeric(mc.cores),
            max.threads = as.numeric(max.threads),
            max.mem = as.numeric(max.mem),
            max.na = max.na,
            edgenudge = as.numeric(edgenudge),
            tilim = as.numeric(tilim),
            strict = strict,
            epgap = epgap,
            name = name,
            use.gurobi = as.logical(use.gurobi),
            field = field,
            ## subsample = subsample,
            slack.penalty = as.numeric(slack.penalty),
            mipstart = init,
            ploidy = as.numeric(ploidy),
            purity = as.numeric(purity),
            pp.method = pp.method,
            indel = indel,
            min.nbins = min.nbins,
            loose.penalty.mode = loose.penalty.mode,
            overwrite = as.logical(overwrite),
            verbose = as.numeric(verbose),
            dyn.tuning = dyn.tuning,
            geno = geno,
            cn.signif = cn.signif,
            lp = lp,
            ism = ism,
            fix.thres = fix.thres,
            min.bins = min.bins,
            filter_loose = filter_loose,
            drop.chr = drop.chr)
    }
    if(QCout){	    
    	QCStats(inputDT=data.table(pair=name,inputdir=outdir),outdir=outdir)
    }
    setwd(cdir)
    # exit with error if epgap did not converge
    sol_epgap = jab$nodes$gr$epgap %>% unique 
    if (sol_epgap > opt$epgap) {
        stop(paste("JaBbA did not converge. Try increasing the tilim. Final epgap:", sol_epgap))
    }
    return(jab)
}


#' @name jabba_stub
#' @rdname internal
#' @title jabba_stub
#' @description
#' Internal function to run single iteration of JaBbA
#'
#' Generates the following files in the output directory:
#'
#' karyograph.rds --- file of unpopulated karyograph as an RDS file of a list object storing the output of karyograph
#'
#'
#' jabba.rds --- file storing JaBbA object
#'
#' jabba.simple.rds --- file storing JaBbA object simplified so that segments containing all unpopulated aberrant junctions are merged
#'
#' jabba.raw.rds --- storing raw jbaMIP solution, this may be useful for debugging and QC
#'
#' jabba.png, jabba.simple.png --- gTrack images of the above reconstructions
#'
#' jabba.seg.txt --- tsv file with jabba.simple solution segments
#'
#' jabba.seg.rds --- GRanges rds with jabba.simple solution segments
#'
#' jabba.adj.txt --- tsv file with edges (i.e. node pairs) of adjacency matrix populated with inferred copy numbers and node ids indexing segments in jabba.seg.txt
#'
#' jabba.vcf, jabba.simple.vcf --- BND-style vcf output of junctions in JaBbA output populated with rearrangement and interval copy numbers
#'
#' jabba.cnv.vcf, jabba.simple.cnv.vcf --- cfopy number style VCF showing jabba copy number output
#'
#'
#' @param junctions  GRangesList of junctions  (i.e. bp pairs with strands oriented AWAY from break) OR path to junction VCF file (BND format), dRanger txt file or rds of GRangesList
#' @param coverage  GRanges of coverage OR path to cov file, rds of GRanges or .wig / .bed file of (normalized, GC corrected) fragment density
#' @param field  field of coverage GRanges to use as fragment density signal (only relevant if coverage is GRanges rds file)
#' @param seg  optional path to existing segmentation, if missing then we will segment coverage using DNACopy with standard settings
#' @param cfield  character, junction confidence meta data field in ra
#' @param tfield  character, tier confidence meta data field in ra
#' @param outdir  out directory to dump into, default ./
#' @param nseg  optional path to normal seg file with $cn meta data field
#' @param hets  optional path to hets.file which is tab delimited text file with fields seqnames, start, end, alt.count.t, ref.count.t, alt.count.n, ref.count.n
#' @param name  prefix for sample name to be output to seg file
#' @param mc.cores  number of cores to use (default 1)
#' @param use.gurobi  logical flag whether to use gurobi vs CPLEX
#' @param nseg  path to data.frame or GRanges rds of normal seg file with coordinates and $cn data field specifying germline integer copy number
#' ## @param subsample  numeric between 0 and 1 specifying how much to sub-sample high confidence coverage data
#' @param tilim  timeout for jbaMIP computation (default 1200 seconds)
#' @param edgenudge  numeric hyper-parameter of how much to nudge or reward aberrant junction incorporation, default 0.1 (should be several orders of magnitude lower than average 1/sd on individual segments), a nonzero value encourages incorporation of perfectly balanced rearrangements which would be equivalently optimal with 0 copies or more copies.
#' @param slack.penalty  penalty to put on every loose.end copy, should be calibrated with respect to 1/(k*sd)^2 for each segment, i.e. that we are comfortable with junction balance constraints introducing k copy number deviation from a segments MLE copy number assignment (the assignment in the absence of junction balance constraints)
#' @param init jabba object (list) or path to .rds file containing previous jabba object which to use to initialize solution, this object needs to have the identical aberrant junctions as the current jabba object (but may have different segments and loose ends, i.e. is from a previous iteration)
#' @param overwrite  flag whether to overwrite existing output directory contents or just continue with existing files.
#' @param lp whether to run as linear program (default FALSE)
#' @param ism logical whether to add ISM constraints default FALSE
#' @param fix.thres (numeric) multiple of lambda above which to fix nodes
#' @param min.bins (numeric) min number of coverage bins for a valid CN estimate
#' @param filter_loose (logical) run loose end analysis?
#' @param drop.chr (logical) Drops chr from chromosome names. (default TRUE)
jabba_stub = function(junctions, # path to junction VCF file, dRanger txt file or rds of GRangesList of junctions (with strands oriented pointing AWAY from breakpoint)
                      coverage, # path to cov file, rds of GRanges
                      blacklist.coverage = NULL,
                      seg = NULL, # path to seg file, rds of GRanges
                      cfield = NULL, # character, junction confidence meta data field in ra
                      tfield = NULL, # character, tier confidence meta data field in ra
                      nudge.balanced = FALSE,  ## if TRUE nudge chains of balanced (or quasi balanced junctions)
                      thresh.balanced = 500, ## threshold for balanced junctions
                      outdir = './', # out directory to dump into
                      nseg = NULL, # path to normal seg file with $cn meta data field
                      hets = NULL, # path to hets.file which is tab delimited text file with fields seqnames, start, end, alt.count.t, ref.count.t, alt.count.n, ref.count.n
                      name = 'tumor', ## prefix for sample name to be output to seg file
                      mc.cores = 1, # default 1
                      max.threads = 12,
                      max.mem = 16,
                      max.na = -1,
                      purity = NA,
                      ploidy = NA,
                      pp.method = "sequenza",
                      strict = FALSE,
                      epgap = 1e-4,
                      mipstart = NULL,
                      field = 'ratio', ## character, meta data field to use from coverage object to indicate numeric coveragendance, coverage,
                      ## subsample = NULL, ## numeric scalar between 0 and 1, how much to subsample coverage per segment
                      tilim = 1200, ## timeout for MIP portion
                      ## mem = 16, ## max memory for MIP portion
                      init = NULL, ## previous JaBbA object to use as a solution
                      edgenudge = 0.1, ## hyper-parameter of how much to "nudge" or reward edge use, will be combined with cfield information if provided
                      slack.penalty = 1e2, ## nll penalty for each loose end cop
                      use.gurobi = FALSE,
                      loose.penalty.mode = "boolean",
                      indel = "exclude", ## default should be nothing
                      min.nbins = 5, ## default to 5, if larger bin can reduce
                      overwrite = F, ## whether to overwrite existing output in outdir
                      lp = FALSE, ## whether to run as linear program
                      ism = FALSE, ## add ISM constraints (only used if lp = TRUE)
                      fix.thres = -1,
                      min.bins = 5,
                      verbose = TRUE,
                      dyn.tuning = TRUE,
                      geno = FALSE,
                      filter_loose = FALSE,
                      outlier.thresh = 0.9999,
                      cn.signif = 1e-5,
		      drop.chr = TRUE)
{
    kag.file = paste(outdir, 'karyograph.rds', sep = '/')
    hets.gr.rds.file = paste(outdir, 'hets.gr.rds', sep = '/')
    junctions.txt.file = paste(outdir, 'junctions.txt', sep = '/')
    junctions.rds.file = paste(outdir, 'junctions.rds', sep = '/')
    jabba.raw.rds.file = paste(outdir, 'jabba.raw.rds', sep = '/')
    jabba.rds.file = paste(outdir, 'jabba.rds', sep = '/')
    jabba.json.file = paste(outdir, 'jabba.json', sep = '/')
    jabba.gg.rds.file = paste(outdir, 'jabba.gg.rds', sep = '/')
    jabba.vcf.file = paste(outdir, 'jabba.vcf', sep = '/')
    jabba.cnv.vcf.file = paste(outdir, 'jabba.cnv.vcf', sep = '/')
    jabba.simple.rds.file = paste(outdir, 'jabba.simple.rds', sep = '/')
    jabba.simple.gg.rds.file = paste(outdir, 'jabba.simple.gg.rds', sep = '/')
    jabba.simple.vcf.file = paste(outdir, 'jabba.simple.vcf', sep = '/')
    jabba.simple.cnv.vcf.file = paste(outdir, 'jabba.simple.cnv.vcf', sep = '/')
    purity.ploidy.txt.file = paste(outdir, 'purity.ploidy.txt', sep = '/')
    jabba.png.file = paste(outdir, 'jabba.png', sep = '/')
    jabba.simple.png.file = paste(outdir, 'jabba.simple.png', sep = '/')
    seg.tab.file = paste(outdir, 'jabba.seg', sep = '/')
    seg.gr.file = paste(outdir, 'jabba.seg.rds', sep = '/')
    seg.adj.file = paste(outdir, 'jabba.adj.txt', sep = '/')
    le.class.file.rds = paste(outdir, 'loose.end.stats.rds', sep = '/')
    nozzle.file = paste(outdir, 'nozzle', sep = '/')

    if (is.character(coverage))
    {
        if (!file.exists(coverage))
        {
            jerror(paste('Coverage path ', coverage, 'does not exist'))
        }

        if (!file.size(coverage))
        {
            jerror(paste('Coverage file ', coverage, 'is empty'))
        }

        if (grepl('\\.rds$', coverage))
        {
            coverage = readRDS(coverage)
        }
        else if (grepl('((\\.txt)|(\\.tsv)|(\\.csv))(.gz|.xz|.bz|.bz2){0,}$', coverage))
        {
            tmp = fread(coverage)
            coverage = try(dt2gr(tmp))
            if (inherits(tmp, "try-error")){
                jerror("Input coverage data ", coverage, " cannot be converted to genomic intervals.")
            }
            ## coverage = dt2gr(tmp, seqlengths = tmp[, max(end), by = seqnames][, structure(V1, names = seqnames)])
        }
        else
        {
            jmessage('Importing coverage from UCSC format')
            coverage = rtracklayer::import(coverage)
            if ("score" %in% names(values(coverage))) {
                values(coverage)[, field] = values(coverage)[, "score"]
            } else {
                names(values(coverage)) = field ## reset name of coverage field
            }
            coverage = gr.fix(coverage)
        }
    }
    else
        coverage = coverage

    if (!inherits(coverage, 'GRanges'))
        coverage = dt2gr(coverage)

    ## the most frequent width in a sample of coverage points
    binwidth = as.numeric(names(sort(
        table(width(sample(coverage, 1000, replace=TRUE))), decreasing = TRUE
    )[1]))

    if (verbose)
    {
        jmessage(paste(
            "Read in",
            prettyNum(length(coverage), big.mark = ','),
            paste0(binwidth, "bp bins of coverage data across"),
            length(unique(seqnames(coverage))), 'chromosomes'))
    }

    if (!(field %in% names(values(coverage))))
    {
        new.field = names(values(coverage))[1]
        jwarning(paste0('Field ',
                        field,
                        ' not found in coverage GRanges metadata so using ',
                        new.field,
                        ' instead'))
        field = new.field
    }

    ## filter out the data in blacklist.coverage if any
    if (!is.null(blacklist.coverage)){
        if (is.character(blacklist.coverage)){
            if (file.exists(blacklist.coverage)){
                if (grepl("rds$", blacklist.coverage)){
                    blacklist.coverage = readRDS(blacklist.coverage)
                } else if (grepl("[(txt)|(tsv)|(csv)]$", blacklist.coverage)){
                    blacklist.coverage = try(gUtils::dt2gr(data.table::fread(blacklist.coverage)))
                } else if (grepl("bed$", blacklist.coverage)){
                    blacklist.coverage = rtracklayer::import.bed(blacklist.coverage)
                }
            }
        } else if (inherits(blacklist.coverage, "data.frame")){
            blacklist.coverage = try(gUtils::dt2gr(data.table(blacklist.coverage)))
        }
        ## if at this point we have a GRanges, then proceed
        if (inherits(blacklist.coverage, "GRanges")){
            bad.ix = which(coverage %^% blacklist.coverage)
            if (length(bad.ix)>0){
                values(coverage)[bad.ix, field] = NA
            }
        } else {
            jwarning("'--blacklist.coverage' cannot be parsed into a GRanges, please check")
        }
    }

    ## if (!is.null(outlier.thresh)) {
    ##     field.thresh = quantile(values(coverage)[[field]], probs = outlier.thresh, na.rm = TRUE)
    ##     bad.ix = which(values(coverage)[[field]] > field.thresh)
    ##     values(coverage)[bad.ix, field] = NA
    ## }

    seg.fn = paste0(outdir, '/seg.rds')
    if (!overwrite & file.exists(seg.fn))
    {
        if (verbose)
        {
            jmessage('Using previous segmentation found in jabba directory')
        }
        seg = readRDS(seg.fn)
    } else {
        if (is.null(seg) || (is.character(seg) && (!file.exists(seg) || !file.size(seg))))
        {
            if (verbose)
            {
                jmessage('No segmentation provided, so performing segmentation using CBS')
            }
            set.seed(42)
            binw = median(sample(width(coverage), 30), na.rm=T)
            vals = as.double(values(coverage)[, field])
            ## if `vals` contain minute values that are basically deemed as zero in R
            ## we add a tiny little bit of non-zero value to all of them before segmenting
            if (length(zero.ix <- which(vals==0))>0){
                tiny.val = .Machine$double.eps
                vals = vals + tiny.val
                jmessage(
                    length(zero.ix),
                    " coverage data points have zero value, adding a tiny value ",
                    tiny.val, " to prevent log error.")
            }
            new.sl = GenomeInfoDb::seqlengths(coverage)
            ix = which(!is.na(vals))

            cna = DNAcopy::CNA(
                log(vals[ix]),
                as.character(seqnames(coverage))[ix], start(coverage)[ix],
                data.type = 'logratio')
            ## Addy at some point suggested 1e-5 to prevent way too many unnecessary segmentations at 200bp resolution
            segs = DNAcopy::segment(DNAcopy::smooth.CNA(cna),
                                    alpha = cn.signif, ## old at 1e-5
                                    verbose = FALSE)
            if (verbose)
            {
                jmessage('Segmentation finished')
            }
            seg = gUtils::seg2gr(segs$out, new.sl) ## remove seqlengths that have not been segmented
            seg = gr.fix(seg, GenomeInfoDb::seqlengths(coverage), drop = T)
            names(seg) = NULL

            ## Filter out small gaps in CBS output (<=1e3)
            ## gap.seg = IRanges::gaps(seg)[which(as.character(strand(seg))=="*" & width(seg) > (5 * binw))]
            ## if (length(gap.seg)>0){
            ##     bps = c(gr.start(seg)[, c()], gr.start(gap.seg)[, c()])
            ## } else {
            ##     bps = gr.start(seg)[, c()]
            ## }

            ## ## keep the breakpoints of the big enough gaps (>10*binwidth), they may contain bad regions
            ## new.segs = gUtils::gr.stripstrand(gUtils::gr.breaks(bps, gUtils::si2gr(seqlengths(bps))))[, c()]
            ## names(new.segs) = NULL
            ## seg = gUtils::gr.fix(new.segs, GenomeInfoDb::seqlengths(coverage), drop = T)

            ## if (verbose)
            ## {
            ##     jmessage(length(seg), ' segments produced')
            ## }
        }
        else
        {
            if (is.character(seg))
            {
                if (grepl('\\.rds$', seg))
                {
                    seg = readRDS(seg)
                } else if (grepl("\\.txt(.gz)?$", seg)){
                    seg = dt2gr(fread(seg))
                }
                else
                {
                    jmessage('Importing seg from UCSC format')
                    seg = rtracklayer::import(seg)
                    ## field = 'score';
                }
            }
        }
    }

    if (!inherits(seg, 'GRanges'))
        seg = dt2gr(seg, GenomeInfoDb::seqlengths(coverage))

    if (!is.null(hets))
    {
        if (is.character(hets))
        {
            if (!file.exists(hets))
            {
                jwarning(sprintf('hets file "%s" not found, ignoring hets\n', hets))
                hets = NULL
            }
        }
    }

    ra = junctions
    if (inherits(ra, "character")) {
        ra = read.junctions(junctions, geno = geno, chr.convert = drop.chr)
    } else if (!inherits(ra, "GRangesList")){
        jerror("`ra` must be GRangesList here")
    }
    jmessage(paste("Loaded", length(ra), "junctions from the input."))

    if (strict)
    {
        ends = c(gr.start(seg),
                 gr.end(gr.end(seg)))
        nog = length(ra)
        ra = ra[grl.in(ra, ends, logical, logical = FALSE) == 2]
        jmessage('Applying strict junction filtering of junctions to only those that land at segment ends')
        jmessage('Leaving ', length(ra), ' junctions from an initial set of ', nog)
    }

    ## determine binwidth
    ## times the factor of minimum number of bins for a segment to be considered
    max.indel.size = min.nbins * binwidth
    ## know what junctions to include and exclude before karyogrpah_stub
    ab.force = NULL
    ab.exclude = NULL
    if (!is.null(tfield))
    {
        if (tfield %in% names(values(ra)))
        {
            ## default forcing tier 1
            ab.force = which(gsub('tier', '', as.character(values(ra)[, tfield]))=='1')

            ## when the switch is on:
            ## small DUP/DEL junctions in tier 2, bump them up to tier 1
            tier2.ix = which(gsub('tier', '', as.character(values(ra)[, tfield]))=='2')
            if (length(tier2.ix)>0){
                ## max size hardcoded for now
                which.like.indel = tier2.ix[
                    which.indel(ra[tier2.ix], max.size = max.indel.size)]
            } else {
                which.like.indel = numeric(0)
            }

            if (verbose)
            {
                jmessage('Found tier field enforcing >=1 CN at ', length(ab.force), ' junctions')                
            }

            ab.exclude = which(gsub('tier', '', as.character(values(ra)[, tfield]))=='3')
            if (verbose)
            {
                jmessage('Removing ', length(ab.exclude), ' tier 3 junctions')
            }
        }
    } else { ## no tfield given, assume everything is tier 2
        ## if (indel){
        which.like.indel = which.indel(ra, max.size = max.indel.size)
        ##     ab.force = union(ab.force, which.like.indel)
        ## }
    }

    if (exists("which.like.indel")){
        values(ra)$like.indel = is.element(seq_along(ra), which.like.indel)
        jmessage(length(which.like.indel), ' INDEL-like isolated events')
    }
    ## depending on the value of "indel", include or exclude indels or do nothing
    if (!is.null(indel)){
        if (indel=="include"){
            ab.force = union(ab.force, which.like.indel)
            if (verbose){
                jmessage(length(which.like.indel), ' INDEL-like isolated junctions mandatorily INCLUDED in the final model')
            }
        } else if (indel=="exclude"){
            ab.exclude = union(ab.exclude, which.like.indel)
            if (verbose){
                jmessage(length(which.like.indel), ' INDEL-like isolated junctions mandatorily EXCLUDED in the final model')
            }
        }
    } else {
        jwarning("doing nothing special to the small INDEL-like isolated junctions")
    }
    ## Dropping chr across the sample
    if (drop.chr){
       jmessage(drop.chr , " for drop.chr, dropping chr in chromosome names.")
       seg = gr.nochr(seg)
       coverage = gr.nochr(coverage)
       ra = gr.nochr(ra)
    }else{
       jmessage(drop.chr , " for drop.chr, continuing on with chr names as is.") 
    }
    ## clean up the seqlevels before moving on
    seg.sl = seqlengths(seg)
    cov.sl = seqlengths(coverage)
    ra.sl = seqlengths(ra)
   
    union.sn = union(union(names(seg.sl), names(cov.sl)), names(ra.sl))
    union.sl = data.table(
        seqnames = union.sn,
        seg.sl = seg.sl[union.sn],
        cov.sl = cov.sl[union.sn],
        ra.sl = ra.sl[union.sn])
    ## keep the largest length so we don't lose any data
    ## it is more union for a certain data type to have shorter genome length than having longer
    union.sl = union.sl[
      , sl := pmax(seg.sl, cov.sl, ra.sl, na.rm = T)][
      , setNames(sl, seqnames)]
    new.seg = seg[which(is.element(as.character(seqnames(seg)), names(union.sl)))]
    if (length(new.seg)<length(seg)){
        jmessage(length(seg)-length(new.seg), " segments are discarded because they fall out of the ref genome.")
    }
    seg = new.seg
    seqlevels(seg) = seqlevels(seg)[which(is.element(seqlevels(seg), names(union.sl)))]
    new.coverage = coverage[which(is.element(as.character(seqnames(coverage)), names(union.sl)))]
    seqlevels(coverage) = seqlevels(coverage)[
        which(is.element(seqlevels(coverage), names(union.sl)))]
    if (length(new.coverage)<length(coverage)){
        jmessage(length(coverage)-length(new.coverage), " coverage points are discarded because they fall out of the ref genome.")
    }
    coverage = new.coverage
    tmp = grl.unlist(ra)
    tmp.md = values(ra)
    nms = names(tmp)
    names(tmp) = NULL
    good.tmp = which(as.character(seqnames(tmp)) %in% names(union.sl))
    tmp = tmp[good.tmp]
    names(tmp) = nms[good.tmp]
    ra = split(tmp[, c()], tmp$grl.ix)
    values(ra) = tmp.md[as.numeric(names(ra)),]
    ## in case some junction lost one breakpoint
    intact.ix = which(elementNROWS(ra)==2)
    new.ra = ra[intact.ix]
    if (length(new.ra)<length(ra)){
        jmessage(length(ra)-length(new.ra), " rearrangements are discarded because they fall out of the ref genome.")
        ## at this point, there's got to be at least 1 coverage point to start the program
        if (length(coverage)==0){
            jerror("Empty coverage data. Please check if their reference chromsome name match the other inputs.")
        }
    }
    ra = new.ra
    seqlevels(ra) = seqlevels(ra)[which(is.element(seqlevels(ra), names(union.sl)))]
    jmessage("Conform the reference sequence length of: seg, coverage, and ra, to be: \n",
             paste0("\t", names(union.sl), ":", union.sl, collapse = "\n"))

    
    ## extend the end and start of the segmentation to the chromosome endpoints
    bp.dt = as.data.table(seg)[as.character(seqnames) %in% names(union.sl)]
    bp.dt[, first := (start == min(.SD$start, na.rm = TRUE)), by = seqnames]
    bp.dt[, last := (end == max(.SD$end, na.rm = TRUE)), by = seqnames]
    bp.dt[first == TRUE, start := 1]
    bp.dt[last == TRUE, end := union.sl[seqnames]]

    ## keep only valid junctions after this seqlength normalization process
    bp.dt = bp.dt[end > start,]

    ## convert back to GRanges
    seg = dt2gr(bp.dt[, .(seqnames, start, end, strand = "*")], seqlengths = union.sl)

    ## xtYao #' Wednesday, Jun 02, 2021 02:43:35 PM
    ## Move the gap filling step after the correction of seqlengths
    ## zchoo Friday, Apr 23, 2021 10:39:19 AM
    ## make gap filtering a general preprocessing step
    ## filter small gaps between segments containing less than ten bins
    binw = median(sample(width(coverage), 30), na.rm = TRUE)
    all.gaps = IRanges::gaps(seg)
    gap.seg = all.gaps[which(as.character(strand(all.gaps)) == "*" & width(all.gaps) > 5 * binw)]

    if (verbose) {
        n.gaps = sum(width(all.gaps) > 0, na.rm = TRUE)
        jmessage("Number of gaps with nonzero width: ", n.gaps)
        jmessage("Number of segments before gap filtering: ", n.gaps + length(seg))
    }

    if (length(gap.seg)>0){
        bps = c(gr.start(seg)[, c()], gr.start(gap.seg)[, c()])
    } else {
        bps = gr.start(seg)[, c()]
    }

    ## create new segments from the breakpoints of the old segments plus big gaps
    new.segs = gUtils::gr.stripstrand(gUtils::gr.breaks(bps, gUtils::si2gr(seqlengths(bps))))[, c()]
    names(new.segs) = NULL
    seg = gUtils::gr.fix(new.segs, union.sl, drop = T)

    if (verbose)
    {
        jmessage(length(seg), ' segments produced after gap filtering')
    }

    saveRDS(seg, seg.fn)

    if (overwrite | !file.exists(kag.file)){
        if (verbose) jmessage("creating karyograph")
        karyograph_stub(seg,
                        coverage,
                        ra = ra,
                        out.file = kag.file,
                        nseg.file = nseg,
                        field = field,
                        purity = purity,
                        ploidy = ploidy,
                        pp.method = pp.method,
                        ## subsample = subsample,
                        het.file = hets,
                        verbose = verbose,
                        ab.exclude = ab.exclude,
                        ab.force = ab.force,
                        max.na = max.na,
                        lp = lp)
    } else {
        jwarning("Skipping over karyograph creation because file already exists and overwrite = FALSE")
    }

    kag = readRDS(kag.file)
    ab.force = kag$ab.force
    ab.exclude = integer(0)

    if (!is.null(cfield))
    {
        if (cfield %in% names(values(kag$junctions)))
        {
            val = values(kag$junctions)[, cfield]
            val[is.na(val)] = 0
            edgenudge = val * edgenudge
        }
    }

    gc()

    juncs = kag$junctions ## already removed ab.exclude!!!
    bpss = grl.unlist(juncs)


    if (nudge.balanced) {
        balanced.jix = c()
        if (length(juncs)>0) {
            jmessage("Brand new function for reciprocal junctions calling.")
            balanced.jix = unlist(reciprocal.cycles(juncs, thresh = 1e3, mc.cores = mc.cores, verbose = verbose>1))
            dp.jix = which(gUtils::ra.duplicated(juncs, pad=1500))
            balanced.jix = setdiff(balanced.jix, dp.jix)
        }
        ## only adds edge nudge to the balanced junctions
        edgenudge = edgenudge * as.numeric(seq_along(juncs) %in% balanced.jix)
    } else { ## nudge everything ..
        if (length(edgenudge)==1) edgenudge = rep(edgenudge, length(juncs))
        if (length(juncs)>0){   ## hot fix for preventing nudging of NA segments
            bps.cov = gr.val(bpss, coverage, val = 'ratio')
            na.jix = unique(bps.cov$grl.ix[is.na(bps.cov$ratio)])
            if (length(na.jix)>0){
                ## if (verbose)
                ## {
                ##   jmessage("Cancel edge nudge for ", length(na.jix), " junctions.")
                ## }
                edgenudge[na.jix] = 0
            }
        }
    }

    ## some edges should be excluded:
    ## completely "dark" reference contigs
    nothing.contig = gr2dt(kag$segstats)[, list(nothing = all(is.na(mean))), by=seqnames][nothing==TRUE, seqnames]
    ## both breakpoints in NA regions
    if (length(juncs)>0){
        junc.dt = data.table(data.frame(values(juncs)))
        junc.dt[, ":="(from = NULL, to = NULL)]
        junc.dt = cbind(junc.dt,
                        data.table(matrix(kag$ab.edges[,,1],
                                          nrow=nrow(kag$ab.edges),
                                          dimnames=dimnames(kag$ab.edges)[1:2])))
        junc.dt[!is.na(from) & !is.na(to), ":="(mean.a=kag$segstats$mean[from],
                       mean.b=kag$segstats$mean[to],
                       chr.a = as.character(seqnames(kag$segstats[from])),
                       chr.b = as.character(seqnames(kag$segstats[to])))]
        junc.dt[, both.na := is.na(mean.a) & is.na(mean.b)]
        both.na.ix = junc.dt[, which(both.na==TRUE)] ## both breakpoint in NA

        no.man.land = junc.dt[, which(chr.a %in% nothing.contig | chr.b %in% nothing.contig)]
        ## either breakpoint in a contig that's completely NA
        ## excluding those whose both bp in NA regions or mapped to completely NA contigs
        ab.exclude = union(ab.exclude, union(both.na.ix, no.man.land))
        ## ab.exclude = union(ab.exclude, both.na.ix)
        ab.force = setdiff(ab.force, ab.exclude)
        edgenudge[ab.exclude] = 0
        ## furthermore, some extra edges should not be nudged
        either.na.ix = junc.dt[, which(both.na==FALSE & (is.na(mean.a) | is.na(mean.b)))]
        edgenudge[either.na.ix] = 0

        if (verbose){
            jmessage("Excluding ", length(both.na.ix), " aberrant junctions whose both breakpoints are in NA coverage regions")
            jmessage("Cancel nudge for ", length(either.na.ix), " aberrant junctions where one of the 2 breakpoint is in NA coverage regions")
        }
    }

    if (verbose){
        jmessage("In sum, we are forcing ", length(ab.force),
                 " junctions, excluding ", length(ab.exclude),
                 " junctions, and nudging ", sum(edgenudge>0), " junctions")
    }


    ## save the final indices of selected edges
    saveRDS(ab.exclude, paste0(outdir, "/ab.exclude.rds"))
    saveRDS(ab.force, paste0(outdir, "/ab.force.rds"))
    saveRDS(edgenudge, paste0(outdir, "/edge.nudge.rds"))

    if (!is.null(init))
    {
        if (is.character(init))
            init = readRDS(init)
    }

    if (overwrite | !file.exists(jabba.raw.rds.file))
    {
        ramip_stub(kag.file,
                   jabba.raw.rds.file,
                   mc.cores = mc.cores,
                   max.threads = max.threads,
                   mem = max.mem,
                   tilim = tilim,
                   edge.nudge = edgenudge,
                   use.gurobi = use.gurobi,
                   ab.force = ab.force,
                   ab.exclude = ab.exclude, ## we now exclude things during karyograph_stub
                   ## ab.exclude = integer(0),
                   init = init,
                   verbose = verbose,
                   purity.min = purity,
                   mipstart = mipstart,
                   epgap = epgap,
                   purity.max = purity,
                   ploidy.min = ploidy,
                   ploidy.max = ploidy,
                   slack.prior = 1/slack.penalty,
                   loose.penalty.mode = loose.penalty.mode,
                   dyn.tuning = dyn.tuning,
                   lp = lp,
                   ism = ism,
                   tfield = tfield,
                   fix.thres = fix.thres,
                   min.bins = min.bins,
                   customparams = !use.gurobi)

    }


    kag = readRDS(kag.file)
    jab = readRDS(jabba.raw.rds.file)

    if (overwrite | !file.exists(jabba.rds.file))
    {
        jabd = JaBbA.digest(jab, kag)
    }
    else
    {
        jabd = readRDS(jabba.rds.file)
    }

    jabd$purity = jab$purity
    jabd$ploidy = jab$ploidy

    if (overwrite | !file.exists(jabba.simple.rds.file))
    {
        if (verbose)
        {
            jmessage('simplifying segments in JaBbA graph but keeping all edges (including copy 0), dumping to jabba.rds')
        }

        jabd.simple = JaBbA.digest(jab, kag, keep.all = F) ## simplified
    }
    else {
        if (verbose)
        {
            jmessage('simplifying segments in JaBbA graph but removing all copy 0 aberrant edges, dumping to jabba.simple.rds')
        }
        jabd.simple = readRDS(jabba.simple.rds.file)
    }

    jabd.simple$purity = jab$purity
    jabd.simple$ploidy = jab$ploidy
    junctions = kag$junctions
    values(junctions)$cn = jab$adj[rbind(kag$ab.edges[, 1:2, 1])]
    jabd.simple$junctions = jabd$junctions = jab$junctions = junctions

    jab$ab.edges = kag$ab.edges
    seg.out = cbind(
        sample = name,
        gr2dt(jabd$segstats)[
            loose==FALSE & strand=="+",
            .(chr = seqnames,
              start, end, width, cn)])
    names(seg.out)[1:4] = c('track.name', 'chrom', 'start', 'end')
    ## seg.out$seg.id = 1:nrow(seg.out)
    ## cols = c('track.name', 'chrom', 'start', 'end', 'cn', 'seg.id')
    ## seg.out = seg.out[, c(cols, setdiff(names(seg.out), cols))]
    write.tab(seg.out, seg.tab.file)
    jabd$segstats$seg.id = seq_along(jabd$segstats)

    if (verbose)
    {
        jmessage('Checking for hets')
    }

    if (!is.null(hets))
        if (file.exists(hets.gr.rds.file))
            tryCatch(
            {
                jmessage('Loading hets')
                hets.gr = readRDS(hets.gr.rds.file)
                jmessage('Computing alleles for jabd ')
                jabd = c(jabd, jabba.alleles(jabd, hets.gr, verbose = TRUE, uncoupled=TRUE)[c('asegstats', 'aadj', 'agtrack')])
                jmessage('Computing alleles for jabd simple ')
                jabd.simple = c(jabd.simple, jabba.alleles(jabd.simple, hets.gr, verbose = TRUE, uncoupled=TRUE)[c('asegstats', 'aadj', 'agtrack')])
                jmessage('Done computing alleles'); file.remove(hets.gr.rds.file)
            },
            error = function(e) print("Jabba allelic generation failed"))

    ## annotate convergence status
    ## jabd.simple$segstats = jabd.simple$segstats %$% jab$segstats[, c("cl")]
    ## jabd.simple$segstats = gr.val(jabd.simple$segstats, jab$segstats[, c("epgap")], mean = FALSE, FUN = max, na.rm = TRUE, verbose = TRUE)
    jabd.simple$segstats = jabd.simple$segstats %$% jab$segstats[, c("cl", "epgap")]
    ## opti = readRDS(paste0(outdir, "/opt.report.rds"))
    ## sapply(strsplit(head(jabd.simple$segstats$cl), ","), as.numeric)

    jab$segstats = gr.fix(jab$segstats)
    jabd$segstats = gr.fix(jabd$segstats)
    jabd.simple$segstats = gr.fix(jabd.simple$segstats)

    ## dependency function: dflm
    .dflm = function(x, last = FALSE, nm = '')
    {
        if (is.null(x))
            out = data.frame(name = nm, method = as.character(NA), p = as.numeric(NA), estimate = as.numeric(NA), ci.lower = as.numeric(NA),  ci.upper = as.numeric(NA), effect = as.character(NA))
        else if (any(c('lm', 'betareg') %in% class(x)))
        {

            coef = as.data.frame(summary(x)$coefficients)
            colnames(coef) = c('estimate', 'se', 'stat', 'p')
            if (last)
                coef = coef[nrow(coef), ]
            coef$ci.lower = coef$estimate - 1.96*coef$se
            coef$ci.upper = coef$estimate + 1.96*coef$se
            if (!is.null(summary(x)$family))
            {
                fam = summary(x)$family$family
                if (summary(x)$family$link %in% c('log', 'logit'))
                {
                    coef$estimate = exp(coef$estimate)
                    coef$ci.upper= exp(coef$ci.upper)
                    coef$ci.lower= exp(coef$ci.lower)
                }
            }
            else
                fam = 'Unknown'

            if (!last)
                nm = paste(nm, rownames(coef))
            out = data.frame(name = nm, method = fam, p = signif(coef$p, 3), estimate = coef$estimate, ci.lower = coef$ci.lower, ci.upper = coef$ci.upper, effect = paste(signif(coef$estimate, 3), ' [',  signif(coef$ci.lower,3),'-', signif(coef$ci.upper, 3), ']', sep = ''))
        }
        else
        {
            ci.lower = ifelse(is.null(x$conf.int[1]), NA, x$conf.int[1])
            ci.upper = ifelse(is.null(x$conf.int[2]), NA, x$conf.int[2])
            out = data.table(name = nm,
                             method = x$method,
                             p = signif(x$p.value, 3),
                             estimate = ifelse(is.null(x$estimate), NA, x$estimate),
                             ci.lower,
                             ci.upper)
            ## FIXME: some model doesn't have `estimate` field
            if (!is.null(x$estimate)){
                out[, effect := paste0(signif(x$estimate, 3),
                                       ' [',  signif(x$conf.int[1],3),
                                       '-', signif(x$conf.int[2], 3), ']')]
            } else {
                out[, effect := "error"]
            }
        }
        out$effect = as.character(out$effect)
        out$name = as.character(out$name)
        out$method = as.character(out$method)
        rownames(out) = NULL
        return(as.data.table(out))
    }

    if (filter_loose) {

        jmessage("Starting loose end annotation")

        ## start building the model
        ## gather loose ends from sample
        gg = gG(jabba = jabd)
        ll = gr2dt(gr.start(gg$nodes[!is.na(cn) & loose.cn.left>0]$gr))[, ":="(lcn = loose.cn.left, strand = "+")]
        lr = gr2dt(gr.end(gg$nodes[!is.na(cn) & loose.cn.right>0]$gr))[, ":="(lcn = loose.cn.right, strand = "-")]

        ## TODO
        ## arbitrarily defined based on Julie's paper and prelim TGCT 1kb dryclean data
        if (binwidth<1000){
            PTHRESH = 3.4e-7
        } else {
            PTHRESH = 2e-6
        }

        if ((nrow(ll)+nrow(lr))>0){
            l = rbind(ll, lr)[, ":="(sample = name)] ## FIXME
            l[, leix := 1:.N]
            l = dt2gr(l)
            le.class = filter.loose(gg, cov = coverage, field = field, l = l, PTHRESH = PTHRESH, verbose = TRUE, max.epgap = epgap)
            saveRDS(le.class, le.class.file.rds)
            n.le = dt2gr(le.class)
            jabd.simple$segstats =
                grbind(
                    jabd.simple$segstats %Q% (loose==FALSE),
                    jabd.simple$segstats %Q% (loose==TRUE) %$% n.le[, c("passed", "true.pos")])
            jabd$segstats =
                grbind(
                    jabd$segstats %Q% (loose==FALSE),
                    jabd$segstats %Q% (loose==TRUE) %$% n.le[, c("passed", "true.pos")])
            jmessage("Loose end quality annotated")
        }
    } else {
        jmessage("Skipping loose end annotation")
    }


    if (overwrite | !file.exists(jabba.simple.rds.file))
    {
        jmessage("Saving results")
        saveRDS(jabd$segstats, seg.gr.file)
        saveRDS(jab, jabba.raw.rds.file)
        saveRDS(jabd, jabba.rds.file)
        saveRDS(jabd.simple, jabba.simple.rds.file)

        purity.ploidy.dt = data.table(purity = jab$purity,
                                      ploidy = jab$ploidy)
        fwrite(purity.ploidy.dt, purity.ploidy.txt.file, sep = '\t')

        jab.gg = gGnome::gGraph$new(jab = jabd)
        tmp.jabd.simple = jabd.simple
        values(tmp.jabd.simple$junctions)$cn = NULL
        jab.simple.gg = gGnome::gGraph$new(jab = tmp.jabd.simple)

        saveRDS(jab.simple.gg, jabba.simple.gg.rds.file)
        saveRDS(jab.gg, jabba.gg.rds.file)
    }



    tryCatch(
    {
        if (overwrite | !file.exists(jabba.vcf.file)) jabba2vcf(jabd, jabba.vcf.file);
        if (overwrite | !file.exists(jabba.cnv.vcf.file)) jabba2vcf(jabd, jabba.cnv.vcf.file, cnv = TRUE)
        if (overwrite | !file.exists(jabba.simple.vcf.file)) jabba2vcf(jabd.simple, jabba.simple.vcf.file)
        if (!file.exists(jabba.simple.cnv.vcf.file)) jabba2vcf(jabd.simple, jabba.simple.cnv.vcf.file, cnv = TRUE)
    }, error = function(e) print("Jabba VCF generation failed"))

    if (nrow(jabd$edges)>0){
        seg.adj = cbind(data.frame(sample = rep(name, nrow(jabd$edges))), jabd$edges[, c('from', 'to', 'cn', 'type')])
        write.tab(seg.adj, seg.adj.file)
    }

    values(kag$junctions)$cn.jabba = jab$adj[rbind(jab$ab.edges[, 1:2, 1])]

    if (length(kag$junctions)>0)
    {
        tmp = grl.pivot(kag$junctions)
        names(tmp[[1]]) = seq_along(kag$junctions)
        names(tmp[[2]]) = seq_along(kag$junctions)
        ra1 = as.data.frame(tmp[[1]])
        ra2 = as.data.frame(tmp[[2]])
        names(ra1) = paste('bp1_', names(ra1), sep = '')
        names(ra2) = paste('bp2_', names(ra2), sep = '')
        junc.txt = as.data.frame(values(kag$junctions))
        write.tab(cbind(ra1, ra2, junc.txt), junctions.txt.file)
    }
    else
        writeLines(c("\t"), junctions.txt.file)

    ## all junctions are based on what's left in the karyograph
    saveRDS(kag$junctions, junctions.rds.file)

    tmp.cov = sample(coverage, pmin(length(coverage), 5e5))
    tmp.cov = gr.fix(tmp.cov, jabd$segstats)
    # add ncn values
    tmp.cov = tmp.cov %$% kag$segstats[,'ncn']
    # transform using rel2abs
    tmp.cov$cn = rel2abs(tmp.cov, purity = jab$purity, ploidy = jab$ploidy,
                         field = field, field.ncn = 'ncn')

    y1 = pmax(5, max(jabd$segstats$cn)*1.1)
    jabd$gtrack$y1 = y1
    jabd.simple$gtrack$y1 = y1

    td.cov = gTrack(tmp.cov, y.field = 'cn', col = alpha('black', 0.2), name = 'Cov', y1 = (y1 + jab$gamma)/jab$beta)

    if (verbose)
    {
        jmessage('Generating figures')
    }

    if (overwrite | !file.exists(jabba.png.file))
    {
        if (is.character(tryCatch(png(jabba.png.file, width = 2000, height = 1000), error = function(e) 'bla')))
            pdf(gsub('png$', 'pdf', jabba.png.file), width = 10, height = 10)

        jun = jabd$junctions
        values(jun)$col = ifelse(values(jun)$cn>0, 'red', alpha('gray', 0.2))

        if (is.null(jabd$agtrack)){
            plotted = tryCatch(plot(c(td.cov, jabd$gtrack), links = jun), error = function(e) return(1))
        } else {
            plotted = tryCatch(plot(c(jabd$agtrack, td.cov, jabd$gtrack), links = jun), error = function(e) return(1))
        }

        if (!is.null(plotted)){
            if (verbose){
                jmessage("Something wrong with plotting JaBbA results. Please try it later.")
            }
        }

        dev.off()
    }

    if (overwrite | !file.exists(jabba.simple.png.file))
    {

        jun = jabd.simple$junctions
        values(jun)$col = ifelse(values(jun)$cn>0, 'red', alpha('gray', 0.2))

        if (is.character(tryCatch(png(jabba.simple.png.file, width = 2000, height = 1000), error = function(e) 'bla')))
            pdf(gsub("png$", "pdf", jabba.simple.png.file), width = 20, height = 10)

        ## if (is.null(jabd.simple$agtrack))
        ##     plot(c(td.cov, jabd.simple$gtrack), links = jun)
        ## else
        ##     plot(c(jabd.simple$agtrack, td.cov, jabd.simple$gtrack), links = jun)

        if (is.null(jabd$agtrack)){
            plotted = tryCatch(plot(c(td.cov, jabd.simple$gtrack), links = jun),
                               error = function(e) return(1))
        } else {
            plotted = tryCatch(plot(c(jabd.simple$agtrack, td.cov, jabd.simple$gtrack), links = jun),
                               error = function(e) return(1))
        }

        if (!is.null(plotted)){
            if (verbose){
                jmessage("Something wrong with plotting JaBbA simplified results. Please try it later.")
            }
        }

        dev.off()
    }

    ## annotate loose ends


    jmessage('Done .. job output in: ', normalizePath(outdir))

    return(readRDS(jabba.simple.gg.rds.file))
}


###############################
## NOT EXPORTED AND INTERNAL TO JABBA
##
##############################

#' karyograph
#' @name karyograph
#' @rdname internal
karyograph_stub = function(seg.file, ## path to rds file of initial genome partition (values on segments are ignored)
                           cov.file, ## path to rds file GRanges of coverage with meta data field "field"
                           nseg.file = NULL, ## rds file of GRanges providing integer copy numbers for normal segments in the genome
                           het.file = NULL,
                           ra = NULL,
                           junction.file = NULL,
                           out.file,
                           pp.method = "sequenza",
                           ra.file = NULL,
                           verbose = FALSE,
                           force.seqlengths = NULL,
                           purity =  NA,
                           ploidy = NA,
                           field = 'ratio',
                           mc.cores = 1,
                           max.chunk = 1e8,
                           max.na = -1,
                           ## subsample = NULL,
                           ab.exclude = NULL,
                           ab.force = NULL,
                           lp = FALSE){
    loose.ends = GRanges()

    if (!is.null(ra)){
        this.ra = ra
    } else
    {
        if (!is.null(junction.file))
        {
            these.junctions = read.delim(junction.file, strings = F)

            if (ncol(these.junctions)<=1) ## wrong separator
                these.junctions = read.delim(junction.file, sep = ',', strings = F)

            if (!is.null(these.junctions$strand1) & !is.null(these.junctions$strand2))
            {
                ## looks like snowman input flip breaks so that they are pointing away from junction
                these.junctions$str1 = ifelse(these.junctions$strand1 == '+', '-', '+')
                these.junctions$str2 = ifelse(these.junctions$strand2 == '+', '-', '+')
            }

            these.junctions$chr1 = gsub('23', 'X', gsub('24', 'Y', these.junctions$chr1))
            these.junctions$chr2 = gsub('23', 'X', gsub('24', 'Y', these.junctions$chr2))
            this.ra = read.junctions(these.junctions, seqlengths = hg_seqlengths())
        }
        else if (grepl('(\\.bedpe)|(\\.vcf$)|(\\.vcf\\.gz$)', ra.file))
        {
            tmp.ra = read.junctions(ra.file, seqlengths = hg_seqlengths(), get.loose = T, geno = geno)
            if (length(tmp.ra)==0){
                this.ra = gr.fix(GRangesList(), hg_seqlengths())
                loose.ends = GRanges(seqlengths = hg_seqlengths())
            } else {
                this.ra = tmp.ra$junctions
                loose.ends = tmp.ra$loose.ends
            }
        }
        else
            this.ra = readRDS(ra.file)
    }

    ab.exclude = intersect(seq_along(this.ra), ab.exclude)
    ab.force = setdiff(intersect(seq_along(this.ra), ab.force), ab.exclude)
    remaining = setdiff(seq_along(this.ra), ab.exclude)
    this.ra = this.ra[remaining]
    ab.force = which(remaining %in% ab.force)

    ## if we don't have normal segments then coverage file will be our bible for seqlengths
    if (is.character(cov.file))
    {
        if (grepl('\\.rds$', cov.file))
            this.cov = readRDS(cov.file)
        else
        {
            this.cov = rtracklayer::import(cov.file)
            if ("score" %in% names(values(this.cov))) {
                values(this.cov)[, field] = values(this.cov)[, "score"]
            } else {
                names(values(this.cov)) = field
            }
            ## field = 'score';
        }
    }
    else {
        this.cov = cov.file
    }

    ## now make sure we have the "best" seqlengths
    .fixsl = function(sl, gr) {sl[seqlevels(gr)] = pmax(GenomeInfoDb::seqlengths(gr), sl[seqlevels(gr)]); return(sl)}

    if (is.null(force.seqlengths)){
        sl = .fixsl(GenomeInfoDb::seqlengths(this.ra), this.cov)
    } else {
        sl = .fixsl(force.seqlengths, this.cov)
    }

    if (!is.null(nseg.file))
    {
        if (is.character(nseg.file))
        {
            if (file.exists(nseg.file))
            {
                stopifnot(file.size(nseg.file)>0)
                if (grepl('\\.rds$', nseg.file, ignore.case = TRUE))
                {
                    nseg = readRDS(nseg.file)
                }
                else if (grepl('(\\.txt$)|(\\.tsv$)|(\\.csv$)', nseg.file))
                {
                    nseg = dt2gr(fread(nseg.file))
                }
                else
                {
                    nseg = rtracklayer::import(nseg.file)
                }
            } else {
                jmessage('Did not find nseg file! Ignore.')
                nseg = nseg.file = NULL
            }

        }
        else {
            nseg = nseg.file
        }
        if (!is.null(nseg)){
            sl = .fixsl(sl, nseg)
        }
    }

    ## make sure all sl's are equiv
    if (is.character(seg.file)){
        if (file.exists(seg.file))
        {
            if (grepl('\\.rds$', seg.file, ignore.case = TRUE))
            {
                this.seg = readRDS(seg.file)
            }
            else if (grepl('(\\.txt$)|(\\.tsv$)|(\\.csv$)', seg.file))
            {
                this.seg = dt2gr(fread(seg.file))
            }
            else
            {
                this.seg = rtracklayer::import(seg.file)
            }
        }
        this.seg = gr.fix(this.seg, sl, drop = T)[, c()]
    } else {
        this.seg = seg.file
    }

    if (length(loose.ends>0))
    {
        if (verbose)
        {
            jmessage('Adding loose ends from vcf file to seg file')
        }
        this.seg = grbind(this.seg, gr.fix(loose.ends, sl, drop = T))
    }

    this.ra = gr.fix(this.ra, sl, drop = T)

    ## DONE: add segmentation to isolate the NA runs
    ## there were a lot of collateral damage because of bad segmentation
    ## na.runs = streduce(
    ##     this.cov[which(is.na(values(this.cov)[, field]))], 1e4
    ## ) %Q% (width>1e5)

    ## this.kag.old = karyograph(this.ra, this.seg)
    ## this.kag = karyograph(this.ra, grbind(this.seg, na.runs))
    this.kag = karyograph(this.ra, this.seg)
    if (length(this.kag$tile)>5e4){
        jmessage("WARNING: big karyograph > 50000 nodes, may take longer to finish.")
    }

    ## NA junctions thrown out
    ## if (length(this.kag$junctions)>0){
    ##     na.cov = this.cov[which(is.na(values(this.cov)[, field]))]
    ##     abe = data.table(as.data.frame(this.kag$ab.edges[,1:2,1, drop=F]))
    ##     colnames(abe) = c("from", "to")
    ##     incident.nodes = abe[, c(from, to)]
    ##     incident.gr = this.kag$tile[incident.nodes]
    ##     incident.gr$nafrac = incident.gr %O% na.cov
    ##     ov.na.runs = which(incident.gr %^% na.runs)
    ##     ov.na.bins = which(incident.gr$nafrac > 0.2)
    ##     na.ix = incident.nodes[union(ov.na.runs, ov.na.bins)]
    ##     ab.exclude = union(ab.exclude,
    ##                        abe[, which(from %in% na.ix | to %in% na.ix)])
    ##     ab.exclude = setdiff(ab.exclude, ab.force)
    ##     if (length(ab.exclude)>0){
    ##         new.jix = setdiff(setNames(seq_along(this.ra), seq_along(this.ra)), ab.exclude)
    ##         ab.force = which(new.jix %in% ab.force)
    ##         this.ra = this.ra[new.jix]
    ##         this.kag = karyograph(this.ra, grbind(this.seg, na.runs))
    ##         if(verbose){
    ##             jmessage("Filtered out ", length(ab.exclude), " junctions in NA enriched regions")
    ##         }
    ##     }
    ## }

    if (is.null(nseg.file)){
        warning('No normal copy number values supplied so defaulting to 2 for all segments.')
        this.kag$segstats$ncn = 2
    }

    hets.gr = NULL
    if (!is.null(het.file))
    {
        if (is.character(het.file))
        {
            if (grepl(".rds$", het.file)){
                hets = readRDS(het.file)
            } else {
                hets = fread(het.file)
            }
        }
        else
            hets = het.file

        if (!is.data.table(hets))
            hets = as.data.table(hets)

        if (verbose)
        {
            jmessage('loaded hets')
        }

        if (inherits(hets, "data.frame")){
            if (!is.null(hets$alt.count.n) & !is.null(hets$ref.count.n)){
                ## old format, apply het filter ourselves
                hets$ref.frac.n = hets$alt.count.n / (hets$alt.count.n + hets$ref.count.n)
                hets.gr = dt2gr(hets[pmin(ref.frac.n, 1-ref.frac.n) > 0.2 & (ref.count.n + alt.count.n)>=2, ])
                hets.gr$alt = hets.gr$alt.count.t
                hets.gr$ref = hets.gr$ref.count.t
            } else if (grepl("chrom", colnames(hets)[1], ignore.case = T) &
                       grepl("(pos)?|(start)?", colnames(hets)[2], ignore.case = T) &
                       any(grepl("af$", colnames(hets), ignore.case = T))){
                ## PCAWG BAF format: Chromosome Pos BAF
                colnames(hets)[1:2] = c("chr", "pos")
                baf.col = grep("af$", colnames(hets),
                               value= TRUE, ignore.case = T)[1]
                baf = hets[, baf.col, with = FALSE][[1]]
                hets[, ":="(ref = 1 - baf, alt = baf)]
                hets.gr = dt2gr(hets)
            } else {## new, standard format, with $alt and $ref field
                hets.gr = dt2gr(hets)
                if (all(c("alt", "ref") %in% colnames(hets))){
                    jmessage("Valid hets already")
                    ## hets.gr$alt.count.t = hets.gr$alt
                    ## hets.gr$ref.count.t = hets.gr$ref
                } else if (all(c("alt.count.t", "ref.count.t") %in% colnames(hets))){
                    hets.gr$alt = hets.gr$alt.count.t
                    hets.gr$ref = hets.gr$ref.count.t
                    hets.gr = hets.gr %Q% (!is.na(alt)) %Q% (!is.na(ref))
                } else {
                    jmessage("hets is not in valid format, ignore")
                    hets.gr = NULL
                }
            }
        } else if (inherits(hets, "GRanges")){
            if (all(c("alt.count.t", "ref.count.t") %in% colnames(values(hets)))){
                hets.gr = hets
                hets.gr$alt = hets$alt.count.t
                hets.gr$ref = hets$ref.count.t
                hets.gr = hets.gr %Q% (!is.na(alt)) %Q% (!is.na(ref))
            }
        } else {
            jmessage("hets is neither data.table nor GRanges, ignore.")
        }


        hets.gr = hets.gr[which(hets.gr %^% this.kag$tile)]
        if (!is.null(hets.gr) & length(hets.gr)>0){
            ## save hets object for later
            saveRDS(hets.gr, paste(dirname(out.file), 'hets.gr.rds', sep = '/'))
        } else {
            if (verbose){
                jmessage("None of the provided (if any) germline heterozygosity site overlap the segments, ignore hets.")
            }
        }
    }


    if (length(hets.gr)>0){
        ## pretend we don't have hets at all
        this.kag$segstats = segstats(this.kag$tile,
                                     this.cov,
                                     field = field,
                                     prior_weight = 1,
                                     max.chunk = max.chunk,
                                     ## subsample = subsample,
                                     asignal = hets.gr,
                                     afields = c('ref', 'alt'),
                                     mc.cores = mc.cores,
                                     verbose = verbose,
                                     max.na = max.na,
                                     lp = lp)
    } else {
        this.kag$segstats = segstats(this.kag$tile,
                                     this.cov,
                                     field = field,
                                     prior_weight = 1,
                                     max.chunk = max.chunk,
                                     ## subsample = subsample,
                                     mc.cores = mc.cores,
                                     verbose = verbose,
                                     max.na = max.na,
                                     lp = lp)
    }

    this.kag$segstats$ncn = 2

    if (!is.null(nseg.file)){
        if (is.null(nseg$cn)){
            warning('Normal seg file does not have "cn" met data field. USING the default 2!!')
            this.kag$segstats$ncn = 2
        } else {
            ## check if ncn has chr prefix that is inconsistent with coverage and seg
            this.kag$segstats$ncn = round(gr.val(this.kag$segstats, nseg, 'cn')$cn)
            this.kag$segstats$mean[is.na(this.kag$segstats$ncn)] = NA ## remove segments for which we have no normal copy number
        }
    }

    ## filter ra here
    ## ## 6/15 temp fix for sd on short segments, which we overestimate for now
    ## cov.thresh = pmin(1e5, median(width(this.cov)))
    ## #    jmessage('!!!!!!!!!!! cov.thresh for fix.sd is', cov.thresh, '\n')
    ## fix.sd  = width(this.kag$segstats)<(3*cov.thresh)
    ## #    this.kag$segstats$mean[make.na] = NA
    ## this.kag$segstats$sd[fix.sd] = sqrt(this.kag$segstats$mean[fix.sd])
    ## #      if (is.character(tryCatch(png(paste(out.file, '.ppgrid.png', sep = ''), height = 500, width = 500), error = function(e) 'bla')))
    ## ss.tmp = this.kag$segstats[width(this.kag$segstats)>1e4, ] ## don't use ultra short segments
    ss.tmp = this.kag$segstats %Q% (nbins>10) ## don't use ultra short segments

    purity = as.numeric(purity)
    ploidy = as.numeric(ploidy)
    if (!is.na(purity) & !is.na(ploidy) & length(purity)==1 & length(ploidy)==1) ## purity and ploidy are completely set
    {
        pp = data.table(purity = purity, ploidy = ploidy)
    } else {
        if (grepl(pp.method, "sequenza")){
            use.sequenza = TRUE
            use.ppurple = FALSE
            use.ppgrid = FALSE
        }
        ## temporarily deprecate Ppurple
        # else if (grepl(pp.method, "ppurple")){
        #     use.ppurple = TRUE
        #     use.sequenza = FALSE
        #     use.ppgrid = FALSE
        # }
        else if (grepl(pp.method, "ppgrid")){
            use.ppgrid = TRUE
            use.ppurple = FALSE
            use.sequenza = FALSE
        } else {
            use.sequenza = TRUE
            use.ppurple = FALSE
            use.ppgrid = FALSE
            if (verbose){
                jmessage("Cannot recognize the choice of purity-ploidy estimation method. Try the default 'Sequenza'.")
            }
        }

        ## only allow ppurple when hets.gr is absent
        if (!exists("hets.gr")){
            hets.gr = NULL
        }

        if (is.null(hets.gr)){
            use.ppgrid = TRUE
            use.ppurple = use.sequenza = FALSE
        } else if (length(hets.gr)==0){
            use.ppgrid = TRUE
            use.ppurple = use.sequenza = FALSE
        } else if (!all(c("ref.count.t",
                          "alt.count.t",
                          "ref.count.n",
                          "alt.count.n") %in% colnames(values(hets.gr)))){
            use.sequenza = FALSE
            use.ppurple = FALSE ## temp deprecate Ppurple
            use.ppgrid = !use.ppurple
        }

        # if (use.ppurple)
        # {
        #     jmessage("Using Ppurple to estimate purity ploidy")
        #     if (is.na(purity))
        #     {
        #         purity = seq(0, 1, 0.1)
        #     }

        #     if (is.na(ploidy))
        #     {
        #         ploidy = seq(1, 6, 0.2)
        #     }
        #     this.cov$y = values(this.cov)[, field]

        #     if (verbose)
        #     {
        #         jmessage('Computing purity and ploidy with Ppurple')
        #     }

        #     max.chunk = 1e3
        #     numchunks = ceiling(length(ss.tmp)/max.chunk)
        #     if (numchunks>length(purity)*length(ploidy)){
        #         pp = Ppurple::ppurple(cov = this.cov, hets = hets.gr, seg = ss.tmp,
        #                               purities = purity, ploidies = ploidy,
        #                               verbose = verbose,
        #                               mc.cores = mc.cores,
        #                               ## numchunks = numchunks,
        #                               ignore.sex = TRUE)
        #     } else {
        #         pp = Ppurple::ppurple(cov = this.cov, hets = hets.gr, seg = ss.tmp,
        #                               purities = purity, ploidies = ploidy,
        #                               verbose = verbose,
        #                               mc.cores = mc.cores,
        #                               numchunks = numchunks,
        #                               ignore.sex = TRUE)
        #     }
        # } else
        if (use.sequenza) {
            jmessage("Using Sequenza to estimate purity ploidy")
            if (is.na(purity))
            {
                purity = seq(0, 1, 0.01)
            }

            if (is.na(ploidy))
            {
                ploidy = seq(1, 6, 0.1)
            }
            ## read in the segmentation and heterozygosity site read counts
            sqz.seg = gr2dt(ss.tmp)[strand=="+"]
            setnames(sqz.seg,
                     old = c("seqnames", "start", "end"),
                     new = c("chrom", "start.pos", "end.pos"))

            sites = gr2dt(hets.gr)

            ## prepare input file to run w/ segment.breaks
            sites[, adjusted.ratio := ((ref.count.t + alt.count.t) / (ref.count.n + alt.count.n))]
            sites[, depth.normal := (ref.count.n + alt.count.n)]
            sites[, depth.tumor := (ref.count.t + alt.count.t)]
            sites[, good.reads := 2]
            sites[, zygosity.normal := "het"]
            sites[, alt.frac.t := alt.count.t / (ref.count.t + alt.count.t)]
            setnames(sites,
                     old = c("seqnames", "start", "alt.frac.t"),
                     new = c("chromosome", "position", "Bf"))
            sites = sites[which(Bf <= 0.5)] # They only include BAF w/ values 0-0.5
            if (exists("nseg") && !is.null(nseg)){
                ## only running w/ diploid autosomes
                ## to avoid situations like HCC1143BL
                good.chr = union(as.character(seqnames(nseg %Q% (cn==2))), "X")
                sites = sites[which(chromosome %in% good.chr)]
            } else {
                ## only running w/ chr1-22 and X
                sites = sites[which(chromosome %in% seqlevels(ss.tmp))]
            }
            sites[, Af := 1-Bf]
            ## zchoo Wednesday, Apr 21, 2021 02:02:54 PM
            ## re-factor sites chromosome column to exclude Y as empty factor level
            common.chr = intersect(as.character(sites$chromosome), as.character(sqz.seg$chrom))

            sites = sites[as.character(chromosome) %in% common.chr,]
            sqz.seg = sqz.seg[as.character(chrom) %in% common.chr,]

            sites[, chromosome := factor(as.character(chromosome), levels = common.chr)]
            sqz.seg[, chrom := factor(as.character(chrom), levels = common.chr)]
            sqz.seg[, chromosome := chrom]

            ## xtYao Tuesday, Nov 26, 2019 04:43:57 PM: new sequenza expectation, we should freeze their code at this version
            seg.s1 = sequenza::segment.breaks(sites, breaks = sqz.seg, weighted.mean = FALSE)
            ## twalradt Thursday, Apr 26, 2018 02:58:23 PM They wrote '10e6' in their documentation
            seg.filtered  = seg.s1[(seg.s1$end.pos - seg.s1$start.pos) > 1e6, ]
            ## get the genome wide mean of the normalized depth ratio:
            weights.seg <- 150 + round((seg.filtered$end.pos -
                                        seg.filtered$start.pos) / 1e6, 0)
            avg.depth.ratio <- mean(sites$adjusted.ratio) # mean(gc.stats$adj[,2])


            if (verbose){
                jmessage("Starting BAF model fit")
            }

            ## run the BAF model fit
            CP = sequenza::baf.model.fit(Bf = seg.filtered$Bf,
                                         depth.ratio = seg.filtered$depth.ratio,
                                         weight.ratio = weights.seg,
                                         weight.Bf = weights.seg,
                                         avg.depth.ratio = avg.depth.ratio,
                                         cellularity = purity,
                                         ploidy = ploidy)

            confint = sequenza::get.ci(CP)

            pp = data.table(ploidy = confint$max.ploidy,
                            purity = confint$max.cellularity)
        } else if (use.ppgrid){
            jmessage("Using ppgrid to estimate purity ploidy")
            pdf(paste(out.file, '.ppgrid.pdf', sep = ''), height = 10, width = 10)
            if (!is.null(het.file))
            {
                pp = ppgrid(ss.tmp,
                            verbose = verbose,
                            plot = F,
                            mc.cores = mc.cores,
                            purity.min = ifelse(is.na(purity[1]), 0, purity[1]),
                            purity.max = ifelse(is.na(purity[length(purity)]), 1, purity[length(purity)]),
                            ploidy.min = ifelse(is.na(ploidy[1]), 1.2, ploidy[1]),
                            ploidy.max = ifelse(is.na(ploidy[length(ploidy)]), 6, ploidy[length(ploidy)]),
                            allelic = TRUE)
            } else {
                pp = ppgrid(ss.tmp,
                            verbose = verbose,
                            plot = F,
                            mc.cores = mc.cores,
                            purity.min = ifelse(is.na(purity[1]), 0, purity[1]),
                            purity.max = ifelse(is.na(purity[length(purity)]), 1, purity[length(purity)]),
                            ploidy.min = ifelse(is.na(ploidy[1]), 1.2, ploidy[1]),
                            ploidy.max = ifelse(is.na(ploidy[length(ploidy)]), 6, ploidy[length(ploidy)]),
                            allelic = FALSE)
            }
        } else {
            jerror("Need purity ploidy estimates to start JaBbA.")
        }
    }

    mu = this.kag$segstats$mean
    mu[is.infinite(mu)] = NA
    w = as.numeric(width(this.kag$segstats))
    w[is.na(mu)] = NA
    sw = sum(w, na.rm = T)
    ncn = this.kag$segstats$ncn
    ploidy_normal = sum(w * ncn, na.rm = T) / sw  ## this will be = 2 if ncn is trivially 2
    mutl = sum(mu * w, na.rm = T)
    pp$beta = ((1-pp$purity)*ploidy_normal + pp$purity*pp$ploidy) * sw / (pp$purity * mutl)
    pp$gamma = 2*(1-pp$purity)/pp$purity

    saveRDS(pp, paste(out.file, '.ppgrid.solutions.rds', sep = '')) ## save alternate solutions

    this.kag$purity = pp[1,]$purity
    this.kag$ploidy = pp[1,]$ploidy
    this.kag$beta = pp[1,]$beta
    this.kag$gamma = pp[1,]$gamma
    ## cn is the copy number b4 rounding
    this.kag$segstats$cnmle = rel2abs(this.kag$segstats,
                                      purity = this.kag$purity,
                                      ploidy = this.kag$ploidy,
                                      field = 'mean')
    this.kag$segstats$cn = pmax(round(this.kag$segstats$cnmle), 0)
    ## this.kag$ab.exclude = ab.exclude
    this.kag$ab.force = ab.force
    saveRDS(this.kag, out.file) ## DONE

    ## TODO make these plots more helpful to the users
    if (is.character(tryCatch(png(paste(out.file, '.ppfit.png', sep = ''), height = 1000, width = 1000), error = function(e) 'bla')))
        pdf(paste(out.file, '.ppfit.pdf', sep = ''), height = 10, width = 10)
    tmp.kag = this.kag

    if (length(tmp.kag$segstats)<10)
        warning('number of segments used for purity ploidy extremely low .. check coverage data')
    .plot_ppfit(tmp.kag)
    dev.off()

    if (verbose)
    {
        jmessage('Built gGraph with ', length(this.kag$tile), ' nodes, ', sum(this.kag$adj!=0), ' edges, purity ', round(this.kag$purity,2), ', and ploidy ', round(this.kag$ploidy,2))
    }

    y1 = 10

    if (is.character(tryCatch(png(paste(out.file, '.inputdata.png', sep = ''), height = 1000, width = 1000), error = function(e) 'bla'))){
        pdf(paste(out.file, '.inputdata.pdf', sep = ''), height = 10, width = 10)
    }
    plot(c(gTrack(gr.fix(sample(this.cov, pmin(length(this.cov), 5e4)), this.kag$segstats), y.field = field, col = alpha('black', 0.3)),
           gTrack(this.kag$segstats, y.field = 'mean', angle = 0, col = 'gray10', border = alpha('black', 0.2))), links = this.kag$junctions, y1 = y1)
    dev.off()
}

## diagnostic function used by karyograph_stub
#' @name .plot_ppfit
#' @rdname internal
.plot_ppfit = function(kag, xlim = c(-Inf, Inf))
{
    tmp = kag$segstats  ## only plot seg that we haven't fixed SD for and that have normal cn 1, to minimize confusion
    dupval = sort(table(tmp$mean), decreasing = TRUE)[1]
    if (!is.na(dupval))
        if (dupval>5)
            tmp = tmp[-which(as.character(tmp$mean) == names(dupval))]

    if (length(tmp)==0)
        return()

    ## sampling random loci to plot not segments
    segsamp = pmin(sample(tmp$mean, 1e6, replace = T, prob = width(tmp)), xlim[2])
    hist(
        pmax(xlim[1], pmin(xlim[2], segsamp)),
        1000, xlab = 'Segment intensity',
        main = sprintf('Purity: %s Ploidy: %s Beta: %s Gamma: %s', kag$purity, kag$ploidy, round(kag$beta,2), round(kag$gamma,2)),
        xlim = c(pmax(0, xlim[1]), pmin(xlim[2], max(segsamp, na.rm = T))))
    abline(v = 1/kag$beta*(0:1000) + kag$gamma/kag$beta, col = alpha('red', 0.3), lty = c(4, rep(2, 1000)))
}

#' @name ramip_stub
#' @rdname internal
#' @noRd
ramip_stub = function(kag.file,
                      out.file,
                      mc.cores = 1,
                      ## max.threads = Inf,
                      max.threads = 12,
                      mem = 16,
                      tilim = 1200,
                      slack.prior = 0.001,
                      gamma = NA,
                      beta = NA,
                      customparams = FALSE,
                      purity.min = NA, purity.max = NA,
                      ploidy.min = NA, ploidy.max = NA,
                      init = NULL,
                      mipstart = NULL,
                      use.gurobi = FALSE,
                      epgap = 1e-4,
                      verbose = FALSE,
                      edge.nudge = 0,
                      ab.force = NULL,
                      ab.exclude = NULL,
                      loose.penalty.mode = "boolean",
                      dyn.tuning = TRUE,
                      debug.ix = NULL,
                      lp = FALSE,
                      ism = FALSE,
                      tfield = NULL,
                      fix.thres = -1,
                      min.bins = 5)
{
    outdir = normalizePath(dirname(kag.file))
    this.kag = readRDS(kag.file)

    ## if (is.null(this.kag$gamma) | is.null(this.kag$beta))
    ## {
    ##     pp = ppgrid(this.kag$segstats, verbose = verbose, plot = T, purity.min = purity.min, purity.max = purity.max, ploidy.min = ploidy.min, ploidy.max = ploidy.max)
    ##     this.kag$beta = pp[1,]$beta
    ##     this.kag$gamma = pp[1,]$gamma
    ## }

    if (!is.na(gamma))
    {
        if (verbose)
        {
            jmessage(sprintf('Overriding gamma with %s\n', gamma))
        }
        this.kag$gamma = gamma
    }

    if (!is.na(beta))
    {
        if (verbose)
        {
            jmessage(sprintf('Overriding beta with %s\n', beta))
        }
        this.kag$beta = beta
    }

    if (customparams)
    {
        if (verbose) jmessage("number of specified max.threads: ", max.threads)
        MAX.THREADS = Sys.getenv("LSB_DJOB_NUMPROC")
        if (nchar(MAX.THREADS) == 0)
            MAX.THREADS = Inf
        else
            MAX.THREADS = as.numeric(MAX.THREADS)
        max.threads = min(max.threads, MAX.THREADS)
        if (is.infinite(max.threads))
            max.threads = 0
        if (verbose) jmessage("number of specified max.threads after processing: ", max.threads)

        param.file = paste(out.file, '.prm', sep = '')
        .cplex_customparams(param.file, max.threads, treememlim = mem * 1024,
                            workingmemlim = floor((mem * 0.6) * 1024))
        if (verbose) {
            jmessage("param.file contents: ")
            system2('cat', normalizePath(param.file))
        }

        Sys.setenv(ILOG_CPLEX_PARAMETER_FILE = normalizePath(param.file))
        if (verbose)
        {
            jmessage('Creating ILOG CPLEX PARAMETER FILE in ', Sys.getenv('ILOG_CPLEX_PARAMETER_FILE'))
        }
    }

    adj.nudge = this.kag$adj*0;
    nnaix = which(rowSums(is.na(this.kag$ab.edges[,1:2, 1, drop = FALSE]))==0)
    nna.abe = this.kag$ab.edges[nnaix, , 1, drop = FALSE]
    ## adj.nudge[] = 1*edge.nudge[nnaix] ## if edge.nudge is length ab.edges, then corresponding edges will be nudged
    adj.nudge[nna.abe[,1:2,1]] = 1*edge.nudge

    adj.lb = NULL
    if (!is.null(ab.force))
    {
        if (verbose)
        {
            if (length(ab.force)>0)
            {
                jmessage(paste('Enforcing lower bounds on aberrant junctions:', paste(ab.force, collapse = ',')))
            }
        }
        adj.lb = this.kag$adj*0
        adj.lb[rbind(this.kag$ab.edges[ab.force, ,1])[, 1:2, drop = FALSE]] = 1
        adj.lb[rbind(this.kag$ab.edges[ab.force, ,2])[, 1:2, drop = FALSE]] = 1
        saveRDS(adj.lb, paste0(outdir, "/adj.lb.rds"))
    }

    if (!is.null(ab.exclude))
    {
        if (verbose)
        {
            if (length(ab.exclude)>0)
            {
                jmessage(paste('Excluding aberrant junctions:', paste(ab.exclude, collapse = ',')))
            }
        }
        nnaix = which(rowSums(is.na(this.kag$ab.edges[,1:2,1, drop = FALSE]))==0)
        ab.exclude = intersect(ab.exclude, nnaix)
        adj.ub = this.kag$adj*0
        ## adj.ub[rbind(this.kag$ab.edges[ab.exclude, ,1])[, 1:2, drop = FALSE]] = -adj
        ## 1.ub[rbind(this.kag$ab.edges[ab.exclude, ,2])[, 1:2, drop = FALSE]] = -1
        adj.ub[rbind(this.kag$ab.edges[ab.exclude, ,1])[, 1:2, drop = FALSE]] = 0.1
        adj.ub[rbind(this.kag$ab.edges[ab.exclude, ,2])[, 1:2, drop = FALSE]] = 0.1
        saveRDS(adj.ub, paste0(outdir, "/adj.ub.rds"))
    } else {
        adj.ub = NULL
    }

    ## if mipstart is not given, construct the naive solution
    ## if mipstart is given (a gGnome or JaBBA) object
    ## here we create an mipstart "adj" matrix for the new graph
    ## by looking up the junctions in the current graph in the old object
    if (is.null(mipstart)){
        if (file.exists(paste0(outdir, "/mipstart.rds"))){
            jmessage("Using existing mipstart in the current directory")
            mipstart = readRDS(paste0(outdir, "/mipstart.rds"))
        } else {
            jmessage("Adjusting the kag (naive solution) as mipstart (initial solution).")
            ndt = gr2dt(this.kag$segstats)[, ":="(cnmle = cn)][, id := seq_along(this.kag$segstats)]
            setkey(ndt, "id")
            adj = this.kag$ab.adj ## logical mat
            es = data.table(Matrix::which(adj!=0, arr.ind=T))
            if (nrow(es)>0){
                es[, ":="(elb = adj.lb[cbind(row, col)],
                          eub = adj.ub[cbind(row, col)])]
                es[, eub := ifelse(eub!=0, 0, Inf)]
                cn.out.lb = es[, .(out.lb = sum(elb)), keyby=row]
                cn.in.lb = es[, .(in.lb = sum(elb)), keyby=col]
                cn.out.ub = es[, .(out.ub = sum(eub)), keyby=row]
                cn.in.ub = es[, .(in.ub = sum(eub)), keyby=col]
                ## edge lb and ub
                ndt = merge(ndt, cn.out.lb, all.x = TRUE, by.x = "id", by.y = "row")
                ndt = merge(ndt, cn.in.lb, all.x = TRUE, by.x = "id", by.y = "col")
                ndt = merge(ndt, cn.out.ub, all.x = TRUE, by.x = "id", by.y = "row")
                ndt = merge(ndt, cn.in.ub, all.x = TRUE, by.x = "id", by.y = "col")
                pl = this.kag$ploidy
                ## make the best segment CN solution
                ndt[, cn := cnmle]
                ndt[is.na(cn), cn := ceiling(pl)]
                ndt[, ":="(cn.lb = pmax(in.lb, out.lb),
                           cn.ub = pmin(in.ub, out.ub))]
                ## if any violation
                ## ndt[cn > cn.ub, cn := cn.ub]
                ndt[cn < cn.lb, cn := cn.lb]
                if (ndt[, any(is.na(cn) | cn<cn.lb, na.rm=T)]){
                    jerror("Infeasible bounds!!")
                }

                ## construct the adj
                es[, ":="(so.cn = ndt[.(row), cn],
                          si.cn = ndt[.(col), cn])]
                es[, ":="(cn = pmax(elb, pmin(eub, pmin(so.cn, si.cn, na.rm=T), na.rm=T), na.rm=T))]
                es[is.na(cn), cn := 0] ## shouldn't be any tho
                mipstart = list(segstats = this.kag$segstats,
                                adj = sparseMatrix(es$row, es$col, x = es$cn,
                                                   dims = dim(this.kag$adj)))
                mipstart$segstats$cn = ndt[, cn] ## use my new cn
                saveRDS(mipstart, paste0(outdir, "/mipstart.rds"))
            } else {
                mipstart = NULL
            }
        }
    }

    if (!is.null(mipstart))
    {
        if (verbose)
            jmessage('Applying mipstarts from previous jabba solution')

        ## for mipstart graph
        if ("loose" %in% colnames(values(mipstart$segstats))){
            not.loose = which(mipstart$segstats$loose==FALSE)
            mgre = suppressWarnings(
                gr.end(mipstart$segstats[not.loose], 1, ignore.strand = FALSE)
            )
            mgrs = suppressWarnings(
                gr.start(mipstart$segstats[not.loose], 1, ignore.strand = FALSE)
            )
            mij = Matrix::which(mipstart$adj[not.loose, not.loose, drop=FALSE] != 0,
                                arr.ind = TRUE)
        } else {
            mgre = suppressWarnings(gr.end(mipstart$segstats,1, ignore.strand = FALSE))
            mgrs = suppressWarnings(gr.start(mipstart$segstats,1, ignore.strand = FALSE))
            mij = Matrix::which(mipstart$adj!=0, arr.ind = TRUE)
        }

        mgend = gr.string(mgre)
        mgstart = gr.string(mgrs)
        mijs = data.table(mstr = paste(mgend[mij[,1]], mgstart[mij[,2]]),
                          cn = mipstart$adj[mij])
        setkey(mijs, mstr)

        ## for new (i.e. this) graph
        gre = suppressWarnings(gr.end(this.kag$segstats, 1, ignore.strand = FALSE))
        grs = suppressWarnings(gr.start(this.kag$segstats, 1, ignore.strand = FALSE))
        gend = gr.string(gre)
        gstart = gr.string(grs)
        ij = Matrix::which(this.kag$adj!=0, arr.ind = TRUE)
        ijs = data.table(i = ij[,1],
                         j = ij[,2], gstr = paste(gend[ij[,1]], gstart[ij[,2]]),
                         mipstart = as.numeric(NA))
        ijs$is.ref = GenomicRanges::shift(gre[ij[,1]], ifelse(as.logical(strand(gre)[ij[,1]]=="+"), 1, -1)) == grs[ij[,2]]
        setkey(ijs, gstr)

        mcn = mipstart$segstats$cn[gr.match(this.kag$segstats, mipstart$segstats)]
        ## for all remaining ref edges just pick the cn as the cn of the segment that it matches in mipstart
        ijs[gstr %in% mijs$mstr, mipstart := mijs[list(gstr), cn]]
        ijs[is.na(mipstart) & is.ref==TRUE, mipstart := pmin(mcn[i], mcn[j])]
        ijs[is.na(mipstart), mipstart := 0] ## all remaining are 0

        mipstart = sparseMatrix(ijs$i, ijs$j, x = ijs$mipstart,
                                dims = dim(this.kag$adj))
    }

    if (lp) {
        ra.sol = jbaLP(kag.file = kag.file,
                       verbose = verbose,
                       tilim = tilim,
                       epgap = epgap,
                       lambda = 1/slack.prior,
                       ism = ism,
                       tfield = tfield,
                       max.mem = mem,
                       min.bins = min.bins,
                       fix.thres = fix.thres,
                       use.gurobi = use.gurobi)

    } else {
        ra.sol = jbaMIP(this.kag$adj,
                        this.kag$segstats,
                        beta = this.kag$beta,
                        gamma = this.kag$gamma,
                        tilim = tilim,
                        slack.prior = slack.prior,
                        mipemphasis = 0,
                        mipstart = mipstart, ## make mipstart if not provided
                        adj.lb = adj.lb,
                        epgap = epgap,
                        adj.ub = adj.ub,
                        use.gurobi = use.gurobi,
                        mc.cores = mc.cores,
                        adj.nudge = adj.nudge,
                        outdir = outdir,
                        cn.ub = rep(500, length(this.kag$segstats)),
                        use.L0 = loose.penalty.mode == 'boolean',
                        verbose = verbose,
                        dyn.tuning = dyn.tuning,
                        debug.ix = debug.ix)
    }
    saveRDS(ra.sol, out.file)

    ## add optimization status logging for LP
    if (lp) {
        opt.report = data.table(status = ra.sol$status,
                                obj = ra.sol$obj,
                                epgap = ra.sol$epgap)
    } else {
        opt.report =
            do.call(`rbind`,
                    lapply(seq_along(ra.sol$sols),
                           function(cl){
                               x = ra.sol$sols[[cl]]
                               if (inherits(x$nll.cn, "Matrix") |
                                   inherits(x$nll.cn, "matrix")){
                                   nll.cn = x$nll.cn[1, 1]
                               } else {
                                   nll.cn = NA
                               }
                               width.tot = sum(width(x$segstats %Q% (strand=="+"))/1e6)
                               data.table(cl = cl,
                                          obj = ifelse(is.null(x$obj), NA, x$obj),
                                          width.tot = width.tot,
                                          status = ifelse(is.null(x$status), NA, x$status),
                                          nll.cn = nll.cn,
                                          nll.opt = x$nll.opt,
                                          gap.cn = x$gap.cn,
                                          epgap = ifelse(is.null(x$epgap), NA, x$epgap),
                                          converge = ifelse(is.null(x$converge), NA, x$converge))
                           }))
    }
    saveRDS(opt.report, paste0(outdir, "/opt.report.rds"))
    if (verbose){
        jmessage("Recording convergence status of subgraphs")
    }

    ## #' keh2019 Tuesday, Jan 11, 2022, Week 02, 01:40:04 PM commenting out
    ## if (customparams)
    ## {
    ##     system(paste('rm', param.file))
    ##     Sys.setenv(ILOG_CPLEX_PARAMETER_FILE='')
    ## }
}

##############################
#' @name segstats
#' @title segstats
#' @rdname internal
#' segstats is a step in the JaBbA pipeline
#'
#' @details
#' computes posterior mean's and sd's for a target tiling GRanges of segments (target)
#' target must be a non-overlapping gapless strandless or two stranded tiling of the genome (eg output of gr.tile)
#' if two stranded, then every stranded interval must have a mirror image interval included
#'
#' given a GRanges of signals using value field "field" of signal GRanges
#' assuming that the signals inside each interval in "target" are independent samples from a
#' normal distribution of unknown mean and variance
#'
#' will also compute means ands std deviations on a gamma posterior of the
#' the "high" and "low" alleles for a granges "asignal" representing allelic signal ref count and tot count
#' across a set of locations (i.e. modeling the counts as a poisson random variable)
#' Fields are specified by afields
#'
#' outputs target GRanges with "$mean" and "$sd" fields populated
#'
#' @param target GRanges of segments on which segstats will be computed
#' @param signal GRanges of coverage from which samples will be taken
#' @param field  field of "signal" GRanges from which coverage signal will be pulled
#' @param asignal optional GRanges corresponding width 1 bialellic snp allele counts across the genome,
#' @param afields length 2 character vector meta data fields of asignal GRanges that will be used to get allele counts (default is ref.count, alt.count)
#' ## @param subsample number between 0 and 1 with which to subsample per segment for coverage (useful for superdense coverage eg 50 bases to avoid correlations between samples due to read overlap)
#' @param mc.cores number of cores to run on (default 1)
#' @param lp (logical) if running LP use binstats-style loess smoothing
###########################################
segstats = function(target,
                    signal = NULL,
                    field = 'signal',
                    asignal = NULL, ## granges corresponding to width 1 snp allele counts across the genome
                    afields = c('ref.count', 'alt.count'), ## length 2 character vector specifying the two allele fields of asignal
                    prior_weight = 1,
                    prior_mean = NA, # if NA will compute prior "empirically"
                    prior_alpha = NA, # priors for inverse gamma for variance inference
                    prior_beta = NA,
                    max.chunk = 1e8,
                    max.slice = 2e4,
                    max.na = -1,
                    na.thresh = 0.2,
                    verbose = FALSE,
                    ## subsample = NULL, ## number between 0 and 1 to subsample per segment for coverage (useful for dense coverage)
                    mc.cores = 1,
                    nsamp_prior = 1e3, ## number of data samples to estimate alpha / beta prior value
                    ksamp_prior = 100, ## size of data samples to estimate alpha / beta prior values
                    lp = FALSE
                    )
{
    if (!is.null(asignal))
    {
        if (all(afields %in% names(values(asignal))))
        {
            asignal$low.count = pmin(values(asignal)[, afields[1]], values(asignal)[, afields[2]])
            asignal$high.count = pmax(values(asignal)[, afields[1]], values(asignal)[, afields[2]])
            asignal$ix = gr.match(asignal, target, max.slice = max.slice)

            aprior_alpha = 1 #mean(c(asignal$low.count, signal$high.count), na.rm = T)
            aprior_beta = 1

            .postalpha = function(x)
                aprior_alpha + sum(x, na.rm = T)

            .postbeta = function(x)
                aprior_beta + sum(!is.na(x))

            ## asignal.df = as.data.frame(asignal)
            ## alpha_high = vaggregate(high.count ~ ix, asignal.df, .postalpha)[as.character(seq_along(target))]
            ## beta_high = vaggregate(high.count ~ ix, asignal.df, .postbeta)[as.character(seq_along(target))]
            ## alpha_low = vaggregate(low.count ~ ix, asignal.df, .postalpha)[as.character(seq_along(target))]
            ## beta_low = vaggregate(low.count ~ ix, asignal.df, .postbeta)[as.character(seq_along(target))]

            asignal.dt = gr2dt(asignal)
            asignal.dt[,
                       ":="(alpha_high = .postalpha(high.count),
                            beta_high = .postbeta(high.count),
                            alpha_low = .postalpha(low.count),
                            beta_low = .postbeta(low.count)),
                       by=ix]
            asignal.dt = asignal.dt[!duplicated(ix), ]
            setkey(asignal.dt, ix)

            target$mean_high = asignal.dt[list(seq_along(target)), alpha_high / beta_high]
            target$sd_high = asignal.dt[list(seq_along(target)), sqrt(alpha_high / (beta_high)^2)]
            target$mean_low = asignal.dt[list(seq_along(target)), alpha_low / beta_low]
            target$sd_low = asignal.dt[list(seq_along(target)), sqrt(alpha_low / (beta_low)^2)]
        }
        else
            jerror('One or more of the afields ', paste(afields, collapse = ', '), ' not found as meta data columns of asignal')
    }

    if (!is.null(signal))
    {
        if (!(field %in% names(values(signal))))
            jerror('Field not found in signal GRanges')

        binwidth = as.numeric(names(sort(
            table(width(sample(signal, 1000, replace=TRUE))), decreasing = TRUE
        )[1]))

        utarget = unique(gr.stripstrand(target)) ## strand-agnostic
        if (is.null(names(utarget))){
            names(utarget) = as.character(seq_along(utarget))
        }

        ## start mapping signal to segments
        map = gr2dt(gr.findoverlaps(utarget, signal))
        map[, target.name := names(utarget)[query.id]]
        map[, target.width := width(utarget)[query.id]]
        setkey(map, "target.name")
        mapped = unique(map[, target.name])
        ## these are the segments without a overlapping coverage point
        unmapped = setdiff(names(utarget), mapped)
        map = map[names(utarget)]
        ## map the value of field
        map[, val := values(signal)[, field][subject.id]]

        ## xtYao ## Monday, Feb 15, 2021 11:10:32 PM
        ## Here explicitly set the infinite coverage values to NA
        ## Otherwise they will make the raw.var NAN
        map[is.infinite(val), val := NA_real_]

        ## target$raw.sd = target$sd
        ## map = gr.tile.map(utarget, signal, verbose = T, mc.cores = mc.cores)
        ## sample mean and sample var
        ## sample.mean = sapply(vall, mean, na.rm = TRUE)
        .geom.mean = function(x, na.rm = TRUE){
            exp(mean(log(x[!is.infinite(log(x))]), na.rm=na.rm))
        }

        ## four type of means, all recorded but only use arithmetic
        ## sample.art.mean = sapply(vall, mean, na.rm = TRUE)
        ## sample.median = sapply(vall, median, na.rm = TRUE)
        map[, raw.mean := .geom.mean(val), by = target.name]
        map[, raw.var := var(val, na.rm = TRUE), by = target.name] ## na.rm  = TRUE!!

        ## summarize valid bins per node
        map[, good.bin := !is.na(val)]
        ## valid.signal.ix = which(!is.na(values(signal)[, field]))
        ## utarget$nbins = utarget %N% signal[valid.signal.ix]
        ## target$nbins = sapply(vall, function(x) sum(!is.na(x)))[
        ##     as.character(abs(as.numeric(names(target))))
        ## ]
        ## utarget$nbins.tot = pmax(ceiling(width(utarget)/binwidth), utarget$nbins)
        ## target$nbins.tot = sapply(map, length)[as.character(abs(as.numeric(names(target))))]
        ## utarget$nbins.nafrac = 1 - utarget$nbins/utarget$nbins.tot
        ## utarget$wbins.nafrac = 1 - do.call(gUtils::`%o%`, list(utarget, signal[valid.signal.ix]))/width(utarget)

        ## map[is.na(map)] = NA_real_
        ## sample.geom.mean = sapply(vall, .geom.mean, na.rm = TRUE) ## change to geometric mean???
        ## sample.trim.mean = sapply(vall,
        ##                           function(x){
        ##                               if (sum(!is.na(x))>20){
        ##                                   return(mean(x, trim=0.05, na.rm = TRUE))
        ##                               } else {
        ##                                   return(mean(x, na.rm = TRUE))
        ##                               }
        ##                           })

        ## DEBUGGING: replace arithmetic mean with geometric mean
        ## sample.mean = sample.art.mean
        ## sample.mean = sample.geom.mean

        ## sample.var = sapply(vall, var, na.rm = TRUE) ## computing sample variance for each segment
        ix = map[!is.na(raw.mean) & !is.na(raw.var), unique(target.name)]

        if (length(ix)>0){
            target.mdat = map[
               ,.(raw.mean = raw.mean[1],
                  raw.var = raw.var[1],
                  target.name = target.name[1],
                  nbins = sum(good.bin),
                  nbins.tot = .N,
                  nbins.nafrac = 1 - sum(good.bin)/.N,
                  wbins.nafrac = 1 - sum(width[which(good.bin==TRUE)], na.rm = TRUE)/target.width[1],
                  wbins.ok = sum(width[which(good.bin==TRUE)] / 1e3, na.rm = TRUE)), ## width of bins with coverage in kbp
                  ## wbins.nafrac = 1 - sum(width[which(good.bin==TRUE)])/sum(width)),
                keyby = target.name]
            values(utarget) = cbind(
                values(utarget),
                target.mdat[names(utarget), .(raw.mean, raw.var,
                                              nbins,
                                              nbins.tot,
                                              nbins.nafrac,
                                              wbins.nafrac,
                                              wbins.ok)]
            )
            ## target$mean[ix] = sample.mean[ix]
            ## target$var[ix] = sample.var[ix]
        } else {
            ## jmessage("Abort: No valid coverage present anywhere!")
            jerror("No valid coverage present anywhere!")
        }
        utarget$mean = utarget$raw.mean

        ## val = values(signal)[, field]
        ## val[is.infinite(val)] = NA
        ## ## val[which(signal$good.prop<0.9)] = NA
        ## vall = lapply(map, function(x) val[x])
        ## vall = vall[match(gr.stripstrand(target), utarget)]

        ## final clean up
        ## target$art.mean = sample.art.mean
        ## target$median = sample.median
        ## target$geom.mean = sample.geom.mean
        ## target$trim.mean = sample.trim.mean
        ## target$raw.mean = target$mean
        ## target$raw.var = target$var

        ## map = gr.tile.map(utarget, signal, verbose = T, mc.cores = mc.cores)
        ## val = values(signal)[, field]
        ## val[is.infinite(val)] = NA
        ## vall = lapply(map, function(x) val[x])
        ## vall = vall[match(gr.stripstrand(target), utarget)]

        ## ## sample mean and sample var
        ## sample.mean = sapply(vall, mean, na.rm = TRUE)
        ## sample.var = sapply(vall, var, na.rm = TRUE) ## computing sample variance for each segment
        ## ix = !is.na(sample.mean) & !is.na(sample.var)

        ## target$mean = NA;
        ## if (any(ix)){
        ##     target$mean[ix] = sample.mean[ix]
        ##     target$var[ix] = sample.var[ix]
        ## } else {
        ##     jmessage("Abort: No valid coverage present anywhere!")
        ##     jerror("No valid coverage present anywhere!")
        ## }

        ## target$nbins = sapply(vall, function(x) sum(!is.na(x)))[
        ##     as.character(abs(as.numeric(names(target))))
        ## ]
        ## target$nbins.tot = sapply(map, length)[as.character(abs(as.numeric(names(target))))]
        ## target$nbins.nafrac = 1-target$nbins/target$nbins.tot

        ## ## final clean up
        ## target$raw.mean = target$mean

        ## target$good.prop = (target+1e5) %O% good.bin
        utarget$bad = FALSE
        ## if the user didn't give the max.na, we infer it
        if (!is.numeric(max.na) || !between(max.na, 0, 1)){
            if (verbose){
                jmessage("No `max.na` argument found, inferring it for you now...")
            }
            ## gather the values of nafrac
            ## colnames(values(target))
            ## nafrac = gr2dt(utarget[which(!duplicated(gr.stripstrand(target[, c()])))])[
            ##   , .(seqnames, start, end, tile.id = 1:.N, nbins.nafrac)]

            nafrac = utarget[mapped]$wbins.nafrac
            if (var(nafrac)>0){
                ## dat = nafrac[!is.na(nbins.nafrac), cbind(nbins.nafrac)]
                dat = cbind(nafrac)
                ## rownames(dat) = nafrac[!is.na(nbins.nafrac), tile.id]
                ## rownames(dat) = names(utarget)
                rownames(dat) = mapped
                km2 = stats::kmeans(dat, center=2)
                ## telll which part is good/bad
                good = which.min(km2$centers)
                max.na = mean(max(dat[which(km2$cluster==good)]),
                              min(dat[which(km2$cluster!=good)]))
                ## TODO: max.na cannot end up equal to 0!!
                if (verbose){
                    jmessage("The suggested `max.na` is at ", max.na)
                }
            } else {
                max.na = 0
                if (verbose){
                    jmessage("WARNING: your coverage input has no NAs, allowing all of the data...")
                }
            }
        }

        ## FIXME: sometimes we'd throw away 1-bin not bad nodes because its variance is NA
        if (length(bad.nodes <- which(((utarget$wbins.nafrac >= max.na) | (is.na(utarget$wbins.nafrac))) &
                                      utarget$wbins.ok < 20))>0)
        {
            utarget$max.na = max.na ## what about really small segs in a good "environment"
            utarget$bad[bad.nodes] = TRUE
            utarget$mean[bad.nodes] = NA_real_

            ## target$sd[bad.nodes] = NA
            if (verbose)
            {
                na.wid = sum(width(utarget[mapped] %Q% which(bad==TRUE)))/1e6
                jmessage("Definining coverage good quality nodes as >=", (1 - max.na)*100, "% bases covered by non-NA and non-Inf values in +/-100KB region")
                jmessage("Hard setting ", na.wid,
                         " Mb of the genome to NA that didn't pass our quality threshold")
                if (na.wid > (sum(as.double(seqlengths(target[mapped])/1e6))/2)){
                    jmessage("WARNING: more than half of the mapped reference genome is set to NA, and ignored by JaBbA!!")
                }
            }
        }

        ## ## loess var estimation
        ## ## i.e. we fit loess function to map segment mean to variance across the sample
        ## ## the assumption is that such a function exists
        ## loess var estimation
        ## i.e. we fit loess function to map segment mean to variance across the sample
        ## the assumption is that such a function exists
        ##        target$nbins = sapply(map, length)[as.character(abs(as.numeric(names(target))))]        
        MINBIN = 1 ## enough data so variance~mean function can be estimated
        tmp = data.table(var = utarget$raw.var,
                         mean = utarget$mean,
                         nbins = utarget$nbins,
                         bad = utarget$bad)[var>0 & nbins>MINBIN & !bad & !is.na(mean) & !is.na(var), ]

        ## if (lp) {
        ##     browser()
        ##     bins.gr = gr.tile(signal, 5e5)
        ##     bins.gr$id = 1:length(bins.gr)
        ##     bins.dt = as.data.table(signal %$% bins.gr) %>% setnames(field, "cn")
        ##     tmp = bins.dt[, .(mean = mean(cn, na.rm = TRUE),
        ##                           var = var(cn, na.rm = TRUE),
        ##                           nbins = sum(!is.na(cn), na.rm = TRUE))][nbins > MINBIN & var > 0,]
        ## }




        ## xtYao ## Thursday, Feb 18, 2021 02:07:22 PM
        ## To prevent extreme outlying variances, limit traing data to variance between 0.05 and 0.95 quantile
        middle.mean = tmp[, which(between(mean, quantile(mean, 0.05, na.rm = TRUE), quantile(mean, 0.95, na.rm = TRUE)))]
        middle.var = tmp[, which(between(var, quantile(var, 0.05, na.rm = TRUE), quantile(var, 0.95, na.rm = TRUE)))]

        if (verbose)
        {
            jmessage('Using loess to fit mean to variance relationship in segments with greater than ', MINBIN, ' bins')
        }

        if (nrow(tmp)<10)
        {
            warning(sprintf('Could not find enough (>=10) segments with more than %s bins for modeling mean to variance relationship in data.  Data might be hypersegmented.', MINBIN))
        }

        ## overdispersion correction
        ##lmd = tmp[, lm(var ~ mean)]
        ## loe = tmp[, loess(var ~ mean, weights = nbins, span = 2)]
        ## loe = tmp[, loess(var ~ mean, weights = nbins, span = 5)]
        ## No don't, this is stupid
        ## xtYao ## Wednesday, Feb 17, 2021 02:56:29 PM
        ## Switch to "surface='direct'" for LOESS as it extrapolates
        ## Also, tune up the span parameter to reduce overfitting
        ## loe = tmp[, loess(var ~ mean, weights = nbins, span = 2, control = loess.control(surface = "direct"))]

        ## tmp[, predict.var := predict.lm(lmd, newdata = data.table(mean))]
        ## tmp[, predict.var := predict(loe, newdata = mean)]
        ## tmp[, predict.var := predict(loe2, newdata = mean)]

        ## tmp = tmp[order(mean)]
        ## ppdf(print(
        ##     ggplot(tmp) +
        ##     geom_point(aes(x = mean, y = var)) +
        ##     geom_line(aes(x = mean, y = predict.var), col = "red") +
        ##     geom_line(aes(x = mean, y = predict.var2), col = "purple") +
        ##     theme_pub()
        ## ))
        ## inferring segment specific variance using loess fit of mean to variance per node
        ## using loess fit as the prior sample var
        ## get the bayesian point estimator (expectation of posterior distribution)
        ## with conjugate prior of scaled inverse chi-sq
        ## min allowable var

        loe.middle.i = tmp[intersect(middle.var, middle.mean), loess(var ~ mean, weights = nbins, span = 5)]
        loe = loe.middle.i
        utarget$loess.var = predict(loe, utarget$mean)
        ## hyperparameter neu, same unit as sample size,
        ## the larger the more weight is put on prior
        ## neu = median(target$nbins, na.rm=T)## neu = 5 ## let's start with this
        neu = utarget$nbins
        ## neu = 434 ## let's start with this
        neu.post = utarget$nbins + neu
        utarget$tau.sq.post = (neu * utarget$loess.var + (utarget$nbins - 1) * utarget$raw.var)/ (neu + utarget$nbins)
        utarget$post.var = try(neu.post * utarget$tau.sq.post / (neu.post - 2))
        utarget$var = utarget$post.var

        min.var = min(tmp$var, na.rm = TRUE)
        ## max.var = min(tmp$var, na.rm = TRUE)

        ## xtYao ## Tuesday, Feb 16, 2021 03:31:04 PM
        ## There could be good small segments with a valid mean without a var
        ## fill them in with just the LOESS prediction
        miss.var = which(is.na(utarget$var) & !is.na(utarget$mean))
        utarget$var[miss.var] = utarget$loess.var[miss.var]

        ## plot the prediction
        ## tdt = gr2dt(target)
        ## tdt[, more_than_20bins := ifelse(nbins>20, ">20bins", "<=20bins")]
        ## tdt[, more_than_5bins := ifelse(nbins>5, ">5bins", "<=5bins")]

        ## ppdf(print(
        ##     tdt[order(mean)] %>%
        ##     ggplot() +
        ##     geom_point(aes(x = mean, y = raw.var, size = nbins), color = "grey75", alpha = 0.5) +
        ##     geom_line(aes(x = mean, y = loess.var), color = "salmon", lwd = 2) +
        ##     geom_hline(yintercept = 0, lty = "dashed") +
        ##     theme_pub(20) +
        ##     facet_wrap(~ more_than_5bins, nrow = 1, scales = "free")
        ## ), width = 12)

        ## wtf = target %Q% which(strand=="+" & raw.var>8000)

        ## clean up NA values which are below or above the domain of the loess function which maps mean -> variance
        ## basically assign all means below the left domain bnound of the function the variance of the left domain bound
        ## and analogously for all means above the right domain bound
        na.var = is.na(utarget$var)
        rrm = range(utarget$mean[!na.var])
        rrv = pmax(predict(loe, rrm), min.var)
        utarget$var[utarget$mean<=rrm[1]] = rrv[1]
        utarget$var[utarget$mean>=rrm[2]] = rrv[2]
        ## no negative var!
        utarget$var[utarget$var<=0] = rrv[1]

        ## if running LP, update loess.var as well
        ## if (lp) {
        ##     jmessage("Running LP mode, filling in loess.var")
        ##     utarget$loess.var[utarget$mean<=rrm[1]] = rrv[1]
        ##     utarget$loess.var[utarget$mean>=rrm[2]] = rrv[2]
        ##     utarget$loess.var[utarget$var<=0] = rrv[1]
        ## }

        ## fill in loess.var generally
        utarget$loess.var[utarget$mean<=rrm[1]] = rrv[1]
        utarget$loess.var[utarget$mean>=rrm[2]] = rrv[2]

        #' zchoo Wednesday, Sep 22, 2021 10:05:59 AM
        ## check specifically for loess.var < 0
        utarget$loess.var[utarget$loess.var<=0] = rrv[1] 

        pdf("var.mean.loess.pdf")
        ## all points training
        ## loe = tmp[, loess(var ~ mean, weights = nbins, span = 5)]
        ## plot(x = tmp$mean, y = tmp$var, pch = 19, cex = 0.5)
        ## lines(x = sort(tmp$mean), y = predict(loe, sort(tmp$mean)), col = "red")
        ## middle means only
        ## loe.middle.mean = tmp[middle.mean, loess(var ~ mean, weights = nbins, span = 5)]
        ## plot(x = tmp$mean, y = tmp$var, pch = 19, cex = 0.5, xlim = tmp[, quantile(mean, c(0.05, 0.95))])
        ## lines(x = sort(tmp[, mean]), y = predict(loe.middle.mean, sort(tmp[, mean])), col = "orange")
        ## middle vars only
        ## loe.middle.var = tmp[middle.var, loess(var ~ mean, weights = nbins, span = 5)]
        ## plot(x = tmp$mean, y = tmp$var, pch = 19, cex = 0.5, xlim = tmp[, quantile(mean, c(0.05, 0.95))])
        ## lines(x = sort(tmp[, var]), y = predict(loe.middle.var, sort(tmp[, mean])), col = "salmon")
        ## union
        ## loe.middle.u = tmp[union(middle.var, middle.mean), loess(var ~ mean, weights = nbins, span = 5)]
        ## plot(x = tmp$mean, y = tmp$var, pch = 19, cex = 0.5, xlim = tmp[, quantile(mean, c(0.05, 0.95))])
        ## lines(x = sort(tmp[, var]), y = predict(loe.middle.u, sort(tmp[, mean])), col = "salmon")
        ## intersect
        ## pcols = .get_density(tmp$mean, tmp$var)
        ## cr = colorRamp(c("#f7f7f7", "#2166ac"))
        plot(x = tmp$mean, y = tmp$var, pch = 19, cex = 0.5, xlim = tmp[, quantile(mean, c(0.05, 0.95))], ylim = tmp[, quantile(var, c(0.05, 0.95))]## ,
             ## col = col2hex(t(cr(pcols)))
             )
        lines(x = sort(tmp[, var]), y = predict(loe.middle.i, sort(tmp[, mean])), col = "salmon")
        plot(x = tmp[nbins>100]$mean, y = tmp[nbins>100]$var, pch = 19, cex = 0.5, xlim = tmp[, quantile(mean, c(0.05, 0.95))], ylim = tmp[, quantile(var, c(0.05, 0.95))])
        lines(x = sort(tmp[nbins>100, var]), y = predict(loe.middle.i, sort(tmp[nbins>100, mean])), col = "salmon")
        plot(utarget$raw.var, utarget$var)
        dev.off()


        ## xtYao ## Tuesday, Feb 16, 2021 09:53:50 AM
        ## CHECK: no good node should have NA var
        if (any(is.na(utarget$var) & !is.na(utarget$mean))){
            jerror("Some segments with valid mean do not have a variance.")
        }

        if (any(utarget$var<=0, na.rm = TRUE)){
            jerror("Some segments have non-positive variance.")
        }

        ## computing sd / sem for each utarget
        utarget$sd = sqrt((2*utarget$var)/utarget$nbins)

        var.ratio = max(utarget$var,na.rm = TRUE)/min(utarget$var, na.rm = TRUE)

        if ((var.ratio)>1e7)
        {
            warning('Ratio of highest and lowest segment variances exceed 1e7. This could result from very noisy bin data and/or extreme hypersegmentation.  Downstream optimization results may be unstable.')
        }

        ## browser()
        ## ## debug
        ## library(ggplot2)
        ## tdt = gr2dt(utarget)
        ## tdt[, in.tmp := raw.var>0 & nbins>MINBIN & !bad]
        ## tdt = tdt[order(raw.mean)]

        ## ppdf({
        ##     print(
        ##         tdt %>%
        ##         ggplot(aes(x = raw.mean, y = raw.var)) +
        ##         geom_point(aes(size = nbins, color = in.tmp, shape = bad)) +
        ##         geom_line(aes(x = raw.mean, y = loess.var), color = "red") +
        ##         scale_color_viridis(alpha = 0.5, discrete = TRUE, begin = 0.2, end = 0.8, option = "magma") +
        ##         geom_vline(xintercept = rrm[1], lty = "dashed") +
        ##         geom_vline(xintercept = rrm[2], lty = "dashed") +
        ##         geom_hline(yintercept = rrv[1], lty = "dashed") +
        ##         geom_hline(yintercept = rrv[2], lty = "dashed") +
        ##         geom_vline(xintercept = tmp[, quantile(mean, 0.05)], lty = "dotted") +
        ##         geom_vline(xintercept = tmp[, quantile(mean, 0.95)], lty = "dotted") +
        ##         geom_hline(yintercept = tmp[, quantile(var, 0.05)], lty = "dotted") +
        ##         geom_hline(yintercept = tmp[, quantile(var, 0.95)], lty = "dotted") +
        ##         scale_x_continuous(trans = "log10") +
        ##         scale_y_continuous(trans = "log10") +
        ##         theme_pub()
        ##     )
        ##     print(
        ##         tdt[(in.tmp)] %>%
        ##         ggplot(aes(x = raw.mean, y = raw.var)) +
        ##         geom_point(aes(size = nbins)) +
        ##         geom_line(aes(x = raw.mean, y = loess.var), color = "red") +
        ##         geom_vline(xintercept = rrm[1], lty = "dashed") +
        ##         geom_vline(xintercept = rrm[2], lty = "dashed") +
        ##         geom_hline(yintercept = rrv[1], lty = "dashed") +
        ##         geom_hline(yintercept = rrv[2], lty = "dashed") +
        ##         geom_vline(xintercept = tmp[, quantile(mean, 0.05)], lty = "dotted") +
        ##         geom_vline(xintercept = tmp[, quantile(mean, 0.95)], lty = "dotted") +
        ##         geom_hline(yintercept = tmp[, quantile(var, 0.05)], lty = "dotted") +
        ##         geom_hline(yintercept = tmp[, quantile(var, 0.95)], lty = "dotted") +
        ##         ## scale_color_viridis(alpha = 0.5, discrete = TRUE, begin = 0.2, end = 0.8, option = "magma") +
        ##         scale_x_continuous(trans = "log10") +
        ##         scale_y_continuous(trans = "log10") +
        ##         theme_pub()
        ##     )
        ##     print(
        ##         tdt %>%
        ##         ggplot(aes(x = raw.mean, y = raw.var)) +
        ##         geom_point(aes(size = nbins, color = in.tmp, shape = bad)) +
        ##         geom_line(aes(x = raw.mean, y = loess.var), color = "red") +
        ##         scale_color_viridis(alpha = 0.5, discrete = TRUE, begin = 0.2, end = 0.8, option = "magma") +
        ##         geom_vline(xintercept = rrm[1], lty = "dashed") +
        ##         geom_vline(xintercept = rrm[2], lty = "dashed") +
        ##         geom_hline(yintercept = rrv[1], lty = "dashed") +
        ##         geom_hline(yintercept = rrv[2], lty = "dashed") +
        ##         geom_vline(xintercept = tmp[, quantile(mean, 0.05)], lty = "dotted") +
        ##         geom_vline(xintercept = tmp[, quantile(mean, 0.95)], lty = "dotted") +
        ##         geom_hline(yintercept = tmp[, quantile(var, 0.05)], lty = "dotted") +
        ##         geom_hline(yintercept = tmp[, quantile(var, 0.95)], lty = "dotted") +
        ##         ## scale_x_continuous(trans = "log10") +
        ##         ## scale_y_continuous(trans = "log10") +
        ##         theme_pub()
        ##     )
        ##     print(
        ##         tdt[(in.tmp)] %>%
        ##         ggplot(aes(x = raw.mean, y = raw.var)) +
        ##         geom_point(aes(size = nbins)) +
        ##         geom_line(aes(x = raw.mean, y = loess.var), color = "red") +
        ##         geom_vline(xintercept = rrm[1], lty = "dashed") +
        ##         geom_vline(xintercept = rrm[2], lty = "dashed") +
        ##         geom_hline(yintercept = rrv[1], lty = "dashed") +
        ##         geom_hline(yintercept = rrv[2], lty = "dashed") +
        ##         geom_vline(xintercept = tmp[, quantile(mean, 0.05)], lty = "dotted") +
        ##         geom_vline(xintercept = tmp[, quantile(mean, 0.95)], lty = "dotted") +
        ##         geom_hline(yintercept = tmp[, quantile(var, 0.05)], lty = "dotted") +
        ##         geom_hline(yintercept = tmp[, quantile(var, 0.95)], lty = "dotted") +
        ##         ## scale_color_viridis(alpha = 0.5, discrete = TRUE, begin = 0.2, end = 0.8, option = "magma") +
        ##         ## scale_x_continuous(trans = "log10") +
        ##         ## scale_y_continuous(trans = "log10") +
        ##         theme_pub()
        ##     )
        ##     print(
        ##         tdt[raw.mean>25 & nbins>2] %>%
        ##         ggplot(aes(x = raw.mean, y = raw.var)) +
        ##         geom_point(aes(size = nbins)) +
        ##         geom_line(aes(x = raw.mean, y = loess.var), color = "red") +
        ##         geom_vline(xintercept = rrm[1], lty = "dashed") +
        ##         geom_vline(xintercept = rrm[2], lty = "dashed") +
        ##         geom_hline(yintercept = rrv[1], lty = "dashed") +
        ##         geom_hline(yintercept = rrv[2], lty = "dashed") +
        ##         geom_vline(xintercept = tmp[, quantile(mean, 0.05)], lty = "dotted") +
        ##         geom_vline(xintercept = tmp[, quantile(mean, 0.95)], lty = "dotted") +
        ##         geom_hline(yintercept = tmp[, quantile(var, 0.05)], lty = "dotted") +
        ##         geom_hline(yintercept = tmp[, quantile(var, 0.95)], lty = "dotted") +
        ##         ## scale_color_viridis(alpha = 0.5, discrete = TRUE, begin = 0.2, end = 0.8, option = "magma") +
        ##         ## scale_x_continuous(trans = "log10") +
        ##         ## scale_y_continuous(trans = "log10") +
        ##         theme_pub()
        ##     )
        ##     print(
        ##         tdt %>%
        ##         ggplot(aes(x = raw.var, y = var)) +
        ##         geom_point(aes(size = nbins, color = in.tmp)) +
        ##         scale_x_continuous(trans = "log10") +
        ##         scale_y_continuous(trans = "log10") +
        ##         theme_pub()
        ##     )
        ## })

        ## finally copy all metadata from utarget to target
        values(target) = values(utarget)[gr.match(target, utarget), ]
    }
    return(target)
}

#' @name jmessage
#' @rdname internal
jmessage = function(..., pre = 'JaBbA')
    message(pre, ' ', paste0(as.character(Sys.time()), ': '), ...)

#' @name jwarning
#' @rdname internal
jwarning = function(..., pre = 'JaBbA', call. = FALSE)
    warning(paste0(pre, ' ', paste0(as.character(Sys.time()), ': '), ...), call. = call.)

#' @name jerror
#' @rdname internal
jerror = function(..., pre = 'JaBbA', call. = TRUE)
    stop(paste0(pre, ' ', paste0(as.character(Sys.time()), ': '), ...), call. = call.)

#' @name jbaLP
#' @title jbaLP
#'
#' @details
#'
#' LP analog of jbaMIP
#'
#' @param kag.file (character) path to karyograph
#' @param gg.file (character) path to gGraph
#' @param kag (karyograph object) karyograph (list)
#' @param gg (gGraph object) gGraph
#' @param cn.field (character) column in karyograph with CN guess, default cnmle
#' @param var.field (character) column in karyograph with node variance estimate, default loess.var
#' @param bins.field (character) column in karyograph containing number of bins, default nbins
#' @param tfield (character) column in junction metadata containing junction tier (default tier)
#' @param min.var (numeric) min allowable variance default 1e-5
#' @param min.bins (numeric) min allowable bins default 1
#' @param lambda (numeric) slack penalty, default 100
#' @param L0 (logical) default TRUE
#' @param M (numeric) max copy number, default 1e3
#' @param verbose (numeric) 0 (nothing) 1 (everything  MIP) 2 (print MIP), default 2 print MIP
#' @param tilim (numeric) default 1e3
#' @param ism (logical) whether to add infinite site assumption constraints. default TRUE
#' @param epgap (numeric) default 1e-3
#' @param max.mem (numeric) maximum memory in GB
#' @param fix.thres (numeric) multiple of lambda above which to fix nodes
#' @param return.type (character) either "gGraph" or "karyograph"
#' @param require.convergence (logical) warn if not converged? default TRUE
#' @param max.epgap.thresh (numeric) above this value, all node and edge CNs are NA (default 0.5)
#' @param nodefileind (numeric) one of 0, 1, 2, 3 (for storing CPLEX tree node files), default 3
#' @param use.gurobi (logical) use gurobi? default FALSE
#'
#' @return
#' karyograph with modified segstats/adj. Adds fields epgap, cl, ecn.in, ecn.out, eslack.in, eslack.out to $segstats and edge CNs to $adj
#'
#' @author Marcin Imielinski, Zi-Ning Choo
jbaLP = function(kag.file = NULL,
                 gg.file = NULL,
                 kag = NULL,
                 gg = NULL,
                 cn.field = "cnmle",
                 var.field = "loess.var",
                 bins.field = "nbins",
                 tfield = "tier",
                 min.var = 1,
                 min.bins = 1,
                 lambda = 100,
                 L0 = TRUE,
                 M = 1e3,
                 verbose = 2,
                 tilim = 1e3,
                 ism = TRUE,
                 epgap = 1e-3,
                 max.mem = 16,
                 fix.thres = -1,
                 round.thresh = 0.25,
                 return.type = "karyograph",
                 require.convergence = FALSE,
                 max.epgap.thresh = 0.5,
                 nodefileind = 3,
                 use.gurobi = FALSE)
{
    
    if (is.null(kag.file) & is.null(kag) & is.null(gg.file) & is.null(gg)) {
        stop("one of kag, kag.file, gg.file, gg must be supplied")
    }
    if (!is.null(kag.file) & !is.null(kag)) {
        warning("both kag.file and kag supplied. using kag.")
    }
    if (!is.null(kag)) {
        if (verbose) {
            message("using supplied karyograph")
            kag.gg = gG(jabba = kag)
            beta = kag$beta
        }
    } else {
        if (!is.null(kag.file) && file.exists(kag.file)) {
            if (verbose) {
                message("reading karyograph from file")
            }
            kag = readRDS(kag.file)
            kag.gg = gG(jabba = kag)
            beta = kag$beta
        } else if (!is.null(gg)) {
            if (verbose) {
                message("using supplied gGraph")
            }
            kag.gg = gg$copy
            beta = gg$meta$beta
        } else if (!is.null(gg.file) && file.exists(gg.file)) {
            if (verbose) {
                message("reading gGraph from provided file")
            }
            kag.gg = readRDS(file = gg.file)
            beta = gg$meta$beta
        } else {
            stop("kag.file does not exist and kag not supplied")
        }
    }

    if (verbose) {
        message("Marking nodes with cn contained in column: ", cn.field)
    }

    if (is.null(values(kag.gg$nodes$gr)[[cn.field]])) {
        stop("karyograph must have field specified in cn.field")
    }
    kag.gg$nodes$mark(cn  = values(kag.gg$nodes$gr)[[cn.field]])

    if (verbose) {
        message("Computing node weights using variance contained in column: ", var.field)
    }

    if (is.null(values(kag.gg$nodes$gr)[[var.field]]) | is.null(values(kag.gg$nodes$gr)[[bins.field]])) {
        warning("karyograph missing var.field. setting weights to node widths")
        wts = width(kag.gg$nodes$gr)
    } else {

        ## process variances
        vars = values(kag.gg$nodes$gr)[[var.field]]
        
        if (is.null(beta)) {
            warning("no $beta provided in karyograph/gg metadata")
        } else if (is.na(beta)) {
            warning("NA value for $beta provided in karyograph/gg metadata")
        } else {
            vars = beta * beta * vars
        }

        ## make sure there are no negative variances and that variance is at least CN
        ## this is because there are a larger number of points with low CN and few high CN points
        ## resulting in low estimated variance for high CN nodes even if raw variance was high
        vars = pmax(pmax(vars, kag.gg$nodes$dt$cn), min.var)
        ## vars = pmax(vars, min.var) ## make sure that all variances are at least min.var
        sd = sqrt(vars)

        ## process bins
        bins = values(kag.gg$nodes$gr)[[bins.field]]
        bins = ifelse(bins < min.bins, NA, bins)

        ## compute node weights
        wts = bins / (sd * sqrt(2)) ## for consistency with Laplace distribution
        wts = ifelse(is.infinite(wts) | is.na(wts) | wts < 0, NA, wts)

    }
    kag.gg$nodes$mark(weight = wts)

    ## check for edge rewards
    if (!is.null(kag.gg$edges$dt$reward)) {
        if (verbose) {
            message("Checking edge rewards...")
        }
        erewards = kag.gg$edges$dt[, reward]
        reward.ix = which(erewards != 0)
        if (verbose) {
            message("Number of nonzero rewards: ", length(reward.ix))
        }
        erewards[reward.ix] = lambda / 8 ## set to lambda? or half lambda?
        kag.gg$edges$mark(reward = erewards)
    } else {
        if (verbose) {
            message("Rewards not supplied on edges!")
        }
    }
    
    ## no edge CNs
    kag.gg$edges$mark(cn = NULL)
    kag.gg$nodes[abs(cn) > M]$mark(cn = NA, weight = NA)

    ## add lower bounds depending on ALT junction tier
    if (tfield %in% colnames(kag.gg$edges$dt)) {
        lbs = ifelse(kag.gg$edges$dt[, ..tfield] == 1, 1, 0)
        kag.gg$edges$mark(lb = lbs)
    }

    if (verbose) {
        message("Starting LP balance on gGraph with...")
        message("Number of nodes: ", length(kag.gg$nodes))
        message("Number of edges: ", length(kag.gg$edges))
    }

    ## check for duplicate breakpoints in karyograph junctions
    dup.junctions = detect_duplicate_breakpoints(kag.gg$junctions, tfield = tfield, verbose = verbose)

    ## reset ISM if there are duplicate breakpoints
    if (length(dup.junctions)) {
        if (verbose) {
            message("Number of overlapping Tier 1 junctions: ", length(dup.junctions))
        }
        if (ism) {
            warning("ISM set to TRUE with duplicate Tier 1 junctions, resetting to FALSE to avoid infeasibility")
            ism = FALSE
        }
    } else {
        if (verbose) {
            message("No duplicate Tier 1 junctions detected")
        }
    }

    ## check for heavy nodes to fix
    if (fix.thres > 0) {

        if (fix.thres < 4) {
            warning("Small value for fix.thres selected. Resetting to 4, the minimum recommended value")
            fix.thres = 4
        }

        if (verbose) {
            message("Checking for heavy nodes to fix")
        }

        penalty.dt = kag.gg$nodes$dt[, .(node.id, cn, weight)]

        ## compute the difference between 'optimal' CN and 'next-best' CN
        penalty.dt[, penalty := weight * abs(1 - 2 * (cn - floor(cn)))]

        ## compute the 'best' CN - this is just CNMLE
        penalty.dt[, best.cn := round(cn)]

        ## get node ids and CNs of nodes with penalty > lambda * fix.thres
        penalty.dt[, fixed := ((penalty > fix.thres * lambda) & (best.cn >= 0))]
        penalty.dt[fixed == TRUE, lb := best.cn]
        penalty.dt[fixed == TRUE, ub := best.cn]
        penalty.dt[, new.cn := cn]
        penalty.dt[fixed == TRUE, new.cn := best.cn]
        penalty.dt[, new.weight := weight]
        penalty.dt[fixed == TRUE, new.weight := NA]

        if (verbose) {
            message("Number of fixed heavy nodes: ", penalty.dt[fixed == TRUE, .N])
        }

        kag.gg$nodes$mark(cn = penalty.dt$new.cn,
                          unfixed.cn = penalty.dt$cn,
                          weight = penalty.dt$new.weight,
                          unfixed.weight = penalty.dt$weight,
                          fixed = penalty.dt$fixed)

        nfix = penalty.dt[fixed == TRUE, node.id]
    } else {

        nfix = NULL

    }

    if (verbose) {
        message("Grabbing available memory...")
    }

    gc.dat = gc()
    mem.mb = sum(gc.dat[, 2])

    tm = (max.mem * 1e3 - mem.mb) - 1e3 ## 1 gb buffer - better is to call in balance

    if (verbose) {
        message("Currently used: ", mem.mb, " Mb")
        message("Allowed: ", max.mem * 1e3, " Mb")
    }

    if (tm <= 0) {
        stop("Not enough memory to continue")
    }

    if (verbose) {
        message("Treemem: ", tm, " Mb")
    }

    res = balance(kag.gg,
                  debug = TRUE,
                  lambda = lambda,
                  L0 = L0,
                  verbose = verbose,
                  tilim = tilim,
                  epgap = epgap,
                  lp = TRUE,
                  ism = ism,
                  trelim = tm, ## max.mem * 1e3,
                  nfix = nfix,
                  nodefileind = 3,
                  use.gurobi = use.gurobi)
    bal.gg = res$gg
    sol = res$sol

    if (return.type == "gGraph") {
        return(bal.gg)
    }

    ## check for convergence
    if (require.convergence) {
        if (verbose) {
            message("Checking for convergence to epgap below: ", epgap)
        }
        if (sol$epgap > epgap) {
           warning("Optimization did not converge! Reached epgap: ", sol$epgap)
        }
        if (sol$epgap > max.epgap.thresh) {
            warning("Very high epgap, marking CN as NA")
            bal.gg$nodes$mark(cn = NA)
            bal.gg$edges$mark(cn = NA)
        }
    }
    
    ## just replace things in the outputs
    ## this can create weird errors if the order of kag and bal.gg isn't the same
    out = copy(kag)
    new.segstats = bal.gg$gr
    nnodes = length(new.segstats)

    new.segstats$cl = 1 ## everything same cluster
    new.segstats$epgap = sol$epgap ## add epgap from genome-side opt
    new.segstats$status = sol$status ## solution status to node metadata
    new.segstats$obj = bal.gg$meta$obj ## objective

    ## weighted adjacency
    adj = sparseMatrix(i = bal.gg$sedgesdt$from, j = bal.gg$sedgesdt$to,
                       x = bal.gg$sedgesdt$cn, dims = c(nnodes, nnodes))

    ## add the necessary columns
    new.segstats$ecn.in = Matrix::colSums(adj, na.rm = TRUE)
    new.segstats$ecn.out = Matrix::rowSums(adj, na.rm = TRUE)
    new.segstats$eslack.in = new.segstats$cn - new.segstats$ecn.in
    new.segstats$eslack.out = new.segstats$cn - new.segstats$ecn.out

    ## NA the telomeric segments?
    qtips = gr.end(si2gr(seqlengths(bal.gg$nodes))) ## location of q arm tips
    term.in = c(which(start(bal.gg$nodes$gr) == 1), ## beginning of chromosome
                -which(bal.gg$nodes$gr %^% qtips)) ## flip side of chromosome end
    term.out = -term.in ## out is reciprocal of in
    telo.in = which(new.segstats$snode.id %in% term.in)
    telo.out = which(new.segstats$snode.id %in% term.out)
    new.segstats$eslack.in[telo.in] = NA
    new.segstats$eslack.out[telo.out] = NA

    out$adj = adj

    ## add metadata
    out$segstats = new.segstats
    out$status = sol$status
    out$epgap = sol$epgap
    out$obj = bal.gg$meta$obj

    return(out)
}

#' @name detect_duplicate_breakpoints
#' @title detect_duplicate_breakpoints
#'
#' @details
#'
#' Identifies tier 1 junctions sharing breakpoints
#' These will cause MIP to be infeasible if ISM = TRUE
#'
#' @param juncs (Junction) junction object
#' @param tfield (character) tier field default 'tier'
#' @param verbose (logical) default FALSE
#'
#' @return Junction containing junctions with overlapping breakpoints
#'
#' if tfield is not in junction metadata or all are tier 2 then empty junctions are returned
detect_duplicate_breakpoints = function(juncs, tfield = "tier", verbose = FALSE) {

    if (!tfield %in% colnames(juncs$dt)) {
        warning("tfield missing from metadata")
        return (jJ())
    }

    if (all(is.na(juncs$dt[, ..tfield]))) {
        warning("all entries in tfield NA")
        return (jJ())
    }

    if (!any(juncs$dt[, ..tfield] == 1, na.rm = TRUE)) {
        if (verbose) {
            message("detected no tier 1 junctions")
        }
        return (jJ())
    }

    if (all(juncs$dt$type != "ALT")) {
        if (verbose) {
            message("no ALT junctions detected!")
        }
        return (jJ())
    }

    cols = c(tfield, "seqnames", "start", "end", "strand", "type", "edge.id")
    juncs.dt = as.data.table(stack(juncs$grl))[, ..cols]

    ## select just tier 1 alt edges
    t1.dt = juncs.dt[which(juncs.dt[, ..tfield]==1),][type == "ALT"]

    if (nrow(t1.dt)==0) {
        return (jJ())
    }

    ## add breakpoint and n.unique annotations
    t1.dt[, bp := paste0(seqnames, ":", start, "-", end, strand)]
    t1.dt[, n.unique := length(unique(edge.id)), by = bp]

    ## check if no conflicts
    if (all(t1.dt$n.unique <= 1)) {
        return (jJ())
    }

    ## identify conflicts
    conflict.dt = t1.dt[n.unique > 1,]
    return(juncs[edge.id %in% conflict.dt$edge.id])
}


#' @name jbaMIP
#' @title jbaMIP
#' @rdname internal
#' jbaMIP
#'
#' @details
#' primary "heavy" lifting task of JaBbA.  Sets up optimization problem given an input graph and segstats input
#' and sends to CPLEX via RCplex
#'
#' combines edge-conservation constraints from karyograph (n x n adjacency matrix connecting n genomic intervals) with segment abundance data
#' (segstats - length n GRanges object with mean and sd fields corresponding to posterior means and sd's on the
#' the relative "concentration" of each interval) to infer
#'
#' (1) interval and edge absolute copy numbers on the karyograph
#' (2) purity and ploidy
#' (3) slack edges (if any needed)
#'
#' basically solves ABSOLUTE problem (fitting integer grid to continuous segment intensities)
#' while enforcing edge-conservation constraints.
#'
#' Most important optional parameters include
#' (1) cn.sd (expected deviation of absolute copy number from ploidy)
#' (2) ploidy.min and ploidy.max --> useful for probing alternate solutions, but can be generously set
#' (3) adj.lb --> enforces minimal edge absolute copy number, eg to force aberrant adjacency use
#' (4) edge.slack - logical variable to determine whether or not to allow penalized relaxation of edge conservation constraints
#' (5) nsolutions - number of alternate solutions
#'
#' @param adj n x n adjacency matrix interpreted as binary (this is the $adj output of karyograph)
#' @param segstats n x 1 GRanges object with "mean" and "sd" value fields
#' @param beta numeric guess for beta (i.e. from ppgrid)
#' @param gamma numeric guess for gamma (i.e. from ppgrid)
#' @param slack.prior 1/slack.prior = penalty for each additional copy number of each slack edge, the higher slack.prior the more slack we allow in the reconstruction, should be intuitively calibrated to the expected "incompleteness" of the reconstruction, 1/slack.prior should be calibrated with respect to 1/(k*sd)^2 for each segment, so that we are comfortable with junction balance constraints introducing k copy number deviation from a segments MLE copy number assignment (the assignment in the absence of junction balance constraints)
#' @param field.ncn this field takes into account normal copy number in relative to absolute conversion
#' @param adj.lb nxn matrix of lower bounds on particular copy numbers - this is used to force certain junctions into the graph
#' @param adj.nudge nxn adjacency matrix of "nudge" rewards on individual junctions, NOTE: maximum value in this matrix
#'
#' @return
#' output is a Rcplex solution or list of Rcplex solution with additional fields, each Rcplex solution is a list and the additional fields
#' added by jbaMIP are
#' $adj input n x n adjacency matrix populated with integer copy numbers
#' $segstats input segstats vector populated with meta data fields $cn, $ecn.in, $ecn.out, $edges.out, $eslack.in, $eslack.out
#' $purity purity value associated with relative-absolute affine copy number conversion for this solution
#' $ploidy purity value associated with relative-absolute affine copy number conversion for this solution
#' $
#'
#'
#' Additional fields for qc / technical debugging:
#' $nll.cn negative log likelihood corresponding to the CN fit in this solution
#' $nll.opt negative log likelihood correpsonding to the MLE CN without junction constraints
#' $residual = value of residual between copy solution and MLE fit without junction constraints
#' $beta beta value associated with relative-absolute affine copy number conversion for this solution
#' $gamma gamma value associated with relative-absolute affine copy number conversion for this solution
#' $gap.cn total gap between MLE fit without junction constraints and JaBbA fit
#' $ploidy.constraints input ploidy constraints
#' $beta.constraints input beta constrinats
#' $slack.prior input slack.prior
############################################
jbaMIP = function(adj, # binary n x n adjacency matrix ($adj output of karyograph)
                  segstats, # n x 1 GRanges object with "mean" and "sd" value fields
                  mipstart = NULL, ## sparse adjacency matrix of mipstarts (0 = NA, 0+eps + 0, k>=1 = k)
########### optional args
                  beta, # beta guess
                  gamma, # gamma guess
                  field.ncn = 'ncn', # will use this field to take into account normal copy number in transformation of relative to integer copy number
                  tilim = 20, mipemphasis = 0, epgap = 1e-4, # MIP params
                  ploidy.normal = NULL, ## usually inferred from ncn field but can be entered for subgraph analysis
                  partition = T, ## whether to partition the problem into MIP subproblems depending on the relationships of the segment standard deviation and the value of the slack.prior
                  cn.fix = rep(NA, length(segstats)), ## vector of NA's and (integer) values to which to "fix" copy states, only non NA's are incorporated
                  cn.lb = cn.fix,
                  cn.ub = cn.fix,
                  loose.ends = c(), ## integer vector specifies indices of "loose ends", slack won't be penalized at these vertices
                  adj.lb = 0*adj, # lower bounds for adjacency matrix
                  adj.ub = NULL,
                  adj.nudge = 0*adj, # linear objective function coefficients for edges (only which(adj!=0) components considered)
                  na.node.nudge = TRUE,
                  use.L0 = FALSE,
                  use.gurobi = FALSE, # otherwise will use cplex
                  nsolutions = 1,
                  verbose = F,
                  debug = F,
                  outdir = NULL,
                  mc.cores = 1, ## only matters if partition = T
                  slack.prior = 1,
                  tuning = FALSE, ## whether to invoke CPLEX auto parameter tuning
                  dyn.tuning = TRUE,
                  debug.ix = c(96, 21, 621, 1122, 179, 28, 363, 180, 56, 239, 333),
                  ... # passed to optimizer
                  )
{
    if (length(segstats) != nrow(adj))
        jerror('length(segstats) !=  nrow(adj)')

    if (is.null(adj.lb))
        adj.lb = 0*adj

    ## save the naive solutions
    segstats$kag.cn = segstats$cn

    ## wrapper that calls jbaMIP recursively on subgraphs after "fixing"
    if (partition)
    {
        ## transform means from data space into copy number space
        m = rel2abs(segstats, gamma = gamma, beta = beta, field = 'mean', field.ncn = field.ncn)

        ## transform sds from data space into copy number space (only need to multiply by beta)
        segstats$sd = segstats$sd * beta

        cnmle = round(m) ## MLE estimate for CN
        residual.min = ((m-cnmle)/(segstats$sd))^2
        residual.other =
            apply(cbind(
            (m-cnmle-1)/segstats$sd,
            (m-cnmle+1)/segstats$sd
            )^2,
            1, min)
        ## penalty for moving to closest adjacent copy state
        residual.diff = residual.other - residual.min

        ## we fix nodes for which the penalty for moving to non (locally) optimal copy state
        ## is greater than k / slack.prior penalty (where k is some copy difference
        ## since each node has 4 loose ends
        ## fix = as.integer(which(residual.diff>(4/slack.prior) &
        ##                        cnmle >= 0))
        fix = as.integer(which(residual.diff>(2/slack.prior) &
                               cnmle >= 0))
        ## save the fixing threshold
        segstats$m = m
        segstats$cnmle = cnmle
        segstats$residual.min = residual.min
        segstats$residual.other = residual.other
        segstats$residual.diff = residual.diff

        ## If we have too few fixed nodes, we will have too few subgraphs to optimize,
        ## each bigger and harder to solve
        if (verbose)
        {
            jmessage('Fixing ', length(fix), ' nodes that are unmovable by slack ')
        }

        ##
        ## now we will create a graph of unfixed nodes and fixed node "halves"
        ## i.e. we split each fixed node to a node that is receiving edges
        ## and a node that is sending edges
        ##
        unfix = as.numeric(setdiff(seq_along(segstats), fix))
        tmp.adj = adj
        if (!is.null(adj.ub)){
            tmp.ix = Matrix::which(adj.ub != 0, arr.ind=TRUE)
            tmp.adj[tmp.ix] = 0
        }
        G = graph(as.numeric(t(Matrix::which(tmp.adj != 0, arr.ind=T))), n = length(segstats), directed = T)
        V(G)$name = as.numeric(V(G)) ##  seq_along(V(G)) ## igraph vertex naming is a mystery

        if (length(fix)>0)
            G.unfix = induced.subgraph(G, unfix) + vertices(c(paste('from', fix), paste('to', fix)))
        else
            G.unfix = induced.subgraph(G, unfix)

        if (length(fix)>0 & length(unfix)>0)
            node.map = structure(c(unfix, fix, fix),
                                 names = c(as.character(unfix),
                                           paste('from', fix),
                                           paste('to', fix)))
        else if (length(fix)>0)
            node.map = structure(c(fix, fix), names = c(paste('from', fix), paste('to', fix)))
        else
            node.map = structure(c(unfix), names = c(as.character(unfix)))

        ## add nodes representing the "receiving" and "sending" side of fixed nodes
        if (length(fix)>0 & length(unfix)>0)
        {
            tofix = Matrix::which(adj[unfix, fix]!=0, arr.ind = T)
            fromfix = Matrix::which(adj[fix, unfix]!=0, arr.ind = T)
        }
        else
        {
            tofix = c()
            fromfix = c()
        }

        if (length(fix)>0)
            fixtofix = Matrix::which(adj[fix, fix]!=0, arr.ind = T)
        else
            fixtofix = c()

        if (length(tofix)>0)
            e.tofix = edges(as.vector(rbind(unfix[tofix[,1]], paste('to', fix[tofix[,2]]))))
        else
            e.tofix = edges()

        if (length(fromfix)>0)
            e.fromfix = edges(rbind(paste('from', fix[fromfix[,1]]), unfix[fromfix[,2]]))
        else
            e.fromfix = edges()

        if (length(fixtofix)>0)
            e.fixtofix = edges(rbind(paste('from', fix[fixtofix[,1]]), paste('to', fix[fixtofix[,2]])))
        else
            e.fixtofix = edges()

        ## add edges to graph from fixed to unfixed, unfixed to fix, and fixed to fixed node sides
        G.unfix = G.unfix + e.tofix + e.fromfix + e.fixtofix

        ## find connected components in these graphs
        cl = igraph::clusters(G.unfix, 'weak')
        cll = split(V(G.unfix)$name, cl$membership) ## keep augmented graph names, use node.map later

        ## combine components with their reverse complement components
        ## (only intervals that have a (fixed node free) path from their positive to their negative strand
        ## will be part of the same component .. all other intervals will be separated from their
        ## reverse complement.  However, in the MIP we always optimize
        ## over both strands, and thus must merge components with their reverse complement
        pos.ix = which( as.logical( strand(segstats)=='+') )
        neg.ix = which( as.logical( strand(segstats)=='-') )

        ## maps segments and reverse complements
        seg.map = c(seq_along(pos.ix), suppressWarnings(pos.ix[match(segstats[neg.ix], gr.flipstrand(segstats[pos.ix]))]))

        cll.m = sapply(cll, function(x) paste(sort(seg.map[node.map[x]]), collapse = ' '))
        dup.ix = match(cll.m, unique(cll.m))
                                        #      cll = lapply(split(seq_along(dup.ix), dup.ix), function(x) sort(unique(do.call('c', cll[x]))))
        cll = lapply(split(seq_along(dup.ix), dup.ix), function(x) c(cll[[x[1]]], cll[[x[2]]]))

        ord.ix = order(-sapply(cll, length))
        cll = cll[ord.ix]

        if (verbose)
        {
            jmessage('Partitioned graph into ', length(cll), ' connected components with the size of the highest 10 components being:\n',
                     paste(sapply(cll[1:min(10, length(cll))], length), collapse = ','), '')
        }

        cn.fix = ifelse(seq_along(segstats) %in% fix, cnmle, NA)

        ## force "non lazy" evaluation of args in order to avoid weird R ghosts (WTF) downstream in do.call
        args = as.list(match.call())[-1]
        args = structure(lapply(names(args), function(x) eval(parse(text = x))), names = names(args))

        if (is.null(ploidy.normal))
        {
            if (field.ncn %in% names(values(segstats)))
            {
                args$ploidy.normal = as.data.table(segstats)[, sum(ncn*as.numeric(width), na.rm = TRUE)/sum(ncn*0+1*as.numeric(width), na.rm = TRUE)]

            }
        }

        sols = parallel::mclapply(seq_along(cll), function(k, args)
        {
            ix = node.map[cll[[k]]] ## indices in the original graph
            uix = unique(ix)
            fr.ix = grepl('from', cll[[k]])
            to.ix = grepl('to', cll[[k]])

            ## we want to make sure that fixed nodes that straddle
            ## two clusters will only have the "correct"
            ## half included in this run
            fronly.ix = setdiff(ix[fr.ix], ix[to.ix])
            toonly.ix = setdiff(ix[to.ix], ix[fr.ix])

            ## now we want to make sure that fronly.ix don't have incoming edges
            tmp.adj = adj[uix, uix, drop = F]

            if (length(fronly.ix)>0)
                tmp.adj[, as.character(as.integer(fronly.ix))] = 0

            ## and toonly.ix don't have outgoing edges
            if (length(toonly.ix)>0)
                tmp.adj[as.character(as.integer(toonly.ix)), ] = 0

            args$adj = tmp.adj

            if (!is.null(mipstart))
                args$mipstart = mipstart[uix, uix]

            args$adj.nudge = adj.nudge[uix, uix, drop = F]
            args$na.node.nudge = na.node.nudge
            args$adj.lb = adj.lb[uix, uix, drop = F]
            if (!is.null(adj.ub)){
                args$adj.ub = adj.ub[uix, uix, drop = F] ## xt added 5/4
            }
            args$segstats = segstats[uix]
            args$cn.fix = cn.fix[uix]
            args$cn.lb = cn.lb[uix]
            args$cn.ub = cn.ub[uix]
            args$partition = F
            args$nsolutions = 1
            if (k<=6){
                saveRDS(args, paste0(outdir, "/.args.", k,".rds"))
            }

            if (!is.null(debug.ix)){
                if (k %in% debug.ix){
                    saveRDS(args, paste0(outdir, "/.args.", k,".rds"))
                }
            }

            if (verbose)
                jmessage('Junction balancing subgraph ', k, ' of ',
                         length(cll), ' which has ', length(uix), ' nodes comprising ',
                         round(sum(as.numeric(width(segstats[uix])))/2/1e6, 2), ' MB and ',
                         length(unique(seqnames((segstats[uix])))),
                         ' chromosomes, including chrs ',
                         paste(names(sort(-table(as.character(seqnames((segstats[uix])))))[1:min(4,length(unique(seqnames((segstats[uix])))))]), collapse = ', '))

            if (dyn.tuning){
                ##
                ## New tilim, epgap interplay
                ## the given tilim is tilim.short
                ## tilim.short = args$tilim
                tilim.long = args$tilim
                tilim.short = pmax(tilim.long/10, 10)
                ## tilim.long = 9 * tilim.short ## total time not exceeding 10 times user-defined tilim
                ## epgap.low = args$epgap
                epgap.high = args$epgap
                epgap.low = pmax(epgap.high/100, 1e-4)
                ## epgap.high = pmin(10 * epgap.low, 0.3) ## permissive bound for hard problems
                this.args = args
                this.args$tilim = tilim.short
                this.args$epgap = epgap.low
                this.args$tuning = FALSE
                if (verbose){
                    jmessage("Starting initial run for subgraph ", k,
                             ", aiming epgap at ", this.args$epgap,
                             ", within time limit of ", this.args$tilim)
                }
                out = do.call('jbaMIP', this.args)

                if (!is.na(out$status)){
                    if (out$status %in% c(101, 102)){
                        out$converge = 1
                        jmessage("Subgraph ", k, " converged quickly.")
                    } else {
                        ## prolong the tilim to tilim.long
                        jmessage("Subgraph ", k, " needs prolonged running.")
                        if (out$epgap > epgap.high){
                            ## this.args$tuning = TRUE
                            ## ## Harder prob, try if
                            ## this.args$tilim = tilim.long
                            ## this.args$epgap = epgap.high
                            ## ## this.args$segstats = out$segstats
                            ## this.args$mipstart = out$adj

                            ## if (verbose){
                            ##     jmessage("Starting prolonged run with tuning for subgraph ", k,
                            ##              ", aiming epgap at ", this.args$epgap,
                            ##              ", within time limit of ", this.args$tilim)
                            ## }

                            ## run a round of L1 mode
                            jmessage("Using half the time limit on L1 mode optimization")
                            this.args$tilim = tilim.long/2
                            this.args$epgap = epgap.high
                            this.args$mipstart = out$adj
                            this.args$use.L0 = FALSE
                            out = do.call('jbaMIP', this.args)
                            if (k==1){
                                saveRDS(out, paste0(outdir,"/.tmp.sol.1.rds"))
                            }

                            ## then run a round of L0
                            this.args$use.L0 = TRUE
                            this.args$mipstart = out$adj
                            out = do.call('jbaMIP', this.args)

                            ## converge value:
                            if (out$status %in% c(101, 102)){
                                jmessage("Subgraph ", k, " roughly converged after prolonged session.")
                                out$converge = 3
                            } else {
                                jmessage("Subgraph ", k, " VERYHARD.")
                                out$converge = 4
                            }
                        } else {
                            ## this.args$tilim = tilim.long
                            ## this.args$mipstart = out$adj
                            ## out = do.call('jbaMIP', this.args)
                            out$converge = 2
                            jmessage("Subgraph ", k, " roughly converged.")
                        }
                    }
                } else {
                    jmessage("Subgraph ", k, " has no data to optimize.")
                }
            } else {
                out = do.call('jbaMIP', args)
            }


            if (k<=6){
                saveRDS(out, paste0(outdir, "/.sol.", k,".rds"))
            }
            if (!is.null(debug.ix)){
                if (k %in% debug.ix){
                    saveRDS(out, paste0(outdir, "/.sol.", k,".rds"))
                }
            }

            gc() ## garbage collect .. not sure why this needs to be done

            return(out)
        }, args, mc.cores = mc.cores, mc.preschedule = FALSE)

        ## saveRDS(sols, "raw.sols.rds")
        out = list()
        ## scalar fields --> length(cluster) vector
        for (f in c('residual', 'nll.cn', 'nll.opt', 'gap.cn', 'slack.prior')){
            out[[paste('component', f, sep = '')]] = sapply(sols, function(x) x[[f]])
        }

        ## length 2 fields --> length(cluster) x 2 matrix
        for (f in c('ploidy.constraints', 'beta.constraints')){
            out[[paste('component', f, sep = '')]] = do.call('rbind', lapply(sols, function(x) x[[f]]))
        }

        ## adjacency matrix
        out$adj = 0 * adj
        for (i in seq_along(sols))
        {
            ix1 = as.numeric(rownames(sols[[i]]$adj))
            out$adj[ix1, ix1] = out$adj[ix1, ix1] + sols[[i]]$adj
        }

        ## segstats
        sol.ix = lapply(sols, function(x) as.numeric(rownames(x$adj)))

        out$segstats = do.call('grbind', lapply(sols, function(x) x$segstats))[match(seq_along(segstats), unlist(sol.ix))]

        ## annotate segstats keep to keep track and "fixed nodes"
        out$segstats$fixed = seq_along(out$segstats) %in% fix
        out$segstats$cn.fix = cn.fix
        out$segstats$cl  = NA
        out$segstats$id = seq_along(out$segstats)

        ## keep track of which clusters segments originated
        sol.ixul = munlist(sol.ix)
        tmp = vaggregate(sol.ixul[,1], by = list(sol.ixul[,3]), FUN = paste, collapse = ',')
        out$segstats$cl = NA
        out$segstats$cl[as.numeric(names(tmp))] = tmp

        out$segstats$epgap = NA
        out$segstats$epgap[sol.ixul[,3]] = rep(sapply(sols, '[[', "epgap")[sol.ixul[,1]])

        out$purity = 2/(2+gamma)
        v = out$segstats$cn; w = as.numeric(width(out$segstats))
        out$ploidy = sum((v*w)[!is.na(v)]) / sum(w[!is.na(v)])
        out$beta = beta;
        out$gamma = gamma;

        target.less = Matrix::rowSums(adj, na.rm = T)==0
        source.less = Matrix::colSums(adj, na.rm = T)==0
        out$segstats$eslack.out[!target.less] = out$segstats$cn[!target.less] - Matrix::rowSums(out$adj)[!target.less]
        out$segstats$eslack.in[!source.less] =  out$segstats$cn[!source.less] - Matrix::colSums(out$adj)[!source.less]

        out$segstats$ecn.out =  Matrix::rowSums(out$adj)
        out$segstats$ecn.in =  Matrix::colSums(out$adj)

        out$segstats$edges.in = sapply(seq_along(out$segstats),
                                       function(x) {ix = Matrix::which(adj[,x]!=0); paste(ix, '(', out$adj[ix,x], ')', '->', sep = '', collapse = ',')})
        out$segstats$edges.out = sapply(seq_along(out$segstats),
                                        function(x) {ix = Matrix::which(adj[x, ]!=0); paste('->', ix, '(', out$adj[x,ix], ')', sep = '', collapse = ',')})

        ncn = rep(2, length(segstats))
        if (!is.null(field.ncn))
            if (field.ncn %in% names(values(segstats)))
                ncn = values(segstats)[, field.ncn]


        nnix = !is.na(out$segstats$mean) & !is.na(out$segstats$sd) & !is.na(out$segstats$cn)

        ## new obj allowing variable normal copy number
        out$obj = 1/4*sum(((out$segstats$cn[nnix] + ncn[nnix]/2*out$gamma - out$beta*out$segstats$mean[nnix])/out$segstats$sd[nnix])^2) +
            1/slack.prior * (sum(out$segstats$eslack.in + out$segstats$eslack.out, na.rm = T)) ## 1/4 because our original objective is 1/2 for pos strand intervals only
        out$nll.cn = (1/2*sum(((out$segstats$cn[nnix] + out$gamma - out$beta*out$segstats$mean[nnix])/out$segstats$sd[nnix])^2))
        out$nll.opt = (1/2*sum(((cnmle[nnix] + out$gamma - out$beta*out$segstats$mean[nnix])/out$segstats$sd[nnix])^2))
        out$gap.cn = as.numeric(1 - out$nll.opt / out$nll.cn)
        out$sols = sols

        return(out)
    }

    ## take into account (variable) normal cn
    segstats$ncn = rep(2, length(segstats))
    if (!is.null(field.ncn))
        if (field.ncn %in% names(values(segstats)))
            segstats$ncn = values(segstats)[, field.ncn]

    sid = .sid(segstats)
    names(segstats) = sid

    edges = Matrix::which(adj!=0, arr.ind = T)
    if (nrow(edges)>0)
    {
        rownames(edges) = .esid(edges, sid)
    }

    ##
    ## Setting up MIP variables (tracked in varmeta data.table)
    ##

    ## varmeta will keep track of all variables
    ## (i.e. interval, edge, source.slack, target.slack, residual, source.slack.indicator, target.slack.indicator)
    ## id = actual column index in the final matrix
    ## pid = integer specifying parent of variable, which is either row of edges matrix (for edges) and the index of the parent interval in segstats (for everything else)
    ## (all the code below assumes that pid in varmeta are in order from 1 to .N and non missing
    ## for each variable of a given type e.g. varmeta[type == 'residual', identical(pid, 1:.N)])
    ## psid = parent signed id, so that both strands of the same edges / segstats parent,
    ## have the same abs(psid), as.character(psid) also indexes names of the respective edges / segstats object
    if (length(fix.ix <- which(!is.na(cn.fix)))>0){
        cn.lb[fix.ix] = cn.fix[fix.ix]
        cn.ub[fix.ix] = cn.fix[fix.ix]
    }

    varmeta = .varmeta(segstats,
                       edges,
                       adj.lb = adj.lb,
                       adj.ub = adj.ub,
                       cn.lb = cn.lb,
                       cn.ub = cn.ub,
                       gamma = gamma,
                       beta = beta,
                       use.L0 = use.L0)

    ##
    ## Set up MIP constraints (tracked in consmeta)
    ## each constraint has a unique label and we track it's sense
    ## and right hand side, and store its formula
    constraints = .constraints(varmeta,
                               segstats,
                               edges,
                               ploidy.normal = ploidy.normal,
                               use.L0 = use.L0)

    if (is.null(constraints))
    {
        ## if constraints are NULL, then
        ## there are no segments with non NA mean, so we return NA solution
        sol = list();
        sol$residual = NA;
        sol$beta = beta;
        sol$gamma = gamma;
        sol$purity = NA;
        sol$ploidy = NA;
        sol$adj = adj*NA;
        sol$nll.cn = NA;
        sol$nll.opt = NA;
        sol$gap.cn = NA;
        sol$segstats = segstats[, c('mean', 'sd')];
        sol$segstats$cn = NA;
        sol$segstats$ecn.in = NA;
        sol$segstats$ecn.out = NA;
        segstats$ncn = NA;
        sol$segstats$edges.out = sol$segstats$edges.in = rep('', length(segstats));
        sol$segstats$eslack.in = NA;
        sol$segstats$eslack.out = NA;
        sol$slack.prior = slack.prior;
        sol$status = NA;
        sol$epgap = NA;
        sol$converge = NA
        return(sol)
    }

    ## pull constraints data.table and Amat constraints matrix
    consmeta = constraints$consmeta
    Amat = constraints$Amat

    ##
    ## set up objective function
    ##

    ## quadratic portion of objective function
    ## a.k.a. "noise penalty"
    Qobj = Zero = sparseMatrix(1, 1, x = 0, dims = c(nrow(varmeta), nrow(varmeta)))
    s.ix = varmeta[type == 'residual' & dup == FALSE, id]
    noisep = (1/segstats[varmeta[s.ix, pid]]$sd)^2
    noisep = ifelse(is.infinite(noisep), NA, noisep)
    ## remove any infinite noise penalty, eg if sd = 0
    noisep = ifelse(is.na(noisep), 0, noisep) ## set all NA noise penalty segments to 0
    Qobj[cbind(s.ix, s.ix)] = noisep

    ## linear portion of objective function
    ## a.k.a.  "slack penalty"
    cvec = Zero[,1]

    if (use.L0)
    {
        if (verbose)
        {
            jmessage('Applying L0 slack penalty')
        }

        slack.ix = varmeta[type %in% c('source.slack.indicator', 'target.slack.indicator') & !dup, id]
        cvec[slack.ix] = 1/slack.prior
    } else {
        if (verbose)
        {
            jmessage('Applying L1 slack penalty')
        }
        slack.ix = varmeta[type %in% c('source.slack', 'target.slack') & !dup, id]
        cvec[slack.ix] = 1/slack.prior
    }

    ## let any specified "loose ends" have unpenalized slack
    if (length(loose.ends)>0)
    {
        cvec[c(es.s.ix[loose.ends], es.t.ix[loose.ends])] = 0
    }

    if (verbose>1)
    {
        jmessage(sprintf('Total mass on cn portion of objective function: %s. Total mass on edge slack: %s',
                         sum(Qobj[cbind(s.ix, s.ix)]),
                         sum(cvec[slack.ix])))
    }

    if (nrow(edges)>0)
    {
        ## Future TODO: weigh the edges in objective functions
        ## what is the conversion from supporting reads to copy number space?
        en = max(abs(adj.nudge), na.rm=T)
        if (!is.na(en)){
            if (abs(en)>0){
                e.penalty = abs(en * 1.1)
            } else {
                e.penalty = 0.01
            }
        } else {
            e.penalty = 0.01
        }

        e.ix = varmeta[type == 'edge', id]
        e.pix = varmeta[type == 'edge', pid]
        cvec[e.ix] = e.penalty-adj.nudge[edges[e.pix, , drop = FALSE]] ### reward each edge use in proportion to position in edge nudge
    }

    if (!is.null(mipstart))
    {
        varmeta$mipstart = .mipstart(mipstart, segstats, edges, varmeta, consmeta, Amat, beta, gamma, use.L0)
    }

    ## cap astronomical Qobj values so that CPLEX / gurobi does not freak out about large near-infinite numbers
    ## astronomical = value that is 1e8 higher than lowest value
    qr = range(setdiff(Matrix::diag(Qobj), 0))
    CPLEX.INFIN = 1e9
    Qobj[cbind(1:nrow(Qobj), 1:nrow(Qobj))] =
        pmin(CPLEX.INFIN*qr[1], Qobj[cbind(1:nrow(Qobj), 1:nrow(Qobj))])

    ## run MIP
    if (use.gurobi) # translate into gurobi
    {
        if (verbose)
        {
            jmessage('Running gurobi!')
        }

        model = list()
       model$A = Amat
        model$rhs = varmeta$b;
        model$sense = c('E'='=', 'G'='>=', 'L'='<=')[varmeta$sense]
        model$Q = Qobj;
        model$obj = cvec;
        model$lb = varmeta$lb;
        model$ub = varmeta$ub;
        model$vtype = varmeta$vtype;
        model$modelsense = 'min';

        if (!is.null(varmeta$mipstart))
            model$start = varmeta$mipstart
        else
        {
            model$start = rep(NA, nrow(varmeta));
            mu_hat = as.vector(round(((segstats$mean-gamma)/beta)));
            model$start[varmeta[type == 'interval', id]] = mu_hat;
            model$start[varmeta[type == 'residual', id]] = segstats$mean-(mu_hat*beta+gamma);
            model$start[varmeta[label = 'gamma', id]] = varmeta[label = 'gamma', lb]
            model$start[varmeta[label = 'beta', id]] = varmeta[label = 'beta', lb]
            model$start[is.infinite(model$start)] = NA;
        }
        sol = gurobi::gurobi(model, params = c(list(TimeLimit=tilim), list(...)));
        sol$xopt = sol$x;
    }
    else
    {
        control = c(list(...), list(trace = ifelse(verbose>=2, 1, 0), tilim = tilim, epgap = epgap, mipemphasis = mipemphasis))
        if (!is.null(mipstart)) ## apply mipstart if provided
            control$mipstart = varmeta$mipstart

        if (verbose)
            jmessage('Running CPLEX with relative optimality gap threshold ', epgap)

        if (tuning){
            tuning.control = control
            ## add a few more controls to the parameter tuning function
            tuning.control$tuning.display = 2L
            tuning.control$tuning.rep = 3L
            tuning.control$tuning.tilim = 30
            sol = Rcplex2(cvec = cvec,
                          Amat = Amat,
                          bvec = consmeta$b,
                          sense = consmeta$sense,
                          Qmat = Qobj,
                          lb = varmeta$lb,
                          ub = varmeta$ub,
                          n = nsolutions,
                          objsense = "min",
                          vtype = varmeta$vtype,
                          control = tuning.control,
                          tuning = TRUE)
        } else {
            sol = Rcplex2(cvec = cvec,
                          Amat = Amat,
                          bvec = consmeta$b,
                          sense = consmeta$sense,
                          Qmat = Qobj,
                          lb = varmeta$lb,
                          ub = varmeta$ub,
                          n = nsolutions,
                          objsense = "min",
                          vtype = varmeta$vtype,
                          control = control,
                          tuning = FALSE)
        }
    }

    if (is.null(sol$xopt))
        sol.l = sol
    else
        sol.l = list(sol);

    adj = as(adj, 'sparseMatrix');
    mu = segstats$mean
    sd = segstats$sd
    segstats = segstats[, c()]
    segstats$mean = mu
    segstats$sd = sd

    sol.l = lapply(sol.l, function(sol)
    {
        v.ix = varmeta[type == 'interval', id]
        e.ix = varmeta[type == 'edge', id]
        s.ix = varmeta[type == 'residual', id]
        es.t.ix = varmeta[type == 'target.slack', id]
        es.s.ix = varmeta[type == 'source.slack', id]
        beta.ix = varmeta[label == 'beta', id]
        gamma.ix = varmeta[label == 'gamma', id]
        vcn = round(sol$xopt[v.ix])
        ecn = round(sol$xopt[e.ix])
        sol$residual = round(sol$xopt[s.ix])
        sol$beta = sol$xopt[beta.ix]
        sol$gamma = sol$xopt[gamma.ix]
        sol$purity = 2/(2+sol$gamma)
        sol$ploidy = (vcn%*%width(segstats))/sum(as.numeric(width(segstats)))
        sol$adj = adj*0;

        sol$nll.cn = ((sol$xopt[s.ix]%*%Qobj[s.ix, s.ix])%*%sol$xopt[s.ix])[1,1]

        if (sum(!is.na(segstats$mean))>0)
            sol$nll.opt = pp.nll(segstats[!is.na(segstats$mean)], gamma = sol$gamma, beta = sol$beta, field = 'mean', field.ncn = field.ncn)$NLL
        else
            sol$nll.opt = NA

        ## supposed to be how far away from naive MLE is the optima
        sol$gap.cn = as.numeric(1 - sol$nll.opt / sol$nll.cn)
        sol$adj[edges] = ecn;
        sol$segstats = segstats
        sol$segstats$cn = round(vcn)
        sol$segstats$ecn.in = round(Matrix::colSums(sol$adj))
        sol$segstats$ecn.out = round(Matrix::rowSums(sol$adj))
        sol$segstats$edges.in = sapply(seq_along(sol$segstats),
                                       function(x) {ix = Matrix::which(adj[,x]!=0); paste(ix, '(', sol$adj[ix,x], ')', '->', sep = '', collapse = ',')})
        sol$segstats$edges.out = sapply(seq_along(sol$segstats),
                                        function(x) {ix = Matrix::which(adj[x, ]!=0); paste('->', ix, '(', sol$adj[x,ix], ')', sep = '', collapse = ',')})

        sol$segstats$eslack.in = round(sol$xopt[es.t.ix])
        sol$segstats$eslack.out = round(sol$xopt[es.s.ix])
        sol$eslack.in = round(sol$xopt[es.t.ix])
        sol$eslack.out = round(sol$xopt[es.s.ix])
        sol$slack.prior = slack.prior

        return(sol)
    });

    sol.l = sol.l[order(sapply(sol.l, function(x) x$obj))]

    if (length(sol.l)==1)
        sol.l = sol.l[[1]]

    return(sol.l)
}

#' @name .sid
#' @title labels segstats with sid in jbaMIP
#' @description
#'
#' Assigns stranded id "sid"
#'
#' @rdname internal
#' @noRd
#'
.sid = function(segstats)
{
    ## Book-keep vertices and their reverse complements
    ##

    ## map intervals to their reverse complement to couple their copy number (and edge variables)
    pos.ix = which( as.logical( strand(segstats)=='+') )
    neg.ix = which( as.logical( strand(segstats)=='-') )

    ## "original vertices"
    og.ix = pos.ix

    ## map flipping positive to negative vertices
    rev.ix = match(segstats, gr.flipstrand(segstats))

    ## use rev.ix to label all reverse complement pairs
    rcpairs = igraph::clusters(graph.edgelist(cbind(seq_along(rev.ix), rev.ix)), 'weak')$membership
    sid = ifelse(duplicated(rcpairs), -1, 1)*rcpairs

    if (any(is.na(rev.ix)))
        jerror('Input genome graph malformed, some nodes missing their exact reverse complement')

    ## "duplicates" of og.ix i.e. revcomp vertices
    dup.ix = suppressWarnings(neg.ix[match(segstats[og.ix], gr.flipstrand(segstats[neg.ix]))])

    if (!identical(segstats$mean[og.ix] , segstats$mean[dup.ix]) & !identical(segstats$sd[og.ix] , segstats$sd[dup.ix]))
        jerror('Segstats mean or sd not identical for all pos / neg strand interval pairs: check segstats computation')
    return(sid)
}


#' @name .esid
#' @title labels segstats with sid in jbaMIP
#' @description
#'
#' Assigns stranded id "sid"
#'
#' @rdname internal
#' @noRd
#'
.esid = function(edges, sid)
{
    edges.dt = as.data.table(edges)
    edges.dt[, es := paste(sid[row], sid[col])]
    edges.dt[, res := paste(-sid[col], -sid[row])]
    erev.ix = match(edges.dt$es, edges.dt$res)
    if (any(is.na(erev.ix)))
        jerror('Input genome graph malformed, some edges missing their exact reverse complement')

    rcepairs = igraph::clusters(graph.edgelist(cbind(seq_along(erev.ix), erev.ix)), 'weak')$membership
    edges.dt[, esid := ifelse(duplicated(rcepairs), -1, 1)*rcepairs]
    return(edges.dt$esid)
}

#' @name constraints
#' @title generates constraints matrix Amat and constraints tracker data.table consmeta
#' @description
#'
#' @rdname internal
#'
#' @param varmeta data.table output of .varmeta tracking all variables
#' @param segstats GRanges named with "sid" (ie output of sid
#' @param edges m x 2 matrix of edges
#' @param ploidy.normal numeric normal (non-tumor) ploidy estimate (default 2)
#' @param use.L0 logical flag whether to set up L0 constraints
#' @noRd
#'
.constraints = function(varmeta, segstats, edges, ploidy.normal, use.L0)
{
    consmeta = data.table() ## store meta data about constraints to keep track
    Zero = sparseMatrix(1, 1, x = 0, dims = c(nrow(varmeta), nrow(varmeta)))
    pid.nna = which(!is.na(segstats$mean) & !is.na(segstats$sd))
    mu.all = NA
    ncn = segstats$ncn

    if (length(pid.nna)==0)
        return(NULL)

    ## query varmeta for relevant indices
    v.ix = varmeta[type == 'interval', id]
    gamma.ix = varmeta[label == 'gamma', id]
    beta.ix = varmeta[label == 'beta', id]
    v.ix.c = varmeta[type == 'interval',][pid.nna, ][psid>0, id]
    s.ix = varmeta[type == "residual", ][varmeta[v.ix.c, ]$pid, id]

    if (is.null(ploidy.normal)){
        ploidy.normal =
            sum(width(segstats)[v.ix.c]*ncn[v.ix.c]) /
            sum(as.numeric(width(segstats))[v.ix.c])
    }

    ## weighted mean across vertices contributing to mean
    mu.all = (width(segstats)[v.ix.c] %*% segstats$mean[v.ix.c]) /
        sum(as.numeric(width(segstats)[v.ix.c]))

    ## set copy number constraints
    Acn = Zero[rep(1, length(v.ix.c)+1), ]
    Acn[cbind(seq_along(v.ix.c), v.ix.c)] = 1;
    Acn[cbind(seq_along(v.ix.c), s.ix)] = 1
    ## taking into account (normal) variable cn
    Acn[cbind(seq_along(v.ix.c), gamma.ix)] = ncn[v.ix.c]/2
    Acn[cbind(seq_along(v.ix.c), beta.ix)] = -segstats$mean[v.ix.c]

    ## ## final "conservation" constraint
    ## Acn[length(v.ix.c)+1, v.ix] = width(segstats)/sum(as.numeric(width(segstats)));
    ## ##  Acn[length(v.ix.c)+1, s.ix[length(s.ix)]] = 1
    ## ##      Acn[length(v.ix.c)+1, gamma.ix] = 1; ## replacing with below
    ## Acn[length(v.ix.c)+1, gamma.ix] = ploidy.normal/2; ## taking into account (normal) variable cn
    ## Acn[length(v.ix.c)+1, beta.ix] = -mu.all;

    bcn = rep(0, nrow(Acn)) ##

    consmeta = rbind(consmeta,
                     data.table(type = 'Copy',
                                label = paste('Copy', 1:nrow(Acn)),
                                sense = 'E',
                                b = bcn,
                                stringsAsFactors = F))

    ## dup constraints on vertices
    ## constrain every vertex to get the same copy number as its reverse complement
    og.ix = varmeta[type == 'interval' & dup == FALSE, id]
    dup.ix = varmeta[type == 'interval', ][match(-varmeta$psid[og.ix], psid), id]
    Dcn = Zero[rep(1, length(dup.ix)),, drop = F];
    Dcn[cbind(1:nrow(Dcn), dup.ix)] = 1
    Dcn[cbind(1:nrow(Dcn), og.ix)] = -1
    dcn = rep(0, nrow(Dcn))
    sensedcn = rep("E", nrow(Dcn))

    consmeta = rbind(consmeta,
                     data.table(type = 'Dup',
                                label = paste('Dup', 1:nrow(Dcn)),
                                sense = 'E',
                                b = dcn,
                                stringsAsFactors = F))

    ## dup constraints on (reverse complement) edge.slack
    ## (these make sure that reverse complement edge.slacks are
    ## given the same solution as their reverse complement)
    sslack.ix = varmeta[type %in% c('source.slack') & psid>0, id]
    sslack.dup.ix = varmeta[type %in% c('target.slack') & psid<0, ][match(-varmeta$psid[sslack.ix], psid), id]
    tslack.ix = varmeta[type %in% c('target.slack') & psid>0, id]
    tslack.dup.ix = varmeta[type %in% c('source.slack') & psid<0, ][match(-varmeta$psid[tslack.ix], psid), id]
    slack.ix = c(sslack.ix, tslack.ix)
    dup.slack.ix = c(sslack.dup.ix, tslack.dup.ix)
    Ecn = Zero[rep(1, length(slack.ix)),, drop = F];
    Ecn[cbind(1:nrow(Ecn), slack.ix)] = -1
    Ecn[cbind(1:nrow(Ecn), dup.slack.ix)] = 1
    ecn = rep(0, nrow(Ecn))
    Dcn = rbind(Dcn, Ecn)
    dcn = c(dcn, ecn);

    consmeta = rbind(consmeta,
                     data.table(type = 'EdgeSlack',
                                label = paste('EdgeSlack', 1:nrow(Ecn)),
                                sense = 'E',
                                b = ecn,
                                stringsAsFactors = F))

    Acn = rbind(Acn, Dcn)

    Aineq = NULL
    bineq = NULL

    if (nrow(edges)>0)
    {
        ## add edge consistency criteria
        ## for every node that is source of an edge
        ## ensure that sum of weights on outgoing edges
        ## = node weight
        ## do the same for nodes that are targets of edges

        ## gather up ids of edges sources and sinks
        e.ix = varmeta[type == "edge", id]
        e.pix = varmeta[type == "edge", pid]
        so.ix = varmeta[type == 'interval', ][edges[e.pix,1], id]
        ta.ix = varmeta[type == 'interval', ][edges[e.pix,2], id]

        ## unique sources and sink ids of edges
        uso.ix = unique(so.ix)
        uta.ix = unique(ta.ix)

        ## encode these in Bs and Bt where every row
        ## is a junction balance constraint
        Bs = Zero[rep(1, length(uso.ix)), , drop = F]
        Bt = Zero[rep(1, length(uta.ix)), , drop = F]
        Bs[cbind(1:nrow(Bs), uso.ix)] = 1
        Bt[cbind(1:nrow(Bt), uta.ix)] = 1
        Bs[cbind(match(so.ix, uso.ix), e.ix)] = -1
        Bt[cbind(match(ta.ix, uta.ix), e.ix)] = -1

        ## add slack edges for loose ends
        uso.pid = varmeta[uso.ix, pid]
        uta.pid = varmeta[uta.ix, pid]
        uso.lix = varmeta[type == 'source.slack', ][uso.pid, id]
        uta.lix = varmeta[type == 'target.slack', ][uta.pid, id]

        Bs[cbind(1:nrow(Bs), uso.lix)] = -1
        Bt[cbind(1:nrow(Bt), uta.lix)] = -1

        B = rbind(Bs, Bt)

        ## edge duplicate constraints!
        tmp.e = varmeta[type=="edge"]
        ## exact fold-back junctions don't have duplicate
        fb.ix = tmp.e[, which(is.na(match(psid, -psid)))]
        good.e.ix = setdiff(seq_len(nrow(tmp.e)), fb.ix)
        if (length(good.e.ix)==0){
            consmeta =
                rbind(consmeta,
                      data.table(type = 'EdgeSource',
                                 label = paste('EdgeSource', 1:nrow(Bs)),
                                 sense = 'E',
                                 b = 0),
                      data.table(type = 'EdgeTarget',
                                 label = paste('EdgeTarget', 1:nrow(Bt)),
                                 sense = 'E',
                                 b = 0))
            Aed = B
        } else {
            Aedup = Zero[
                seq_len(tmp.e[good.e.ix, sum(dup)]),
              , drop=FALSE]
            reid = tmp.e[, match(psid, -psid)]
            tmp.e[, rid := tmp.e[reid, id]]
            ijs = tmp.e[good.e.ix][dup==FALSE, .(j1 = id, j2 = rid)][, i := 1:nrow(Aedup)]
            Aedup[ijs[, cbind(i, j1)]] = 1
            Aedup[ijs[, cbind(i, j2)]] = -1
            consmeta =
                rbind(consmeta,
                      data.table(type = 'EdgeSource',
                                 label = paste('EdgeSource', 1:nrow(Bs)),
                                 sense = 'E',
                                 b = 0),
                      data.table(type = 'EdgeTarget',
                                 label = paste('EdgeTarget', 1:nrow(Bt)),
                                 sense = 'E',
                                 b = 0),
                      data.table(type = 'EdgeDup',
                                 label = paste('EdgeDup', 1:nrow(Aedup)),
                                 sense = 'E',
                                 b = 0))
            ## populate linear constraints
            Aed = rbind(B, Aedup);
        }
        Amat = rbind(Acn, Aed, Aineq);
    }
    else
    {
        Amat = rbind(Acn, Aineq);
    }

    if (use.L0)
    {
        ## set up indicator constraints
        M = min(c(max(varmeta[type == 'interval', ub], na.rm = TRUE)+1, 1e10))
        if (M>1e10){
            jwarning('Using extremely high copy number upper bounds (above 10000) is not recommended for this model')
        }

        ## only make indicator constraints for non dup source and target
        ## slack variables
        es.s.nz.ix = varmeta[type == 'source.slack.indicator', id]
        es.t.nz.ix = varmeta[type == 'target.slack.indicator', id]
        es.s.nz.pid = varmeta[type == 'source.slack.indicator', pid]
        es.t.nz.pid = varmeta[type == 'target.slack.indicator', pid]
        es.s.ix = varmeta[type == 'source.slack', ][es.s.nz.pid, id]
        es.t.ix = varmeta[type == 'target.slack', ][es.t.nz.pid, id]

        nz.len = length(es.s.nz.ix) + length(es.t.nz.ix)
        vtype = varmeta[, vtype]
        ub = varmeta[, ub]
        lb = varmeta[, lb]

        ## add constraints to make them indicators of loose ends
        ## constraint 1: loose - 0.1 * loose.bool > 0
        ## constraint 2: loose - cn.ub * loose.bool < 0
        new.i = rep(seq_len(nz.len * 2), 2)
        new.j = c(rep(c(es.s.nz.ix, es.t.nz.ix), 2),
                  rep(c(es.s.ix, es.t.ix), 2))
        new.x = c(rep(-0.1, nz.len),
                  rep(-M, nz.len),
                  rep(1, (length(es.s.ix) + length(es.t.ix)) * 2))
        Amat.boolean = sparseMatrix(i = new.i,
                                    j = new.j,
                                    x = new.x,
                                    dims = c(nz.len * 2, nrow(varmeta)))
        b.boolean = rep(0, nz.len * 2)
        sense.boolean = rep(c("G", "L"), each = nz.len)
        consmeta = rbind(consmeta,
                         data.table(type = "SlackBoolean",
                                    label = paste0('SlackBoolean', seq_len(nz.len * 2)),
                                    sense = sense.boolean,
                                    b = b.boolean))
        Amat = rbind(Amat, Amat.boolean)
    }

    colnames(Amat) = varmeta$label
    ## consmeta$formula = paste(arrstring(Amat), '=', consmeta$b)


    return(list(consmeta = consmeta, Amat = Amat))
}


#' @name mipstart
#' @title generates mipstart solution from initial interval by interval mipstart matrix
#' @description
#'
#' @rdname internal
#' @param mipstart n x n matrix of edge solutions
#' @param segstats GRanges named with "sid" (ie output of sid
#' @param edges m x 2 matrix of edges
#' @param varmeta data.table output of .varmeta tracking all variables
#' @param consmeta data.table $consmeta output of .constraints tracking all constraints
#' @param Amat constraint matrix
#' @param beta scalar beta setting
#' @param gamma scalar gamma setting
#' @param use.L0 logical flag whether to set up L0 constraints
#' @noRd
#'
.mipstart = function(mipstart, segstats, edges, varmeta, consmeta, Amat, beta, gamma, use.L0)
{
    ## mips.dt = data.table(edges)
    mips.dt = as.data.table(Matrix::which(mipstart>0, arr.ind = TRUE))
    setnames(mips.dt, c("row", "col"))
    mips.dt[, cn := mipstart[cbind(row, col)]]
    setkeyv(mips.dt, c("row", "col"))

    ## convert everything to data.tables
    consmeta[, id := 1:.N]
    setkey(consmeta, "id")
    varmeta[, id := 1:.N]
    varmeta[type=="interval", mipstart := segstats$cn]
    setkey(varmeta, "id")

    ## NODES
    ## recompute
    ## first mipstart the node copy number n_hat by rounding
    cno = mips.dt[, list(cn = sum(cn, na.rm = TRUE)),  keyby = 'row']
    cni = mips.dt[, list(cn = sum(cn, na.rm = TRUE)),  keyby = 'col']
    varmeta[type == 'interval', in.e.lb := cni[list(pid), cn]]
    varmeta[type == 'interval', out.e.lb := cno[list(pid), cn]]

    ## dial down the edge copy number when there can't be enough node
    dial.down = varmeta[out.e.lb>ub | in.e.lb>ub, pid]
    dial.down.rc = varmeta[type=="interval"][psid %in% varmeta[type=="interval"][pid %in% dial.down, -psid], pid]
    ## dial.down.to = setkey(varmeta[dial.down, .(pid, ub)], "pid")
    ## mips.dt[row %in% dial.down, cn := dial.down.to[.(row), ub]]
    ## mips.dt[col %in% dial.down, cn := dial.down.to[.(col), ub]]
    mips.dt[row %in% dial.down, cn := 0]
    mips.dt[col %in% dial.down.rc, cn := 0]
    mips.dt[col %in% dial.down, cn := 0]
    mips.dt[row %in% dial.down.rc, cn := 0]

    ## recompute
    cno = mips.dt[, list(cn = sum(cn, na.rm = TRUE)),  keyby = 'row']
    cni = mips.dt[, list(cn = sum(cn, na.rm = TRUE)),  keyby = 'col']
    varmeta[type == 'interval', in.e.lb := cni[list(pid), cn]]
    varmeta[type == 'interval', out.e.lb := cno[list(pid), cn]]

    varmeta[type == "interval", mipstart := pmin(ub, pmax(mipstart, lb, out.e.lb, in.e.lb, na.rm = TRUE), na.rm=TRUE)]

    varmeta[type == 'interval' & is.na(mipstart), mipstart := pmax(ceiling(segstats$cn[pid]), 0)]
    nna.ix = which(!is.na(segstats$cn))
    pl = sum(segstats$cn[nna.ix] * width(segstats[nna.ix])/1e6)/sum(width(segstats)[nna.ix]/1e6)
    varmeta[type == 'interval' & is.na(mipstart), mipstart := ceiling(pl)]

    ## final cleanup
    varmeta[type == 'interval', mipstart := pmin(ub, pmax(mipstart, lb, out.e.lb, in.e.lb, na.rm = TRUE), na.rm=TRUE)]
    ## check
    node.meta = setkey(varmeta[type=="interval"], "pid")
    mips.dt[, ":="(row.mips = node.meta[.(row), mipstart],
                   col.mips = node.meta[.(col), mipstart])]

    ## EDGES
    varmeta[
        type == 'edge',
        mipstart := mips.dt[
            list(as.data.table(edges[pid,,drop = FALSE])), cn
        ]]
    varmeta[type == 'edge' & is.na(mipstart), mipstart := 0]
    varmeta[, mipstart := pmax(pmin(mipstart, ub), lb)]


    ## SLACKS
    ## which is just the difference between the copy number at
    ## c_i and the incoming / outgoing edges
    e_hat = varmeta[type == 'edge', mipstart]
    n_hat = varmeta[type == 'interval', mipstart]
    ## Bs stores the constraints c_i - - slack_s_i - sum_j \in Es(i) e_j for all i in six
    Bs = Amat[consmeta[type == 'EdgeSource', id],]
    Bs.interval = Bs[, varmeta[type == "interval", id]]
    Bs.interval.ij = data.table(Matrix::which(Bs.interval != 0, arr.ind=T))
    six = Bs.interval.ij[, col, by=row][order(row), col]

    s_slack_hat = rep(0, length(n_hat))
    if (length(varmeta[type == "edge", id])>0)
        s_slack_hat[six] = Bs[, varmeta[type == "edge", id]] %*% e_hat + n_hat[six]
    s_slack_hat[is.na(s_slack_hat)] = 0
    varmeta[type == 'source.slack', mipstart := s_slack_hat]

    ## Bt stores the constraints c_i - - slack_t_i - sum_j \in Et(i) e_j for all i in tix
    Bt = Amat[consmeta[type == 'EdgeTarget', id],]
    Bt.interval = Bt[, varmeta[type == "interval", id]]
    Bt.interval.ij = data.table(Matrix::which(Bt.interval != 0, arr.ind=T))
    tix = Bt.interval.ij[, col, by=row][order(row), col]

    t_slack_hat = rep(0, length(n_hat))
    if (length(varmeta[type == "edge", id])>0)
        t_slack_hat[tix] = Bt[, varmeta[type == "edge", id]] %*% e_hat + n_hat[tix]
    t_slack_hat[is.na(t_slack_hat)] = 0
    varmeta[type == 'target.slack', mipstart := t_slack_hat]

    ## negative slacks can happen when the number of incoming and outgoing
    ## edges exceed the number of nodes ..
    ## this will only happen when lb is provided to certain edges
    ## in this case, we adjust the mipstart by adding cn to nodes and slacks
    ## so that the copy number of the parent nodes associated
    ## with the negative slacks to make everything non-negative
    bad.slacks = varmeta[type %in% c('source.slack', 'target.slack') & mipstart<0, pid]
    slacks.to.adjust = varmeta[type %in% c('source.slack', 'target.slack'), ][pid %in% bad.slacks, ]
    if (nrow(slacks.to.adjust)>0)
    {
        cn.to.adjust = slacks.to.adjust[, min(mipstart), keyby = pid]
        nodes.to.adjust = varmeta[type == 'interval', ][slacks.to.adjust$pid, ]
        nodes.to.adjust$adjust = -cn.to.adjust[list(nodes.to.adjust$pid), V1]
        slacks.to.adjust$adjust = -cn.to.adjust[list(slacks.to.adjust$pid), V1]
        varmeta[nodes.to.adjust$id, ]$mipstart =   nodes.to.adjust$mipstart + nodes.to.adjust$adjust
        varmeta[slacks.to.adjust$id, ]$mipstart = slacks.to.adjust$mipstart + slacks.to.adjust$adjust
    }

    varmeta[label == 'beta', mipstart := beta]
    varmeta[label == 'gamma', mipstart := gamma]

    ## compute residual as difference between rounded and "mean" value
    n_hat = varmeta[type == 'interval', mipstart]
    mu_hat = ifelse(!is.na(segstats$mean), segstats$mean*beta-segstats$ncn/2*gamma, 0)
    eps_hat = mu_hat - n_hat
    varmeta[type == 'residual', mipstart := eps_hat]

    if (use.L0)
    {
        ssid = varmeta[type == 'source.slack.indicator', pid]
        varmeta[type == 'source.slack.indicator', ]$mipstart = sign(varmeta[type == 'source.slack', ][ssid, mipstart]>0)

        tsid = varmeta[type == 'target.slack.indicator', pid]
        varmeta[type == 'target.slack.indicator', ]$mipstart = sign(varmeta[type == 'target.slack', ][tsid, mipstart]>0)
    }

    ## final check on variable upper bound
    varmeta[, mipstart := pmin(mipstart, ub)]
    varmeta[, mipstart := pmax(mipstart, lb)]

    ## sanity check
    b.mipstart = Amat %*% varmeta[, mipstart]
    consmeta[, mipstart := b.mipstart[, 1, drop=TRUE]]

    return(varmeta$mipstart)
}

#' @name varmeta
#' @title makes varmeta in jbaMIP
#' @description
#'
#'
#' assumes that segstats and rows of edges are named with sid
#'
#'   varmeta will keep track of all variables
#' (i.e. interval, edge, source.slack, target.slack, residual, source.slack.indicator, target.slack.indicator)
#' id = actual column index in the final matrix
#'  pid = integer for edges the row in the edges matrix, for everything else the index of the parent interval in segstats
#'  psid = parent signed id, so that both strands of the same edges / segstats parent, have the same abs(psid), as.character(psid) also indexes names of the respective edges / segstats object
#' @rdname internal
#' @param segstats GRanges named with "sid" (ie output of sid
#' @param edges Matrix with columns named row and col, rows labeled with esid
#' @noRd
#'
.varmeta = function(segstats,
                    edges,
                    adj.lb,
                    adj.ub,
                    cn.lb,
                    cn.ub,
                    gamma,
                    beta,
                    use.L0)
{
    ##
    ## Setting up MIP variables (tracked in varmeta data.table)
    ##
    ## TODO: make gGraph conform to this
    sid = as.numeric(names(segstats)) ## assume that segstats has signed integer sid

    varmeta = data.table()
    v.ix = nrow(varmeta) +  seq_along(segstats)

    ## adding intervals
    varmeta = data.table(
        id = v.ix,
        pid = seq_along(sid),
        psid = sid,
        label = paste0('interval', ifelse(sid>0, sid, paste0(-sid, 'r'))),
        type = 'interval',
        dup = sid<0,
        vtype = 'I',
        lb = pmax(0, cn.lb, na.rm = TRUE),
        ub = pmin(cn.ub, Inf, na.rm = TRUE),
        stringsAsFactors = F)

    ## adding residuals: interval cn - cnmle
    s.ix = nrow(varmeta) + v.ix
    varmeta = rbind(varmeta,
                    data.table(
                        id = s.ix,
                        pid = seq_along(sid),
                        psid = sid,
                        label = paste0('residual', ifelse(sid>0, sid, paste0(-sid, 'r'))),
                        dup = sid<0,
                        vtype = 'C',
                        lb = -Inf,
                        ub = Inf,
                        type = 'residual',
                        stringsAsFactors = F))

    if (nrow(edges)>0)
    {
        if (any(tmpix <- adj.ub<0)){ ## tmp fix for negative adj.ub if any exist
            adj.ub[tmpix] = 0.1 ## ie make them >0 <1 effectively constraining to 0
        }

        MAX.EUB = ifelse(max(cn.ub[!is.na(cn.ub)])>0, max(cn.ub[!is.na(cn.ub)]), Inf)
        if (is.null(adj.ub)){
            eub = rep(MAX.EUB, nrow(edges))
        } else {
            eub = ifelse(adj.ub[edges]==0, MAX.EUB, round(pmax(adj.ub[edges], 0)))
        }
        edges.dt = as.data.table(edges)[, esid := as.numeric(rownames(edges))]
        edges.dt[, lb := pmax(adj.lb[edges], 0)]
        edges.dt[, ub := eub]

        ## adding edges
        e.ix = nrow(varmeta) + (1:nrow(edges.dt))
        varmeta = rbind(varmeta,
                        data.table(
                            id = e.ix,
                            pid = 1:nrow(edges.dt),
                            psid = edges.dt$esid,
                            label = paste0('edge',
                                           ifelse(edges.dt$esid>0,
                                                  edges.dt$esid,
                                                  paste0(-edges.dt$esid, 'r'))),
                            type = 'edge',
                            dup = edges.dt$esid<0,
                            vtype = 'I',
                            lb = edges.dt$lb,
                            ub = edges.dt$ub,
                            stringsAsFactors = F))
    }

    ## adding gamma and beta
    gb.ix = nrow(varmeta) + 1:2
    varmeta = rbind(varmeta,
                    data.table(
                        id = gb.ix,
                        pid = rep(1,2),
                        psid = rep(1, 2),
                        label = c('gamma', 'beta'),
                        type = 'global',
                        dup = FALSE,
                        vtype = 'C',
                        lb = c(gamma, beta),
                        ub = c(gamma, beta),
                        stringsAsFactors = F))

    ## adding slacks
    es.s.ix = nrow(varmeta)+(seq_along(v.ix)) ## adding "source slack" variable
    varmeta = rbind(varmeta,
                    data.table(
                        id = es.s.ix,
                        pid = seq_along(sid),
                        psid = sid,
                        label = paste0('source.slack', ifelse(sid>0, sid, paste0(-sid, 'r'))),
                        type = 'source.slack',
                        dup = sid<0,
                        vtype = 'I',
                        lb = 0,
                        ub = Inf,
                        stringsAsFactors = F))

    es.t.ix = nrow(varmeta)+(seq_along(v.ix)) ## adding "target slack"
    varmeta = rbind(varmeta,
                    data.table(
                        id = es.t.ix,
                        pid = seq_along(sid),
                        psid = sid,
                        label = paste0('target.slack', ifelse(sid>0, sid, paste0(-sid, 'r'))),
                        type = 'target.slack',
                        dup = sid<0,
                        vtype = 'I',
                        lb = 0,
                        ub = Inf, stringsAsFactors = F))

    if (use.L0)
    {
        ## add loose end binary indicator variables for all (non dup) loose end
        ## source and target slack variables
        lix = varmeta[type %in% c('source.slack') & !dup, id]
        ilix = nrow(varmeta) + seq_along(lix)
        varmeta = rbind(varmeta,
                        data.table(
                            id = ilix,
                            pid = varmeta$pid[lix],
                            psid = varmeta$psid[lix],
                            label = paste0('SourceSlackIndicator',
                                           ifelse(varmeta$psid[lix]>0,
                                                  varmeta$psid[lix],
                                                  paste0(varmeta$psid[lix], 'r'))),
                            type = 'source.slack.indicator',
                            dup = varmeta$psid[lix]<0,
                            vtype = 'B',
                            lb = 0,
                            ub = 1,
                            stringsAsFactors = FALSE))

        lix = varmeta[type %in% c('target.slack') & !dup, id]
        ilix = nrow(varmeta) + seq_along(lix)
        varmeta = rbind(varmeta,
                        data.table(
                            id = ilix,
                            pid = varmeta$pid[lix],
                            psid = varmeta$psid[lix],
                            label = paste0('TargetSlackIndicator', ifelse(varmeta$psid[lix]>0, varmeta$psid[lix], paste0(varmeta$psid[lix], 'r'))),
                            type = 'target.slack.indicator',
                            dup = varmeta$psid[lix]<0,
                            vtype = 'B',
                            lb = 0,
                            ub = 1,
                            stringsAsFactors = FALSE))
    }

    return(varmeta)
}


####################################################################
#' @name JaBbA.digest
#' @title JaBbA.digest
#' @rdname internal
#' JaBbA.digest
#'
#' @details
#' processes JaBbA object
#' (1) collapsing segments with same copy number that lack loose ends
#' (2) (optional) +/- adds segments correponding to loose ends
#' (3) outputting edges data frame with colors, and other formatting information
#' (4) outputting junctions GRangesList with copy number, color, lty and other plotting components
#'
#'
#' @param jab JaBbA object "undigested"
#' @param kag karyograph (original karyograph input to JaBbA), if NULL then will "redigest" JaBbA object
#' @param verbose logical flag
#' @param keep.all keep.all (default TRUE) whether to keep 0 copy junctions or collapse segments across these as well
############################################
JaBbA.digest = function(jab, kag, verbose = T, keep.all = T)
{
    if (any(dim(jab$adj) != dim(kag$adj)))
        jerror('JaBbA and karyograph object mismatch')

    bk.adj = kag$adj ## the background graph will make sure our collapsed paths are reference adjacent

    nnab = !ifelse(is.na(kag$ab.edges[,3,1]), TRUE, kag$ab.edges[,3,1]==0)

    if (!keep.all & nrow(kag$ab.edges)>0) ## we can throw out unused aberrant edges by throwing them out of the background graph (used ones will still be in jab$adj)
    {
        bk.adj = kag$adj
        bk.adj[rbind(kag$ab.edges[nnab,1:2, 1])] = 0
        bk.adj[rbind(kag$ab.edges[nnab,1:2, 2])] = 0
    }

    adj = sign(bk.adj)*0.01 + jab$adj ## keep a hint of 0 copy edges

    #' mimielinski Friday, Jan 26, 2018 07:43:18 PM
    #' rewriting to get rid of strange edge cases
    #' resulting from unnecessarily having to use coordinates
    #' to match up loose ends with their nodes


    if (any(jab$segstats$eslack.out>0 | jab$segstats$eslack.in>0, na.rm = TRUE))
    {
        sink.ix = which(jab$segstats$eslack.out>0)
        sinks = gr.end(jab$segstats[sink.ix],ignore.strand = FALSE)
        sinks$cn = jab$segstats$eslack.out[sink.ix]
        sinks$partner.id = sink.ix
        sinks$id = nrow(adj) + seq_along(sinks)
        sinks$loose = TRUE
        sinks$right = as.logical(strand(sinks)=='+')

        source.ix = which(jab$segstats$eslack.in>0)
        sources = gr.start(jab$segstats[source.ix],ignore.strand = FALSE)
        sources$cn = jab$segstats$eslack.in[source.ix]
        sources$partner.id = source.ix
        sources$id = nrow(adj) + length(sinks) + seq_along(sources)
        sources$loose = TRUE
        sources$right = as.logical(strand(sources)=='+')

        nlends = length(sources) + length(sinks)

        ## pad original matrix with new nodes
        adj.plus = rbind(cbind(adj, sparseMatrix(1,1,x = 0, dims = c(nrow(adj), nlends))),
                         cbind(sparseMatrix(1,1,x = 0, dims = c(nlends, ncol(adj))), sparseMatrix(1,1,x = 0, dims = c(nlends, nlends))))

        ## add new edges
        adj.plus[cbind(sinks$partner.id, sinks$id)] = sinks$cn+0.01
        adj.plus[cbind(sources$id, sources$partner.id)] = sources$cn+0.01
        adj = adj.plus
        segstats = grbind(jab$segstats, sinks, sources)
        segstats$loose = F
        values(segstats) = rrbind(values(jab$segstats), values(sinks), values(sources))
    }
    else
    {
        segstats = jab$segstats
        segstats$loose = FALSE
    }

    out = list()

    ## now we have augmented adjacency matrix with loose ends, let's simplify the structure
    ## by collapsing all simple paths
    collapsed = collapse.paths(adj, verbose = verbose)

    ## new segstats formed by reducing "collapsed' sets
    segstats$set.id = collapsed$map
    out$segstats = gr.fix(
        gr.simplify(
            segstats[unlist(lapply(collapsed$sets, sort))],
            val = rep(seq_along(collapsed$sets), sapply(collapsed$sets, length))),
        segstats)

    tmp.ss = gr.string(gr.stripstrand(out$segstats), other.cols = 'loose')
    check1 = all(table(match(tmp.ss, tmp.ss)))
    ## check2 = identical(1:length(collapsed$sets), sort(out$segstats$set.id))

    ##   if (!check1 | !check2) ## quick sanity check to make sure we didn't screw up collapsing
    if (!check1) ## quick sanity check to make sure we didn't screw up collapsing
        jerror('collapse yielded funny / missing segments')
    else
        out$segstats = out$segstats[match(seq_along(collapsed$sets), out$segstats$set.id), c('cn')]

    ## out$segstats$og.ix = sapply(collapsed$sets, function(x) paste(sort(x), collapse = ','))

    tmp.start.ix = sapply(collapsed$sets, function(x) sort(x)[1])
    tmp.end.ix = sapply(collapsed$sets, function(x) -sort(-x)[1])
    out$segstats$start.ix = ifelse(as.logical(strand(out$segstats)=='+'), tmp.start.ix, tmp.end.ix)
    out$segstats$end.ix = ifelse(as.logical(strand(out$segstats)=='+'), tmp.end.ix, tmp.start.ix)
    out$segstats$eslack.in = segstats$eslack.in[out$segstats$start.ix]
    out$segstats$eslack.out = segstats$eslack.out[out$segstats$end.ix]
    out$segstats$loose = F
    if (any(loose.ix <- which(segstats$loose)))
        out$segstats$loose = out$segstats$start.ix %in% loose.ix

    adj.new.ix = out$adj = collapsed$adj*0 ## rewire copy numbers and edge indices according to collapsed
    edge.ix = Matrix::which(collapsed$adj!=0, arr.ind = T)

    if (nrow(edge.ix)>0)
    {
        out$adj[edge.ix] = round(adj[cbind(out$segstats$end.ix[edge.ix[,1]], out$segstats$start.ix[edge.ix[,2]])])
        adj.new.ix[edge.ix] = 1:nrow(edge.ix)
    }

    out$ab.edges = array(NA, dim = c(nrow(kag$ab.edges), 3, 2), dimnames = list(NULL, c('from', 'to', 'edge.ix'), c('+', '-')))

    ## match ab edges to new graph, excluding any edges that aren't included in graph (i.e. not given >0 copy number)
    ## (tmp.ix may map some ab.edges to "internal" vertices, so need to weed these out via keep)
    if (any(nnab))
    {
        ## og junctions
        tmp.ix = cbind(rep(NA, nrow(kag$ab.edges)), rep(NA, nrow(kag$ab.edges)))
        tmp.ix[nnab,] = cbind(collapsed$map[kag$ab.edges[nnab,1,1]], collapsed$map[kag$ab.edges[nnab,2,1]])
        keep = rep(FALSE, length(nnab))
        keep[nnab] = (out$segstats$end.ix[collapsed$map[kag$ab.edges[nnab,1,1]]] == kag$ab.edges[nnab,1,1]) & (out$segstats$start.ix[collapsed$map[kag$ab.edges[nnab,2,1]]] == kag$ab.edges[nnab,2,1])
        tmp.ix[!keep, ] = NA ## not really needed but let's keep it
        if (any(keep))
            out$ab.edges[keep,,1] = cbind(tmp.ix[keep, , drop = F], adj.new.ix[tmp.ix[keep, , drop = F]])

        ## rev comp junctions
        tmp.ix = cbind(rep(NA, nrow(kag$ab.edges)), rep(NA, nrow(kag$ab.edges)))
        tmp.ix[nnab] = cbind(collapsed$map[kag$ab.edges[nnab,1,2]], collapsed$map[kag$ab.edges[nnab,2,2]])
        keep = rep(FALSE, length(nnab))
        keep[nnab] = (out$segstats$end.ix[collapsed$map[kag$ab.edges[nnab,1,2]]] == kag$ab.edges[nnab,1,2]) & (out$segstats$start.ix[collapsed$map[kag$ab.edges[nnab,2,2]]] == kag$ab.edges[nnab,2,2])
        tmp.ix[!keep, ] = NA ## not really needed, but let's keep it
        if (any(keep))
          out$ab.edges[keep,,2] = cbind(tmp.ix[keep, , drop = F], adj.new.ix[tmp.ix[keep, , drop = F]])
    }

    ## convert to "simplified form"
    out$edges = data.table(from = edge.ix[,1], to = edge.ix[,2], cn = out$adj[edge.ix])

    estr = paste(edge.ix[,1], edge.ix[,2])
    abestr = paste(out$ab.edges[,1,1:2], out$ab.edges[,2,1:2])

    if (nrow(out$edges)>0)
    {
        out$edges$type = 'reference'
        if (any(ix <- estr %in% abestr))
            out$edges$type[ix] = 'aberrant'
        if (any(ix <- out$segstats$loose[out$edges[, from]] | out$segstats$loose[out$edges[, to]]))
            out$edges$type[ix] = 'loose'

        out$edges$col = ifelse(out$edges$type == 'aberrant', ifelse(out$edges$cn>0, alpha('red', 0.4), alpha('purple', 0.3)), alpha('gray', 0.2))

        loose.ix = which(out$edges$type == 'loose')
        out$edges$h = 1
    }

    if (length(loose.ix)>0)
    {
        seg.map = match(out$segstats, gr.flipstrand(out$segstats)) ## maps segs to rev comp
        ## maps edges to rev comp
        ed.map = match(paste(out$edges[, from], out$edges[, to]),
                       paste(seg.map[out$edges[, to]], seg.map[out$edges[, from]]))
        temp.ix = which(ed.map>(seq_along(ed.map)));
        ed.id = ed.map
        ed.id[temp.ix] = temp.ix
        ## edges whose rev comp has higher id we name with their index,
        ## and we name their rev comp with their index
        out$edges$col[loose.ix] = alpha('blue', 0.6)
        rh = 0.5 + runif(length(loose.ix)/2)
        out$edges$h[loose.ix] = rh[match(ed.id[loose.ix], unique(ed.id[loose.ix]))]
        ## out$edges$h = ifelse(out$edges$type == 'loose', rand(nrow(out$edges)), 1)
    }

    if (nrow(out$edges)>0)
    {
        out$edges$lwd = ifelse(out$edges$type == 'aberrant', 1 + log2(0.2*out$edges$cn+2), 1)
        out$edges$lwd[out$edges$cn==0] = 0.2
        out$edges$lty = ifelse(out$edges$type == 'loose', 3, ifelse(out$edges$cn==0, 2, 1))
        out$edges$lty[out$edges$cn==0] = 3
        out$edges$col[out$edges$cn==0 & out$edges$type == 'loose'] = alpha('purple', 0.6)
        out$edges$cex.arrow = 0
        out$edges$v = 1
        out$edges$not.flat = out$edges$type == 'aberrant'
        out$edges$v[out$edges$type == 'aberrant'] = 2
        out$edges$h[out$edges$type == 'aberrant'] = 2
        out$edges$dangle.w = 0.5
    }

    out$G = graph.adjacency(adj.new.ix, weighted = 'edge.ix')

    out$segstats$edges.in = sapply(seq_along(out$segstats),
                                   function(x) {ix = which(out$adj[,x]!=0); paste(ix, '(', out$adj[ix,x], ')', '->', sep = '', collapse = ',')})
    out$segstats$edges.out = sapply(seq_along(out$segstats),
                                    function(x) {ix = which(out$adj[x, ]!=0); paste('->', ix, '(', out$adj[x,ix], ')', sep = '', collapse = ',')})

    pos.ix = which( as.logical( strand(out$segstats)=='+') )
    out$segstats$tile.id = match(gr.stripstrand(out$segstats), gr.stripstrand(out$segstats[pos.ix]))

    ss = out$segstats
    ss$right = segstats$right[ss$start.ix]
    ss$partner.id = segstats$partner.id[ss$start.ix]
    ss$col = ifelse(ss$loose, alpha('blue', 0.3), alpha('gray', 0.5))
    ss$border = ifelse(ss$loose, alpha('blue', 0.3), alpha('black', 0.5))
    ss$ywid = 0.8

    if (any(ss$loose))
    {
                                        #        ss$cn[ss$loose] = ss$cn[ss$loose]+0.5
        ss$cn[ss$loose] = ifelse(ss$right[ss$loose], segstats$cn[ss$partner.id[ss$loose]]*1.2, segstats$cn[ss$partner.id[ss$loose]]*1.2)
        ss[ss$loose] = GenomicRanges::shift(ss[ss$loose], ifelse(ss[ss$loose]$right, -500, 500)) + 500
                                        #        ss[ss$loose] = gr.flipstrand(ss[ss$loose]) + 100
        ss[ss$loose]$ywid = 0.001
        ss[ss$loose]$col = alpha('white', 0)
        ss[ss$loose]$border = alpha('white', 0)
    }

    out$td = out$gtrack = gTrack(ss, y.field = 'cn', edges = out$edges[order(out$edges$cn), ], name ='JaBbA', angle = 0)
    out$purity = jab$purity
    out$ploidy = jab$ploidy

    return(out)
}

####################
#' @name jabba.alleles
#' @rdname internal
#' jabba.alleles
#'
#' Populates allelic value s for JaBbA object.  This does not explicitly impose junction balance constraints on alleles, but rather just computes
#' the maximum likelihood estimate given allelic counts and the inferred total copy number on a given segment according to JaBbA
#'
#' @param jab JaBbA object
#' @param het.sites GRanges with meta data fields (see below) for alt and rref count
#' @param alt.count.field character specifying alt.count meta data field in input het.sites (default $alt)
#' @param ref.count.field character specifying ref.count meta data field in input het.sites (default $ref)
#' @param split.ab logical flag whether to split aberrant segmetns (segmentss with ab edge entering or leaving prior to computing allelic states (default FALSE)
#' @param uncoupled logical flag whether to not collapse segments after inferring MLE estimate (default FALSE), if FALSE will try to merge adjacent segments and populate allele-specific junctions with copy numbers on the basis of the MLE fit on individual allelic segments
#' @param conservative if TRUE then will leave certain allelic segments "unphased" if one cannot sync the high / low interval state with the incoming and / or outgoing junction state
#' @param verbose logical flag
#' @return
#' list with following fields:
#' $segstats = GRanges of input segments with $cn.high and $cn.low segments populated
#' $asegstats = GRanges of allelic segments (length is 2*length(segstats)) with high and low segments each having $cn, this is a "melted" segstats GRAnges
#' $agtrack = gTrack of allelic segments and supporting input het.sites
#' $aadj = allelic adjacency matrix of allele specific junctions
#' $ab.ix = indices of aberrant edges in $aadj
#' $ref.ix = indices of reference edges in $aadj
############################################
jabba.alleles = function(jab,
                         het.sites, ## granges with meta data fields for alt.count and
                         alt.count.field = 'alt',
                         ref.count.field = 'ref',
                         baf.field = 'baf.t',
                         split.ab = F, ## if split.ab == T, then will split across any "aberrant" segment (i.e. segment with ab edge entering or leaving prior to computing allelic states (note: this might create gaps)
                         uncoupled = FALSE, ## if uncoupled, we just assign each high low allele the MLE conditioning on the total copy number
                         conservative = FALSE, ## if TRUE then will leave certain allelic segments "unphased" if one cannot sync the high / low interval state with the incoming and / or outgoing junction state
                         verbose = F
                         )
{
    if (!all(c(alt.count.field, ref.count.field) %in% names(values(het.sites)))){
        jwarning('count fields not found in meta data of het.sites input, trying BAF...')
        if (!(baf.field %in% names(values(het.sites))))
            jerror('BAF field not found in meta data of het.sites input either!')
        else{
            ## outputs are re.seg$low and re.seg$high
            ## test deviations of observed BAF from expected by beta distribution
            if (verbose)
                jmessage('Processing', length(het.sites),
                         'het sites using fields', baf.field, '\n')

        }
    } else {
        ## jerror('count fields not found in meta data of het.sites input')

        if (verbose)
        {
            jmessage('Processing ', length(het.sites), ' het sites using fields ', alt.count.field, ' and ', ref.count.field)
        }

        het.sites$low.count = pmin(values(het.sites)[, alt.count.field], values(het.sites)[, ref.count.field])
        het.sites$high.count = pmax(values(het.sites)[, alt.count.field], values(het.sites)[, ref.count.field])

        het.sites = het.sites[!is.na(het.sites$low.count) & !is.na(het.sites$high.count)]

        ss.p = jab$segstats[ as.logical( strand(jab$segstats)=='+' ) ]

        ## find the reference junctions
        ord.ix = GenomicRanges::order(jab$segstats)
        rev.ix = as.logical(GenomicRanges::strand(jab$segstats[ord.ix]) == '-')
        ord.ix = c(ord.ix[!rev.ix], rev(ord.ix[rev.ix]))

        ref.jun = cbind(ord.ix[-length(ord.ix)], ord.ix[-1])
        ref.jun = ref.jun[which(jab$adj[ref.jun]>0), ]

        has.ab.rand = 0
        if (split.ab)
        {
            ab.adj = jab$adj
            ab.adj[ref.jun] = 0
            has.ab = as.numeric(Matrix::rowSums(ab.adj!=0)!=0 | Matrix::colSums(ab.adj!=0)!=0)[which( as.logical( strand(jab$segstats)=='+')) ]
            has.ab.rand = runif(length(ss.p)) * 1e-6 * has.ab
        }

        ss.p = ss.p[!is.na(ss.p$cn)]
        re.seg = as(coverage(ss.p, weight = ss.p$cn + has.ab.rand), 'GRanges')
        re.seg$cn = round(re.seg$score)

        het.sites$ix = gr.match(het.sites, re.seg)

        if (verbose)
        {
            jmessage('Computed high / low counts and matched to segs')
        }


        highs = split(het.sites$high.count, het.sites$ix)[as.character(seq_along(re.seg))]
        lows = split(het.sites$low.count, het.sites$ix)[as.character(seq_along(re.seg))]

        het.sites$cn = re.seg$cn[het.sites$ix]
        purity = jab$purity
        ploidy = mean(het.sites$cn, na.rm = T) ## ploidy may be slightly different from "global ploidy" depending on the distribution of sites

        sw = length(het.sites)
        total = sum(as.numeric(c(het.sites$high.count, het.sites$low.count)))

        cn = re.seg$cn
        ## gamma = 2*(1-purity)/purity  ## gammas and betas need to be recomputed for
        ## beta = (2*(1-purity)*sw + purity*ploidy*sw) / (purity * total)
        gamma = 1*(1-purity)/purity  ## gammas and betas need to be recomputed for  (1 since we are looking at het alleles)
        beta = (1*(1-purity)*sw + purity*ploidy*sw) / (purity * total)
        centers = (0:(max(cn)) + gamma)/beta

        if (verbose)
        {
            jmessage('Computed SNP ploidy and allelic copy centers')
        }

        ## now test deviation from each absolute copy combo using poisson model
        ## i.e. counts ~ poisson(expected mean)
        ##
        re.seg$low = sapply(seq_along(re.seg), function(i)
        {
            ##        if (verbose)
            ##          cat('.')
            x = lows[[i]]
            if (length(x)==0)
                return(NA)
            y = highs[[i]]
            tot.cn = cn[i]
            ll = sapply(0:(floor(tot.cn/2)), function(j) sum(ppois(x,centers[j+1], log.p = T) + ppois(y,centers[tot.cn-j+1],log.p = T)))
            ll = ll - min(ll)
            return(which.max(ll)-1)
        })

        re.seg$high = re.seg$cn-re.seg$low
    }
    ## #########################################################################
    ## borderline, below are common to both methods
    jab$segstats$cn.low = round(gr.val(jab$segstats, re.seg, 'low', na.rm = TRUE)$low)
    jab$segstats$cn.high = round(gr.val(jab$segstats, re.seg, 'high', na.rm = TRUE)$high)
    na.ix = (!gr.val(jab$segstats, re.seg, 'low', FUN = function(x,w,na.rm) any(!is.na(x)))$low) |
        (!gr.val(jab$segstats, re.seg, 'high', FUN = function(x,w,na.rm) any(!is.na(x)))$high)
    jab$segstats$cn.low[na.ix] = jab$segstats$cn.high[na.ix] = NA

    ## ###########
    ## phasing
    ## ###########

    ## iterate through all reference junctions and apply (wishful thinking) heuristic
    ##
    ## populate n x n x 2 adjacency matrix, which we will later expand to a bigger matrix
    adj.ab = jab$adj
    adj.ab[ref.jun] = 0
    adj.ref = jab$adj*0
    adj.ref[ref.jun] = jab$adj[ref.jun]
    high = low = jab$segstats[, c()]
    high$cn = jab$segstats$cn.high
    low$cn = jab$segstats$cn.low
    high$parent = low$parent = seq_along(jab$segstats)
    high$type = 'high'
    low$type = 'low'
    high$id = seq_along(jab$segstats)
    low$id = length(jab$segstats) + seq_along(jab$segstats)
    asegstats = c(high, low)
    amap = cbind(high$id, low$id) ## maps segstats id x allele combos to asegstats id

    aadj = sparseMatrix(1, 1, x = 0, dims = c(length(asegstats), length(asegstats)))

    .flip = function(x) x %% 2+1

    asegstats = c(high, low)
    acn = cbind(high$cn, low$cn)

    phased.out = phased.in = rep(TRUE, length(asegstats))

    str = strand(asegstats)

    if (verbose)
        jmessage('Starting phasing ')

    if (nrow(ref.jun))
        {
            for (k in 1:nrow(ref.jun))
            {
                i = ref.jun[k, 1]
                j = ref.jun[k, 2]
                a = acn[ref.jun[k,1],]
                b = acn[ref.jun[k,2],]

                phased.out[amap[i, ]] = FALSE
                phased.in[amap[j, ]] = FALSE

                pairs.ij = cbind(rep(c(1:2), 2), rep(c(1:2), each = 2)) ## 4 possible matches
                m = setdiff(which(a[pairs.ij[,1]] == b[pairs.ij[,2]]), NA)

                if (!(length(m) %in% c(0, 4))) ## 1,2, and 3 matches are fine (3 matches occur if one interval is in allelic balance, and the other not
                {
                    if (length(m)==2) ## pick the phase that the alleles can handle
                        m = rev(m[order(as.numeric(sum(adj.ab[i, ])<=a[pairs.ij[m,1]]) + as.numeric(sum(adj.ab[, j])<=b[pairs.ij[m,2]]))])

                    m.ij = pairs.ij[m[1], ]
                    fm.ij = .flip(m.ij)
                    aadj[amap[i, m.ij[1]], amap[j, m.ij[2]]] = min(a[m.ij[1]], jab$adj[i, j])
                    aadj[amap[i, fm.ij[1]], amap[j, fm.ij[2]]] = jab$adj[i, j] - aadj[amap[i, m.ij[1]], amap[j, m.ij[2]]]

                    phased.out[amap[i, ]] = TRUE
                    phased.in[amap[j, ]] = TRUE

                    if (length(a.ab <- Matrix::which(adj.ab[i,]!=0))>0)
                    {
                        ## if a.ab (partner) is already phased then unpopulate the non-ab allelic junction, otherwise populate both alleles of partner
                        ## BUG: a.ab is length 2????
                        ## hack: replace a.ab with a.ab[1]
                        if (any(ph <- aadj[amap[i, fm.ij[1]], amap[a.ab[1], ]] !=0))
                        {
                            aadj[amap[i, fm.ij[1]], amap[a.ab[1], ph]] = adj.ab[i, a.ab[1]]
                            aadj[amap[i, m.ij[1]], amap[a.ab[1], ph]] = 0
                        }
                        else
                            ## otherwise diffuse copy into both alleles of the partner (will be resolved when we resolve phase for the partner interval)
                            ## or collapse unphased nodes back
                            aadj[amap[i, fm.ij[1]], amap[a.ab[1], ]] = adj.ab[i, a.ab[1]]/2

                        if (!conservative)
                            if (a[fm.ij[1]] < adj.ab[i, a.ab]) # if the allelic node can't handle the outgoing allelic edge flux, so unphase
                                phased.out[amap[i, ]] = FALSE
                    }

                    if (length(b.ab <- Matrix::which(adj.ab[,j]!=0))>0)
                    {
                        ## if b.ab (partner) is already phased then concentrate all of the junction copy into the aberrant allele of this interval
                        ## BUG: why b.ab is length 2???? I thought we resolved this long ago
                        ## hack: replace a.ab with a.ab[1]
                        if (any(ph <- aadj[amap[b.ab[1], ], amap[j, fm.ij[2]]] !=0))
                        {
                            aadj[amap[b.ab[1], ph], amap[j, fm.ij[2]]] = adj.ab[b.ab[1], j]
                            aadj[amap[b.ab[1], ph], amap[j, m.ij[2]]] = 0
                        }
                        else
                            ## otherwise diffuse copy into both alleles of the partner (will be resolved when we resolve phase for the partner interval)
                            ## or collapse unphased nodes back
                            aadj[amap[b.ab[1],], amap[j, fm.ij[2]]] = adj.ab[b.ab[1], j]/2

                        if (!conservative)
                            if (b[fm.ij[2]] < adj.ab[b.ab, j]) # the allelic node cn can't handle the incoming allelic edge flux, so unphase
                                phased.in[amap[j, ]] = FALSE
                    }
                }
            }
        }

    if (verbose)
        jmessage('Finished phasing, finalizing.')

    asegstats$phased.in = phased.in
    asegstats$phased.out = phased.out

    if (uncoupled)
        unphased = rep(FALSE, length(asegstats))
    else
        unphased = !asegstats$phased.out | !asegstats$phased.in

    unphased.parents = unique(asegstats$parent[unphased])
    aadj.unphunph = jab$adj[unphased.parents, unphased.parents]
    aadj.phph = aadj[!unphased, !unphased]

    asegstats$new.ind = NA
    asegstats$new.ind[!unphased] = 1:sum(!unphased)
    asegstats$new.ind[unphased] = as.integer(factor(asegstats$parent[unphased], unphased.parents))
    mat.collapse = sparseMatrix(which(unphased), asegstats$new.ind[unphased], x = 1, dims = c(nrow(aadj), length(unphased.parents)))

    aadj.phunph = aadj[!unphased, ] %*% mat.collapse
    aadj.unphph = t(mat.collapse) %*% aadj[, which(!unphased)]

    aadj.final = rbind(
        cbind(aadj.phph, aadj.phunph),
        cbind(aadj.unphph, aadj.unphunph)
    )

    asegstats.unphased = asegstats[match(unphased.parents, asegstats$parent)]
    asegstats.unphased$cn = jab$segstats$cn[asegstats.unphased$parent]
    asegstats.final = c(asegstats[!unphased], asegstats.unphased)
    asegstats.final$phased = c(rep(T, sum(!unphased)), rep(F, length(asegstats.unphased)))
    asegstats.final$type[!asegstats.final$phased] = 'total'

    tmp.str = gr.string(gr.stripstrand(asegstats), mb = F, other.cols = 'type');
    asegstats$tile.id = as.integer(factor(tmp.str, unique(tmp.str)))

    ix = GenomicRanges::order(asegstats.final)
    asegstats.final = asegstats.final[ix]
    aadj.final = aadj.final[ix, ix]

    if (verbose)
        jmessage('Annotating allelic vertices')

    tmp.string = gr.string(asegstats, mb = F, other.cols = 'type'); tmp.string2 = gr.string(gr.flipstrand(asegstats), mb = F, other.cols = 'type')
    asegstats$flip.ix = match(tmp.string, tmp.string2)
    asegstats$phased = !unphased

    asegstats.final$edges.in = sapply(seq_along(asegstats.final),
                                      function(x) {ix = Matrix::which(aadj.final[,x]!=0); paste(ix, '(', aadj.final[ix,x], ')', '->', sep = '', collapse = ',')})
    asegstats.final$edges.out = sapply(seq_along(asegstats.final),
                                       function(x) {ix = Matrix::which(aadj.final[x, ]!=0); paste('->', ix, '(', aadj.final[x,ix], ')', sep = '', collapse = ',')})

    asegstats.final$slack.in = asegstats.final$cn - Matrix::colSums(aadj.final)
    asegstats.final$slack.out = asegstats.final$cn - Matrix::rowSums(aadj.final)

    asegstats.final$new.ind = asegstats.final$phased.out = asegstats.final$phased.in = asegstats.final$id = NULL
    asegstats.final$tile.id = as.integer(factor(gr.string(gr.stripstrand(asegstats.final), mb = F, other.cols = 'type')))

    m = sparseMatrix(seq_along(asegstats.final), asegstats.final$parent, x = 1);

    hh = rep(het.sites[, c()], 2)
    hh$count = c(het.sites$high.count, het.sites$low.count)
    hh$type = rep(c('high', 'low'), each = length(het.sites))

    hh$ywid = 0.5
    atd = c(
        gTrack(hh, angle = 0, y.field = 'count', y0 = 0,
               colormaps = list(type = c('high' = alpha('red', 0.3), 'low' = alpha('blue', 0.3))), name = 'hets', y.quantile = 0.001, lwd.border = 2),
        gTrack(asegstats.final, angle = 0, y.field = 'cn', y0 = 0,
               colormaps = list(type = c('high' = alpha('red', 0.3), 'low' = alpha('blue', 0.3), 'total' = alpha('purple', 0.3))), name = 'alleles')
    )

    out = list(
        segstats = jab$segstats,
        asegstats = asegstats.final,
        atd = atd,
        agtrack = atd,
        aadj = aadj.final,
        ab.ix = Matrix::which((m %*% adj.ab %*% t(m))!=0, arr.ind = T),
        ref.ix = Matrix::which((m %*% adj.ref %*% t(m))!=0, arr.ind = T)
    )

    return(out)
}



##############################
#' @name pp.nll
#' @rdname internal
#' pp.nll
#'
#' computes neg log likelihood ($nll) for purity, ploidy combo and mle abs copy numbers ($v), returns as list
#'
#' v can be over-ridden to compute NLL for other (eg non MLE) values
#'
#' @param segstats granges of segstats with field $mean and $sd corresponding to mean and standard deviation on estimates of fragment density
#' @param purity numeric purity value
#' @param ploidy numeric ploidy value
#' @param gamma numeric gamma value (over-rides purity, ploidy)
#' @param beta numeric beta value (over-rides purity, ploidy)
#' @param field.ncn character specifying meta data field specifying germline copy number, default value is "ncn", if absent then will assume 2
#' @param solution over-ride if not interested in MLE / weighted least squares fit, this should be an integer vector of length(segstats)
#' @return
#' negative log likelihood of MLE i.e. least squares model (or supplied solution) fit
##############################
pp.nll = function(segstats, purity = NA, ploidy = NA, gamma = NA, beta = NA, field = 'mean', field.ncn = 'ncn', v = NULL)
{
    mu = segstats$mean
    w = width(segstats)
    Sw = sum(as.numeric(width(segstats)))
    sd = segstats$sd
    m0 = sum(as.numeric(mu*w))/Sw
    alpha = purity
    tau = ploidy
    ncn = rep(2, length(mu))

    if (!is.null(field.ncn))
        if (field.ncn %in% names(values(segstats)))
            ncn = values(segstats)[, field.ncn]

    ploidy_normal = sum(w * ncn, na.rm = T) / Sw  ## this will be = 2 if ncn is trivially 2

    if (is.na(gamma))
        gamma = 2/alpha - 2

    if (is.na(beta))
        beta = ( tau + ploidy_normal * gamma / 2 ) / m0
                                        # beta = (tau + gamma)/m0

    if (is.null(v))
        v = round(beta*mu-gamma)

    return(list(NLL = sum((v-beta*mu+ncn*gamma/2)^2/((sd)^2)), v = v))
                                        #    return(list(NLL = sum((v-beta*mu+gamma)^2/sd^2), v = v))
}


#############################################################
#' @name munlist
#' @rdname internal
#' munlist
#'
#' unlists a list of vectors, matrices, data frames into a n x k matrix
#' whose first column specifies the list item index of the entry
#' and second column specifies the sublist item index of the entry
#' and the remaining columns specifies the value(s) of the vector
#' or matrices.
#'
#' force.cbind = T will force concatenation via 'cbind'
#' force.rbind = T will force concatenation via 'rbind'
#############################################################
munlist = function(x, force.rbind = F, force.cbind = F, force.list = F)
{
    x = x[!sapply(x, is.null)]

    if (length(x)==0)
        return(NULL)

    if (!any(c(force.list, force.cbind, force.rbind)))
    {
        if (any(sapply(x, function(y) is.null(dim(y)))))
            force.list = T
        if (length(unique(sapply(x, function(y) dim(y)[2]))) == 1)
            force.rbind = T
        if ((length(unique(sapply(x, function(y) dim(y)[1]))) == 1))
            force.cbind = T
    }
    else
        force.list = T

    if (force.list)
        return(cbind(ix = unlist(lapply(seq_along(x), function(y) rep(y, length(x[[y]])))),
                     iix = unlist(lapply(seq_along(x), function(y) seq_along(x[[y]]))),
                     unlist(x)))
    else if (force.rbind)
        return(cbind(ix = unlist(lapply(seq_along(x), function(y) rep(y, nrow(x[[y]])))),
                     iix = unlist(lapply(seq_along(x), function(y) 1:nrow(x[[y]]))),
                     do.call('rbind', x)))
    else if (force.cbind)
        return(t(rbind(ix = unlist(lapply(seq_along(x), function(y) rep(y, ncol(x[[y]])))),
                       iix = unlist(lapply(seq_along(x), function(y) 1:ncol(x[[y]]))),
                       do.call('cbind', x))))
}



## cplex set max threads (warning can only do once globally per machine, so be wary of multiple hosts running on same machine)
##
.cplex_customparams = function(out.file, numthreads = 0, nodefileind = NA, treememlim = NA, workingmemlim = NA_real_)
{
    CPLEX.CMD = readLines(pipe(sprintf('find %s/%s -name cplex', Sys.getenv("CPLEX_DIR"), 'cplex/bin')))
    vnum = grep("Welcome to IBM",
                system2(CPLEX.CMD, c("-c \"quit\""), stdout = T),
                value = T)
    vnum = sub("([a-z)(A-Z ]+)([0-9.]+)$", "\\2", vnum)
    vnums = as.integer(unlist(strsplit(vnum, "\\.")))
    ## param_lines = "CPLEX Parameter File Version 12.6.0.0"
    param_lines = paste("CPLEX Parameter File Version", vnum)

    if (vnums[1] <= 12 && vnums[2] <= 6) {
        param_lines = c(param_lines, paste("CPX_PARAM_THREADS", numthreads, sep = '\t'))
    } else {
        param_lines = c(param_lines, paste("CPXPARAM_Threads", numthreads, sep = '\t')) ## 20.0.0.0 version
    }


    if (!is.na(nodefileind)) {
        if (vnums[1] <= 12 && vnums[2] <= 6) {
            param_lines = c(param_lines, paste("CPX_PARAM_NODEFILEIND", nodefileind, sep = '\t'))
        } else {
            param_lines = c(param_lines, paste("CPXPARAM_MIP_Strategy_File", nodefileind, sep = '\t'))
        }
    }


    if (!is.na(treememlim))
    {
        ## #      param_lines = c(param_lines, paste("CPX_PARAM_WORKDIR", getwd(), sep = '\t'))
        if (vnums[1] <= 12 && vnums[2] <= 6) {
            param_lines = c(param_lines, paste("CPX_PARAM_TRELIM", treememlim, sep = '\t'))
        } else {
            param_lines = c(param_lines, paste("CPXPARAM_MIP_Limits_TreeMemory", treememlim, sep = '\t'))
        }
        
    }

    if (!is.null(workingmemlim) && !identical(workingmemlim, NA_real_)) {
        ## workingmemlim should be expressed in units of megabytes for CPLEX
        ## default is 2048
        if (vnums[1] <= 12 && vnums[2] <= 6) {
            param_lines = c(param_lines, paste("CPX_PARAM_WORKMEM", workingmemlim, sep = '\t'))
        } else {
            param_lines = c(param_lines, paste("CPXPARAM_WorkMem", workingmemlim, sep = '\t'))
        }
    }

    writeLines(param_lines, out.file)
    Sys.setenv(ILOG_CPLEX_PARAMETER_FILE=out.file)
}



#' @name gr.tile.map
#' @rdname internal
#' gr.tile.map
#'
#' Given two tilings of the genome (eg at different resolution)
#' query and subject outputs a length(query) list whose items are integer vectors of indices in subject
#' overlapping that overlap that query (strand non-specific)
#'
#' @note Assumes that input query and subject have no gaps (including at end) or overlaps, i.e. ignores end()
#' coordinates and only uses "starts"
#' @param query Query
#' @param subject Subject
#' @param mc.cores number of cores
#' @param verbose Default FALSE
#' @noRd
############################################
gr.tile.map = function(query, subject, mc.cores = 1, verbose = FALSE)
{
    ix.q = GenomicRanges::order(query)
    ix.s = GenomicRanges::order(subject)

    q.chr = as.character(seqnames(query))[ix.q]
    s.chr = as.character(seqnames(subject))[ix.s]

    ql = split(ix.q, q.chr)
    sl = split(ix.s, s.chr)

    tmp = mcmapply(
        function(x,y)
        {
            if (length(y)==0)
                return(NULL)
            all.pos = c(start(query)[x], start(subject)[y])
            is.q = c(rep(T, length(x)), rep(F, length(y)))
            all.ix = c(x, y)
            ord.ix = order(all.pos)
            all.pos = all.pos[ord.ix]
            is.q = is.q[ord.ix]
            all.ix = all.ix[ord.ix]
            out = matrix(NA, nrow = length(all.pos), ncol = 2)
            last.x = last.y = NA
            for (i in seq_along(all.pos))
            {
                if (is.q[i])
                {
                    out[i, ] = c(all.ix[i], last.y)

                    if (i<length(all.pos)) ## edge case where subject and query intervals share a start point, leading to two consecutive all.pos
                        if (all.pos[i] == all.pos[i+1])
                            out[i, ] = NA

                    last.x = all.ix[i]
                }
                else
                {
                    out[i, ] = c(last.x, all.ix[i])

                    if (i<length(all.pos)) ## edge case where subject and query intervals share a start point, leading to two consecutive all.pos
                        if (all.pos[i] == all.pos[i+1])
                            out[i, ] = NA

                    last.y = all.ix[i]
                }
            }
            out = out[Matrix::rowSums(is.na(out))==0, ]
            return(out)
        }, ql, sl[names(ql)], mc.cores = mc.cores, SIMPLIFY = FALSE)

    m = munlist(tmp)[, -c(1:2), drop = FALSE]
    out = split(m[,2], m[,1])[as.character(seq_along(query))]
    names(out) = as.character(seq_along(query))
    return(out)
}


##################################
#' @name vaggregate
#' @rdname internal
#' @title vaggregate
#'
#' @description
#' same as aggregate except returns named vector
#' with names as first column of output and values as second
#'
#' Note: there is no need to ever use aggregate or vaggregate, just switch to data.table
#'
#' @param ... arguments to aggregate
#' @return named vector indexed by levels of "by"
#' @author Marcin Imielinski
#' @noRd
##################################
vaggregate = function(...)
{
    out = aggregate(...);
    return(structure(out[,ncol(out)], names = do.call(paste, lapply(names(out)[1:(ncol(out)-1)], function(x) out[,x]))))
}

#' @name write.tab
#' @rdname internal
#' @noRd
write.tab = function(x, ..., sep = "\t", quote = F, row.names = F)
{
    if (!is.data.frame(x))
        x = as.data.frame(x)
    x = apply(x,2,as.character)
    write.table(x, ..., sep = sep, quote = quote, row.names = row.names)
}



#' @name alpha
#' @noRd
alpha = function(col, alpha)
{
    col.rgb = col2rgb(col)
    out = rgb(red = col.rgb['red', ]/255, green = col.rgb['green', ]/255, blue = col.rgb['blue', ]/255, alpha = alpha)
    names(out) = names(col)
    return(out)
}

############################
#' @name rel2abs
#' @rdname internal
#' rel2abs
#'
#' rescales CN values from relative to "absolute" (i.e. per cancer cell copy) scale given purity and ploidy
#'
#' takes in gr with signal field "field"
#'
#' @param gr GRanges input with meta data field corresponding to mean relative copy "mean" in that interval
#' @param purity purity of sample
#' @param ploidy ploidy of sample
#' @param gamma gamma fit of solution (over-rides purity and ploidy)
#' @param beta beta fit of solution (over-rides purity and ploidy)
#' @param field meta data field in "gr" variable from which to extract signal, default "mean"
#' @param field.ncn meta data field in "gr" variable from which to extract germline integer copy number, default "ncn", if doesn't exist, germline copy number is assumed to be zero
#' @return
#' numeric vector of integer copy numbers
#' @noRd
############################################
rel2abs = function(gr, purity = NA, ploidy = NA, gamma = NA, beta = NA, field = 'ratio', field.ncn = 'ncn')
{
    mu = values(gr)[, field]
    mu[is.infinite(mu)] = NA
    w = as.numeric(width(gr))
    w[is.na(mu)] = NA
    sw = sum(w, na.rm = T)
    mutl = sum(mu * w, na.rm = T)

    ncn = rep(2, length(mu))
    if (!is.null(field.ncn))
        if (field.ncn %in% names(values(gr)))
            ncn = values(gr)[, field.ncn]

    ploidy_normal = sum(w * ncn, na.rm = T) / sw  ## this will be = 2 if ncn is trivially 2

    if (is.na(gamma))
        gamma = 2*(1-purity)/purity

    if (is.na(beta))
        beta = ((1-purity)*ploidy_normal + purity*ploidy) * sw / (purity * mutl)
                                        #      beta = (2*(1-purity)*sw + purity*ploidy*sw) / (purity * mutl)

                                        # return(beta * mu - gamma)
    return(beta * mu - ncn * gamma / 2)
}

#####################################################
#' @name all.paths
#' @rdname internal
#' all.paths
#'
#' Low level function to enumerate all elementary paths and cycles through graph
#'
#' takes directed graph represented by n x n binary adjacency matrix  A and outputs all cycles and paths between source.vertices, sink.vertices
#'
#'
#' @param A nxn adjacency matrix
#' @param all logical flag, if all = T, will include all sources (parentless vertices) and sinks (childless vertices) in path computati
#' @param ALL logical flag, if ALL = T, will also include vertices without outgoing and incoming edges in paths
#' @param sources graph indices to treat as sources (by default is empty)
#' @param sinks graph indices to treat as sinks (by default is empty)
#' @param verbose logical flag
#' @return list of integer vectors corresponding to indices in A (i.e. vertices)
#' $paths = paths indices
#' $cycles = cycle indices
#' @noRd
#####################################################
all.paths = function(A, all = F, ALL = F, sources = c(), sinks = c(), source.vertices = sources, sink.vertices = sinks, verbose = FALSE,...)
{
    blank.vertices = which(Matrix::rowSums(A)==0 & Matrix::colSums(A)==0)

    if (ALL)
        all = T

    if (all)
    {
        source.vertices = which(Matrix::rowSums(A)>0 & Matrix::colSums(A)==0)
        sink.vertices = which(Matrix::colSums(A)>0 & Matrix::rowSums(A)==0)
    }

    out = list(cycles = NULL, paths = NULL)

    node.ix = which(Matrix::rowSums(A!=0)>0 | Matrix::colSums(A!=0)>0)
    if (length(node.ix)==0)
        return(out)

    A = A[node.ix, node.ix]

    ij = which(A!=0, arr.ind = T)
    B = sparseMatrix(c(ij[,1], ij[,2]), rep(1:nrow(ij), 2), x = rep(c(-1, 1), each = nrow(ij)), dims = c(nrow(A), nrow(ij)))
    I = diag(rep(1, nrow(A)))

    source.vertices = setdiff(match(source.vertices, node.ix), NA)
    sink.vertices = setdiff(match(sink.vertices, node.ix), NA)

    B2 = cbind(B, I[, source.vertices, drop = FALSE], -I[, sink.vertices, drop = FALSE])

    if (verbose)
        cat(sprintf('Computing paths for %s vertices and %s edges\n', nrow(B2), ncol(B2)))

    K = convex.basis(B2, verbose = verbose, ...)

    if (all(is.na(K)))
        return(out)

    K = K[, Matrix::colSums(K[1:ncol(B), ,drop = FALSE])!=0, drop = FALSE] ## remove any pure source to sink paths

    is.cyc = Matrix::colSums(B %*% K[1:ncol(B), ,drop = FALSE]!=0)==0

    out$cycles = lapply(which(is.cyc),
                        function(i)
                        {
                            k = which(K[1:ncol(B), i]!=0)
                            v.all = unique(as.vector(ij[k, , drop = FALSE]))
                            sG = graph.edgelist(ij[k, , drop = FALSE])
                            tmp.v = v.all[c(1,length(v.all))]
                            p.fwd = get.shortest.paths(sG, tmp.v[1], tmp.v[2])
                            p.bwd = get.shortest.paths(sG, tmp.v[2], tmp.v[1])
                            return(node.ix[unique(unlist(c(p.fwd, p.bwd)))])
                        })

    out$paths = lapply(which(!is.cyc),
                       function(i)
                       {
                           k = K[1:ncol(B), i]
                           eix = which(k!=0)
                           v.all = unique(as.vector(ij[eix, , drop = FALSE]))
                           sG = graph.edgelist(ij[eix, , drop = FALSE])
                           io = B %*% k
                           v.in = which(io<0)[1]
                           v.out = which(io>0)[1]
                           return(node.ix[unlist(get.shortest.paths(sG, v.in, v.out))])
                       })

    if (length(out$cycles)>0)
    {
        tmp.cix = cbind(unlist(lapply(seq_along(out$cycles), function(x) rep(x, length(out$cycles[[x]])))), unlist(out$cycles))
        out$cycles = out$cycles[!duplicated(as.matrix(sparseMatrix(tmp.cix[,1], tmp.cix[,2], x = 1)))]
    }

    if (length(out$paths)>0)
    {
        tmp.pix = cbind(unlist(lapply(seq_along(out$paths), function(x) rep(x, length(out$paths[[x]])))), unlist(out$paths))
        out$paths = out$paths[!duplicated(as.matrix(sparseMatrix(tmp.pix[,1], tmp.pix[,2], x = 1)))]
    }

    if (ALL & length(blank.vertices)>0)
        out$paths = c(out$paths, lapply(blank.vertices, identity))

    return(out)
}


###############################################
#' @name collapse.paths
#' @rdname internal
#' collapse.paths
#'
#' collapse simple paths in a graph G (adjacency matrix or igraph object)
#' returns m x m new adjacency matrix and map of old vertex id's to new ones
#' $adj = m x m matrix
#' #map = length n with indices 1 .. m
#'
#' @noRd
###############################################
collapse.paths = function(G, verbose = T)
{
    if (inherits(G, 'igraph'))
        G = G[,]

    out = G!=0

    ## ##  if (verbose)
    ## ##     cat('graph size:', nrow(out), 'nodes\n')

    ## first identify all nodes with exactly one parent and child to do initial collapsing of graph
    singletons = which(Matrix::rowSums(out)==1 & Matrix::colSums(out)==1)

    ## #  if (verbose)
    ## #      cat('Collapsing simple paths..\n')

    sets = split(1:nrow(G), 1:nrow(G))
    if (length(singletons)>0)
    {
        tmp = out[singletons, singletons]
        cl = igraph::clusters(graph(as.numeric(t(Matrix::which(tmp, arr.ind = TRUE))), n = nrow(tmp)), 'weak')$membership
        dix = unique(cl)
        if (length(dix)>0)
        {
            for (j in dix)
            {
                                        #                          if (verbose)
                                        #                              cat('.')

                ## grab nodes in this cluster
                setj = singletons[which(cl == j)]

                ## move all members into a single set
                sets[setj[1]] = list(setj)
                sets[setj[-1]] = list(NULL)

                ## connect this node to the parent and child of the set
                parent = setdiff(which(Matrix::rowSums(out[, setj, drop = FALSE])>0), setj)
                child = setdiff(which(Matrix::colSums(out[setj, , drop = FALSE])>0), setj)
                out[setj, c(setj, child)] = FALSE
                out[c(setj, parent), setj] = FALSE
                out[parent, setj[1]] = TRUE
                out[setj[1], child] = TRUE
            }
        }
    }

    if (verbose)
    {
                                        #    jmessage('done\nnow fixing branches\n')
    }

    todo = rep(FALSE, nrow(G))
    todo[Matrix::rowSums(out)==1 | Matrix::colSums(out)==1] = TRUE ## could also be 1/0 or 0/1!!!

    while (sum(todo)>0)
    {
        sets.last = sets
        out.last = out

                                        #        if (verbose)
                                        #            if ((sum(todo) %% 200)==0)
                                        #                cat('todo:', sum(todo), 'num sets:', sum(!sapply(sets, is.null)), '\n')

        i = which(todo)[1]

        todo[i] = F

        child = which(out[i, ])
        parent = which(out[,i])

        if (length(child)==1 & length(parent)==1) ## if there is exactly one child and one parent then we want to merge with one or both
        {
            ## if i-child has no other parents and i-parent has no other child
            ## then merge i, i-parent and i-child
            if (sum(out[,  child])==1 & sum(out[parent, ])==1)
            {
                grandch = which(out[child, ])
                if (length(grandch)>0)
                {
                    out[parent, grandch] = TRUE  ## parent inherits grandchildren of i
                    out[child, grandch] = FALSE
                }
                out[parent, i] = FALSE ## remove node i's edges
                out[i, child] = FALSE
                sets[[parent]] = c(sets[[parent]], sets[[child]], sets[[i]])
                sets[c(i, child)] = list(NULL)
                todo[child] = F ## no longer have to do i-child, since they have already been merged with parent
            }
            ## otherwise if either i-child has no other parent or i-parent has no other children (but not both)
            ## then connect i-parent to i-child, but do not merge them (but merge ONE of them with i)
            else if (sum(out[,  child])==1 | sum(out[parent, ])==1)
            {
                ## if parent has no other children then merge with him
                if (sum(out[parent, ])==1)
                    sets[[parent]] = c(sets[[parent]], sets[[i]])
                else
                    sets[[child]] = c(sets[[child]], sets[[i]])

                out[parent, child] = TRUE
                out[parent, i] = FALSE ## remove node i's edges
                out[i, child] = FALSE
                sets[i] = list(NULL)
            }
        }
        else if (length(child)==1 & length(parent)>1) ## if i has more than one parent but one child, we merge with child if child has no other parents
        {
            if (sum(out[, child])==1)
            {
                sets[[child]] = c(sets[[child]], sets[[i]])
                out[parent, child] = TRUE
                out[parent, i] = FALSE ## remove node i's edges
                out[i, child] = FALSE ## remove node i's edges
                sets[i] = list(NULL)
            }


        }
        else if (length(child)>1 & length(parent)==1) ## if i has more than one child but one parent, then merge with parent if parent has no other children
        {
            if (sum(out[parent, ])==1)
            {
                sets[[parent]] = c(sets[[parent]], sets[[i]])
                out[parent, child] = TRUE
                out[parent, i] = FALSE ## remove node i's edges
                out[i, child] = FALSE ## remove node i's edges
                sets[i] = list(NULL)
            }
        }
    }

    slen = sapply(sets, length)
    ix = which(slen>0)
    map = rep(NA, nrow(G))
    map[unlist(sets)] = match(rep(seq_along(sets), slen), ix)
    out = out[ix, ix]
    colnames(out) = rownames(out) = NULL

    return(list(adj = out, map = map, sets = split(seq_along(map), map)))
}



###############################################
#' @name sparse_subset
#' @rdname internal
#' sparse_subset
#'
#' given k1 x n matrix A and k2 x n matrix B
#' returns k1 x k2 matrix C whose entries ij = 1 if the set of nonzero components of row i of A is
#' a (+/- strict) subset of the nonzero components of row j of B
#'
#' @noRd
###############################################
sparse_subset = function(A, B, strict = FALSE, chunksize = 100, quiet = FALSE)
{
    nz = Matrix::colSums(as.matrix(A)!=0, 1)>0

    if (is.null(dim(A)) | is.null(dim(B)))
        return(NULL)

    C = sparseMatrix(i = c(), j = c(), dims = c(nrow(A), nrow(B)))

    for (i in seq(1, nrow(A), chunksize))
    {
        ixA = i:min(nrow(A), i+chunksize-1)
        for (j in seq(1, nrow(B), chunksize))
        {
            ixB = j:min(nrow(B), j+chunksize-1)

            if (length(ixA)>0 & length(ixB)>0 & !quiet)
                cat(sprintf('\t interval A %s to %s (%d) \t interval B %d to %d (%d)\n', ixA[1], ixA[length(ixA)], nrow(A), ixB[1], ixB[length(ixB)], nrow(B)))
            if (strict)
                C[ixA, ixB] = (sign((A[ixA, , drop = FALSE]!=0)) %*% sign(t(B[ixB, , drop = FALSE]!=0))) * (sign((A[ixA, , drop = FALSE]==0)) %*% sign(t(B[ixB, , drop = FALSE]!=0))>0)
            else
                C[ixA, ixB] = (sign(A[ixA, nz, drop = FALSE]!=0) %*% sign(t(B[ixB, nz, drop = FALSE]==0)))==0
        }
    }

    return(C)
}


#' @name convex.basis
#' @rdname internal
#' @description
#' convex.basis
#'
#' Outputs a matrix K of the convex basis of matrix A
#'
#' i.e. each column x = K[,i] is a minimal solution (with respect to sparsity) to
#' Ax = 0, x>=0
#'
#' @noRd
convex.basis = function(A, interval = 80, chunksize = 100, maxchunks = Inf,
                        verbose = F)
{
    ZERO = 1e-8;
    remaining = 1:nrow(A);
    iter = 0;
    i = 0;
                                        #    order = c()
    numelmos = c()
    K_i = I = as(diag(rep(1, ncol(A))), 'sparseMatrix');
                                        #    A_i = as(A %*% K_i, 'sparseMatrix');
    K_i = I = diag(rep(1, ncol(A)))
    A_i = A %*% K_i

                                        # vector to help rescale matrix (avoid numerical issues)
    mp  = apply(abs(A), 1, min); # minimum value of each column
    mp[mp[ZERO]] = 1; # columns with zero minimum get scale "1"

    st = Sys.time()
                                        # iterate through rows of A, "canceling" them out
    while (length(remaining)>0)
    {
        if (nrow(K_i)==0 | ncol(K_i)==0)
            return(matrix())

        iter = iter+1;
        K_last = K_i;

        if (verbose)
            print(Sys.time() - st)

        if (verbose)
            cat('Iter ', iter, '(of',  nrow(A_i),  ') Num basis vectors: ', nrow(K_i), " Num active components: ", sum(Matrix::rowSums(K_i!=0)), "\n")

        i = remaining[which.min(Matrix::rowSums(A_i[remaining,, drop = FALSE]>=ZERO)*Matrix::rowSums(A_i[remaining,, drop = FALSE]<=(-ZERO)))]  # chose "cheapest" rows

        remaining = setdiff(remaining, i);
                                        #        order = c(order, i);

        zero_elements = which(abs(A_i[i, ]) <= ZERO);
        K_i1 = K_last[zero_elements, , drop = FALSE];  ## K_i1 = rows of K_last that are already orthogonal to row i of A
        K_i2 = NULL; ## K_i1 = will store positive combs of K_last rows that are orthogonal to row i of A (will compute these below)

        pos_elements = which(A_i[i, ]>ZERO)
        neg_elements = which(A_i[i, ]<(-ZERO))

        if (verbose)
            cat('Iter ', iter, " Row ", i, ":", length(zero_elements), " zero elements ", length(pos_elements), " pos elements ", length(neg_elements), " neg elements \n")

        if (length(pos_elements)>0 & length(neg_elements)>0)
            for (m in seq(1, length(pos_elements), interval))
                for (l in seq(1, length(neg_elements), interval))
                {
                    ind_pos = c(m:min(c(m+interval, length(pos_elements))))
                    ind_neg = c(l:min(c(l+interval, length(neg_elements))))

                    indpairs = cbind(rep(pos_elements[ind_pos], length(ind_neg)),
                                     rep(neg_elements[ind_neg], each = length(ind_pos))); # cartesian product of ind_pos and ind_neg
                    pix = rep(1:nrow(indpairs), 2)
                    ix = c(indpairs[,1], indpairs[,2])
                    ## coeff = c(-A_i[i, indpairs[,2]], A_i[i, indpairs[,1]])  ## dealing with Matrix ghost
                    coeff = c(-A_i[i, ][indpairs[,2]], A_i[i, ][indpairs[,1]])  ##
                    combs = sparseMatrix(pix, ix, x = coeff, dims = c(nrow(indpairs), nrow(K_last)))
                    combs[cbind(pix, ix)] = coeff;

                    H = combs %*% K_last;

                    ## remove duplicated rows in H (with respect to sparsity)
                    H = H[which(!duplicated(as.matrix(H)>ZERO)), , drop = FALSE];

                    ## remove rows in H that have subsets in H (with respect to sparsity) ..
                    if ((as.numeric(nrow(H))*as.numeric(nrow(H)))>maxchunks)
                    {
                        print('Exceeding maximum number of chunks in convex.basis computation')
                        jerror('Exceeding maximum number of chunks in convex.basis computation')
                    }
                    keep = which(Matrix::colSums(sparse_subset(abs(H)>ZERO, abs(H)>ZERO, chunksize = chunksize, quiet = !verbose))<=1) # <=1 since every H is its own subset
                    H = H[keep, , drop = FALSE]

                                        # remove rows in H that have subsets in K_i2
                    if (!is.null(K_i2))
                        if (nrow(K_i2)>0)
                        {
                            if ((as.numeric(nrow(K_i2))*as.numeric(nrow(H)))>maxchunks)
                            {
                                print('Exceeding maximum number of chunks in convex.basis computation')
                                jerror('Exceeding maximum number of chunks in convex.basis computation')
                            }
                            keep = which(Matrix::colSums(sparse_subset(abs(K_i2)>ZERO, abs(H)>ZERO, chunksize = chunksize, quiet = !verbose))==0)
                            H = H[keep, , drop = FALSE]
                        }

                                        # remove rows in H that have subsets in K_i1
                    if (!is.null(K_i1))
                        if (nrow(K_i1)>0)
                        {
                            if ((as.numeric(nrow(K_i1))*as.numeric(nrow(H)))>maxchunks)
                            {
                                print('Exceeding maximum number of chunks in convex.basis computation')
                                jerror('Exceeding maximum number of chunks in convex.basis computation')
                            }
                            keep = which(Matrix::colSums(sparse_subset(abs(K_i1)>ZERO, abs(H)>ZERO, chunksize = chunksize, quiet = !verbose))==0)
                            H = H[keep, , drop = FALSE]
                        }

                                        # maintain numerical stability
                    if ((iter %% 10)==0)
                        H = diag(1/apply(abs(H), 1, max)) %*% H

                                        #                K_i2 = rBind(K_i2, H)
                    K_i2 = rbind(K_i2, as.matrix(H))
                }

                                        #        K_i = rBind(K_i1, K_i2)
        K_i = rbind(K_i1, K_i2) ## new basis set

        if (nrow(K_i)==0)
            return(matrix())

        A_i = A %*% t(K_i)
    }

    return(t(K_i))
}

############################################################
#' read.junctions: parse junction data from various common formats
#'
#' @name read.junctions
#'
#' @description Parsing various formats of structural variation data into junctions.
#'
#' @usage read.juncs(rafile,
#' keep.features = T,
#' seqlengths = hg_seqlengths(),
#' chr.convert = T,
#' geno=NULL,
#' flipstrand = FALSE,
#' swap.header = NULL,
#' breakpointer = FALSE,
#' seqlevels = NULL,
#' force.bnd = FALSE,
#' skip = NA)
#'
#' @param rafile path to the junctions file. See details for the compatible formats.
#' @param keep.features \code{logical}, if TRUE preserve meta data from the input
#' @param seqlengths a named \code{numeric} vector containing reference contig lengths
#' @param chr.convert \code{logical}, if TRUE strip "chr" prefix from contig names
#' @param geno \code{logical}, whether to parse the 'geno' fields of VCF
#' @param flipstrand \code{logical}, if TRUE will flip breakpoint strand
#' @param swap.header path to the alternative VCF header file
#' @param breakpointer \code{logical}, if TRUE will parse as breakpointer output
#' @param seqlevels vector for renaming the chromosomes
#' @param force.bnd if TRUE overwrite all junction "type" to "BND"
#' @param skip \code{numeric} lines to skip
#'
#' @details
#' A junction is a unordered pair of strand-specific genomic locations (breakpoints). Within a given
#' reference genome coordinate system, we call the direction in which coordinates increase "+". A breakpoint
#' is a width 1 (\code{start==end})genomic range with \code{strand} specified, and "+" means the side with larger
#' coordinate is fused with the other breakpoint in a junction.
#'
#' \code{rafile} must be one of the following formats:
#' 1) Some VCF (variant call format). We currently support the VCF output
#' from a number of structural variation detection methods, namely
#' SvABA (https://github.com/walaj/svaba),
#' DELLY (https://github.com/dellytools/delly),
#' LUMPY (https://github.com/arq5x/lumpy-sv),
#' novoBreak (https://sourceforge.net/projects/novobreak/). In theory,
#' VCF defined with BND style should be compatible but be cautious
#' when using the output from other methods since
#' no universal data definition is adopted by the community yet.
#' 2) BEDPE (http://bedtools.readthedocs.io/en/latest/content/general-usage.html#bedpe-format)
#' 3) Textual output from Breakpointer
#' (http://archive.broadinstitute.org/cancer/cga/breakpointer)
#' 4) R serialized object storing junctions (.rds)
#'
#' @section Warning:
#' We assume the orientation definition in the input is consistent with ours. Check with
#' the documentation of your respective method to make sure. If the contrary, use
#' \code{flipstrand=TRUE} to reconcile.
#'
#' @return a \code{GRangesList} of the junctions
#'
#' @importFrom VariantAnnotation readVcf info geno
#'
#' @export read.junctions
###########################################################
read.junctions = function(rafile,
                          keep.features = T,
                          ## seqlengths = hg_seqlengths(),
                          seqlengths = NULL,
                          chr.convert = T,
                          geno=FALSE,
                          flipstrand = FALSE,
                          swap.header = NULL,
                          breakpointer = FALSE,
                          seqlevels = NULL,
                          force.bnd = FALSE,
                          skip = NA,
                          get.loose = FALSE) {
    out = gGnome:::read.juncs(rafile,
                              flipstrand = flipstrand,
                              keep.features = keep.features,
                              seqlengths = seqlengths,
                              chr.convert = chr.convert)
    return(out)
}## {
##     if (is.null(rafile)){
##         return(GRangesList())
##     } else if (inherits(rafile, "GRangesList")){
##         return(verify.junctions(rafile))
##     } else if (inherits(rafile, "Junction")){
##         return(verify.junctions(rafile$grl))
##     } else if (is.character(rafile)){
##         if (!file.exists(rafile)){
##             return(NULL)
##         }
##         if (grepl('.rds$', rafile)){
##             ra = readRDS(rafile)
##             ## validity check written for "junctions" class
##             if (inherits(ra, "Junction")){
##                 ra = ra$grl
##             }
##             return(verify.junctions(ra))
##         } else if (grepl('bedpe(\\.gz)?$', rafile)){
##             ra.path = rafile
##             cols = c('chr1', 'start1', 'end1', 'chr2', 'start2', 'end2', 'name', 'score', 'str1', 'str2')

##             f = file(ra.path, open = "rb")
##             headers = character(0)
##             thisline = readLines(f, 1)
##             while (grepl("^((#)|(chrom)|(chr))", thisline)) {
##                 headers = c(headers, thisline)
##                 thisline = readLines(f, 1)
##             }
##             ln = sum(length(headers), length(thisline))
##             while (length(thisline) > 0) {
##                 ## thisline = readBin(f, "raw", n = 50000)
##                 ## sum(thisline == as.raw(10L))
##                 thisline = readLines(f, n = 50000)
##                 ln = length(thisline) + ln
##             }
##             lastheader = tail(headers, 1)
##             ## ln = readLines(ra.path)
##             if (is.na(skip)){
##                 ## nh = min(c(Inf, which(!grepl('^((#)|(chrom)|(chr))', ln))))-1
##                 nh = length(headers)
##                 ## if (is.infinite(nh)){
##                 ##     nh = 1
##                 ## }
##             } else{
##                 nh = skip
##             }

##             if ( (ln-nh) <=0) {
##                 ## if (get.loose){
##                 ##     return(list(junctions = GRangesList(GRanges(seqlengths = seqlengths))[c()], loose.ends = GRanges(seqlengths = seqlengths)))
##                 ## }
##                 ## else{
##                 return(GRangesList(GRanges(seqlengths = seqlengths))[c()])
##                 ## }
##             }

##             if (nh ==0) {
##                 rafile = fread(rafile, header = FALSE)
##             } else {

##                 if (nh == 1) {
##                     header_arg = TRUE
##                     skip_arg = 0
##                     bedhead = NULL
##                 } else if (nh > 1) {
##                     header_arg = F
##                     skip_arg = nh
##                     bedhead = gsub("^#", "", unlist(strsplit(lastheader, "\t|,")))
##                 }

##                 rafile = tryCatch(fread(ra.path, header = header_arg, skip = skip_arg), error = function(e) NULL)
##                 if (is.null(rafile)){
##                     rafile = tryCatch(fread(ra.path, header = header_arg, skip = skip_arg, sep = '\t'), error = function(e) NULL)
##                 }

##                 if (is.null(rafile)){
##                     rafile = tryCatch(fread(ra.path, header = header_arg, skip = skip_arg, sep = ','), error = function(e) NULL)
##                 }

##                 if (is.null(rafile)){
##                     stop('Error reading bedpe')
##                 }

##                 if (!is.null(bedhead) && identical(length(bedhead), ncol(rafile))) {
##                     colnames(rafile) = bedhead
##                 }
##             }

##             if (nrow(rafile)==0)
##                 return(GRangesList())
##             ## this is not robust enough! there might be mismatching colnames
##             setnames(rafile, seq_along(cols), cols)
##             rafile[, str1 := ifelse(str1 %in% c('+', '-'), str1, '*')]
##             rafile[, str2 := ifelse(str2 %in% c('+', '-'), str2, '*')]
##             ## converting bedpe to 1-based coordinates for verify.junctions()
##             rafile[, `:=`(
##                 end1 = ifelse(start1==end1-1, start1, end1),
##                 end2 = ifelse(start2==end2-1, start2, end2)
##             )]
##         } else if (grepl('(vcf$)|(vcf.gz$)|(vcf.bgz$)', rafile)){

##           if (!is.null(seqlengths) && all(!is.na(seqlengths)))
##             {
##               vcf = VariantAnnotation::readVcf(
##                                          rafile, genome = Seqinfo(
##                                                    seqnames = names(seqlengths),
##                                                    seqlengths = as.vector(seqlengths)))
##             }
##           else ## get seqlengths from vcf
##           {
##             vcf = VariantAnnotation::readVcf(
##                                          rafile)
##           }
##             ## vgr = rowData(vcf) ## parse BND format
##             vgr = DelayedArray::rowRanges(vcf) ## this range is identical to using read_vcf
##             ## old.vgr = read_vcf(rafile, swap.header = swap.header, geno=geno)
##             mc = data.table(as.data.frame(mcols(vgr)))

##             ## append the INFO
##             info.dt = data.table(
##                 as.data.frame(VariantAnnotation::info(vcf))
##             )
##             mc = cbind(mc, info.dt)
##             values(vgr) = mc

##             if (!('SVTYPE' %in% colnames(mc))) {
##                 jwarning('Vcf not in proper format.  Is this a rearrangement vcf?')
##                 return(GRangesList());
##             }

##             if (any(w.0 <- (width(vgr)<1))){
##                 jwarning("Some breakpoint width==0.")
##                 ## right bound smaller coor
##                 ## and there's no negative width GR allowed
##                 vgr[which(w.0)] = gr.start(vgr[which(w.0)]) %-% 1
##                 ## bpid = names(vgr)
##                 ## names(vgr) = NULL ## for some reason the below lines doesn't like names sometimes
##                 ## vgr[which(w.0)] = GenomicRanges::shift(gr.start(vgr[which(w.0)]), -1)
##                 ## names(vgr) = bpid
##             }

##             ## BND format doesn't have duplicated rownames
##             duped = sum(names(vgr) %in% names(vgr)[duplicated(names(vgr))]) == length(vgr)
##             ## if (any(duplicated(names(vgr)))){
##             if (duped){
##                 names(vgr) = NULL
##             }

##             ## no events
##             if (length(vgr) == 0){
##                 return (GRangesList())
##             }

##             ## local function that turns old VCF to BND
##             .vcf2bnd = function(vgr){
##                 if (!"END" %in% colnames(values(vgr))){
##                     jerror("Non BND SV should have the second breakpoint coor in END columns!")
##                 }

##                 if (!"CHR2" %in% colnames(values(vgr)) | any(is.na(vgr$CHR2))){
##                     vgr$CHR2 = as.character(seqnames(vgr))
##                 }

##                 bp2 = data.table(as.data.frame(mcols(vgr)))
##                 bp2[, ":="(seqnames=CHR2, start=as.numeric(END), end=as.numeric(END))]
##                 slbp2 = bp2[, pmax(1, end), by = seqnames][, structure(V1, names = seqnames)]
##                 bp2.gr = dt2gr(bp2, seqlengths = slbp2)
##                 mcols(bp2.gr) = mcols(vgr)

##                 if (!is.null(names(vgr)) & !anyDuplicated(names(vgr))){
##                     jid = names(vgr)
##                 } else {
##                     jid = seq_along(vgr)
##                 }
##                 names(vgr) = paste(paste0("exp", jid), "1", sep=":")
##                 names(bp2.gr) = paste(paste0("exp", jid), "2", sep=":")

##                 nm = c(names(vgr), names(bp2.gr))
##                 vgr = resize(grbind(vgr, bp2.gr), 1)
##                 names(vgr) = nm

##                 if (all(grepl("[_:][12]$",names(vgr)))){
##                     ## row naming same with Snowman
##                     nm <- vgr$MATEID <- names(vgr)
##                     ix <- grepl("1$",nm)
##                     vgr$MATEID[ix] = gsub("(.*?)(1)$", "\\12", nm[ix])
##                     vgr$MATEID[!ix] = gsub("(.*?)(2)$", "\\11", nm[!ix])
##                     vgr$SVTYPE="BND"
##                 }
##                 return(vgr)
##             }

##             ## GRIDSS FIX?
##             if ("PARID" %in% colnames(mcols(vgr))) {
##                 vgr$MATEID = vgr$PARID
##             }

##             ## TODO: Delly and Novobreak
##             ## fix mateids if not included
##             if (!"MATEID" %in% colnames(mcols(vgr))) {
##                 ## TODO: don't assume every row is a different junction
##                 ## Novobreak, I'm looking at you.
##                 ## now delly...
##                 ## if SVTYPE is BND but no MATEID, don't pretend to be
##                 if (length(fake.bix <- which(values(vgr)$SVTYPE=="BND"))!=0){
##                     values(vgr[fake.bix])$SVTYPE = "TRA"
##                 }

##                 ## add row names just like Snowman
##                 if (all(names(vgr)=="N" | ## Novobreak
##                         is.null(names(vgr)) |
##                         all(grepl("^DEL|DUP|INV|BND", names(vgr)))) ## Delly
##                     ){
##                     ## otherwise if all "N", as Novobreak
##                     ## or starts with DEL|DUP|INV|BND, as Delly
##                     ## expand and match MATEID
##                     vgr=.vcf2bnd(vgr)
##                 }
##             } else if (any(is.na(mid <- as.character(vgr$MATEID)))){
##                 ## like Lumpy, the BND rows are real BND but blended with non-BND rows
##                 ## treat them separately
##                 if (is.null(vgr$CHR2)){
##                     vgr$CHR2 = as.character(NA)
##                 }

##                 names(vgr) = gsub("_", ":", names(vgr))
##                 vgr$MATEID = sapply(vgr$MATEID, function(x) gsub("_", ":", x))

##                 values(vgr) = data.table(as.data.frame(values(vgr)))

##                 ## break up the two junctions in one INV line!
##                 if ("STRANDS" %in% colnames(mc) & any(ns <- sapply(vgr$STRANDS, length)>1)){
##                     ## first fix format errors, two strand given, but not comma separeted
##                     ## so you'd have taken them as single
##                     if (any(fuix <- sapply(vgr[which(!ns)]$STRANDS, stringr::str_count, ":")>1)){
##                         which(!ns)[fuix] -> tofix
##                         vgr$STRANDS[tofix] = lapply(vgr$STRANDS[tofix],
##                                                     function(x){
##                                                         strsplit(gsub("(\\d)([\\+\\-])", "\\1,\\2", x), ",")[[1]]
##                                                     })
##                         ns[tofix] = TRUE
##                     }

##                     ## for the one line two junction cases
##                     ## split into two lines
##                     vgr.double = vgr[which(ns)]
##                     j1 = j2 = vgr.double
##                     st1 = lapply(vgr.double$STRANDS, function(x)x[1])
##                     st2 = lapply(vgr.double$STRANDS, function(x)x[2])
##                     j1$STRANDS = st1
##                     j2$STRANDS = st2
##                     vgr.double = c(j1, j2)
##                     names(vgr.double) = dedup(names(vgr.double))
##                     vgr = c(vgr[which(!ns)], vgr.double)
##                 }

##               mid <- as.logical(sapply(vgr$MATEID, length))
##               vgr$loose.end = FALSE
##               vgr.bnd = vgr[which(mid)]
##               vgr.nonbnd = vgr[which(!mid)]

##               if (length(vgr.nonbnd))
##               {
##                 if (any(naix <- is.na(vgr.nonbnd$END)))
##                   {
##                     vgr.nonbnd$END[naix] = -1
##                     vgr.nonbnd$loose.end[naix] = TRUE
##                   }
##                 vgr.nonbnd = .vcf2bnd(vgr.nonbnd)
##               }

##                 mc.bnd = data.table(as.data.frame(values(vgr.bnd)))
##                 mc.nonbnd = data.table(as.data.frame(values(vgr.nonbnd)))
##                 mc.bnd$MATEID = as.character(mc.bnd$MATEID)

##                 vgr.nonbnd = vgr[which(!mid)]
##                 if (length(loose.ix <- which(vgr.nonbnd$FILTER=="LOOSEEND" |
##                                              grepl("gridss.*b$",
##                                                    names(vgr.nonbnd),
##                                                    perl = T)))>0){
##                     ## Non-BND rows contains loose ends
##                     vgr.loose = vgr.nonbnd[loose.ix]
##                     vgr.loose$loose.end = TRUE
##                     vgr.nonbnd = vgr.nonbnd[setdiff(
##                         seq_along(vgr.nonbnd),
##                         loose.ix
##                     )]
##                 }
                
##                 if (length(vgr.nonbnd)>0){
##                     vgr.nonbnd = .vcf2bnd(vgr.nonbnd)
##                     mc.nonbnd = data.table(as.data.frame(values(vgr.nonbnd)))
##                     vgr = c(vgr.bnd[,c()], vgr.nonbnd[,c()])
##                     values(vgr) = rbind(mc.bnd, mc.nonbnd)
##                 }
##             }

##             ## sanity check
##             if (!any(c("MATEID", "SVTYPE") %in% colnames(mcols(vgr)))){
##                 jerror("MATEID or SVTYPE not included. Required")
##             }

##             vgr$mateid = vgr$MATEID
##             ## what's this???
##             vgr$svtype = vgr$SVTYPE

##             if (!is.null(info(vcf)$SCTG)){
##                 vgr$SCTG = info(vcf)$SCTG
##             }

##             if (force.bnd){
##                 vgr$svtype = "BND"
##             }

##             if (sum(vgr$svtype == 'BND')==0){
##                 jwarning('Vcf not in proper format.  Will treat rearrangements as if in BND format')
##             }

##             if (!all(vgr$svtype == 'BND')){
##                 jwarning(sprintf('%s rows of vcf do not have svtype BND, treat them as non-BND!',
##                                 sum(vgr$svtype != 'BND')))

##             }

##             bix = which(vgr$svtype == "BND")
##             vgr = vgr[bix]
##             alt <- sapply(vgr$ALT, function(x) x[1])

##             ## Determine each junction's orientation
##             if ("CT" %in% colnames(mcols(vgr))){
##                 if (verbose)
##                 {
##                     message("CT INFO field found.")
##                 }
##                 if ("SVLEN" %in% colnames(values(vgr))){
##                     ## proceed as Novobreak
##                     ## ALERT: overwrite its orientation!!!!
##                     del.ix = which(vgr$SVTYPE=="DEL")
##                     dup.ix = which(vgr$SVTYPE=="DUP")
##                     vgr$CT[del.ix] = "3to5"
##                     vgr$CT[dup.ix] = "5to3"
##                 }

##                 ## also, Delly is like this
##                 ori = strsplit(vgr$CT, "to")
##                 iid = sapply(strsplit(names(vgr), ":"), function(x)as.numeric(x[2]))
##                 orimap = setNames(c("+", "-"), c("5", "3"))
##                 strd = orimap[sapply(seq_along(ori), function(i) ori[[i]][iid[i]])]
##                 strand(vgr) = strd
##                 vgr.pair1 = vgr[which(iid==1)]
##                 vgr.pair2 = vgr[which(iid==2)]
##             } else if ("STRANDS" %in% colnames(mcols(vgr))){
##                 ## TODO!!!!!!!!!!!!!!!
##                 ## sort by name, record bp1 or bp2
##                 jmessage("STRANDS INFO field found.")
##                 iid = sapply(strsplit(names(vgr), ":"), function(x)as.numeric(x[2]))
##                 vgr$iid = iid
##                 vgr = vgr[order(names(vgr))]
##                 iid = vgr$iid

##                 ## get orientations
##                 ori = strsplit(substr(unlist(vgr$STRANDS), 1, 2), character(0))
##                 orimap = setNames(c("+", "-"), c("-", "+"))

##                 ## map strands
##                 strd = orimap[sapply(seq_along(ori), function(i) ori[[i]][iid[i]])]
##                 strand(vgr) = strd

##                 vgr.pair1 = vgr[which(iid==1)]
##                 vgr.pair2 = vgr[which(iid==2)]
##             }
##             else if (any(grepl("\\[|\\]", alt))){
##                 jmessage("ALT field format like BND")
##                 ## proceed as Snowman
##                 vgr$first = !grepl('^(\\]|\\[)', alt) ## ? is this row the "first breakend" in the ALT string (i.e. does the ALT string not begin with a bracket)
##                 vgr$right = grepl('\\[', alt) ## ? are the (sharp ends) of the brackets facing right or left
##                 vgr$coord = as.character(paste(seqnames(vgr), ':', start(vgr), sep = ''))
##                 vgr$mcoord = as.character(gsub('.*(\\[|\\])(.*\\:.*)(\\[|\\]).*', '\\2', alt))
##                 vgr$mcoord = gsub('chr', '', vgr$mcoord)

##                 ## add extra genotype fields to vgr
##                 if (all(is.na(vgr$mateid))){
##                     if (!is.null(names(vgr)) & !any(duplicated(names(vgr)))){
##                         jwarning('MATEID tag missing, guessing BND partner by parsing names of vgr')
##                         vgr$mateid = paste(gsub('::\\d$', '', names(vgr)),
##                         (sapply(strsplit(names(vgr), '\\:\\:'), function(x) as.numeric(x[length(x)])))%%2 + 1, sep = '::')
##                     }
##                     else if (!is.null(vgr$SCTG))
##                     {
##                         jwarning('MATEID tag missing, guessing BND partner from coordinates and SCTG')
##                         ## require(igraph)
##                         ucoord = unique(c(vgr$coord, vgr$mcoord))
##                         vgr$mateid = paste(vgr$SCTG, vgr$mcoord, sep = '_')

##                         if (any(duplicated(vgr$mateid)))
##                         {
##                             warning('DOUBLE WARNING! inferred mateids not unique, check VCF')
##                             bix = bix[!duplicated(vgr$mateid)]
##                             vgr = vgr[!duplicated(vgr$mateid)]
##                         }
##                     }
##                     else{
##                         jerror('Error: MATEID tag missing')
##                     }
##                 }

##                 vgr$mix = as.numeric(match(vgr$mateid, names(vgr)))

##                 pix = which(!is.na(vgr$mix))

##                 vgr.pair = vgr[pix]

##                 if (length(vgr.pair)==0){
##                     jerror('Error: No mates found despite nonzero number of BND rows in VCF')
##                 }

##                 vgr.pair$mix = match(vgr.pair$mix, pix)

##                 vix = which(seq_along(vgr.pair)<vgr.pair$mix)
##                 vgr.pair1 = vgr.pair[vix]
##                 vgr.pair2 = vgr.pair[vgr.pair1$mix]

##                 ## now need to reorient pairs so that the breakend strands are pointing away from the breakpoint

##                 ## if "first" and "right" then we set this entry "-" and the second entry "+"
##                 tmpix = vgr.pair1$first & vgr.pair1$right
##                 if (any(tmpix)){
##                     strand(vgr.pair1)[tmpix] = '-'
##                     strand(vgr.pair2)[tmpix] = '+'
##                 }

##                 ## if "first" and "left" then "-", "-"
##                 tmpix = vgr.pair1$first & !vgr.pair1$right
##                 if (any(tmpix)){
##                     strand(vgr.pair1)[tmpix] = '-'
##                     strand(vgr.pair2)[tmpix] = '-'
##                 }

##                 ## if "second" and "left" then "+", "-"
##                 tmpix = !vgr.pair1$first & !vgr.pair1$right
##                 if (any(tmpix)){
##                     strand(vgr.pair1)[tmpix] = '+'
##                     strand(vgr.pair2)[tmpix] = '-'
##                 }

##                 ## if "second" and "right" then "+", "+"
##                 tmpix = !vgr.pair1$first & vgr.pair1$right
##                 if (any(tmpix)){
##                     strand(vgr.pair1)[tmpix] = '+'
##                     strand(vgr.pair2)[tmpix] = '+'
##                 }

##                 pos1 = as.logical(strand(vgr.pair1)=='+') ## positive strand junctions shift left by one (i.e. so that they refer to the base preceding the break for these junctions
##                 if (any(pos1)){
##                     start(vgr.pair1)[pos1] = start(vgr.pair1)[pos1]-1
##                     end(vgr.pair1)[pos1] = end(vgr.pair1)[pos1]-1
##                 }

##                 pos2 = as.logical(strand(vgr.pair2)=='+') ## positive strand junctions shift left by one (i.e. so that they refer to the base preceding the break for these junctions
##                 if (any(pos2)){
##                     start(vgr.pair2)[pos2] = start(vgr.pair2)[pos2]-1
##                     end(vgr.pair2)[pos2] = end(vgr.pair2)[pos2]-1
##                 }
##             }

##             ra = grl.pivot(GRangesList(vgr.pair1[, c()], vgr.pair2[, c()]))

##             ## ALERT: vgr has already been subsetted to only include BND rows
##             ## bix is the original indices, so NOT compatible!
##             ## this.inf = values(vgr)[bix[pix[vix]], ]
##             if (exists("pix") & exists("vix")){
##                 this.inf = values(vgr)[pix[vix], ]
##             }
##             if (exists("iid")){
##                 this.inf = values(vgr[which(iid==1)])
##             }

##             if (is.null(this.inf$POS)){
##                 this.inf = cbind(data.frame(POS = ''), this.inf)
##             }
##             if (is.null(this.inf$CHROM)){
##                 this.inf = cbind(data.frame(CHROM = ''), this.inf)
##             }

##             if (is.null(this.inf$MATL)){
##                 this.inf = cbind(data.frame(MALT = ''), this.inf)
##             }

##             this.inf$CHROM = seqnames(vgr.pair1)
##             this.inf$POS = start(vgr.pair1)
##             this.inf$MATECHROM = seqnames(vgr.pair2)
##             this.inf$MATEPOS = start(vgr.pair2)
##             this.inf$MALT = vgr.pair2$AL

##             ## NOT SURE WHY BROKEN
##             ## tmp = tryCatch(cbind(values(vgr)[bix[pix[vix]],], this.inf), error = function(e) NULL)
##             ## if (!is.null(tmp))
##             ##     values(ra) = tmp
##             ## else
##             ##     values(ra) = cbind(vcf@fixed[bix[pix[vix]],], this.inf)

##             values(ra) = this.inf

##             if (is.null(values(ra)$TIER)){
##                 ## baseline tiering of PASS vs non PASS variants
##                 ## ALERT: mind the naming convention by diff programs
##                 ## TODO: make sure it is compatible with Delly, Novobreak, Meerkat
##                 ## Snowman/SvABA uses "PASS"
##                 ## Lumpy/Speedseq uses "."
##                 values(ra)$tier = ifelse(values(ra)$FILTER %in% c(".", "PASS"), 2, 3)
##             } else {
##                 values(ra)$tier = values(ra)$TIER
##             }

##             if (isTRUE(geno)){
##                 ## expand into a list of GRLs, named by the sample name in the VCF
##                 geno.dt = data.table(
##                     data.table(as.data.frame(VariantAnnotation::geno(vcf)$GT))
##                 )
##                 if (ncol(geno.dt)>1) {
##                     cnms = colnames(geno.dt)
##                     single.ra = ra
##                     ra = lapply(setNames(cnms, cnms),
##                                 function(cnm){
##                                     this.ra = copy(single.ra)
##                                     this.dt = data.table(as.data.frame(values(this.ra)))
##                                     this.geno = geno.dt[[cnm]]
##                                     this.dt[
##                                       , tier := ifelse(
##                                             tier==2, ifelse(grepl("1", this.geno), 2, 3), 3)]
##                                     values(this.ra) = this.dt
##                                     return(verify.junctions(this.ra))
##                                 })
##                     loose=FALSE ## TODO: temporary until we figure out how
##                 }
##             }
##             if (!get.loose | is.null(vgr$mix)){
##                 return(verify.junctions(ra))
##             } else {
##                 npix = is.na(vgr$mix)
##                 ## these are possible "loose ends" that we will add to the segmentation
##                 vgr.loose = vgr[npix, c()] 

##                 ## NOT SURE WHY BROKEN
##                 tmp =  tryCatch( values(vgr)[bix[npix], ],
##                                 error = function(e) NULL)
##                 if (!is.null(tmp)){
##                     values(vgr.loose) = tmp
##                 } else{
##                     values(vgr.loose) = cbind(vcf@fixed[bix[npix], ], info(vcf)[bix[npix], ])
##                 }

##                 return(list(junctions = verify.junctions(ra), loose.ends = vgr.loose))
##             }
##         } else {
##             rafile = data.table::fread(rafile)
##         }
##     } else if (all(is.na(rafile), na.rm = TRUE)){
##         return(GRangesList())
##     }


##     if (is.data.table(rafile)){
##         rafile = as.data.frame(rafile)
##     }

##     if (nrow(rafile)==0){
##         out = GRangesList()
##         values(out) = rafile
##         return(verify.junctions(out))
##     }

##     ## flip breaks so that they are pointing away from junction
##     if (flipstrand) {
##         rafile$str1 = ifelse(rafile$strand1 == '+', '-', '+')
##         rafile$str2 = ifelse(rafile$strand2 == '+', '-', '+')
##     }

##     if (!is.null(seqlevels)) ## convert seqlevels from notation in tab delim file to actual
##     {
##         rafile$chr1 = seqlevels[rafile$chr1]
##         rafile$chr2 = seqlevels[rafile$chr2]
##     }


##     if (is.null(rafile$str1)){
##         rafile$str1 = rafile$strand1
##     }

##     if (is.null(rafile$str2)){
##         rafile$str2 = rafile$strand2
##     }

##     if (!is.null(rafile$pos1) & !is.null(rafile$pos2)){
##         if (breakpointer){
##             rafile$pos1 = rafile$T_BPpos1
##             rafile$pos2 = rafile$T_BPpos2
##         }

##         if (!is.numeric(rafile$pos1)){
##             rafile$pos1 = as.numeric(rafile$pos1)
##         }

##         if (!is.numeric(rafile$pos2)){
##             rafile$pos2 = as.numeric(rafile$pos2)
##         }

##         ## clean the parenthesis from the string

##         rafile$str1 <- gsub('[()]', '', rafile$str1)
##         rafile$str2 <- gsub('[()]', '', rafile$str2)

##         ## goal is to make the ends point <away> from the junction where - is left and + is right
##         if (is.character(rafile$str1) | is.factor(rafile$str1)){
##             rafile$str1 = gsub('0', '-', gsub('1', '+', gsub('\\-', '1', gsub('\\+', '0', rafile$str1))))
##         }

##         if (is.character(rafile$str2) | is.factor(rafile$str2)){
##           rafile$str2 = gsub('0', '-', gsub('1', '+', gsub('\\-', '1', gsub('\\+', '0', rafile$str2))))
##         }


##         if (is.numeric(rafile$str1)){
##             rafile$str1 = ifelse(rafile$str1>0, '+', '-')
##         }

##         if (is.numeric(rafile$str2)){
##             rafile$str2 = ifelse(rafile$str2>0, '+', '-')
##         }

##         rafile$rowid = 1:nrow(rafile)

##         bad.ix = is.na(rafile$chr1) | is.na(rafile$chr2) | is.na(rafile$pos1) | is.na(rafile$pos2) | is.na(rafile$str1) | is.na(rafile$str2) | rafile$str1 == '*'| rafile$str2 == '*' | rafile$pos1<0 | rafile$pos2<0

##         rafile = rafile[which(!bad.ix), ]

##         if (nrow(rafile)==0){
##             return(GRangesList())
##         }

##         seg = rbind(data.frame(chr = rafile$chr1, pos1 = rafile$pos1, pos2 = rafile$pos1, strand = rafile$str1, ra.index = rafile$rowid, ra.which = 1, stringsAsFactors = F),
##                     data.frame(chr = rafile$chr2, pos1 = rafile$pos2, pos2 = rafile$pos2, strand = rafile$str2, ra.index = rafile$rowid, ra.which = 2, stringsAsFactors = F))

##         if (chr.convert){
##             seg$chr = gsub('chr', '', gsub('25', 'M', gsub('24', 'Y', gsub('23', 'X', seg$chr))))
##         }

##         out = seg2gr(seg, seqlengths = seqlengths)[, c('ra.index', 'ra.which')];
##         out = split(out, out$ra.index)
##     } else if (!is.null(rafile$start1) & !is.null(rafile$start2) & !is.null(rafile$end1) & !is.null(rafile$end2)){
##         ra1 = gr.flipstrand(GRanges(rafile$chr1, IRanges(rafile$start1, rafile$end1), strand = rafile$str1))
##         ra2 = gr.flipstrand(GRanges(rafile$chr2, IRanges(rafile$start2, rafile$end2), strand = rafile$str2))
##         out = grl.pivot(GRangesList(ra1, ra2))
##     }

##     if (keep.features){
##         values(out) = rafile[, ]
##     }

##     ## if (!is.null(pad)){
##     ##     out = ra.dedup(out, pad = pad)
##     ## }

##     out = verify.junctions(out)

##     if (!get.loose){
##         return(out)
##     } else{
##         return(list(junctions = out, loose.ends = GRanges()))
##     }

##     return(out)
##     ## return(Junction$new(out))
## }

## read.junctions = function(rafile,
##                           keep.features = T,
##                           ## seqlengths = hg_seqlengths(),
##                           seqlengths = NULL,
##                           chr.convert = T,
##                           geno=FALSE,
##                           flipstrand = FALSE,
##                           swap.header = NULL,
##                           breakpointer = FALSE,
##                           seqlevels = NULL,
##                           force.bnd = FALSE,
##                           skip = NA,
##                           get.loose = FALSE){
##     if (is.null(rafile)){
##         return(GRangesList())
##     } else if (inherits(rafile, "GRangesList")){
##         return(verify.junctions(rafile))
##     } else if (inherits(rafile, "Junction")){
##         return(verify.junctions(rafile$grl))
##     } else if (is.character(rafile)){
##         if (!file.exists(rafile)){
##             return(NULL)
##         }
##         if (grepl('.rds$', rafile)){
##             ra = readRDS(rafile)
##             ## validity check written for "junctions" class
##             if (inherits(ra, "Junction")){
##                 ra = ra$grl
##             }
##             return(verify.junctions(ra))
##         } else if (grepl('bedpe(\\.gz)?$', rafile)){
##             ra.path = rafile
##             cols = c('chr1', 'start1', 'end1', 'chr2', 'start2', 'end2', 'name', 'score', 'str1', 'str2')

##             f = file(ra.path, open = "rb")
##             headers = character(0)
##             thisline = readLines(f, 1)
##             while (grepl("^((#)|(chrom)|(chr))", thisline)) {
##                 headers = c(headers, thisline)
##                 thisline = readLines(f, 1)
##             }
##             ln = sum(length(headers), length(thisline))
##             while (length(thisline) > 0) {
##                 ## thisline = readBin(f, "raw", n = 50000)
##                 ## sum(thisline == as.raw(10L))
##                 thisline = readLines(f, n = 50000)
##                 ln = length(thisline) + ln
##             }
##             lastheader = tail(headers, 1)
##             ## ln = readLines(ra.path)
##             if (is.na(skip)){
##                 ## nh = min(c(Inf, which(!grepl('^((#)|(chrom)|(chr))', ln))))-1
##                 nh = length(headers)
##                 ## if (is.infinite(nh)){
##                 ##     nh = 1
##                 ## }
##             } else{
##                 nh = skip
##             }

##             if ( (ln-nh) <=0) {
##                 ## if (get.loose){
##                 ##     return(list(junctions = GRangesList(GRanges(seqlengths = seqlengths))[c()], loose.ends = GRanges(seqlengths = seqlengths)))
##                 ## }
##                 ## else{
##                 return(GRangesList(GRanges(seqlengths = seqlengths))[c()])
##                 ## }
##             }

##             if (nh ==0) {
##                 rafile = fread(rafile, header = FALSE)
##             } else {

##                 if (nh == 1) {
##                     header_arg = TRUE
##                     skip_arg = 0
##                     bedhead = NULL
##                 } else if (nh > 1) {
##                     header_arg = F
##                     skip_arg = nh
##                     bedhead = gsub("^#", "", unlist(strsplit(lastheader, "\t|,")))
##                 }

##                 rafile = tryCatch(fread(ra.path, header = header_arg, skip = skip_arg), error = function(e) NULL)
##                 if (is.null(rafile)){
##                     rafile = tryCatch(fread(ra.path, header = header_arg, skip = skip_arg, sep = '\t'), error = function(e) NULL)
##                 }

##                 if (is.null(rafile)){
##                     rafile = tryCatch(fread(ra.path, header = header_arg, skip = skip_arg, sep = ','), error = function(e) NULL)
##                 }

##                 if (is.null(rafile)){
##                     stop('Error reading bedpe')
##                 }

##                 if (!is.null(bedhead) && identical(length(bedhead), ncol(rafile))) {
##                     colnames(rafile) = bedhead
##                 }
##             }

##             if (nrow(rafile)==0)
##                 return(GRangesList())
##             ## this is not robust enough! there might be mismatching colnames
##             setnames(rafile, seq_along(cols), cols)
##             rafile[, str1 := ifelse(str1 %in% c('+', '-'), str1, '*')]
##             rafile[, str2 := ifelse(str2 %in% c('+', '-'), str2, '*')]
##             ## converting bedpe to 1-based coordinates for verify.junctions()
##             rafile[, `:=`(
##                 end1 = ifelse(start1==end1-1, start1, end1),
##                 end2 = ifelse(start2==end2-1, start2, end2)
##             )]
##         } else if (grepl('(vcf$)|(vcf.gz$)', rafile)){

##           if (!is.null(seqlengths) && all(!is.na(seqlengths)))
##             {
##               vcf = VariantAnnotation::readVcf(
##                                          rafile, genome = Seqinfo(
##                                                    seqnames = names(seqlengths),
##                                                    seqlengths = as.vector(seqlengths)))
##             }
##           else ## get seqlengths from vcf
##           {
##             vcf = VariantAnnotation::readVcf(
##                                          rafile)
##           }
##             ## vgr = rowData(vcf) ## parse BND format
##             vgr = DelayedArray::rowRanges(vcf) ## this range is identical to using read_vcf
##             ## old.vgr = read_vcf(rafile, swap.header = swap.header, geno=geno)
##             mc = data.table(as.data.frame(mcols(vgr)))

##             ## append the INFO
##             info.dt = data.table(
##                 as.data.frame(VariantAnnotation::info(vcf))
##             )
##             mc = cbind(mc, info.dt)
##             values(vgr) = mc

##             if (!('SVTYPE' %in% colnames(mc))) {
##                 jwarning('Vcf not in proper format.  Is this a rearrangement vcf?')
##                 return(GRangesList());
##             }

##             if (any(w.0 <- (width(vgr)<1))){
##                 jwarning("Some breakpoint width==0.")
##                 ## right bound smaller coor
##                 ## and there's no negative width GR allowed
##                 vgr[which(w.0)] = gr.start(vgr[which(w.0)]) %-% 1
##             }

##             ## BND format doesn't have duplicated rownames
##             if (any(duplicated(names(vgr)))){
##                 names(vgr) = NULL
##             }

##             ## no events
##             if (length(vgr) == 0){
##                 return (GRangesList())
##             }

##             ## local function that turns old VCF to BND
##             .vcf2bnd = function(vgr){
##                 if (!"END" %in% colnames(values(vgr))){
##                     jerror("Non BND SV should have the second breakpoint coor in END columns!")
##                 }

##                 if (!"CHR2" %in% colnames(values(vgr)) | any(is.na(vgr$CHR2))){
##                     vgr$CHR2 = as.character(seqnames(vgr))
##                 }

##                 bp2 = data.table(as.data.frame(mcols(vgr)))
##                 bp2[, ":="(seqnames=CHR2, start=as.numeric(END), end=as.numeric(END))]
##                 bp2.gr = dt2gr(bp2, seqlengths = seqlengths(vgr))
##                 mcols(bp2.gr) = mcols(vgr)

##                 if (!is.null(names(vgr)) & !anyDuplicated(names(vgr))){
##                     jid = names(vgr)
##                 } else {
##                     jid = seq_along(vgr)
##                 }
##                 names(vgr) = paste(paste0("exp", jid), "1", sep=":")
##                 names(bp2.gr) = paste(paste0("exp", jid), "2", sep=":")

##                 vgr=resize(c(vgr, bp2.gr), 1)

##                 if (all(grepl("[_:][12]$",names(vgr)))){
##                     ## row naming same with Snowman
##                     nm <- vgr$MATEID <- names(vgr)
##                     ix <- grepl("1$",nm)
##                     vgr$MATEID[ix] = gsub("(.*?)(1)$", "\\12", nm[ix])
##                     vgr$MATEID[!ix] = gsub("(.*?)(2)$", "\\11", nm[!ix])
##                     vgr$SVTYPE="BND"
##                 }
##                 return(vgr)
##             }

##             ## TODO: Delly and Novobreak
##             ## fix mateids if not included
##             if (!"MATEID" %in% colnames(mcols(vgr))) {
##                 ## TODO: don't assume every row is a different junction
##                 ## Novobreak, I'm looking at you.
##                 ## now delly...
##                 ## if SVTYPE is BND but no MATEID, don't pretend to be
##                 if (length(fake.bix <- which(values(vgr)$SVTYPE=="BND"))!=0){
##                     values(vgr[fake.bix])$SVTYPE = "TRA"
##                 }

##                 ## add row names just like Snowman
##                 if (all(names(vgr)=="N" | ## Novobreak
##                         is.null(names(vgr)) |
##                         all(grepl("^DEL|DUP|INV|BND", names(vgr)))) ## Delly
##                     ){
##                     ## otherwise if all "N", as Novobreak
##                     ## or starts with DEL|DUP|INV|BND, as Delly
##                     ## expand and match MATEID
##                     vgr=.vcf2bnd(vgr)
##                 }
##             } else if (any(is.na(mid <- as.character(vgr$MATEID)))){
##                 ## like Lumpy, the BND rows are real BND but blended with non-BND rows
##                 ## treat them separately
##                 if (is.null(vgr$CHR2)){
##                     vgr$CHR2 = as.character(NA)
##                 }

##                 names(vgr) = gsub("_", ":", names(vgr))
##                 vgr$MATEID = sapply(vgr$MATEID, function(x) gsub("_", ":", x))

##                 values(vgr) = data.table(as.data.frame(values(vgr)))

##                 ## break up the two junctions in one INV line!
##                 if ("STRANDS" %in% colnames(mc) & any(ns <- sapply(vgr$STRANDS, length)>1)){
##                     ## first fix format errors, two strand given, but not comma separeted
##                     ## so you'd have taken them as single
##                     if (any(fuix <- sapply(vgr[which(!ns)]$STRANDS, stringr::str_count, ":")>1)){
##                         which(!ns)[fuix] -> tofix
##                         vgr$STRANDS[tofix] = lapply(vgr$STRANDS[tofix],
##                                                     function(x){
##                                                         strsplit(gsub("(\\d)([\\+\\-])", "\\1,\\2", x), ",")[[1]]
##                                                     })
##                         ns[tofix] = TRUE
##                     }

##                     ## for the one line two junction cases
##                     ## split into two lines
##                     vgr.double = vgr[which(ns)]
##                     j1 = j2 = vgr.double
##                     st1 = lapply(vgr.double$STRANDS, function(x)x[1])
##                     st2 = lapply(vgr.double$STRANDS, function(x)x[2])
##                     j1$STRANDS = st1
##                     j2$STRANDS = st2
##                     vgr.double = c(j1, j2)
##                     names(vgr.double) = dedup(names(vgr.double))
##                     vgr = c(vgr[which(!ns)], vgr.double)
##                 }

##                 mid <- as.logical(sapply(vgr$MATEID, length))
##                 vgr.bnd = vgr[which(mid)]
##                 mc.bnd = data.table(as.data.frame(values(vgr.bnd)))
##                 mc.bnd$MATEID = as.character(mc.bnd$MATEID)

##                 vgr.nonbnd = vgr[which(!mid)]
##                 if (length(loose.ix <- which(vgr.nonbnd$FILTER=="LOOSEEND"))>0){
##                     ## Non-BND rows contains loose ends
##                     vgr.loose = vgr.nonbnd[loose.ix]
##                     vgr.nonbnd = vgr.nonbnd[setdiff(seq_along(vgr.nonbnd), loose.ix)]
##                 }

##                 if (length(vgr.nonbnd)>0){
##                     vgr.nonbnd = .vcf2bnd(vgr.nonbnd)
##                     mc.nonbnd = data.table(as.data.frame(values(vgr.nonbnd)))
##                     vgr = c(vgr.bnd[,c()], vgr.nonbnd[,c()])
##                     values(vgr) = rbind(mc.bnd, mc.nonbnd)
##                 }
##             }

##             ## sanity check
##             if (!any(c("MATEID", "SVTYPE") %in% colnames(mcols(vgr)))){
##                 jerror("MATEID or SVTYPE not included. Required")
##             }

##             vgr$mateid = vgr$MATEID
##             ## what's this???
##             vgr$svtype = vgr$SVTYPE

##             if (!is.null(info(vcf)$SCTG)){
##                 vgr$SCTG = info(vcf)$SCTG
##             }

##             if (force.bnd){
##                 vgr$svtype = "BND"
##             }

##             if (sum(vgr$svtype == 'BND')==0){
##                 jwarning('Vcf not in proper format.  Will treat rearrangements as if in BND format')
##             }

##             if (!all(vgr$svtype == 'BND')){
##                 jwarning(sprintf('%s rows of vcf do not have svtype BND, treat them as non-BND!',
##                                 sum(vgr$svtype != 'BND')))
##             }

##             bix = which(vgr$svtype == "BND")
##             vgr = vgr[bix]
##             alt <- sapply(vgr$ALT, function(x) x[1])

##             ## Determine each junction's orientation
##             if ("CT" %in% colnames(mcols(vgr))){
##                 jmessage("CT INFO field found.")
##                 if ("SVLEN" %in% colnames(values(vgr))){
##                     ## proceed as Novobreak
##                     ## ALERT: overwrite its orientation!!!!
##                     del.ix = which(vgr$SVTYPE=="DEL")
##                     dup.ix = which(vgr$SVTYPE=="DUP")
##                     vgr$CT[del.ix] = "3to5"
##                     vgr$CT[dup.ix] = "5to3"
##                 }

##                 ## also, Delly is like this
##                 ori = strsplit(vgr$CT, "to")
##                 iid = sapply(strsplit(names(vgr), ":"), function(x)as.numeric(x[2]))
##                 orimap = setNames(c("+", "-"), c("5", "3"))
##                 strd = orimap[sapply(seq_along(ori), function(i) ori[[i]][iid[i]])]
##                 strand(vgr) = strd
##                 vgr.pair1 = vgr[which(iid==1)]
##                 vgr.pair2 = vgr[which(iid==2)]
##             } else if ("STRANDS" %in% colnames(mcols(vgr))){
##                 ## TODO!!!!!!!!!!!!!!!
##                 ## sort by name, record bp1 or bp2
##                 jmessage("STRANDS INFO field found.")
##                 iid = sapply(strsplit(names(vgr), ":"), function(x)as.numeric(x[2]))
##                 vgr$iid = iid
##                 vgr = vgr[order(names(vgr))]
##                 iid = vgr$iid

##                 ## get orientations
##                 ori = strsplit(substr(unlist(vgr$STRANDS), 1, 2), character(0))
##                 orimap = setNames(c("+", "-"), c("-", "+"))

##                 ## map strands
##                 strd = orimap[sapply(seq_along(ori), function(i) ori[[i]][iid[i]])]
##                 strand(vgr) = strd

##                 vgr.pair1 = vgr[which(iid==1)]
##                 vgr.pair2 = vgr[which(iid==2)]
##             }
##             else if (any(grepl("\\[|\\]", alt))){
##                 jmessage("ALT field format like BND")
##                 ## proceed as Snowman
##                 vgr$first = !grepl('^(\\]|\\[)', alt) ## ? is this row the "first breakend" in the ALT string (i.e. does the ALT string not begin with a bracket)
##                 vgr$right = grepl('\\[', alt) ## ? are the (sharp ends) of the brackets facing right or left
##                 vgr$coord = as.character(paste(seqnames(vgr), ':', start(vgr), sep = ''))
##                 vgr$mcoord = as.character(gsub('.*(\\[|\\])(.*\\:.*)(\\[|\\]).*', '\\2', alt))
##                 vgr$mcoord = gsub('chr', '', vgr$mcoord)

##                 ## add extra genotype fields to vgr
##                 if (all(is.na(vgr$mateid)))
##                     if (!is.null(names(vgr)) & !any(duplicated(names(vgr)))){
##                         jwarning('MATEID tag missing, guessing BND partner by parsing names of vgr')
##                         vgr$mateid = paste(gsub('::\\d$', '', names(vgr)),
##                         (sapply(strsplit(names(vgr), '\\:\\:'), function(x) as.numeric(x[length(x)])))%%2 + 1, sep = '::')
##                     }
##                     else if (!is.null(vgr$SCTG))
##                     {
##                         jwarning('MATEID tag missing, guessing BND partner from coordinates and SCTG')
##                         ## require(igraph)
##                         ucoord = unique(c(vgr$coord, vgr$mcoord))
##                         vgr$mateid = paste(vgr$SCTG, vgr$mcoord, sep = '_')

##                         if (any(duplicated(vgr$mateid)))
##                         {
##                             warning('DOUBLE WARNING! inferred mateids not unique, check VCF')
##                             bix = bix[!duplicated(vgr$mateid)]
##                             vgr = vgr[!duplicated(vgr$mateid)]
##                         }
##                     }
##                     else{
##                         jerror('Error: MATEID tag missing')
##                     }

##                 vgr$mix = as.numeric(match(vgr$mateid, names(vgr)))

##                 pix = which(!is.na(vgr$mix))

##                 vgr.pair = vgr[pix]

##                 if (length(vgr.pair)==0){
##                     jerror('Error: No mates found despite nonzero number of BND rows in VCF')
##                 }

##                 vgr.pair$mix = match(vgr.pair$mix, pix)

##                 vix = which(seq_along(vgr.pair)<vgr.pair$mix)
##                 vgr.pair1 = vgr.pair[vix]
##                 vgr.pair2 = vgr.pair[vgr.pair1$mix]

##                 ## now need to reorient pairs so that the breakend strands are pointing away from the breakpoint

##                 ## if "first" and "right" then we set this entry "-" and the second entry "+"
##                 tmpix = vgr.pair1$first & vgr.pair1$right
##                 if (any(tmpix)){
##                     strand(vgr.pair1)[tmpix] = '-'
##                     strand(vgr.pair2)[tmpix] = '+'
##                 }

##                 ## if "first" and "left" then "-", "-"
##                 tmpix = vgr.pair1$first & !vgr.pair1$right
##                 if (any(tmpix)){
##                     strand(vgr.pair1)[tmpix] = '-'
##                     strand(vgr.pair2)[tmpix] = '-'
##                 }

##                 ## if "second" and "left" then "+", "-"
##                 tmpix = !vgr.pair1$first & !vgr.pair1$right
##                 if (any(tmpix)){
##                     strand(vgr.pair1)[tmpix] = '+'
##                     strand(vgr.pair2)[tmpix] = '-'
##                 }

##                 ## if "second" and "right" then "+", "+"
##                 tmpix = !vgr.pair1$first & vgr.pair1$right
##                 if (any(tmpix)){
##                     strand(vgr.pair1)[tmpix] = '+'
##                     strand(vgr.pair2)[tmpix] = '+'
##                 }

##                 pos1 = as.logical(strand(vgr.pair1)=='+') ## positive strand junctions shift left by one (i.e. so that they refer to the base preceding the break for these junctions
##                 if (any(pos1)){
##                     start(vgr.pair1)[pos1] = start(vgr.pair1)[pos1]-1
##                     end(vgr.pair1)[pos1] = end(vgr.pair1)[pos1]-1
##                 }

##                 pos2 = as.logical(strand(vgr.pair2)=='+') ## positive strand junctions shift left by one (i.e. so that they refer to the base preceding the break for these junctions
##                 if (any(pos2)){
##                     start(vgr.pair2)[pos2] = start(vgr.pair2)[pos2]-1
##                     end(vgr.pair2)[pos2] = end(vgr.pair2)[pos2]-1
##                 }
##             }

##             ra = grl.pivot(GRangesList(vgr.pair1[, c()], vgr.pair2[, c()]))

##             ## ALERT: vgr has already been subsetted to only include BND rows
##             ## bix is the original indices, so NOT compatible!
##             ## this.inf = values(vgr)[bix[pix[vix]], ]
##             if (exists("pix") & exists("vix")){
##                 this.inf = values(vgr)[pix[vix], ]
##             }
##             if (exists("iid")){
##                 this.inf = values(vgr[which(iid==1)])
##             }

##             if (is.null(this.inf$POS)){
##                 this.inf = cbind(data.frame(POS = ''), this.inf)
##             }
##             if (is.null(this.inf$CHROM)){
##                 this.inf = cbind(data.frame(CHROM = ''), this.inf)
##             }

##             if (is.null(this.inf$MATL)){
##                 this.inf = cbind(data.frame(MALT = ''), this.inf)
##             }

##             this.inf$CHROM = seqnames(vgr.pair1)
##             this.inf$POS = start(vgr.pair1)
##             this.inf$MATECHROM = seqnames(vgr.pair2)
##             this.inf$MATEPOS = start(vgr.pair2)
##             this.inf$MALT = vgr.pair2$AL

##             ## NOT SURE WHY BROKEN
##             ## tmp = tryCatch(cbind(values(vgr)[bix[pix[vix]],], this.inf), error = function(e) NULL)
##             ## if (!is.null(tmp))
##             ##     values(ra) = tmp
##             ## else
##             ##     values(ra) = cbind(vcf@fixed[bix[pix[vix]],], this.inf)

##             values(ra) = this.inf

##             if (is.null(values(ra)$TIER)){
##                 ## baseline tiering of PASS vs non PASS variants
##                 ## ALERT: mind the naming convention by diff programs
##                 ## TODO: make sure it is compatible with Delly, Novobreak, Meerkat
##                 ## Snowman/SvABA uses "PASS"
##                 ## Lumpy/Speedseq uses "."
##                 values(ra)$tier = ifelse(values(ra)$FILTER %in% c(".", "PASS"), 2, 3)
##             } else {
##                 values(ra)$tier = values(ra)$TIER
##             }

##             if (geno==TRUE){
##                 ## expand into a list of GRLs, named by the sample name in the VCF
##                 geno.dt = data.table(
##                     data.table(as.data.frame(VariantAnnotation::geno(vcf)$GT))
##                 )
##                 if (ncol(geno.dt)>1) {
##                     cnms = colnames(geno.dt)
##                     single.ra = ra
##                     ra = lapply(setNames(cnms, cnms),
##                                 function(cnm){
##                                     this.ra = copy(single.ra)
##                                     this.dt = data.table(as.data.frame(values(this.ra)))
##                                     this.geno = geno.dt[[cnm]]
##                                     this.dt[
##                                       , tier := ifelse(
##                                             tier==2, ifelse(grepl("1", this.geno), 2, 3), 3)]
##                                     values(this.ra) = this.dt
##                                     return(this.ra)
##                                 })
##                     loose=FALSE ## TODO: temporary until we figure out how
##                 }
##             }

##             if (!get.loose | is.null(vgr$mix)){
##                 return(ra)
##             } else {
##                 npix = is.na(vgr$mix)
##                 ## these are possible "loose ends" that we will add to the segmentation
##                 vgr.loose = vgr[npix, c()]

##                 ## NOT SURE WHY BROKEN
##                 tmp =  tryCatch( values(vgr)[bix[npix], ],
##                                 error = function(e) NULL)
##                 if (!is.null(tmp)){
##                     values(vgr.loose) = tmp
##                 } else{
##                     values(vgr.loose) = cbind(vcf@fixed[bix[npix], ], info(vcf)[bix[npix], ])
##                 }

##                 return(list(junctions = ra, loose.ends = vgr.loose))
##             }
##         } else{
##             rafile = data.table::fread(rafile)
##         }
##     } else if (is.na(rafile)){
##         return(GRangesList())
##     }

##     if (is.data.table(rafile)){
##         rafile = as.data.frame(rafile)
##     }

##     if (nrow(rafile)==0){
##         out = GRangesList()
##         values(out) = rafile
##         return(verify.junctions(out))
##     }

##     ## flip breaks so that they are pointing away from junction
##     if (flipstrand) {
##         rafile$str1 = ifelse(rafile$strand1 == '+', '-', '+')
##         rafile$str2 = ifelse(rafile$strand2 == '+', '-', '+')
##     }

##     if (!is.null(seqlevels)) ## convert seqlevels from notation in tab delim file to actual
##     {
##         rafile$chr1 = seqlevels[rafile$chr1]
##         rafile$chr2 = seqlevels[rafile$chr2]
##     }


##     if (is.null(rafile$str1)){
##         rafile$str1 = rafile$strand1
##     }

##     if (is.null(rafile$str2)){
##         rafile$str2 = rafile$strand2
##     }

##     if (!is.null(rafile$pos1) & !is.null(rafile$pos2)){
##         if (breakpointer){
##             rafile$pos1 = rafile$T_BPpos1
##             rafile$pos2 = rafile$T_BPpos2
##         }

##         if (!is.numeric(rafile$pos1)){
##             rafile$pos1 = as.numeric(rafile$pos1)
##         }

##         if (!is.numeric(rafile$pos2)){
##             rafile$pos2 = as.numeric(rafile$pos2)
##         }

##         ## clean the parenthesis from the string

##         rafile$str1 <- gsub('[()]', '', rafile$str1)
##         rafile$str2 <- gsub('[()]', '', rafile$str2)

##         ## goal is to make the ends point <away> from the junction where - is left and + is right
##         if (is.character(rafile$str1) | is.factor(rafile$str1)){
##             rafile$str1 = gsub('0', '-', gsub('1', '+', gsub('\\-', '1', gsub('\\+', '0', rafile$str1))))
##         }

##         if (is.character(rafile$str2) | is.factor(rafile$str2)){
##             rafile$str2 = gsub('0', '-', gsub('1', '+', gsub('\\-', '1', gsub('\\+', '0', rafile$str2))))
##         }


##         if (is.numeric(rafile$str1)){
##             rafile$str1 = ifelse(rafile$str1>0, '+', '-')
##         }

##         if (is.numeric(rafile$str2)){
##             rafile$str2 = ifelse(rafile$str2>0, '+', '-')
##         }

##         rafile$rowid = 1:nrow(rafile)

##         bad.ix = is.na(rafile$chr1) | is.na(rafile$chr2) | is.na(rafile$pos1) | is.na(rafile$pos2) | is.na(rafile$str1) | is.na(rafile$str2) | rafile$str1 == '*'| rafile$str2 == '*' | rafile$pos1<0 | rafile$pos2<0

##         rafile = rafile[which(!bad.ix), ]

##         if (nrow(rafile)==0){
##             return(GRangesList())
##         }

##         seg = rbind(data.frame(chr = rafile$chr1, pos1 = rafile$pos1, pos2 = rafile$pos1, strand = rafile$str1, ra.index = rafile$rowid, ra.which = 1, stringsAsFactors = F),
##                     data.frame(chr = rafile$chr2, pos1 = rafile$pos2, pos2 = rafile$pos2, strand = rafile$str2, ra.index = rafile$rowid, ra.which = 2, stringsAsFactors = F))

##         if (chr.convert){
##             seg$chr = gsub('chr', '', gsub('25', 'M', gsub('24', 'Y', gsub('23', 'X', seg$chr))))
##         }

##         out = seg2gr(seg, seqlengths = seqlengths)[, c('ra.index', 'ra.which')];
##         out = split(out, out$ra.index)
##     } else if (!is.null(rafile$start1) & !is.null(rafile$start2) & !is.null(rafile$end1) & !is.null(rafile$end2)){
##         ra1 = gr.flipstrand(GRanges(rafile$chr1, IRanges(rafile$start1, rafile$end1), strand = rafile$str1))
##         ra2 = gr.flipstrand(GRanges(rafile$chr2, IRanges(rafile$start2, rafile$end2), strand = rafile$str2))
##         out = grl.pivot(GRangesList(ra1, ra2))
##     }

##     if (keep.features){
##         values(out) = rafile[, ]
##     }

##     ## if (!is.null(pad)){
##     ##     out = ra.dedup(out, pad = pad)
##     ## }
##     out = verify.junctions(out)
##     if (!get.loose){
##         return(out)
##     } else{
##         return(list(junctions = out, loose.ends = GRanges()))
##     }

##     return(out)
##     ## return(new("junctions", out))
## }

#' @name filter_oob_junctions
#' @rdname internal
#' @details
#'
#' Remove any out-of-range junctions
#'
#' A junction will be removed if:
#' - an endpoint exceeds seqlength of chromosome
#' - the start point is less than 1
#'
#' @param ra GRangesList object to be verified
#' @return GRangesList
filter_oob_junctions = function(ra) {
    if (!length(ra)) {
        return(ra)
    }
    pivoted.ra = grl.pivot(ra)
    sl = seqlengths(ra)
    bp1 = pivoted.ra[[1]]
    bp2 = pivoted.ra[[2]]
    start.oob = start(bp1) < 1 | start(bp2) < 1
    end.oob = start(bp1) > sl[as.character(seqnames(bp1))] | start(bp2) > sl[as.character(seqnames(bp2))]
    names(end.oob) = NULL
    names(start.oob) = NULL
    start.oob[which(is.na(start.oob))] = FALSE
    end.oob[which(is.na(end.oob))] = FALSE
    if (any(start.oob, na.rm = TRUE)) {
        jwarning("Some junction breakpoints are < 1 and will be removed")
        jwarning("Number of affected junctions: ", sum(start.oob, na.rm = T))
    }
    if (any(end.oob, na.rm = TRUE)) {
        jwarning("Some junction breakpoints are > seqlength and will be removed")
        jwarning("Number of affected junctions: ", sum(end.oob, na.rm = T))
    }
    return(ra[which(start.oob == FALSE & end.oob == FALSE, useNames = FALSE)])
}


#' @name verify.junctions
#' @rdname internal
#' @details
#' Produce error if the input does not meet any of the four following criteria
#' 1) is a GRangesList
#' 2) every element is length 2
#' 3) all elements are strand-specific
#' 4) all elements are width 1
#' @param ra rearrangement junctions object to be verified
#' @return the input if meets all four criteria
verify.junctions = function(ra){
    .ra.stop = function(ra){
        stopifnot(inherits(ra, "GRangesList"))
        if (length(ra)>0){
            stopifnot(all(elementNROWS(ra)==2))
            stopifnot(all(as.character(strand(unlist(ra))) %in% c("+", "-")))
            ## stopifnot(all(as.numeric(width(unlist(ra)))==1)) ## cancel this requirement but need to handle it later
        }
    }
    if (inherits(ra, "GRangesList")){
        .ra.stop(ra)
    } else if (inherits(ra, "list")) {
        for (i in seq_along(ra)){
            .ra.stop(ra[[i]])
        }
    }
    ## stopifnot(inherits(ra, "GRangesList"))
    ra = filter_oob_junctions(ra)
    return(ra)
}

#' @name karyograph
#' @title karyograph
#' @rdname internal
#' karyograph
#'
#' @details
#' builds graph from rearrangement breakpoints +/- copy number endpoints
#' used for downstream jbaMIP and karyoMIP functions
#'
#' Input bpp is a GRangesList of signed locus pairs describing aberrant adjacencies.
#' The convention is as follows: Each locus in the input breakpoint pair points to the direction that
#' is being joined by the adjacencies i.e.
#' (-) bp points to "left" or preceding segment
#' (+) bp points to the  "right" or the following segment
#'
#' eg imagine a|bp1|b
#'            c|bp2|d
#'  "+" bp point to the right (eg b or d), "-" bp point to the left (a or c)
#'
#' Input "tile" is a set of intervals whose endpoints are also used to partition the genome prior to the building of the
#' karyograph.
#'
#' Output karyograph connects signed genomic intervals (in a signed tiling of the reference genome) with "aberrant" and "reference" edges.
#' Reference edges connect intervals that are adjacent in the reference genome, and aberrant edges are inferred (upstream
#' of this) through cancer genome paired end analysis.
#' Note that every node, edge, and path in this karyograph has a "reciprocal path"
#'
#' @param junctions GRangesList of junctions, where each item is a length GRanges of signed locations
#' @param tile GRanges optional existing tiling of the genome (eg a copy number segmentation) from which additional segments will be created
#' @return
#'  a list with the following fields
#' $tile = GRanges of length 2*n tiling of the genome corresponding to union of rearrangement breakpoints and copy number endpoints
#' $G = igraph object representing karyograph, here are the edge and vertex features
#'      vertex features: $chrom, $start, $end, $width, $strand, $size, $shape, $border.width, $label, $chrom.ord, $y, $col, $weight
#'      edge features:$ $bp.id, $weight, $from, $to, $col, $type, $line.style, $arrow.shape, $width,
#'      important:  $type specifies which edges are "aberrant" and "reference", $bp.id specifies which input rearrangement (item in junctions)
#'      a given aberrant edge came from (and is NA for reference edges)
#' $adj = 2n x 2n adjacency matrix whose nonzero entries ij show the edge.id in $G
#' $ab.adj = 2n x 2n binary matrix specifying aberrant edges
#' $ab.edges = length(junctions) x {'from', 'to'} x {'+', '-'} mapping junction id's (indices into input junctions lists) to source and sink vertices,
#'             in both orientations
############################################
karyograph = function(junctions, ## this is a grl of breakpoint pairs (eg output of read.junctions(dranger.df) where dranger is df of dranger output)
                      tile = NULL, # pre-existing set of intervals on top of which to build a graph (eg endpoints from a copy number based segmentation)
                      label.edges = FALSE)
{
    if (length(junctions)>0)
    {
        bp.p = grl.pivot(junctions)
        bp1 = suppressWarnings(gr.end(gr.fix(bp.p[[1]]), 1, ignore.strand = F))
        bp2 = suppressWarnings(gr.start(gr.fix(bp.p[[2]]), 1, ignore.strand = F))


        ## #' mimielinski Sunday, Aug 06, 2017 06:46:15 PM
        ## #' fix added to handle strange [0 0] junctions outputted
        ## #' by Snowman ... which failed to match to any tile
        ## #' todo: may want to also handle junctions that
        ## #' fall off the other side of the chromosome
        #' this should now be unnecessary due to junction filtering
        ## end(bp1) = pmax(1, end(bp1))
        ## start(bp1) = pmax(1, start(bp1))
        ## end(bp1) = pmax(1, end(bp1))
        ## start(bp1) = pmax(1, start(bp1))

        if (any(as.logical(strand(bp1) == '*') | as.logical(strand(bp2) == '*')))
            jerror('bp1 and bp2 must be signed intervals (i.e. either + or -)')

        if (length(bp1) != length(bp2))
            jerror('bp1 and bp2 inputs must have identical lengths')

        ## #    if (sum(width(reduce(bp1))) != sum(width(bp1)) | sum(width(reduce(bp2))) != sum(width(bp2)))
        ## #      jerror('bp1 or bp2 cannot have duplicates / overlaps (with respect to location AND strand)')

        values(bp1)$bp.id = seq_along(bp1);
        values(bp2)$bp.id = seq_along(bp1)+length(bp1);

        pgrid = sgn1 = c('-'=-1, '+'=1)[as.character(strand(bp1))]
        sgn2 = c('-'=-1, '+'=1)[as.character(strand(bp2))]
### HACK HACK to force seqlengths to play with each other if malformedo
        tmp.sl = GenomeInfoDb::seqlengths(grbind(bp1, bp2))
        tmp.sl.og = tmp.sl
                                        #        tmp.sl = gr2dt(grbind(bp1, bp2))[, max(end, na.rm = TRUE), keyby = seqnames][, sl := pmax(V1+2, tmp.sl[as.character(seqnames)], na.rm = TRUE)][, structure(sl, names = as.character(seqnames))]
        tmp.sl = gr2dt(grbind(bp1, bp2))[, max(end+1, na.rm = TRUE), keyby = seqnames][names(tmp.sl.og), structure(pmax(V1, tmp.sl.og, na.rm = TRUE), names = names(tmp.sl.og))]
        bp1 = gr.fix(bp1, tmp.sl)
        bp2 = gr.fix(bp2, tmp.sl)
                                        # first we tile the genome around the combined breakpoints
    }
    else
    {
        if (is.null(tile))
        {
            tile = si2gr(junctions)
            if (length(tile)==0)
            {
                jwarning('Empty input given, producing empty output')
                return(NULL)
            }
            A = sparseMatrix(1,1, x = 0, dims = rep(length(tile), 2))
            return(
                list(tile = tile, adj = A,
                     G = igraph::graph.adjacency(A), ab.adj = A != 0, ab.edges = NULL, junctions = junctions))
        }

        junctions = GRangesList()
        bp1 = bp2 = GRanges()
    }

    if (!is.null(tile))
    {
        ## find disjoint union of tile and join with gaps
        tile = gr.fix(tile)
        tile = gr.fix(tile, bp1)
        bp1 = gr.fix(bp1, tile) ## argh argh argh .. more pain avoiding hacks
        strand(tile) = '+'
        tile = disjoin(tile)
        tile = sort(c(tile, gaps(tile)))

        ## make sure seqlevels / seqinfo are identical
        if (!identical(sort(seqlevels(tile)), seqlevels(junctions)))
        {
            tile = gr.fix(tile, junctions)
            junctions = gr.fix(junctions, tile)
        }

        if(length(junctions)>0)
        {
            tbp = setdiff(gr.stripstrand(gr.trim(tile, 1)), gr.stripstrand(grbind(bp1, bp2)))
            bp1 = gr.fix(bp1, tbp)
            bp2 = gr.fix(bp2, tbp) ## seqlengths pain
            tbp = gr.fix(tbp, bp1)
        }
        else
            tbp = gr.stripstrand(gr.trim(tile, 1))

        tbp = tbp[start(tbp)!=1]

        if (length(tbp)>0)
            tbp$seg.bp = TRUE
    }
    else {
        tbp = NULL
    }

    if (length(junctions)>0){
        if (length(tbp)>0)
            g = gaps(gr.stripstrand(sort(c(bp1[, c()], bp2[, c()], tbp[, c()]))))
        else
            g = gaps(gr.stripstrand(sort(c(bp1[, c()], bp2[, c()]))))
    } else {
        g = gaps(gr.stripstrand(sort(tbp)))
    }

    g = g[as.logical( strand(g)=='*' )];
    strand(g) = '+';

    values(g)$bp.id = NA
    values(g)$seg.bp = NA

    ## combine tiles and find disjoint set
    tile.og = tile
    tile = grbind(bp1, bp2, g, tbp);
    tile = disjoin(gr.stripstrand(tile[GenomicRanges::order(gr.stripstrand(tile))]))
    strand(tile) = '+'
    tile = gr.fix(tile);
    tile$is.tel = start(tile)==1 | end(tile) == GenomeInfoDb::seqlengths(tile)[as.character(seqnames(tile))]
    values(tile)$tile.id = seq_along(tile);

    ## find "breakpoint" i.e. bp associated intervals, i.e. width 1 intervals that end with a bp1 or bp2 location
    junc.bp = grbind(bp1, bp2)
    junc.bpix = numeric()
    if (length(junc.bp)>0)
        junc.bpix = which(paste(seqnames(tile), end(tile)) %in% paste(seqnames(junc.bp), start(junc.bp)))

    ## make sure all seqlenths are compatible (so effing annoying)
    tile = gr.fix(tile, bp1)
    tile = gr.fix(tile, bp2)
    bp1 = gr.fix(bp1, tile)
    bp2 = gr.fix(bp2, tile)


    ## also keep track of tbp associatd bp.ix
    all.bp = grbind(bp1, bp2, tbp)
    all.bpix = numeric()

    if (length(all.bp)>0)
        all.bpix = which(paste(seqnames(tile), end(tile)) %in% paste(seqnames(all.bp), start(all.bp)))

    ## now to build the graph, we would like to fuse all the bp associated intervals with their previous interval
    ## UNLESS they are preceded by another bp associated interval
    ##
    if (length(all.bpix>0))
    {
        to.fuse = all.bpix[which(all.bpix>1 & !((all.bpix-1) %in% all.bpix))]
        end(tile)[to.fuse-1] = end(tile)[to.fuse-1]+1
        tile = tile[-to.fuse]
    }

    if (length(junc.bpix)>0)
    {
        ## we have a partition of genomic segments flanked by tile endpoints and/or ra junctions
        ##
        ## Input junction syntax is interpreted as follows:
        ## a- b+ junctions connect seg ending with position a to seg starting with b+1
        ## a- b- junctions connect seg ending with position a to seg ending with position b (on neg strand)
        ## a+ b+ junctions connect seg starting with position a+1 (on negative strand) to seg starting with position b+1
        ## a+ b- junctions connect seg starting with position a+1 (on negative strand) to seg ending with position b (on neg strand)

        ## # collect all pairwise adjacencies implied by breakpoints
        ## # eg imagine a|bp1|b
        ## #            c|bp2|d
        ## # "+" bp point to the right (eg b or d), "-" bp point to the left (a or c)

        ab.pairs = cbind(
            ifelse(as.logical(strand(bp1)=='+'), gr.match(GenomicRanges::shift(gr.start(bp1), 1), gr.start(tile)),
                   gr.match(gr.start(bp1), gr.end(tile))),
            ifelse(as.logical(strand(bp2)=='+'), gr.match(GenomicRanges::shift(gr.start(bp2), 1), gr.start(tile)),
                   gr.match(gr.start(bp2), gr.end(tile)))
        )

        ## ab.pairs = cbind(
        ##   ifelse(as.logical(strand(bp1)=='+'), match(paste(seqnames(bp1), start(bp1)+1), paste(seqnames(tile), start(tile))),
        ##          match(paste(seqnames(bp1), start(bp1)), paste(seqnames(tile), end(tile)))),
        ##   ifelse(as.logical(strand(bp2)=='+'), match(paste(seqnames(bp2), start(bp2)+1), paste(seqnames(tile), start(tile))),
        ##          match(paste(seqnames(bp2), start(bp2)), paste(seqnames(tile), end(tile))))
        ##   )
        ab.pairs.bpid = bp1$bp.id
        pp = (sgn1*sgn2)>0 & sgn1>0;
        mm = (sgn1*sgn2)>0 & sgn1<0;
        mp = sgn1>0 & sgn2<0
        ab.pairs[pp,1] = -ab.pairs[pp,1] ## # ++ breakpoints --> (-b)d adjacency
        ab.pairs[mm,2] = -ab.pairs[mm,2] ## -- breakpoints --> a(-c) adjacency
        ab.pairs[mp, ] = -ab.pairs[mp, ] ## +- breakpoints --> (-b)(-c) adjacency

        ## # clean up adj pairs
        ## # remove any that have crossed a chromosome boundary from their breakpoint
        ## # this will occur in cases of badly formed breakpoint input (eg breakpoints that point outward
        ## # from their telomeres)
        edge.id = rep(1:nrow(ab.pairs), 2)
        ab.pairs = rbind(ab.pairs, cbind(-ab.pairs[,2], -ab.pairs[,1]));
        ab.pairs.bpid = c(ab.pairs.bpid, ab.pairs.bpid)

        ## # build "aberrant" adjacency matrix representing directed graph of edges connecting
        ## # <signed> nodes.
        ## # note: indices of matrix represent edge labels
        adj.ab = Matrix( 0,
                        nrow = 2*length(tile),
                        ncol = 2*length(tile),
                        dimnames = rep( list( as.character(c(seq_along(tile), -(seq_along(tile))))), 2))
        tmp.ix = cbind(match(as.character(ab.pairs[,1]), rownames(adj.ab)),
                       match(as.character(ab.pairs[,2]), colnames(adj.ab)))

        ## added !is.na filters 8/4/2020 MI to take care of literal edge cases where edges point off the chromosome eg negative
        ## coordinates
        tix = !duplicated(tmp.ix) & !is.na(tmp.ix[,1]) & !is.na(tmp.ix[,2])
        adj.ab[tmp.ix[tix, , drop = F]] = ab.pairs.bpid[tix]
    }
    else
    {
        ab.pairs.bpid = edge.id = c()
        ab.pairs = matrix(nrow = 0, ncol = 2);
        adj.ab = Matrix(FALSE, nrow = 2*length(tile), ncol = 2*length(tile),
                        dimnames = rep(list(as.character(c(seq_along(tile), -(seq_along(tile))))), 2))
    }

    ## # build reference adjacency matrix (representing consecutive segments on the reference genome)
    ## # note: indices of matrix represent edge labels
    seg.ix = seq_along(tile)
    ref.pairs = cbind(seg.ix[1:(length(seg.ix)-1)],
                      seg.ix[2:(length(seg.ix))])
    ## # ref.pairs = ref.pairs[ref.pairs[,1]>0 & ref.pairs[,2]!=length(tile), ]
    ref.pairs = ref.pairs[which(as.character(seqnames(tile[ref.pairs[,1]])) ==
                                as.character(seqnames(tile[ref.pairs[,2]]))), , drop = FALSE]

    ## XT fix 08/12: edge.id could be length 0 when no aberrant junction is used
    ## we should still make the ref edges in that case
    ## if (nrow(ref.pairs)>0 & length(edge.id)>0)
    if (nrow(ref.pairs)>0)
    {
        edge.id = c(edge.id, max(edge.id) + rep(1:nrow(ref.pairs), 2))
        ref.pairs = rbind(ref.pairs, cbind(-ref.pairs[,2], -ref.pairs[,1])) # reverse ref pairs
        adj.ref = Matrix(0, nrow = 2*length(tile), ncol = 2*length(tile),
                         dimnames = rep(list(as.character(c(seq_along(tile), -(seq_along(tile))))), 2))
        adj.ref[cbind(match(as.character(ref.pairs[,1]), rownames(adj.ref)),
                      match(as.character(ref.pairs[,2]), colnames(adj.ref)))] = nrow(ab.pairs)+1:nrow(ref.pairs)
    }
    else
    {
        adj.ref = Matrix(FALSE, nrow = 2*length(tile), ncol = 2*length(tile),
                         dimnames = rep(list(as.character(c(seq_along(tile), -(seq_along(tile))))), 2))
    }

    ## current tile is partition of genome only in positive orientation + dummy intervals for breakpoints
    ## output tile is forward partition and followed by reverse partition
    tmp.nm = as.character(c(seq_along(tile), -(seq_along(tile))))
    tile = c(tile, gr.flipstrand(tile))
    names(tile) = tmp.nm

    ## apply ix to adj.ref and adj.ab, and create "adj" which has union of reference and aberrant junctions
    ## and adj.source which remembers whether edge ij was reference (value = 1) or aberrant (value = 2)
    adj.source = sign(adj.ref)+2*sign(adj.ab)
    adj = sign(adj.ref)+sign(adj.ab)
    tryres <- try( edges <- Matrix::which(adj!=0, arr.ind=T), silent=T ) ## num edge x 2 matrix of vertex pairs

    adj[edges] = 1:nrow(edges) ## re number edges across edge set
    rownames(adj) = colnames(adj) = 1:nrow(adj)
    G = graph.adjacency(adj ,weighted = 'edge.ix') ## edge.ix will allow us to match up edges in the adj matrix with edges in the igraph
    node.ind = abs(as.numeric(V(G)$name))

    ## add vertex features including formatting to igraph
    V(G)$chrom = as.character(seqnames(tile))[node.ind]
    V(G)$start = start(tile)[node.ind]
    V(G)$end = end(tile)[node.ind]
    V(G)$width = width(tile)[node.ind]
    V(G)$strand = sign(as.numeric(V(G)$name))
    V(G)$size = 5;
    V(G)$shape= c('rectangle', 'crectangle')[1 + as.numeric(V(G)$strand<0)];
    V(G)$border.width = c(1, 2)[1 + as.numeric(V(G)$strand=='-')] ;
    V(G)$label = paste(V(G)$chrom, ':', round(V(G)$start/1e6,0), '-', round(V(G)$end/1e6,0), sep = '')
    V(G)$label[V(G)$strand<0] = paste(V(G)$chrom, ':', round(V(G)$end/1e6,0), '-', round(V(G)$start/1e6,0), sep = '')[V(G)$strand<0]
    col.map = structure(brewer.master(length(seqlevels(tile))), names = seqlevels(tile))
    V(G)$chrom.ord = levapply(as.numeric(V(G)$start), list(V(G)$chrom), 'rank')
    V(G)$y = V(G)$chrom.ord*30
    V(G)$x = chr2num(V(G)$chrom)*300 + 100*rep(c(0,1), each = length(tile)/2)
    V(G)$col = col.map[V(G)$chrom]

    ## add edge features including formatting to igraph
    E(G)$weight = 1
    E(G)$from = edges[E(G)$edge.ix, 1]
    E(G)$to = edges[E(G)$edge.ix, 2]
    E(G)$col = c(col2hex('gray20'), col2hex('red'))[adj.source[edges[E(G)$edge.ix, ]]]
    E(G)$type = c('reference', 'aberrant', 'aberrant')[adj.source[edges[E(G)$edge.ix, ]]]
    E(G)$line.style = 'SEPARATE_ARROW'
    E(G)$arrow.shape = 'ARROW'
    E(G)$width = 1
    ab.ix = E(G)$type=='aberrant'  ## keep track of bp.id leading to edge
    E(G)$bp.id = NA;
    if (length(ab.pairs.bpid)>0)
        E(G)$bp.id[ab.ix] = ab.pairs.bpid[adj.ab[cbind(E(G)$from[ab.ix], E(G)$to[ab.ix])]]
    E(G)$eid = NA; ## what is edge ID??? how is different from edge.ix?
    if (length(ab.ix)>0)
      {
        E(G)$eid[ab.ix] = edge.id[adj.ab[cbind(E(G)$from[ab.ix], E(G)$to[ab.ix])]]
        E(G)$eid[!ab.ix] = edge.id[adj.ref[cbind(E(G)$from[!ab.ix], E(G)$to[!ab.ix])]]
      }
    values(tile) = values(tile)[, c('tile.id', 'is.tel')]
    tile$ab.source = seq_along(tile) %in% E(G)$from[ab.ix]
    tile$ab.target = seq_along(tile) %in% E(G)$to[ab.ix]

    ## # important: map input ra to aberrant graph edges, i.e. ab.edges matrix with $from $to and $edge.ix columns
    ## # and one row for each aberrant edge
    ab.edges = array(NA, dim = c(length(junctions), 3, 2), dimnames = list(NULL, c('from', 'to', 'edge.ix'), c('+', '-')))
    dupped = duplicated(ab.pairs.bpid) ## but there are still duplicated ones
    ab.edges[,1:2,1] = cbind(match(ab.pairs[!dupped,1], names(tile)),
                             match(ab.pairs[!dupped,2], names(tile)))
    ab.edges[,1:2,2] = cbind(match(ab.pairs[dupped,1], names(tile)), match(ab.pairs[dupped,2], names(tile)))
    ab.edges[,3, 1] = match(paste(ab.edges[,1,1], '|', ab.edges[,2,1]), paste(E(G)$from, '|', E(G)$to)) ## must be easier way to perform this taks
    ab.edges[,3, 2] = match(paste(ab.edges[,1,1], '|', ab.edges[,2,1]), paste(E(G)$from, '|', E(G)$to))

    if (label.edges & nrow(ab.edges)>0)
    {
        ix = c(ab.edges[,1,1], ab.edges[,2,1], ab.edges[,1,2], ab.edges[,2,2])
        tile$edges.out = tile$edges.in = ''
        tile$edges.in[ix]= sapply(ix,
                                  function(x) {ix = Matrix::which(adj[,x]!=0); paste(ix, '->', sep = '', collapse = ',')})
        tile$edges.out[ix] = sapply(ix,
                                    function(x) {ix = Matrix::which(adj[x, ]!=0); paste('->', ix,  sep = '', collapse = ',')})
    }

    return(list(tile = tile, adj = adj, G = G, ab.adj = adj.ab != 0, ab.edges = ab.edges, junctions = junctions))
}


## FIXME: problem too large

########################################
#' @name jabba2vcf
#' @rdname internal
#' jabba2vcf
#'
#' Converts jabba output to vcf file according to 4.2 "BND" syntax
#'
#'
#' @param jab JaBbA object
#' @param fn output file name
#' @param sampleid sample id
#' @param hg human genome as BSgenome or ffTrack
#' @param cnv flag whether to dump in CNV format
#' @return returns character string or writes to file if specified
#' @noRd
#########################################
jabba2vcf = function(jab, fn = NULL, sampleid = 'sample', hg = NULL, include.loose = TRUE, include.cn0 = TRUE, cnv = FALSE)
{
    if (is.null(hg))
        hg = tryCatch(skidb::read_hg(), error = function(e) NULL)

    vcffields = c('CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'GENO')

    ## convert all aberrant connections into pairs of VCF rows
    if (!cnv)
    {
      jix = which(!is.na(jab$ab.edges[,3,1])) ## these are the only junctions with breaks in the reconstruction
      if (!include.cn0) ## remove from jix
      {
        jcn = jab$adj[jab$ab.edges[jix, 1:2, 1]]
        jix = jix[jcn>0]
        message('Removing cn=0')
      }
      abs = rbind(jab$ab.edges[jix,1:2,1])
      rabs = rbind(jab$ab.edges[jix,1:2,2])
      rcix = match(jab$segstats, gr.flipstrand(jab$segstats)) ## map of seg to its reverse complement

      adj.ref = jab$adj ## reference graph has reference copy numbers, we obtain by zeroing out all ab.edges and loose end edges
      adj.ref[rbind(jab$ab.edges[jix,1:2,1])] = 0
      adj.ref[rbind(jab$ab.edges[jix,1:2,2])] = 0

      ## #' xtYao #' Wednesday, Mar 20, 2019 11:09:46 AM
      ## Fix missing $
      if (any(jab$segstats$loose))
      {
        adj.ref[jab$segstats$loose, ] = 0
        adj.ref[,jab$segstats$loose] = 0
      }

      if (length(jix)>0)
        {
            jcn = jab$adj[abs]
            gr1 = gr.end(jab$segstats[abs[,1]], ignore.strand = F)[, 'cn']
            gr1$jid = jix
            gr1$nid = abs[,1]
            gr1$acn = jcn
            gr1$rcn = Matrix::rowSums(adj.ref[gr1$nid, , drop = FALSE])
            gr1$ID = paste(sampleid, '_seg', jab$segstats$tile.id[abs[,1]], ifelse(as.logical(strand(gr1)=='+'), '_R', '_L'), sep = '')

            gr2 = gr.start(jab$segstats[abs[,2]], ignore.strand = F)[, 'cn']
            gr2$jid = jix
            gr2$nid = abs[,2]
            gr2$acn = jcn
            gr2$rcn = Matrix::colSums(adj.ref[,gr2$nid, drop = FALSE])
            gr2$ID = paste(sampleid, '_seg', jab$segstats$tile.id[abs[,2]], ifelse(as.logical(strand(gr2)=='+'), '_L', '_R'), sep = '')

            gr1$mid = gr2$ID
            gr2$mid = gr1$ID

            gr1$REF = tryCatch(as.character(ffTrack::get_seq(hg, gr.stripstrand(gr1))), error = function(e) 'N')
            gr1$ALT = ifelse(as.logical(strand(gr1)=='+') & as.logical(strand(gr2)=='+'), paste(gr1$REF, '[', seqnames(gr2), ':', start(gr2), '[', sep = ''),
                      ifelse(as.logical(strand(gr1)=='+') & as.logical(strand(gr2)=='-'), paste(gr1$REF, ']', seqnames(gr2), ':', start(gr2), ']', sep = ''),
                      ifelse(as.logical(strand(gr1)=='-') & as.logical(strand(gr2)=='+'), paste('[', seqnames(gr2), ':', start(gr2), '[', gr1$REF, sep = ''),
                             paste(']', seqnames(gr2), ':', start(gr2), ']', gr1$REF, sep = '')))) ## last one is A- --> B-

            gr2$REF = tryCatch(as.character(ffTrack::get_seq(hg, gr.stripstrand(gr2))), error = function(e) 'N')
            gr2$ALT = ifelse(as.logical(strand(gr1)=='+') & as.logical(strand(gr2)=='+'), paste(']', seqnames(gr1), ':', start(gr1), ']', gr2$REF, sep = ''),
                      ifelse(as.logical(strand(gr1)=='-') & as.logical(strand(gr2)=='+'), paste('[', seqnames(gr1), ':', start(gr1), '[', gr2$REF, sep = ''),
                      ifelse(as.logical(strand(gr1)=='+') & as.logical(strand(gr2)=='-'), paste(gr2$REF, ']', seqnames(gr1), ':', start(gr1), ']', sep = ''),
                             paste(gr2$REF, '[', seqnames(gr1), ':', start(gr1), '[', sep = '')))) ## last one is B- --> A-

            gr1$FILTER = gr2$FILTER = ifelse(gr2$acn != 0, "PASS", "NINC")
            gr2$FORMAT = gr1$FORMAT = "GT:CN:RCN:SCN"
            gr1$GENO = paste(ifelse(gr1$rcn>0, '0/1', '1'), gr1$acn, gr1$rcn, gr1$cn, sep = ":")
            gr2$GENO = paste(ifelse(gr2$rcn>0, '0/1', '1'), gr2$acn, gr2$rcn, gr2$cn, sep = ":")

            gr2$CHROM = as.character(seqnames(gr2))
            gr1$CHROM = as.character(seqnames(gr1))

            gr2$POS = as.character(start(gr2))
            gr1$POS = as.character(start(gr1))

            gr1$QUAL = gr2$QUAL = '.'
            gr1$INFO = paste("SVTYPE=BND", ";MATEID=", gr1$mid, ";CNADJ=", gr1$acn, ";CNRADJ=", gr1$rcn, ";CN=", gr1$cn,
                             ";JABID=", abs[,1], ";RJABID=", rcix[abs[,1]], ";JUNCID=", seq_along(gr1), sep = '')
            gr2$INFO = paste("SVTYPE=BND", ";MATEID=", gr2$mid, ";CNADJ=", gr2$acn, ";CNRADJ=", gr2$rcn, ";CN=", gr2$cn,
                             ";JABID=", abs[,2], ";RJABID=", rcix[abs[,2]], ";JUNCID=", seq_along(gr2), sep = '')

            gr1 = gr1[, vcffields]
            gr2 = gr2[, vcffields]
        }
        else
        {
            gr1 = GRanges()
            gr2 = GRanges()
        }

        ## now loose ends
        lix = which(jab$segstats$loose & as.logical(strand(jab$segstats)=="+"))
        if (length(lix)>0 & include.loose)
        {
            ## loose ends should be width 1, but just in case
            if (is.element("passed", colnames(values(jab$segstats)))){
                ## with le quality
                gr.loose = gr.start(jab$segstats[lix, c('passed', 'cn')])
                gr.loose$QUAL = ifelse(gr.loose$passed, "PASSED", "FAILED")
            } else {
                ## without
                gr.loose = gr.start(jab$segstats[lix, c('cn')])
                gr.loose$QUAL = '.'
            }

            gr.loose$jid = NA
            gr.loose$nid = lix
            gr.loose$acn = gr.loose$cn

            ## query parents / children of loose ends
            tmp1 = apply(jab$adj[gr.loose$nid, , drop = FALSE],1, function(x) which(x!=0)[1]) ## only non NA if loose end has children (i.e. is a parent)
            tmp2 = apply(jab$adj[, gr.loose$nid, drop = FALSE],2, function(x) which(x!=0)[1]) ## only non NA if loose end has parents (i.e. is child)

            isp = apply(cbind(tmp1, tmp2), 1, function(x) which(is.na(x))[1])==2 ## is the loose end a parent or child of a seg?
            pcid = pmax(tmp1, tmp2, na.rm = T) ## which seg is the parent / child of the loose end

            ##                        gr.loose$rcn = ifelse(isp, colSums(adj.ref[,pcid, drop = FALSE]), Matrix::rowSums(adj.ref[pcid,, drop = FALSE])) ## if loose end is parent of seg, we want num copies of that segs reference parent,

            gr.loose$rcn = ifelse(isp, Matrix::colSums(adj.ref[,pcid, drop = FALSE]), Matrix::rowSums(adj.ref[pcid,, drop = FALSE])) ## if loose end is parent of seg, we want num copies of that segs reference parent,


            gr.loose$cn = jab$segstats$cn[pcid]
            ## if loose end is the parent of a seg, then it is a "left" loose end (since + strand) otherwise "right"
            gr.loose$ID = paste(sampleid, '_looseend', jab$segstats$tile.id[pcid], ifelse(isp, '_L', '_R'), sep = '')
            gr.loose$mid = NA
            gr.loose$REF = tryCatch(as.character(ffTrack::get_seq(hg, gr.stripstrand(gr.loose))), error = function(e) 'N')

            ## again, same rationale as above, parent = left loose end, child = right loose end
            gr.loose$ALT = ifelse(isp, paste('.', gr.loose$REF, sep = ''), paste(gr.loose$REF, '.', sep = ''))
            gr.loose$FORMAT = "GT:CN:RCN:SCN"
            gr.loose$GENO = paste(ifelse(gr.loose$rcn>0, '0/1', '1'), gr.loose$acn, gr.loose$rcn, gr.loose$cn, sep = ":")
            gr.loose$CHROM = as.character(seqnames(gr.loose))
            gr.loose$POS = start(gr.loose)
            gr.loose$FILTER = "LOOSEEND"

            gr.loose$INFO = paste("SVTYPE=BND", ";CNADJ=", gr.loose$acn, ";CNRADJ=", gr.loose$rcn, ";CN=", gr.loose$cn, ";JABID=", lix, ";RJABID=", rcix[lix], sep = '')
            gr.loose = gr.loose[, vcffields]
        }
        else
            gr.loose = GRanges()

        ## make header
        sl = GenomeInfoDb::seqlengths(jab$segstats)
        header = '##fileformat=VCFv4.2'
        header = c(header, sprintf('##fileDate=%s', format(Sys.Date(), '%Y%m%d')))
        header = c(header, '##source=JaBbAV0.1')
        if (inherits(hg, "BSgenome"))
        {
            header = c(header, sprintf("##reference=%s", BSgenome::sourceUrl(hg)))
            header = c(header, unlist(mapply(function(x, y) sprintf('##contig=<ID=%s,length=%s,assembly=%s,species="%s">', x, y, BSgenome::providerVersion(hg), BSgenome::organism(hg)), names(sl), sl)))
        }
        else
      {
        if (!is.null(hg))
        {
          header = c(header, sprintf("##reference=%s", BSgenome::filename(hg)['rds']))
        }
        header = c(header, unlist(mapply(function(x, y) sprintf('##contig=<ID=%s,length=%s>', x, y), names(sl), sl)))
        }

        header = c(header,
                   '##INFO=<ID=MATEID,Number=.,Type=String,Description="ID of mate breakends">',
                   '##INFO=<ID=EVENT,Number=1,Type=String,Description="ID of event associated to breakend">',
                   '##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">',
                   '##INFO=<ID=CNADJ,Number=.,Type=Integer,Description="Copy number of variant adjacency at breakend">',
                   '##INFO=<ID=CNRADJ,Number=1,Type=Integer,Description="Copy number of reference adjacency at breakend">',
                   '##INFO=<ID=CN,Number=1,Type=Integer,Description="Copy number of segment containing breakend">',
                   '##INFO=<ID=JUNCID,Number=.,Type=Integer,Description="Index of allele(s) in JaBbA junction input pile">',
                   '##INFO=<ID=JABID,Number=.,Type=Integer,Description="Index of the interval containing allele(s) in JaBbA reconstruction">',
                   '##INFO=<ID=RJABID,Number=.,Type=Integer,Description="Index of the reverse complement containing interval allele(s) in JaBbA reconstruction">',
                   '##FILTER=<ID=PASS,Description="Junction incorporated into JaBbA MIP reconstruction at nonzero copy number">',
                   '##FILTER=<ID=LOOSEEND,Description="Loose end incorporated into JaBbA MIP reconstruction at unexplained copy break">',
                   '##FILTER=<ID=NINC,Description="Not Incorporated into JaBbA MIP reconstruction: either subclonal or false positive">',
                   '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
                   '##FORMAT=<ID=CN,Number=.,Type=String,Description="Copy number of variant adjacencies">',
                   '##FORMAT=<ID=RCN,Number=1,Type=String,Description="Copy number of reference adjacency at breakend">',
                   '##FORMAT=<ID=SCN,Number=1,Type=String,Description="Copy number of segment containing breakend">'
                   )

        if ((length(gr1) + length(gr.loose))>0)
        {
            body = as.data.frame(values(c(gr1, gr2, gr.loose)))
            body$ord = NA
            body$ord[order(gr.stripstrand(c(gr1, gr2, gr.loose)))] = 1:nrow(body)
            body = as.data.table(body)
            setkeyv(body, c('CHROM', 'POS'))

            ## useful for merging info records in painful coordinate deduping process below
            .infomelt = function(str)
            {
                z = lapply(strsplit(str, ";"), function(x) {y = matrix(unlist(strsplit(x, "=")), ncol = 2, byrow = T); return(structure(y[,2], names = y[,1]))})
                unames = unique(unlist(sapply(z, names)))
                mfields = c("MATEID", "CNADJ", "JUNCID", "JABID", "RJABID")
                mergeix = unames %in% mfields
                out = sapply(seq_along(unames), function(i) {
                    x = unames[i]
                    tmp = sapply(z, function(y) y[x])
                    if (mergeix[i])
                    {
                        if (any(is.na(tmp))) tmp[is.na(tmp)] = '.';
                        return(paste(tmp, collapse = ','))
                    }
                    else
                        return(tmp[!is.na(tmp)][1])})
                names(out) = unames
                return(paste(names(out), "=", out, collapse = ";", sep = ''))
            }

            .genomelt = function(geno, format)
            {
                if (length(geno)==1)
                    return(genos)
                genos = strsplit(geno, ":")
                formats = strsplit(format[1], ":")[[1]]

                mergeix = formats == "CN"
                gtix = formats == "GT" ## should be only one such entry
                out = genos[[1]] ## pick the first one as the default - this should be a vector length(formats)
                if (any(gtix))
                    out[gtix] = paste(genos[[1]][gtix][1], '/', paste(2:length(genos), collapse = '/'), sep = '') ## add "fake allele names"

                if (any(mergeix))
                    out[mergeix] = sapply(which(mergeix), function(x)
                        paste(sapply(genos, function(y) y[[x]]), collapse = ','))
                return(paste(out, collapse = ":"))
            }

            ## dedup and collapse breakends that share coordinates into single variant sites with several alleles
            body = body[, list(
                ID = ID[1],
                REF = REF[1],
                ALT = if (length(ALT)>1) paste(ALT, collapse = ',') else ALT,
                QUAL = QUAL[1],
                FILTER = if (length(FILTER)>1) paste(FILTER, collapse = ";") else FILTER,
                INFO = if (length(INFO)>1) .infomelt(INFO) else INFO,
                FORMAT = FORMAT[1], ## assume everything is same format
                GENO = if (length(GENO)>1) .genomelt(GENO, FORMAT) else GENO,
                ord = ord[1]
            ), by = c("CHROM", "POS")]

            setkey(body, "ord")

            body = as.data.frame(body)[, vcffields]
            names(body)[ncol(body)] = sampleid
            names(body)[1] = '#CHROM'
        }
        else
        {
            body = as.data.frame(matrix(NA, ncol = length(vcffields), dimnames = list(c(), vcffields), ))[c(), ]
            names(body)[1] = '#CHROM'
            names(body)[ncol(body)] = sampleid
        }
    }
    else ## CNV mode
    {
        sl = GenomeInfoDb::seqlengths(jab$segstats)
        header = '##fileformat=VCFv4.2'
        header = c(header, sprintf('##fileDate=%s', format(Sys.Date(), '%Y%m%d')))
        header = c(header, '##source=JaBbAV0.1 CNV')

        if (inherits(hg, "BSgenome"))
        {
            header = c(header, sprintf("##reference=%s", BSgenome::sourceUrl(hg)))
            header = c(header, unlist(mapply(function(x, y) sprintf('##contig=<ID=%s,length=%s,assembly=%s,species="%s">', x, y, BSgenome::providerVersion(hg), BSgenome::organism(hg)), names(sl), sl)))
        }
        else if (inherits(hg, "ffTrack"))
        {
            header = c(header, sprintf("##reference=%s", BSgenome::filename(hg)['rds']))
            header = c(header, unlist(mapply(function(x, y) sprintf('##contig=<ID=%s,length=%s>', x, y), names(sl), sl)))
        }
        else
        {
            header = c(header, sprintf("##reference=NA"))
            header = c(header, unlist(mapply(function(x, y) sprintf('##contig=<ID=%s,length=%s>', x, y), names(sl), sl)))
        }

        header = c(header,
                   '##ALT=<ID=DEL,Description="Decreased copy number relative to reference">',
                   '##ALT=<ID=DUP,Description="Increased copy number relative to reference">',
                   '##ALT=<ID=DIP,Description="Normal diploid copy number">',
                   '##INFO=<ID=SVTYPE,Number=1,Type=String,Description="Type of structural variant">',
                   '##INFO=<ID=START,Number=1,Type=Integer,Description="Start of copy number annotated interval">',
                   '##INFO=<ID=END,Number=1,Type=Integer,Description="End of copy number annotated interval">',
                   '##INFO=<ID=JABID,Number=1,Type=Integer,Description="Index of the interval containing allele(s) in JaBbA reconstruction">',
                   '##INFO=<ID=RJABID,Number=1,Type=Integer,Description="Index of the reverse complement  interval allele(s) in JaBbA reconstruction">',
                   '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">',
                   '##FORMAT=<ID=CN,Number=1,Type=String,Description="Copy number">'
                   )

        rcix = match(jab$segstats, gr.flipstrand(jab$segstats)) ## map of seg to its reverse complement
        six = which(!is.na(jab$segstats$cn) & !jab$segstats$loose & as.character(strand(jab$segstats))=='+')
        ss = jab$segstats[six]

        if (length(ss)>0)
        {
            REF = tryCatch(as.character(ffTrack::get_seq(hg, gr.stripstrand(gr.start(ss,1)))), error = function(e) NULL)
            if (is.null(REF))
            {
                REF = 'N'
            }
            body = data.frame("CHROM" = as.character(seqnames(ss)), POS = start(ss),
                              ID = paste(sampleid, '_seg', six, sep = ''),
                              REF = REF,
                              ALT = ifelse(ss$cn==2, "<DIP>", ifelse(ss$cn>2, "<DUP>", "<DEL>")),
                              QUAL = ".",
                              FILTER = "PASS",
                              INFO = paste("SVTYPE=", ifelse(ss$cn>2, "DUP", "DEL"),
                                           ";START=", start(ss), ";END=", end(ss), ";JABID=", six, ";RJABID=", rcix[six], sep = ''),
                              FORMAT = "GT:CN",
                              GENO = paste("./.", ss$cn, sep = ':'))
            names(body)[1] = '#CHROM'
            names(body)[ncol(body)] = sampleid
        }
        else
        {
            body = as.data.frame(matrix(NA, ncol = length(vcffields), dimnames = list(c(), vcffields), ))[c(), ]
            names(body)[1] = '#CHROM'
            names(body)[ncol(body)] = sampleid
        }
    }

    if (!is.null(fn))
    {
        writeLines(header, fn)
        suppressWarnings(write.table(body, fn, sep = '\t', quote = F, row.names = F, append = T))
    }
    else
    {
        t = textConnection("out", 'w')
        writeLines(header, t)
        suppressWarnings(write.table(body, t, sep = '\t', quote = F, row.names = F, append = T))
        close(t)
        return(out)
    }
}

#' @name read_vcf
#' @rdname internal
#' @title read_vcf
#'
#' @description
#'
#' wrapper around variantAnnotation reads VCF into granges or data.table format
#'
#' @author Marcin Imielinski
#' @noRd
read_vcf = function(fn,
                    hg = "hg19",
                    swap.header = NULL,
                    verbose = FALSE,
                    add.path = FALSE,
                    tmp.dir = '~/temp/.tmpvcf',
                    ...)
{
    in.fn = fn

    if (verbose){
        cat('Loading', fn, '\n')
    }

    ##   if (!is.null(swap.header))
    ##   {
    ##     if (!file.exists(swap.header))
    ##       jerror(sprintf('Swap header file %s does not exist\n', swap.header))

    ##     system(paste('mkdir -p', tmp.dir))
    ##     tmp.name = paste(tmp.dir, '/vcf_tmp', gsub('0\\.', '', as.character(runif(1))), '.vcf', sep = '')
    ##     if (grepl('gz$', fn))
    ##       system(sprintf("zcat %s | grep '^[^#]' > %s.body", fn, tmp.name))
    ##     else
    ##       system(sprintf("grep '^[^#]' %s > %s.body", fn, tmp.name))

    ##     system(sprintf("cat %s.header %s.body > %s", tmp.name, tmp.name, tmp.name))
    ##     vcf = VariantAnnotation::readVcf(tmp.name, hg, ...)
    ##     system(sprintf("rm %s %s.body %s.header", tmp.name, tmp.name, tmp.name))
    ##   }
    ## else

    ## QUESTION: why isn't genome recognized when ... is provided?
    args = list(...)
    ## vcf = VariantAnnotation::readVcf(file = fn, hg, ...)
    vcf = VariantAnnotation::readVcf(file = fn, genome = hg)

    out = granges(vcf)

  if (!is.null(values(out)))
    values(out) = cbind(values(out), VariantAnnotation::info(vcf))
  else
    values(out) = VariantAnnotation::info(vcf)

    if (add.path)
        values(out)$path = in.fn

    return(out)
}

#' @name write_vcf
#' @title write_vcf
#'
#' @description
#'
#' writes any GRanges vars into vcf using columns of vars to guide choice of common fields like
#' $FILTER
#' $GT
#' $REF
#' $ALT
#'
#' and adding all other fields to INFO
#'
#' @noRd
#' @author Marcin Imielinski
write_vcf = function(vars, filename, sname = "mysample", info.fields = setdiff(names(values(vars)), c("FILTER", "GT", "REF", "ALT")))
{

    genoh = DataFrame(row.names = 'GT', Number = 1, Type = 'Float', Description = 'Genotypes')

    for (field in names(values(vars))) ## clean up vars of weird S4 data structures that are not compatible with before
    {
        tmp = tryCatch(as.character(values(vars)[, field]), error = function(e) NULL)
        if (is.null(tmp))
        {
            values(vars)[, field] = NULL
            warning(paste('Could not process field', field, "due to S4 conversion issues, discarding"))
        }
        is.num = !all(is.na(as.numeric(tmp)))
        if (!is.num)
            values(vars)[, field] = tmp
    }

    info.fields = intersect(info.fields, names(values(vars)))

    if (length(info.fields)==0) # dummy field to keep asVCF happy
    {
        info.fields = "DM"
        vars$DM = '.'
    }

    is.num = sapply(info.fields, function(x) !suppressWarnings(all(is.na(as.numeric(as.character(values(vars)[, x]))))))
    infoh = DataFrame(
        row.names = info.fields, Number = 1,
        Type = ifelse(is.num, 'Float', 'String'),
        Description = paste('Field', info.fields))

    if (is.null(vars$REF))
        vars$REF = vars$refbase

    if (is.null(vars$ALT))
        vars$ALT = vars$altbase

    if (is.null(vars$REF))
        vars$REF =  "N"

    if (is.null(vars$ALT))
        vars$ALT =  "X"

    if (is.null(vars$FILTER))
        vars$FILTER =  "PASS"

    ## vcf = asVCF(vars)
    vr = VRanges(seqnames(vars), ranges(vars), ref = vars$REF, alt = vars$ALT, sampleNames = rep(sname, length(vars)))
    names(vr) = names(vars)
    vcf = asVCF(vr)


    for (field in info.fields)
        info(vcf)[[field]] = values(vars)[[field]]

##    geno(vcf)$DP = vars$DP; geno(vcf)$AD = vars$AD; geno(vcf)$FT = vars$FT

    info(header(vcf)) = infoh

    if (is.null(vars$FILTER))
        filt(vcf) = rep('PASS', length(vars))
    else
        filt(vcf) = vars$FILTER

    ## xtYao ## Thursday, Feb 25, 2021 02:55:16 PM
    ## use the names if they are there!!
    if (!is.null(names(vars))){
        rownames(vcf) = names(vars)
    } else {
        rownames(vcf) = vars$assembly.coord ## WHY, WHY, WHY??????
    }

    geno(header(vcf)) = genoh

    geno(vcf)$GT = vcf$GT

    writeVcf(vcf, filename)
}

#' @name levapply
#' @rdname internal
#' @title levapply
#'
#' @description
#' Applies FUN locally to levels of x and returns vector of length()
#' (eg can do a "local" order within levels)
#'
#' @param x input vector of data
#' @param by length(x) vector of categorical labels
#' @param FUN function that takes a length k vector and outputs a length k vector, used for processing each "level" of by
#' @return length(x) vector of outputs, the results of applying FUN to each "by" defined level of x
#' @author Marcin Imielinski
#' @noRd
levapply = function(x, by, FUN = 'order')
{
    if (!is.list(by))
        by = list(by)

    f = factor(do.call('paste', c(list(sep = '|'), by)))
    ixl = split(seq_along(x), f);
    ixv = lapply(ixl, function(y) x[y])
    res = structure(unlist(lapply(ixv, FUN)), names = unlist(ixl))
    out = rep(NA, length(x))
    out[as.numeric(names(res))] = res;
    return(out)
}

#' @name chr2num
#' @description
#' Strips chromosome numbers
#'
#' @param x character vector to strip
#' @param xy logical flag specifying whether to keep X and Y or convert to 23 and 24
#' @noRd
chr2num = function(x, xy = FALSE)
{
    if (inherits(x, 'factor') | inherits(x, 'Rle'))
        x = as.character(x)

    out = gsub('chr', '', x);

    if (!xy)
        out = as.numeric(gsub('M', '25', gsub('Y', '24', gsub('X', '23', out))))

    return(out)
}

#' @name which.indel
#' @rdname internal
#' @title which.indel
#'
#' @description
#' Among a GRangesList of junction set, find the indices of isolated, small scale tDup or DEL
#' They are in the grey area from SV to INDEL.
#'
#' @param juncs GRangesList of junctions
#' @param max.size the size cutoff in bp, any pair of breakpoints below this
#' with the correct orintation wil be called
#' @return indices of the identified junctions
which.indel = function(juncs,
                       max.size = 1e4){
    bps = unname(grl.unlist(juncs))
    sort.grl.ix = rle((bps %Q% (order(seqnames, start)))$grl.ix)
    ## criterion 1: they are non-overlapping with others
    iso.ix = sort.grl.ix$values[which(sort.grl.ix$lengths==2)]
    out.ix = iso.ix
    if (length(out.ix)==0){
        return(out.ix)
    }
    juncs.iso = juncs[out.ix]
    ## criterion 2: they are smaller than max.size
    iso.sizes = sv.size(juncs.iso, ignore.strand = TRUE)
    small.ix = which(iso.sizes <= max.size)
    out.ix = iso.ix[small.ix]
    if (length(out.ix)==0){
        return(out.ix)
    }
    ## criterion 3: they need to have opposite directions
    out.ix = gr2dt(bps %Q% (grl.ix %in% out.ix))[
      , oppo := all(c("+", "-") %in% strand), by=grl.ix][oppo==TRUE, unique(grl.ix)]
    return(out.ix)
}

#' @name sv.size
#' @rdname internal
#' @description
#' Simply the distance between pairs of breakpoints
#' @param juncs GRangesList of junctions
#' @param mc.cores parallel
#' @param ignore.strand usually TRUE
#' @return numerical vector of the same length, Inf means they r not facing each other
#' @noRd
sv.size = function(juncs,
                   ...){
    bps = gUtils::grl.pivot(juncs)
    return(IRanges::distance(bps[[1]], bps[[2]], ...))
}

#' @name reciprocal.cycles
#' @rdname internal
#' @description
#' Returns indices (subset of seq_along(junc) corresponding to cycles of (quasi) reciprocal cycles
#' @param juncs GRangesList of junctions
#' @param mc.cores parallel
#' @param ignore.strand usually TRUE
#' @return numerical vector of the same length, Inf means they r not facing each other
#' @noRd
reciprocal.cycles = function(juncs, paths = FALSE, thresh = 1e3, mc.cores = 1, verbose = FALSE, chunksize = 1e3)
{
    bp = grl.unlist(juncs)[, c("grl.ix", "grl.iix")]

    ix = split(seq_along(bp), ceiling(runif(length(bp))*ceiling(length(bp)/chunksize)))
    ixu = unlist(ix)
    eps = 1e-9
    ij = do.call(rbind, split(seq_along(bp), bp$grl.ix))
    adj = sparseMatrix(1, 1, x = FALSE, dims = rep(length(bp), 2))

    ## matrix of (strand aware) reference distances between breakpoint pairs
    adj[ixu, ] = do.call(rbind, parallel::mclapply(ix,
                                         function(iix)
                                         {
                                             if (verbose)
                                                 cat('.')
                                             tmpm = gr.dist(bp[iix], gr.flipstrand(bp), ignore.strand = FALSE)+eps
                                             tmpm[is.na(tmpm)] = 0
                                             tmpm[tmpm>thresh] = 0
                                             tmpm = as(tmpm>0, 'Matrix')
                                         },
                                         mc.cores = mc.cores))
    if (verbose)
        cat('\n')

    adj = adj | Matrix::t(adj) ## symmetrize

    ## bidirected graph --> skew symmetric directed graph conversion
    ## split each junction (bp pair) into two nodes, one + and -
    ## arbitrarily call each bp1-->bp2 junction is "+" orientation
    ## then all odd proximities adjacent to bp1 will enter the "+"
    ## version of that junction and exit the "-" version

    ## new matrix will be same dimension as adj
    ## however the nodes will represents + and -
    ## orientation of junctions
    ## using the foollowing conversion

    ## i.e.
    ## bp2 --> bp1 + +
    ## bp2 --> bp2 + -
    ## bp1 --> bp1 - +
    ## bp1 --> bp2 - -

    ## we'll use the same indices just to keep things confusing
    junpos = bp1 = bp$grl.iix == 1
    junneg = bp2 = bp$grl.iix == 2

    adj2 = adj & FALSE ## clear out adj for new skew symmetric version
    adj2[junpos, junpos] = adj[bp2, bp1]
    adj2[junpos, junneg] = adj[bp2, bp2]
    adj2[junneg, junpos] = adj[bp1, bp1]
    adj2[junneg, junneg] = adj[bp1, bp2]

    ## strongly connected components consists of (possibly nested) cycles
    cl = split(seq_along(bp), igraph::clusters(graph.adjacency(adj2), 'strong')$membership)

    ## choose only clusters with length > 1
    cl = cl[S4Vectors::elementNROWS(cl)>1]
    cl = cl[order(S4Vectors::elementNROWS(cl))]


    jcl = lapply(cl, function(x) unique(sort(bp$grl.ix[x])))
    jcls = sapply(jcl, paste, collapse = ' ')
    jcl = jcl[!duplicated(jcls)]

    if (paths)
    {
        adj3 = adj2

        ## remove all cycles and enumerate remaining paths > 1
        adj3[unlist(jcl), unlist(jcl)] = FALSE
        sinks = which(rowSums(adj3)==0)
        sources = which(colSums(adj3)==0)

        cl2 = split(seq_along(bp), igraph::clusters(graph.adjacency(adj3), 'weak')$membership)
        cl2 = cl2[S4Vectors::elementNROWS(cl2)>1]

        if (any(ix <- S4Vectors::elementNROWS(cl2)>2))
        { ## only need to do this for connected components that have 3 or more junctions
            cl3 = do.call(c, parallel::mclapply(cl2[ix], function(x)
            {
                tmp.adj = adj3[x, x]
                lapply(all.paths(tmp.adj, sources = sources, sinks = sinks)$paths, function(i) x[i])
            }, mc.cores = mc.cores))

            cl2 = c(cl2[!ix], cl3)
        }
        jcl2 = lapply(cl2, function(x) unique(sort(bp$grl.ix[x])))
        jcls2 = sapply(jcl2, paste, collapse = ' ')
        jcl2 = jcl2[!duplicated(jcls2)]

        return(list(cycles = jcl, paths = jcl2))
    }

    return(jcl)
}

#' @name ra.merge
#' Merges rearrangements represented by \code{GRangesList} objects
#'
#' Determines overlaps between two or more piles of rearrangement junctions (as named or numbered arguments) +/- padding
#' and will merge those that overlap into single junctions in the output, and then keep track for each output junction which
#' of the input junctions it was "seen in" using logical flag  meta data fields prefixed by "seen.by." and then the argument name
#' (or "seen.by.ra" and the argument number)
#'
#' @param ... GRangesList representing rearrangements to be merged
#' @param pad non-negative integer specifying padding
#' @param ind  logical flag (default FALSE) specifying whether the "seen.by" fields should contain indices of inputs (rather than logical flags) and NA if the given junction is missing
#' @param ignore.strand whether to ignore strand (implies all strand information will be ignored, use at your own risk)
#' @return \code{GRangesList} of merged junctions with meta data fields specifying which of the inputs each outputted junction was "seen.by"
#' @examples
#'
#' # generate some junctions
#' gr1 <- GRanges(1, IRanges(1:10, width = 1), strand = rep(c('+', '-'), 5))
#' gr2 <- GRanges(1, IRanges(4 + 1:10, width = 1), strand = rep(c('+', '-'), 5))
#' ra1 = split(gr1, rep(1:5, each = 2))
#' ra2 = split(gr2, rep(1:5, each = 2))
#'
#' ram = ra.merge(ra1, ra2)
#' values(ram) # shows the metadata with TRUE / FALSE flags
#'
#' ram2 = ra.merge(ra1, ra2, pad = 5) # more inexact matching results in more merging
#' values(ram2)
#'
#' ram3 = ra.merge(ra1, ra2, ind = TRUE) #indices instead of flags
#' values(ram3)
#' @noRd
ra.merge = function(..., pad = 0, ind = FALSE, ignore.strand = FALSE){
    ra = list(...)
    ra = ra[which(!sapply(ra, is.null))]
    ra = ra[which(!sapply(ra, function(x) {length(x)==0}))] ## filter zero length junctions
    nm = names(ra)
    if (is.null(nm)){
        nm = paste('ra', seq_along(ra), sep = '')
    }
    nm = paste('seen.by', nm, sep = '.')
    if (length(nm)==0){
        return(NULL)
    }
    out = ra[[1]]
    values(out) = cbind(as.data.frame(matrix(FALSE, nrow = length(out), ncol = length(nm), dimnames = list(NULL, nm))), values(out))

    if (!ind){
        values(out)[, nm[1]] = TRUE
    } else{
        values(out)[, nm[1]] = seq_along(out)
    }

    if (length(ra)>1){
        for (i in 2:length(ra)){
            this.ra = ra[[i]]
            if (length(this.ra)>0){
                values(this.ra) = cbind(as.data.frame(matrix(FALSE, nrow = length(this.ra), ncol = length(nm), dimnames = list(NULL, nm))), values(this.ra))
                ovix = ra.overlaps(out, this.ra, pad = pad, ignore.strand = ignore.strand)

                if (!ind){
                    values(this.ra)[[nm[i]]] = TRUE
                } else{
                    values(this.ra)[[nm[i]]] = seq_along(this.ra)
                }

                if (!ind){
                    if (!all(is.na(ovix))){
                        values(out)[, nm[i]][ovix[,1]] = TRUE
                    }
                } else{
                    values(out)[, nm[i]] = NA
                    if (!all(is.na(ovix))){
                        values(out)[, nm[i]][ovix[,1]] = ovix[,1]
                    }
                }
                ## which are new ranges not already present in out, we will add these
                if (!all(is.na(ovix))){
                    nix = setdiff(seq_along(this.ra), ovix[,2])
                } else{
                    nix = seq_along(this.ra)
                }

                if (length(nix)>0){
                    val1 = values(out)
                    val2 = values(this.ra)
                    if (ind){
                        val2[, nm[1:(i-1)]] = NA
                    }
                    else{
                        val2[, nm[1:(i-1)]] = FALSE
                    }
                    values(out) = NULL
                    values(this.ra) = NULL
                    out = grl.bind(out, this.ra[nix])
                    d1 = as.data.table(val1)
                    d2 = as.data.table(val2[nix, ])
                    ## prevent column class mismatches
                    c1 = data.table(class = sapply(d1, class), cnm = colnames(d1))
                    c2 = data.table(class = sapply(d2, class), cnm = colnames(d2))
                    conflict = merge(c1, c2, by = "cnm")[class.x != class.y]
                    for (r in seq_len(nrow(conflict))){
                        d1[[conflict[r, cnm]]] = as(d1[[conflict[r, cnm]]], conflict[r, c(class.y)])
                    }
                    values(out) = rbind(d1, d2, fill = TRUE)
                }
            }
        }
    }
    return(out)
}


####################################################################
#' ppgrid
#'
#' least squares grid search for purity and ploidy modes
#'
#' @param segstats GRanges object of intervals with meta data fields "mean" and "sd" (i.e. output of segstats function)
#' @param allelic logical flag, if TRUE will also look for mean_high, sd_high, mean_low, sd_low variables and choose among top solutions from top copy number according to the best allelic fit
#' @param purity.min min purity value allowed
#' @param purity.max max purity value allowed
#' @param ploidy.min min ploidy value allowed
#' @param ploidy.max max ploidy value allowed
#' @param ploidy.step grid length of ploidy values
#' @param purity.step grid length of purity values
#' @param plot whether to plot the results to file
#' @param verbose print intermediate outputs
#' @param mc.cores integer number of cores to use (default 1)
#' @return data.frame with top purity and ploidy solutions and associated gamma and beta values, for use in downstream jbaMI
############################################
ppgrid = function(segstats,
                  allelic = FALSE,
                  purity.min = 0.01,
                  purity.max = 1.0,
                  ploidy.step = 0.01,
                  purity.step = 0.01,
                  ploidy.min = 1.2, # ploidy bounds (can be generous)
                  ploidy.max = 6,
                  plot = F,
                  verbose = F,
                  mc.cores = 1){
    if (verbose)
        jmessage('setting up ppgrid matrices .. \n')

    if (is.na(ploidy.min)) ploidy.min = 1.2
    if (is.na(ploidy.max)) ploidy.max = 6
    if (is.na(purity.min)) purity.min = 0.01
    if (is.na(purity.max)) purity.max = 1

    ##  purity.guesses = seq(0, 1, purity.step)
    purity.guesses = seq(pmax(0, purity.min), pmin(1.00, purity.max), purity.step)
    ## ploidy.guesses = seq(pmin(0.5, ploidy.min), pmax(10, ploidy.max), ploidy.step)
    ploidy.guesses = seq(pmax(0.5, ploidy.min), pmax(0.5, ploidy.max), ploidy.step)

    if (allelic)
        if (!all(c('mean_high', 'mean_low', 'sd_high', 'sd_low') %in% names(values(segstats))))
        {
            jwarning('If allelic = TRUE then must have meta data fields mean_high, mean_low, sd_high, sd_low in input segstats')
            allelic = FALSE
        }

    if (is.null(segstats$mean))
        jerror('segstats must have field $mean')

    segstats = segstats[!is.na(segstats$mean) & !is.na(segstats$sd)]

    if (!is.null(segstats$ncn))
        segstats = segstats[segstats$ncn==2, ]

    ## if (is.null(segstats$ncn))
    ##     ncn = rep(2, length(mu))
    ## else
    ##     ncn = segstats$ncn

    if (any(tmpix <-is.infinite(segstats$mean) | is.infinite(segstats$sd)))
    {
      segstats$sd[tmpix] = segstats$mean[tmpix] = NA
    }
    segstats = segstats[!is.na(segstats$mean) & !is.na(segstats$sd), ]
    if (length(segstats)==0)
      jerror('No non NA segments provided')

    mu = segstats$mean
    w = as.numeric(width(segstats))
    Sw = sum(as.numeric(width(segstats)))
    sd = segstats$sd
    m0 = sum(as.numeric(mu*w))/Sw

    if (verbose)
        cat(paste(c(rep('.', length(purity.guesses)), '\n'), collapse = ''))

    NLL = matrix(unlist(parallel::mclapply(seq_along(purity.guesses), function(i)
    {
        if (verbose)
            cat('.')
        nll = rep(NA, length(ploidy.guesses))
        for (j in seq_along(ploidy.guesses))
        {
            alpha = purity.guesses[i]
            tau = ploidy.guesses[j]
            gamma = 2/alpha - 2
            beta = (tau + gamma)/m0
            v = pmax(0, round(beta*mu-gamma))
            nll[j] = sum((v-beta*mu+gamma)^2/((sd)^2))
        }
        return(nll)
    }, mc.cores = mc.cores)), nrow = length(purity.guesses), byrow = T)

    dimnames(NLL) = list(as.character(purity.guesses), as.character(ploidy.guesses))

  if (verbose) {
      cat('\n')
  }
    ## rix = as.numeric(rownames(NLL))>=purity.min & as.numeric(rownames(NLL))<=purity.max
    ## cix = as.numeric(colnames(NLL))>=ploidy.min & as.numeric(colnames(NLL))<=ploidy.max
    ## NLL = NLL[rix, cix, drop = FALSE]

    a = rep(NA, nrow(NLL));
    b = rep(NA, ncol(NLL)+2)
    b.inf = rep(Inf, ncol(NLL)+2)
    #  a = rep(Inf, nrow(NLL));
    #  b = rep(Inf, ncol(NLL)+2)
    NLLc = rbind(b, cbind(a, NLL, a), b) ## padded NLL and all of its shifts
    NLLul = rbind(cbind(NLL, a, a), b.inf, b)
    NLLuc = rbind(cbind(a, NLL, a), b.inf, b)
    NLLur = rbind(cbind(a, a, NLL), b.inf, b)
    NLLcl = rbind(b, cbind(NLL, a, a), b)
    NLLcr = rbind(b, cbind(a, a, NLL), b)
    NLLll = rbind(b, b, cbind(NLL, a, a))
    NLLlc = rbind(b, b, cbind(a, NLL, a))
    NLLlr = rbind(b, b, cbind(a, a, NLL))

    if (min(c(ncol(NLL), nrow(NLL)))>1) ## up up down down left right left right ba ba start
        M = (NLLc < NLLul &
             NLLc < NLLuc &
             NLLc < NLLur &
             NLLc < NLLcl &
             NLLc < NLLcr &
             NLLc < NLLll &
             NLLc < NLLlc &
             NLLc < NLLlr)[-c(1, nrow(NLLc)),
                           -c(1, ncol(NLLc)),
                           drop = FALSE]
    else if (ncol(NLL)==1) ## one column, only go up and down
        M = (NLLc < NLLuc & NLLc < NLLlc)[-c(1, nrow(NLLc)), -c(1, ncol(NLLc)), drop = FALSE]
    else ## only row, only go left right
        M = (NLLc < NLLcl & NLLc < NLLcr)[-c(1, nrow(NLLc)), -c(1, ncol(NLLc)), drop = FALSE]

    if (length(M)>1)
    {
        ix = Matrix::which(M, arr.ind=T);
        if (nrow(ix)>1)
        {
            C = hclust(d = dist(ix), method = 'single')
            cl = cutree(C, h = min(c(nrow(NLL), ncol(NLL), 2)))
            minima = ix[vaggregate(1:nrow(ix), by = list(cl), function(x) x[which.min(NLL[ix[x, drop = FALSE]])]), , drop = FALSE]
        }
        else if (nrow(ix) == 0) {
            ## if NLL is monotonically increaing or dereasing, minima will not be found
            ## in this case, return the local minimum of NLL over the tested grid
            minima = Matrix::which(NLL == min(NLL, na.rm = T), arr.ind = T)
        }
        else
            minima = ix[1,, drop = FALSE]
    }
    else
        minima = cbind(1,1)

    out = data.frame(purity = as.numeric(rownames(NLL)[minima[,1]]), ploidy = as.numeric(colnames(NLL)[minima[,2]]), NLL = NLL[minima],
                     i = minima[,1], j = minima[,2])

    out = out[order(out$NLL), , drop = FALSE]
    rownames(out) = 1:nrow(out)
    ## Saturday, Sep 02, 2017 10:33:26 PM
    ## Noted floating point error, use the epsilon trick to replace '>='
    ## out = out[out$purity>=purity.min & out$purity<=purity.max & out$ploidy>=ploidy.min & out$ploidy<=ploidy.max, ]
    eps = 1e9
    out = out[out$purity - purity.min >= -eps &
              out$purity - purity.max <= eps &
              out$ploidy - ploidy.min >= -eps &
              out$ploidy - ploidy.max <= eps, ]
    out$gamma = 2/out$purity -2
    out$beta = (out$ploidy + out$gamma)/m0
    out$mincn = mapply(function(gamma, beta) min(round(beta*mu-gamma)), out$gamma, out$beta)
    out$maxcn = mapply(function(gamma, beta) max(round(beta*mu-gamma)), out$gamma, out$beta)

    ## group solutions with (nearly the same) slope (i.e. 1/beta), these should have almost identical
    ## NLL (also take into account in-list distance just be safe)
    if (nrow(out)>1)
        out$group = cutree(hclust(d = dist(cbind(100/out$beta, 1:nrow(out)), method = 'manhattan'), method = 'single'), h = 2)
    else
        out$group = 1
    out = out[out$group<=3, ,drop = FALSE] ## only pick top 3 groups

    if (allelic) ## if allelic then use allelic distance to rank best solution in group
    {
        ## remove all NA allelic samples
        segstats = segstats[!is.na(segstats$mean_high) & !is.na(segstats$sd_high) & !is.na(segstats$mean_low) & !is.na(segstats$sd_low)]
        out$NLL.allelic = NA
        mu = cbind(segstats$mean_high, segstats$mean_low)
        w = matrix(rep(as.numeric(width(segstats)), 2), ncol = 2, byrow = TRUE)
        Sw = sum(as.numeric(width(segstats)))*2
        sd = cbind(segstats$sd_high, segstats$sd_low)
        m0 = sum(as.numeric(mu*w))/Sw

        if (verbose)
            cat(paste(c(rep('.', length(purity.guesses)), '\n'), collapse = ''))

        for (i in 1:nrow(out))
        {
          if (verbose)
            {
              jmessage(sprintf('Evaluating alleles for solution %s of %s\n', i, nrow(out)))
            }
            alpha = out$purity[i]
            tau = out$ploidy[i]
                                        #                  gamma = 2/alpha - 2
            gamma = 1/alpha - 1 ## 1 since we are looking at hets
            beta = (tau + gamma)/m0 ## replaced with below 9/10/14
                                        #          beta = ( tau + tau_normal * gamma /2 ) / m0
                                        #          v = pmax(0, round(beta*mu-ncn*gamma/2))
            v = pmax(0, round(beta*mu-gamma))

            vtot = round(out$beta[i]*segstats$mean-out$gamma[i])
            vlow.mle = rep(NA, length(vtot))

            for (j in seq_along(vlow.mle))
            {
                if (vtot[j]==0)
                    vlow.mle[j] = 0
                else
                {
                    vlow = 0:floor(vtot[j]/2)
                    vhigh = vtot[j]-vlow
                    tmp.nll = cbind((vlow-beta*mu[j,2]+gamma)^2/(sd[j,2])^2, (vhigh-beta*mu[j, 1]+gamma)^2/((sd[j,1])^2))
                    vlow.mle[j] = vlow[which.min(rowSums(tmp.nll))]
                }
            }

            vlow.mle = apply(cbind(mu, sd, vtot), 1, function(x) {
                tot = x[5]
                if (tot == 0)
                    return(0)
                else
                {
                    vlow = 0:floor(tot/2)
                    vhigh = tot-vlow
                    muh = x[1]
                    mul = x[2]
                    sdh = x[3]
                    sdl = x[4]
                    tmp.nll = cbind((vlow-beta*mul+gamma)^2/(sdl)^2, (vhigh-beta*muh+gamma)^2/((sdh)^2))
                    return(vlow[which.min(rowSums(tmp.nll))])
                }
            })

            out$NLL.allelic[i] = sum((cbind(vtot-vlow.mle, vlow.mle)-beta*mu+gamma)^2/sd^2)
        }

        out$NLL.tot = out$NLL
        out$NLL = out$NLL.tot + out$NLL.allelic
        out.all = out
        ix = vaggregate(1:nrow(out), by = list(out$group), FUN = function(x) x[order(abs(out$NLL[x]))][1])
    }
    else ## otherwise choose the one that gives the lowest magnitude copy number
    {
        out.all = out
        ix = vaggregate(1:nrow(out), by = list(out$group), FUN = function(x) x[order(abs(out$mincn[x]), out$mincn[x]<0)][1])
    }

    out = out[ix, , drop = FALSE]
    out$NLL = vaggregate(out$NLL, by = list(out$group), FUN = min)

    out.all$keep = 1:nrow(out.all) %in% ix ## keep track of other ploidy group peaks for drawing purposes
    out.all = out.all[out.all$group %in% out$group, ] ## only draw the groups in the top solution
    out = out.all;
    out = out[order(out$group, !out$keep, out$NLL), ]
    out$rank = NA
    out$rank[out$keep] = 1:sum(out$keep)
    out$keep = out$i = out$j = NULL
    rownames(out) = NULL
    return(out)
}



####################
#' @name arrstring
#' @title arrstring
#'
#' @description
#' string representation of row array as linear combination of nonzero entries
#' of that row using column names as variables
#'
#' @param A array
#' @param sep separator to use between table elements
#' @return character representation of table
#' @author Marcin Imielinski
####################
arrstring = function(A, x = NULL, sep = ', ', sep2 = '_', signif = 3, dt = FALSE)
{
  if (is.null(dim(A)))
  {
    A = rbind(A)
  }

  if (is.null(colnames(A)))
  {
    colnames(A) = paste0('V', 1:ncol(A))
  }

  if (is.null(x))
  {
    x = colnames(A)
  }
  else
  {
    x = signif(x, signif)
  }

  tmp = as.data.table(Matrix::which(A!=0, arr.ind = TRUE))
  tmp[,  y := A[cbind(row, col)]][  , x:= x[col]]

  str = tmp[, paste(signif(y,signif), x[y!=0], sep = '*', collapse = ' + '), keyby = row][list(1:nrow(A)), V1]

  return(str)
}




#' filter.loose
#'
#' analyze coverage surrounding given loose ends to evaluate quality
#' @param gg gGraph of JaBbA model
#' @param cov.rds character path to binned coverage data
#' @param l data.table of loose ends to evaluate
#' @param purity optional, fractional purity of sample, default assumes 1
#' @param ploidy optional, ploidy of sample, default infers from gg
#' @param field optional, column name in cov.rds, default="ratio"
#' @param PTHRESH optional, threshold for GLM p-value for calling true positive loose ends, default=3.4e-7 provides consanguinity with large dataset bonferroni correction
#' @param max.epgap optional, threshold for JaBbA MIQP convergence epgap. Values over this is considered incomplete optimization and disregarded.
#' @param verbose optional, default=FALSE
#' @return data.table containing a row for every input loose end and logical column `true.pos` indicating whether each loose end has passed all filters (TRUE) or not (FALSE)
#' @author Julie Behr
filter.loose = function(gg, cov, l, purity=NULL, ploidy=NULL, field="ratio", PTHRESH=3.4e-7, max.epgap = 1e-3, verbose=F){
    ## ## load coverage and beta (coverage CN fit)
    ## if(verbose) message("Loading coverage bins")
    ## cov = readRDS(cov.rds)
    ## cov = gr.sub(cov, "chr", "")
    if(!(field %in% colnames(values(cov)))) stop("must provide field in cov.rds")
    if(field != "ratio") cov$ratio = values(cov)[, field]
    if(!("tum.counts" %in% colnames(values(cov)))){
        yf = ifelse("reads.corrected" %in% colnames(values(cov)), "reads.corrected", field)
        cov$tum.counts = values(cov)[, yf]
    }
    if(!("norm.counts" %in% colnames(values(cov)))){
        cov$norm.counts = 1 ## dummy to make it flat
    }
    if(is.null(purity)){
        if (is.null(purity <- gg$meta$purity)){
            jerror("Purity must be given.")
        }
    }  ## purity = 1
    if(is.null(ploidy)) {
        ploidy = weighted.mean(gg$nodes$gr$cn, gg$nodes$gr %>% width, na.rm=T)
    }
    ## remove bins with infinite values up front
    cov = cov[which(is.finite(cov$ratio) & !is.na(cov$ratio))]
    ratios = cov$ratio
    beta = mean(ratios[is.finite(ratios)], na.rm=T) * purity/(2*(1-purity) + purity * ploidy)
    segs = gg$nodes$gr
    ## segs = gr.sub(segs, "chr", "")
    ## l = gr.sub(l, "chr", "")

    if (!is(l, "GRanges")){
        try({l = dt2gr(l)})
        if (inherits(l, "try-error")){
            jerror("l must be a GRanges or a data.table that can be converted to a GRanges.")
        }
    }

    ## identify nodes flanking each loose end, extending up to 100kb away
    o = gr.findoverlaps(segs, l+1)
    segs = segs[o$query.id]; segs$leix = l[o$subject.id]$leix
    sides = gr.findoverlaps(segs, l+1e5, by="leix")
    values(sides) = cbind(values(sides), values(l[sides$subject.id]))
    sides$fused = !is.na(gr.match(sides, l, by="leix"))
    sides$wid = width(sides)

    ## gather coverage bins corresponding to fused & unfused sides of loose ends
    if(verbose) jmessage("Overlapping coverage with loose end fused and unfused sides")

    rel = gr.findoverlaps(cov, sides)
    values(rel) = cbind(values(cov[rel$query.id]), values(sides[rel$subject.id]))
    qq = 0.05
    rel = gr2dt(rel)[, ":="(
                   in.quant.r = ratio >= quantile(ratio, qq, na.rm=T) & ratio <= quantile(ratio, 1-qq, na.rm=T),
                   good.cov=sum(is.na(tum.counts))/.N < 0.1 & sum(is.na(norm.counts))/.N < 0.1 & sum(is.na(ratio))/.N < 0.1 & wid > 5e4
               ), by=.(subject.id, fused)]

    # dealing with tiny flanking nodes
    # if a flanking node has 2 or less bins then in.quant.r will be TRUE (unless ratio is NA)
    rel[, in.quant.r := ifelse(.N > 2, in.quant.r, !is.na(ratio)), by = .(subject.id, fused)]

    rel[, lxxx := leix]
    variances = rel[(in.quant.r), var(ratio), keyby=.(fused, lxxx)]

    ## only run if variances has > 0 rows:
    if (!nrow(variances)) {
        return(data.table())
    }
    variances[, side := ifelse(fused, "f_std", "u_std")]
    variances[, std := sqrt(V1)]
    ## if unfused loose ends are present, a column should still be made
    variances[, side := factor(side, levels = c("f_std", "u_std"))]
    vars = dcast.data.table(variances, lxxx ~ side, value.var="std", fill = NA, drop = FALSE)
    rel[is.na(in.quant.r), in.quant.r := FALSE]
    rel[, tum.median := median(tum.counts[in.quant.r]), by=.(lxxx)]
    rel[, norm.median := median(norm.counts[in.quant.r]), by=.(lxxx)]
    rel[, tum.res := tum.counts - tum.median]
    rel[, norm.res := norm.counts - norm.median]
    tum.ks = rel[(in.quant.r), tryCatch(dflm(ks.test(tum.res[fused], tum.res[!fused])), error = function(e) dflm(ks.test(tum.res, tum.res))), by=lxxx][, p, by=lxxx]
    norm.ks = rel[(in.quant.r), tryCatch(dflm(ks.test(norm.res[fused], norm.res[!fused])), error = function(e) dflm(ks.test(norm.res, norm.res))), by=lxxx][, p, by=lxxx]
    pt1 = merge(vars, merge(tum.ks, norm.ks, by="lxxx", suffixes=c("_tum", "_norm"),all=T), by="lxxx", all=T)
    pt1$lxxx = as.character(pt1$lxxx); setkey(pt1, lxxx)
    pt1[, n_fdr := p.adjust(p_norm, "bonferroni")]
    pt1[, t_fdr := p.adjust(p_tum, "bonferroni")]

    rel[, ":="(
        tumor.mean.fused = mean(tum.counts[fused], na.rm=T),
        tumor.mean.unfused = mean(tum.counts[!fused], na.rm=T),
        normal.mean.fused = mean(norm.counts[fused], na.rm=T),
        normal.mean.unfused = mean(norm.counts[!fused], na.rm=T)
    ), by=leix]

    ## evaluate waviness across bins per loose end
    rel[, good.cov := all(good.cov), by=subject.id]
    if(verbose) message("Calculating waviness around loose end")
    rel[, waviness := max(.waviness(start[fused], ratio[fused]), .waviness(start[!fused], ratio[!fused]), na.rm=T), by=subject.id]

    ## prep glm input matrix
    if(verbose) message("Prepping GLM input matrix")
    glm.in = melt.data.table(rel[(in.quant.r),], id.vars=c("leix", "fused"), measure.vars=c("tum.counts", "norm.counts"), value.name="counts")[, tumor := variable=="tum.counts"]
    glm.in[, ix := 1:.N, by=leix]
    rel2 = copy(glm.in)
    setnames(glm.in, "leix", "leix2")

    ## calculate residuals from glm
    rel2[, residual := .mod(glm.in[leix2==leix[1],]), by=leix]

    ## evaluate KS-test on residuals and calculate effect size
    ## effect will be from KS test on residuals
    ## estimate is replaced with difference of median coverage
    if(verbose) message("Running KS-test on fused vs unfused sides of loose ends")
    res = rel2[(tumor), tryCatch(dflm(ks.test(residual[fused], residual[!fused])), error=function(e) dflm(ks.test(residual, residual))), by=leix]
    est = rel2[, median(counts), keyby=.(fused, tumor, leix)][, V1[tumor] / V1[!tumor], keyby=.(leix, fused)][, V1[fused]-V1[!fused], keyby=leix]
    res$leix = as.character(res$leix); setkey(res, leix)
    res[as.character(est$leix), estimate := est$V1]

    ## combine relevant fields from each test
    test = rel2[(tumor), mean(counts), keyby=.(fused, leix)][, V1[fused]-V1[!fused], keyby=leix]; setnames(test, "V1", "testimate")
    test$leix = as.character(test$leix); setkey(test, leix)
    nest = rel2[!(tumor), mean(counts), keyby=.(fused, leix)][, V1[fused]-V1[!fused], keyby=leix]; setnames(nest, "V1", "nestimate")
    nest$leix = as.character(nest$leix); setkey(nest, leix)
    cnl = rev(rev(colnames(rel))[1:6])
    cns = c("name", "method", "estimate", "effect", "p")
    le.class = cbind(gr2dt(l[rel[!duplicated(subject.id), subject.id]]), rel[!duplicated(subject.id), cnl, with=F], res[rel[!duplicated(subject.id), as.character(leix)], cns, with=F], nest[rel[!duplicated(subject.id), as.character(leix)], "nestimate"], test[rel[!duplicated(subject.id), as.character(leix)], "testimate"])[, effect.thresh := beta]

    le.class[, f.std := pt1[.(as.character(leix)), f_std]]
    le.class[, u.std := pt1[.(as.character(leix)), u_std]]
    le.class[, ":="(n_fdr = pt1[.(as.character(leix)), n_fdr], t_fdr = pt1[.(as.character(leix)), t_fdr])]
    le.class[, bon := p.adjust(p, "bonferroni")]

    ## correct p values
    if(verbose) message("Identifying true positives")
    if(!("epgap" %in% colnames(le.class))){
        le.class[, passed := !is.na(p) & p < PTHRESH & estimate > (0.6*effect.thresh) & testimate > (0.6*effect.thresh) & waviness < 2 & abs(nestimate) < (0.6*effect.thresh)]
    } else {
        le.class[, passed := !is.na(p) & p < PTHRESH & estimate > (0.6*effect.thresh) & testimate > (0.6*effect.thresh) & waviness < 2 & abs(nestimate) < (0.6*effect.thresh) &
                       epgap < max.epgap]
    }
    le.class[, true.pos := passed & estimate > f.std & estimate > u.std & n_fdr > 0.05 & t_fdr < 0.01]
    ## le.class$passed = NULL

    gc()
    return(le.class)
}

#' @name dflm
#' @title dflm
#' @description
#' @noRd
#'
#' Formats lm, glm, or fisher.test outputs into readable data.table
#' @author Marcin Imielinski
dflm = function(x, last = FALSE, nm = '')
{
    if (is.null(x))
        out = data.frame(name = nm, method = as.character(NA), p = as.numeric(NA), estimate = as.numeric(NA), ci.lower = as.numeric(NA),  ci.upper = as.numeric(NA), effect = as.character(NA))
    else if (any(c('lm', 'betareg') %in% class(x)))
    {

        coef = as.data.frame(summary(x)$coefficients)
        colnames(coef) = c('estimate', 'se', 'stat', 'p')
        if (last)
            coef = coef[nrow(coef), ]
        coef$ci.lower = coef$estimate - 1.96*coef$se
        coef$ci.upper = coef$estimate + 1.96*coef$se
        if (!is.null(summary(x)$family))
        {
            fam = summary(x)$family$family
            if (summary(x)$family$link %in% c('log', 'logit'))
            {
                coef$estimate = exp(coef$estimate)
                coef$ci.upper= exp(coef$ci.upper)
                coef$ci.lower= exp(coef$ci.lower)
            }
        }
        else
            fam = 'Unknown'

        if (!last)
            nm = paste(nm, rownames(coef))

        out = data.frame(name = nm,
                         method = fam,
                         stat = coef$stat,
                         p = signif(coef$p, 3),
                         estimate = coef$estimate,
                         ci.lower = coef$ci.lower,
                         ci.upper = coef$ci.upper,
                         effect = paste(signif(coef$estimate, 3), ' [',
                                        signif(coef$ci.lower,3),'-',
                                        signif(coef$ci.upper, 3), ']',
                                        sep = ''))
    }
    else if (class(x) == 'htest')
    {
        if (is.null(x$estimate))
            x$estimate = x$statistic
        if (is.null(x$conf.int))
            x$conf.int = c(NA, NA)
        out = data.table(name = nm, method = x$method, estimate = x$estimate, ci.lower = x$conf.int[1], ci.upper = x$conf.int[2], effect = paste(signif(x$estimate, 3), ' [',  signif(x$conf.int[1],3),'-', signif(x$conf.int[2], 3), ']', sep = ''), p = x$p.value)
    }
    else if (class(x) == 'polr')
    {
        coef = coef(summary(x)) %>% as.data.frame
        nm = paste(nm, rownames(coef))
        coef = as.data.table(coef)
        setnames(coef, c('estimate', 'se', 't'))
        out = data.table(name = nm) %>% cbind(coef)
        out$p =  pnorm(abs(out$t), lower.tail = FALSE) * 2
        out[, ci.lower := estimate-1.96*se]
        out[, ci.upper := estimate+1.96*se]
        out[, effect := paste(signif(estimate, 3), ' [',  signif(ci.lower,3),'-', signif(ci.upper, 3), ']', sep = '')]
    }
    else
    {
        out = data.frame(name = nm, method = x$method, p = signif(x$p.value, 3), estimate = x$estimate, ci.lower = x$conf.int[1], ci.upper = x$conf.int[2], effect = paste(signif(x$estimate, 3), ' [',  signif(x$conf.int[1],3),'-', signif(x$conf.int[2], 3), ']', sep = ''))
    }

    out$effect = as.character(out$effect)
    out$name = as.character(out$name)
    out$method = as.character(out$method)
    rownames(out) = NULL
    return(as.data.table(out))
}


#' @name .waviness
#'
#' quantifies autocorrelation in coverage
.waviness = function(x, y, min.thresh = 5e3, max.thresh = 10e4, spar = 0.5, smooth = TRUE, filter = rep(FALSE, length(x)), trim = 10) {
    if(length(x)==0) return(NA)
    dat = data.table(x, y)[order(x), ]
    dat[, lag := x-min(x)]
    fdat = dat[!is.na(y), ][!is.infinite(y), ]
    ## autocorrelation
    if(nrow(fdat)==0) return(NA)
    fdat[, ac := as.numeric(acf(c(y, y), plot = FALSE, lag.max = length(y))$acf[-1])]
    if (smooth) { ## smoothing the autocorrelation gets rid of some more noise
        fdat = fdat[!is.na(lag) & !is.na(ac),]
        if (nrow(fdat[!is.na(lag) & !is.na(ac),])<4)
            return(NA)
        fdat$ac = predict(smooth.spline(fdat$lag, fdat$ac, spar = spar), fdat$lag)$y
    }
    return(fdat[lag>min.thresh & lag<max.thresh, sum(ac^2)])
}

#' @name .mod
#'
#' fit data table with glm and return residuals
.mod = function(dt){
    mod = dt[, glm(counts ~ tumor + fused + ix, family='gaussian')]
    res = dt$counts - predict(mod, dt, type='response')
    return(res)
}

#' @name .mod2
#'
#' fit data table with glm and return residuals
.mod2 = function(dt){
    mod = dt[, glm(counts ~ tumor + ix, family='gaussian')]
    res = dt$counts - predict(mod, dt, type='response')
    return(res)
}

#' @name QCStats
#' @title QCStats
#' @description
#' Function to generate quality control stats for one (default) or multiple JaBbA outputs. It is to be runned after the main JaBba function is done. Stats are printed to a txt file (QCStats.txt)
#' 
#' @param inputDT Datatable with the following columns:
#' 	pair name of the sample pair
#' 	inputdir: Directory with JaBbA results
#' @param outdir Output directory where to place the summary graphs and txt files (only if multiple JaBbAs are provided.)
#' @param testMode Whether to run the function in test mode or not. Only used for running unit tests. This mode returns a few specific QC values (as opposed to writing them down in a file) to be checked via test_that.

QCStats = function(inputDT,outdir,testMode=FALSE){
	library(data.table)
	library(gGnome)
	library(ggplot2)
	
	
	if(testMode){
			output_gg=readRDS(system.file('extdata', "jabba.gg.rds", package = "JaBbA"))
			opt.report=readRDS(system.file('extdata', "opt.report.rds", package = "JaBbA"))
			kar=readRDS(system.file('extdata', "karyograph.rds", package = "JaBbA"))
			fep=readRDS(system.file('extdata', "jabba.raw.rds", package = "JaBbA"))$epgap
		
			input_segs=length(readRDS(system.file('extdata', "segs.rds", package = "JaBbA")))
			output_segs=nrow(output_gg$nodes$dt)
			rmse=sqrt(sum((kar$segstats$cnmle-kar$segstats$cn)^2,na.rm=TRUE))
			
			return(c(input_segs,output_segs,rmse,fep))
		}
	
	
	summaryDT=data.table(pair=character(),Tier_1_Input_Junctions=numeric(),Tier_2_Input_Junctions=numeric(),Tier_3_Input_Junctions=numeric(),
		Tier_1_Output_Junctions=numeric(),Tier_2_Output_Junctions=numeric(),Tier_3_Output_Junctions=numeric(),
		Number_of_Segments_Input=numeric(),Number_of_Segments_Output=numeric(),Non_telomeric_Loose_Ends=numeric(),
		Requested_epgap=numeric(),Final_epgap=numeric(),Converged=logical(),Rho_of_Coverage_and_CN=numeric(),p_value_of_Rho=numeric(),
		r_of_Coverage_and_CN=numeric(),p_value_of_r=numeric(),RMSE_of_Coverage_and_CN=numeric())
	combforScatter=list(CNMLE=vector(),CN=vector())

	for(i in 1:nrow(inputDT)){
		JaBba_Args=readRDS(paste0(inputDT$inputdir[i],"/cmd.args.rds"))
		output_gg=readRDS(paste0(inputDT$inputdir[i], "/jabba.gg.rds"))
		opt.report=readRDS(paste0(inputDT$inputdir[i], "/opt.report.rds"))
		kar=readRDS(paste0(inputDT$inputdir[i], "/karyograph.rds"))
		loose=length(output_gg$nodes$loose$edges.in[output_gg$nodes$loose$edges.in=="()->"])+length(output_gg$nodes$loose$edges.out[output_gg$nodes$loose$edges.out=="->()"])
		input_Jtiers=table(readRDS(JaBba_Args$junctions)$dt$tier)
		output_Jtiers=table(output_gg$edges[type == 'ALT']$dt$tier)
		input_segs=length(readRDS(JaBba_Args$seg))
		output_segs=nrow(output_gg$nodes$dt)
		corr_sp=cor.test(kar$segstats$cnmle[!is.na(kar$segstats$cn)],kar$segstats$cn[!is.na(kar$segstats$cn)],method="spearman")	
		corr_pe=cor.test(kar$segstats$cnmle[!is.na(kar$segstats$cn)],kar$segstats$cn[!is.na(kar$segstats$cn)],method="pearson")
		rmse=sqrt(sum((kar$segstats$cnmle-kar$segstats$cn)^2,na.rm=TRUE))
		fep=readRDS(paste0(inputDT$inputdir[i],"/jabba.raw.rds"))$epgap

				
		sink(paste0(inputDT$inputdir[i],"/QCStats.txt"))
		cat("Stat \t Value \n")
		cat(paste0("Tier_1_Input_Junctions \t",ifelse("1" %in% names(input_Jtiers), input_Jtiers[[1]], "0"),"\n"))
		cat(paste0("Tier_2_Input_Junctions \t",ifelse("2" %in% names(input_Jtiers), input_Jtiers[[2]], "0"),"\n"))
		cat(paste0("Tier_3_Input_Junctions \t",ifelse("3" %in% names(input_Jtiers), input_Jtiers[[3]], "0"),"\n"))
		cat(paste0("Tier_1_Output_Junctions \t",ifelse("1" %in% names(output_Jtiers), output_Jtiers[[1]], "0"),"\n"))
		cat(paste0("Tier_2_Output_Junctions \t",ifelse("2" %in% names(output_Jtiers), output_Jtiers[[2]], "0"),"\n"))
		cat(paste0("Tier_3_Output_Junctions \t",ifelse("3" %in% names(output_Jtiers), output_Jtiers[[3]], "0"),"\n"))

		cat(paste0("Number_of_Segments_Input \t",input_segs,"\n"))
		cat(paste0("Number_of_Segments_Output \t",output_segs,"\n"))
		cat(paste0("Non_telomeric_Loose_Ends \t",loose,"\n"))

		cat(paste0("Requested_epgap \t",JaBba_Args$epgap,"\n"))
		cat(paste0("Final_epgap \t",fep,"\n"))
		cat(paste0("Converged \t",ifelse(JaBba_Args$epgap>fep,"TRUE","FALSE"),"\n"))

		cat(paste0("Rho_of_Coverage_and_CN \t",signif(as.vector(corr_sp$estimate),digits=4),"\n"))
		cat(paste0("p_value_of_Rho \t",signif(as.vector(corr_sp$p.value),digits=4),"\n"))

		cat(paste0("r_of_Coverage_and_CN \t",signif(as.vector(corr_pe$estimate),digits=4),"\n"))
		cat(paste0("p_value_of_r \t",signif(as.vector(corr_pe$p.value),digits=4),"\n"))

		cat(paste0("RMSE_of_Coverage_and_CN \t",signif(rmse,digits=4),"\n"))
		sink()

		QCGraphs(StatsTxt=paste0(inputDT$inputdir[i],"/QCStats.txt"),KarDT=data.table(cn=kar$segstats$cn,cnmle=kar$segstats$cnmle),
			outdir=inputDT$inputdir[i])

		if(nrow(inputDT)>1){
				summaryDT=rbind(summaryDT,data.table(pair=inputDT$pair[i],
					Tier_1_Input_Junctions=as.numeric(ifelse("1" %in% names(input_Jtiers), input_Jtiers[[1]], "0")),
					Tier_2_Input_Junctions=as.numeric(ifelse("2" %in% names(input_Jtiers), input_Jtiers[[2]], "0")),
					Tier_3_Input_Junctions=as.numeric(ifelse("3" %in% names(input_Jtiers), input_Jtiers[[2]], "0")),
					Tier_1_Output_Junctions=as.numeric(ifelse("1" %in% names(output_Jtiers), output_Jtiers[[1]], "0")),
					Tier_2_Output_Junctions=as.numeric(ifelse("3" %in% names(output_Jtiers), output_Jtiers[[1]], "0")),
					Tier_3_Output_Junctions=as.numeric(ifelse("3" %in% names(output_Jtiers), output_Jtiers[[1]], "0")),
					Number_of_Segments_Input=input_segs,Number_of_Segments_Output=output_segs,
					Non_telomeric_Loose_Ends=loose,
					Requested_epgap=JaBba_Args$epgap,Final_epgap=fep,Converged=ifelse(JaBba_Args$epgap>fep,"TRUE","FALSE"),
					Rho_of_Coverage_and_CN=signif(as.vector(corr_sp$estimate),digits=4),
					p_value_of_Rho=signif(as.vector(corr_sp$p.value),digits=4),
					r_of_Coverage_and_CN=signif(as.vector(corr_pe$estimate),digits=4),
					p_value_of_r=signif(as.vector(corr_pe$p.value),digits=4),
					RMSE_of_Coverage_and_CN=signif(rmse,digits=4)))

				combforScatter$CNMLE=c(combforScatter$CNMLE,kar$segstats$cnmle)
				combforScatter$CN=c(combforScatter$CN,kar$segstats$cn)
			
		}
	}

	if(nrow(inputDT)>1){

		Comb_corr_sp=cor.test(combforScatter$CNMLE,combforScatter$CN,method="spearman")	
		Comb_corr_pe=cor.test(combforScatter$CNMLE,combforScatter$CN,method="pearson")
		Comb_rmse=sqrt(sum((combforScatter$CNMLE-combforScatter$CN)^2,na.rm=TRUE))

		summaryDT=rbind(summaryDT,data.table(pair="Mean",
					Tier_1_Input_Junctions=mean(summaryDT$Tier_1_Input_Junctions),
					Tier_2_Input_Junctions=mean(summaryDT$Tier_2_Input_Junctions),
					Tier_3_Input_Junctions=mean(summaryDT$Tier_3_Input_Junctions),
					Tier_1_Output_Junctions=mean(summaryDT$Tier_1_Output_Junctions),
					Tier_2_Output_Junctions=mean(summaryDT$Tier_2_Output_Junctions),
					Tier_3_Output_Junctions=mean(summaryDT$Tier_3_Output_Junctions),
					Number_of_Segments_Input=mean(summaryDT$Number_of_Segments_Input),Number_of_Segments_Output=mean(summaryDT$Number_of_Segments_Output),
					Non_telomeric_Loose_Ends=mean(summaryDT$Non_telomeric_Loose_Ends),
					Requested_epgap=mean(summaryDT$Requested_epgap),
					Final_epgap=mean(summaryDT$Final_epgap),
					Converged="NA",
					Rho_of_Coverage_and_CN=signif(as.vector(Comb_corr_sp$estimate),digits=4),
					p_value_of_Rho=signif(as.vector(Comb_corr_sp$p.value),digits=4),
					r_of_Coverage_and_CN=signif(as.vector(Comb_corr_pe$estimate),digits=4),
					p_value_of_r=signif(as.vector(Comb_corr_pe$p.value),digits=4),
					RMSE_of_Coverage_and_CN=signif(Comb_rmse,digits=4)))
		
		fwrite(summaryDT,paste0(outdir,"/QCSummary.csv"))
		QCGraphs(StatsCsv=paste0(outdir,"/QCSummary.csv"),KarDT=data.table(cn=combforScatter$CN,cnmle=combforScatter$CNMLE),
			outdir=outdir)
	}
}

#' @name QCGraphs
#' @title QCGraphs
#' @description Function for generating quality graphs for the output of a JaBbA run. Can be run either for one or multiple JaBbA outputs (see below).
#' 
#' 
#' @param StatsTxt Route to output file of QCStats. Only provide if running for a single sample.
#' @param StatsCsv Csv with pairs (column 1) and corresponding path to QCStats file (columns 2). Only provide if running for more than 1 file.
#' @param KarDT Datatable with paired up values of cn and cnmle from the output karyograph.
#' @param outdir Output directory where to place the function's graphs and txt files

QCGraphs=function(StatsTxt=NA, StatsCsv=NA, KarDT, outdir){
	if(!is.na(StatsTxt)){
		QCDF=read.table(StatsTxt,header=TRUE,stringsAsFactors=FALSE)
		QCList=list()
		for(i in 1:nrow(QCDF)){
			QCList[as.character(QCDF$Stat[i])]=QCDF$Value[i]
		}
	}

	if(!is.na(StatsCsv)){
		QCDF=read.csv(StatsCsv,header=TRUE,stringsAsFactors=FALSE)
		QCList=QCDF[QCDF$pair=="Mean",]
	}

	signifr=ifelse(QCList$"p_value_of_r"<0.01,"Significant","Non-Significant")
	signifRho=ifelse(QCList$"p_value_of_Rho"<0.01,"Significant","Non-Significant")
	convergence=ifelse(QCList$"Converged","reached","not reached")
	subtext=paste0("Convergence ",convergence, " (epgap delta=",signif(as.numeric(QCList$Requested_epgap)-as.numeric(QCList$Final_epgap),digits=4),")")
	captiontext=paste0("rho=",QCList$"Rho_of_Coverage_and_CN","(",signifRho,");"," r=",QCList$"r_of_Coverage_and_CN","(",signifr,")")

	png(paste0(outdir,"/QC_CNScatter.png"))
	p=ggplot(data.frame(KarDT), aes(x=cn, y=cnmle)) + geom_point()+
	labs(title = "CN vs CN_Old",subtitle = subtext,
		caption=captiontext)+xlab("CN")+ylab("CN_Old")+
	theme_grey(base_size = 16)
	print(p)
	dev.off()

	Frequency=c(as.integer(as.numeric(QCList$"Number_of_Segments_Input")),as.integer(as.numeric(QCList$"Number_of_Segments_Output")))
	Type=c("Input","Output")
	data<- data.frame(Frequency, Type)

	png(paste0(outdir,"/QC_SegmentsBarplot.png"))
	p=ggplot(data, aes(x=Type, y=Frequency, fill=Type)) + geom_bar(stat="identity", color="black",width=0.6)+
	scale_fill_manual(values=c("#56B4E9", "#E69F00"))+
	labs(title = "Genomic Segments")+ylab("Abundance")+theme_grey(base_size = 20)+geom_text(aes(label=Frequency), vjust=1.6, color="black", size=10)
	print(p)
	dev.off()


	Frequency=c(as.numeric(QCList$Tier_1_Input_Junctions),as.numeric(QCList$Tier_2_Input_Junctions),as.numeric(QCList$Tier_3_Input_Junctions)
		,as.numeric(QCList$Tier_1_Output_Junctions),as.numeric(QCList$Tier_2_Output_Junctions),as.numeric(QCList$Tier_3_Output_Junctions))
	Type=c("Input","Input","Input","Output","Output","Output")
	Tier=c(1,2,3,1,2,3)
	data<- data.frame(Frequency, Type, Tier)

	png(paste0(outdir,"/QC_JunctionsBarplot.png"))
	p=ggplot(data=data, aes(x=Tier, y=Frequency, fill=Type)) +
	geom_bar(stat="identity", color="black", position=position_dodge())+theme_minimal()+labs(title = "Junction Tiers",subtitle=paste0("Number of non-telomeric loose ends=",QCList$"Non_telomeric_Loose_Ends"))+theme_grey(base_size = 16)
	print(p)
	dev.off()
}
mskilab/JaBbA documentation built on June 9, 2024, 12:40 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
mskilab/JaBbA
Infer cancer graph genomes from read depth and junctions

R/JaBbA.R
In mskilab/JaBbA: Infer cancer graph genomes from read depth and junctions

Defines functions jabba_stub JaBbA .onLoad

R Package Documentation

Browse R Packages

We want your feedback!

mskilab/JaBbA Infer cancer graph genomes from read depth and junctions

R/JaBbA.R In mskilab/JaBbA: Infer cancer graph genomes from read depth and junctions

Defines functions jabba_stub JaBbA .onLoad

R Package Documentation

Browse R Packages

We want your feedback!

mskilab/JaBbA
Infer cancer graph genomes from read depth and junctions

R/JaBbA.R
In mskilab/JaBbA: Infer cancer graph genomes from read depth and junctions