TitanCNA: Subclonal copy number and LOH prediction from whole genome sequencing of tumours

Documented in computeSDbwIndex correctIntegerCN correctReadDepth filterData getPositionOverlap loadAlleleCounts loadDefaultParameters outputModelParameters outputTitanResults outputTitanSegments setGenomeStyle

#' author: Gavin Ha 
#' 		Dana-Farber Cancer Institute
#'		Broad Institute
#' contact: <gavinha@gmail.com> or <gavinha@broadinstitute.org>
#' date:	  June 26, 2018

loadDefaultParameters <- function(copyNumber = 5, numberClonalClusters = 1, 
    skew = 0, hetBaselineSkew = NULL, alleleEmissionModel = "binomial", symmetric = TRUE, data = NULL) {
    if (copyNumber < 3 || copyNumber > 8) {
        stop("loadDefaultParameters: Fewer than 3 or more than 8 copies are 
             being specified. Please use minimum 3 or maximum 8 'copyNumber'.")
    }
    if (!alleleEmissionModel %in% c("binomial", "Gaussian")){
      stop("loadDefaultParameters: alleleEmissionModel must be either \"binomial\" or \"Gaussian\".")
    }
    if (!symmetric){
      message("loadDefaultParameters: symmetric=FALSE is deprecated; using symmetric=TRUE.")
    }
    ## Data without allelic skew rn is theoretical
    ## normal reference allelic ratio initialize to
    ## theoretical values
    rn <- 0.5
    if (symmetric) {
        rt = c(rn, 1, 1, 1/2, 1, 2/3, 1, 3/4, 2/4, 
            1, 4/5, 3/5, 1, 5/6, 4/6, 3/6, 1, 6/7, 
            5/7, 4/7, 1, 7/8, 6/8, 5/8, 4/8)
        rt = rt + skew
        rt[rt > 1] <- 1
        rt[rt < (rn + skew)] <- rn + skew
        ## shift heterozygous states to account for noise 
        ##   when using symmetric = TRUE
        if (!is.null(hetBaselineSkew)){
        	hetARshift <- hetBaselineSkew + 0.5
        }else if (!is.null(data)){
        	hetARshift <- median(data$ref / data$tumDepth, na.rm = TRUE)
        }else{
        	hetARshift <- 0.55
        }
        hetState <- c(4, 9, 16, 25)
        rt[hetState] <- hetARshift
        ZS = 0:24
        ct = c(0, 1, 2, 2, 3, 3, 4, 4, 4, 5, 5, 5, 
            6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 8)
        
    } #else {
      #  rt = c(rn, 1, 1e-05, 1, 1/2, 1e-05, 1, 2/3, 
      #      1/3, 1e-05, 1, 3/4, 2/4, 1/4, 1e-05, 1, 
      #      4/5, 3/5, 2/5, 1/5, 1e-05, 1, 5/6, 4/6, 
      #      3/6, 2/6, 1/6, 1e-05, 1, 6/7, 5/7, 4/7, 
      #      3/7, 2/7, 1/7, 1e-05, 1, 7/8, 6/8, 5/8, 
      #      4/8, 3/8, 2/8, 1/8, 1e-05)
      #  rt = rt + skew
      #  rt[rt > 1] <- 1
      #  rt[rt < 0] <- 1e-05
      #  ZS = c(0, 1, 1, 2, 3, 2, 4, 5, 5, 4, 6, 7, 
      #      8, 7, 6, 9, 10, 11, 11, 10, 9, 12, 13, 
      #      14, 15, 14, 13, 12, 16, 17, 18, 19, 19, 
      #      18, 17, 16, 20, 21, 22, 23, 24, 23, 22, 
      #      21, 20)
      #  ct = c(0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 
      #      4, 4, 4, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 
      #      6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 
      #      8, 8, 8, 8, 8, 8, 8)
      #  highStates <- c(1,16:length(rt))
      #  hetState <- c(5, 13, 25, 41)
    #}
    ZS[hetState[1]] <- -1
    rn = rn + skew
    ind <- ct <= copyNumber
    hetState <- hetState[hetState <= sum(ind)]
    rt <- rt[ind]
    ZS <- ZS[ind]
    ct <- ct[ind]  #reassign rt and ZS based on specified copy number
    K <- length(rt)
    N <- nrow(data)
    ## Dirichlet hyperparameter for initial state
    ## distribution, kappaG
    kappaGHyper_base <- 100
    if (length(data$ref) > 0 && length(data$logR) > 0){
      corRho_0 <- cor(data$ref / data$tumDepth, data$logR, use = "pairwise.complete.obs")
    }else{
      corRho_0 <- NULL
    }
    var_base <- 1/20 #var(data$logR, na.rm = TRUE)
    var0_base <- 1/20 #var(data$ref / data$tumDepth, na.rm = TRUE)
    if (!is.null(data)){
      #alphaK <- 1 / (var(data$logR, na.rm = TRUE) / sqrt(K))
      #betaK <- alphaK * var(data$logR, na.rm = TRUE)
      #alphaR <- 1 / (var(data$ref / data$tumDepth, na.rm = TRUE) / sqrt(K))
      #betaR <- alphaK * var(data$ref / data$tumDepth, na.rm = TRUE)
      betaK <- 25 
      alphaK <- betaK / var(data$logR) 
      betaR <- 25 
      alphaR <- betaR / var(data$ref / data$tumDepth, na.rm = TRUE)
    }else{
      alphaK <- 10000   
      betaK <- 25
      alphaR <- 10000
      betaR <- 25
    }
    ## Gather all genotype related parameters into a list
    genotypeParams <- vector("list", 0)
    genotypeParams$rt <- rt
    genotypeParams$rn <- rn
    genotypeParams$ZS <- ZS
    genotypeParams$ct <- ct
    ## VARIANCE for Gaussian to model copy number, var
    genotypeParams$corRho_0 <- corRho_0
    genotypeParams$var_0 <- rep(var_base, K) 
    #genotypeParams$var_0[ct %in% c(2, 4, 8)] <- var_base / 10
    genotypeParams$alphaKHyper <- rep(alphaK, K)
    genotypeParams$alphaKHyper[ct >= 5] <- alphaK  #AMP(11-15),HLAMP(16-21) states
    genotypeParams$betaKHyper <- rep(betaK, K)
    genotypeParams$alleleEmissionModel <- alleleEmissionModel
    ## VARIANCE for Gaussian to model allelic fraction, varR
    genotypeParams$varR_0 <- rep(var0_base, K) 
    #genotypeParams$varR_0[hetState] <- var0_base / 10
    genotypeParams$alphaRHyper <- rep(alphaR, K)
    genotypeParams$betaRHyper <- rep(betaR, K)
    genotypeParams$kappaGHyper <- rep(kappaGHyper_base, K) + 1
    genotypeParams$kappaGHyper[hetState] <- kappaGHyper_base * 5
    genotypeParams$kappaGHyper[ct == 0] <- kappaGHyper_base / 50
    genotypeParams$piG_0 <- estimateDirichletParamsMap(genotypeParams$kappaGHyper)  #add the outlier state
    genotypeParams$outlierVar <- 10000
    genotypeParams$symmetric <- symmetric
    ## NORMAL, n
    normalParams <- vector("list", 0)
    normalParams$n_0 <- 0.5
    normalParams$alphaNHyper <- 2
    normalParams$betaNHyper <- 2
    #rm(list = c("rt", "rn", "ZS", "ct", "var_0", "kappaGHyper", "skew"))
    ## PLOIDY, phi
    ploidyParams <- vector("list", 0)
    ploidyParams$phi_0 <- 2
    ploidyParams$alphaPHyper <- 20
    ploidyParams$betaPHyper <- 42
    
    ## CELLULAR PREVALENCE, s
    sParams <- setupClonalParameters(Z = numberClonalClusters)
    sParams$piZ_0 <- estimateDirichletParamsMap(sParams$kappaZHyper)
    # return
    output <- vector("list", 0)
    output$genotypeParams <- genotypeParams
    output$ploidyParams <- ploidyParams
    output$normalParams <- normalParams
    output$cellPrevParams <- sParams
    return(output)
}


loadAlleleCounts <- function(inCounts, symmetric = TRUE, 
			genomeStyle = "NCBI", sep = "\t", header = TRUE) {
	if (is.character(inCounts)){
    ## LOAD INPUT READ COUNT DATA 
    	message("titan: Loading data ", inCounts)
    	data <- read.delim(inCounts, header = header, stringsAsFactors = FALSE, 
        		sep = sep)
        if (typeof(data[,2])!="integer" || typeof(data[,4])!="integer" || 
        		typeof(data[,6])!="integer"){
        	stop("loadAlleleCounts: Input counts file format does not 
        		match required specifications.")		
        }
    }else if (is.data.frame(inCounts)){  #inCounts is a data.frame
    	data <- inCounts
    }else{
    	stop("loadAlleleCounts: Must provide a filename or data.frame 
    		to inCounts")
    }
    ## use GenomeInfoDb
    #require(GenomeInfoDb)
    # convert to desired genomeStyle and only include autosomes, sex chromosomes
    data[, 1] <- setGenomeStyle(data[, 1], genomeStyle)
   
    ## sort chromosomes
	indChr <- orderSeqlevels(as.character(data[, 1]), X.is.sexchrom = TRUE)
	data <- data[indChr, ]
	## sort positions within each chr
	for (x in unique(data[, 1])){
		ind <- which(data[, 1] == x)
		data[ind, ] <- data[ind[sort(data[ind, 2], index.return = TRUE)$ix], ]
	}
    
    refOriginal <- as.numeric(data[, 4])
    nonRef <- as.numeric(data[, 6])
    tumDepth <- refOriginal + nonRef
    if (symmetric) {
        ref <- pmax(refOriginal, nonRef)
    } else {
        ref <- refOriginal
    }
    
    return(data.table(chr = as.character(data[, 1]), posn = data[, 2], ref = ref, 
        refOriginal = refOriginal, nonRef = nonRef, 
        tumDepth = tumDepth))
}

extractAlleleReadCounts <- function(bamFile, bamIndex, 
			positions, outputFilename = NULL, 
			pileupParam = PileupParam()){
	#require(Rsamtools)

## read in vcf file of het positions
	vcf <- BcfFile(positions)
	vcfPosns <- scanBcf(vcf)

## setup the positions of interest to generate the pileup for
	which <- GRanges(as.character(vcfPosns$CHROM), 
		IRanges(vcfPosns$POS, width = 1))
## setup addition BAM filters, such as excluding duplicate reads
	sbp <- ScanBamParam(flag = scanBamFlag(isDuplicate = FALSE), which = which)

## generate pileup using function (Rsamtools >= 1.17.11)
## this step can take a while
	tumbamObj <- BamFile(bamFile, index = bamIndex)
	counts <- pileup(tumbamObj, scanBamParam = sbp,  pileupParam = pileupParam)

## set of command to manipulate the "counts" data.frame output
##     by pileup() such that multiple nucleotides are in a single
##     row rather than in multiple rows.
	countsMerge <- xtabs(count ~ which_label + nucleotide, counts)
	
	label <- do.call(rbind, strsplit(rownames(countsMerge), ":"))
	posn <- do.call(rbind, strsplit(label[, 2],"-"))
	countsMerge <- cbind(data.frame(chr = label[, 1]), 
		position = posn[, 1], countsMerge[,1:7])
	
## GET REFERENCE AND NON-REF READ COUNTS
## this block of code is used to match up the reference and 
##   non-reference nucleotide when assigning read counts
##   final output data.frame is "countMat"
## setup output data.frame
	countMat <- data.frame(chr = vcfPosns$CHROM, 
			position = as.numeric(vcfPosns$POS), 
			ref = vcfPosns$REF, refCount = 0, 
			Nref = vcfPosns$ALT, NrefCount = 0, 
			stringsAsFactors = FALSE)

## match rows with vcf positions of interest
	countMat <- merge(countMat, countsMerge, by = c("chr","position"), 
		sort = FALSE, stringsAsFactors = FALSE)

## assign the flattened table of nucleotide counts to ref, Nref
## note that non-reference (Nref) allele is sum of other bases
##    that is not matching the ref.
	NT <- c("A", "T", "C", "G")
	for (n in 1:length(NT)){	
		indRef <- countMat$ref == NT[n]
		countMat[indRef, "refCount"] <- countMat[indRef, NT[n]]
		countMat[indRef, "NrefCount"] <- rowSums(countMat[indRef, NT[-n]])
	}

## remove "chr" string from chromosome
	countMat$chr <- gsub("chr","",countMat$chr)	
## only use autosomes and sex chrs
	countMat <- countMat[countMat$chr %in% c(as.character(1:22),"X","Y"),]
## only use first 6 columns for TitanCNA
	countMat <- countMat[,1:6]

## OUTPUT TO TEXT FILE 
	if (!is.null(outputFilename)){
		## output text file will have the same format required by TitanCNA
		message("extractAlleleReadCounts: writing to ", outputFilename)
		write.table(countMat, file = outputFilename, row.names = FALSE, 
			col.names = TRUE, quote = FALSE, sep = "\t")
	}
	return(countMat)
	#return(loadAlleleCounts(countMat))
}

## filter data by depth and mappability.  data is a
## logical vector data is a list containing all our
## data: ref, depth, logR, etc.  
filterData <- function(data, chrs = NULL, minDepth = 10, 
    maxDepth = 200, positionList = NULL, map = NULL, 
    mapThres = 0.9, centromeres = NULL, centromere.flankLength = 0) {
    genomeStyle <- seqlevelsStyle(data$chr)[1]
    if (!is.null(map)) {
        keepMap <- map >= mapThres
    } else {
        keepMap <- !logical(length = length(data$refOriginal))
    }
    if (!is.null(positionList)) {
        chrPosnList <- paste(positionList[, 1], positionList[, 
            2], sep = ":")  #chr:posn
        chrPosnData <- paste(data$chr, data$posn, sep = ":")
        keepPosn <- is.element(chrPosnData, chrPosnList)
    } else {
        keepPosn <- !logical(length = length(data$chr))
    }
    keepTumDepth <- data$tumDepth <= maxDepth & data$tumDepth >= minDepth
    cI <- keepTumDepth & !is.na(data$logR) & 
        keepMap & keepPosn
    data <- data[which(cI), ]

    ## remove centromere SNPs ##
    if (!is.null(centromeres)){
    	colnames(centromeres)[1:3] <- c("Chr", "Start", "End") 
    	centromeres$Chr <- setGenomeStyle(centromeres$Chr, genomeStyle = genomeStyle[1])
    	data <- removeCentromere(data, centromeres, flankLength = centromere.flankLength)
    }
    if (is.null(chrs)){
      keepChrs <- logical(length = length(data$chr))
    }else{
      keepChrs <- is.element(data$chr, chrs)
      data <- data[keepChrs, ]
      message("Removed Chrs: ", names(which(table(data$chr) < 2)))
      data <- data[data$chr %in% names(which(table(data$chr) > 1)), ]
    }
    return(data)
}

## input: 
# 1) data object output by loadAlleleCounts(); 6 element list: chr, posn, ref, refOriginal, nonRef, tumDepth)
# 2) data.frame containing coordinates of centromeres; 4 columns: Chr, Start, End, arbitrary
##
removeCentromere <- function(data, centromere, flankLength = 0){
	keepInd <- !logical(length = length(data$chr))
	for (c in 1:nrow(centromere)){
		ind <- which((data$chr == centromere[c, "Chr"]) &
				(data$posn >= (centromere[c, "Start"] - flankLength)) &
				(data$posn <= (centromere[c, "End"] + flankLength)))
		keepInd[ind] <- FALSE			
	}	
	message("Removed ", sum(!keepInd), " centromeric positions")
	## remove positions in all elements of list	  
	data <- data[which(keepInd), ]
  return(data)
}

## segs is a data.table object
extendSegments <- function(segs, removeCentromeres = FALSE, centromeres = NULL,
	extendToTelomeres = FALSE, seqInfo = NULL, chrs = c(1:22, "X", "Y"), genomeStyle = "NCBI"){
	newSegs <- copy(segs)
	newStartStop <- newSegs[, {totalLen = c(Start[-1], NA) - End
					extLen = round(totalLen / 2)
					End.ext = c(End[-.N] + round(extLen)[-.N], End[.N])
					Start.ext = c(Start[1], End.ext[-.N] + 1)
					list(Start=Start.ext, End=End.ext) 
				  }, by=Chromosome]
	newSegs[, Start.snp := Start]
	newSegs[, End.snp := End]
	newSegs[, Start := newStartStop[, Start]]
	newSegs[, End := newStartStop[, End]]
	stopColInd <- which(names(newSegs) == "End")
	setcolorder(newSegs, c(names(newSegs)[1:stopColInd], "Start.snp", "End.snp", names(newSegs)[(stopColInd+1):(ncol(newSegs)-2)]))
	
	if (removeCentromeres){
		if (is.null(centromeres)){
			stop("If removeCentromeres=TRUE, must provide centromeres data.table object.")
		}
		message("Removing centromeres from segments.")
		newSegs <- removeCentromereSegs(newSegs, centromeres, chrs = chrs, genomeStyle = genomeStyle)
	}
	
	if (extendToTelomeres){
		if (is.null(seqInfo)){
			stop("If extendToTelomeres=TRUE, must provide SeqInfo object with chromosome lengths.")
		}
		message("Extending segments to telomeres.")
		newSegs[, Start.telo := { endCoord = End
				endCoord[.N] = seqlengths(seq.info)[seqnames(seqInfo)[.GRP]]		
				endCoord
				}, by=Chromosome]		
	}
	return(copy(newSegs))
}


removeCentromereSegs <- function(segs, centromeres, chrs = c(1:22, "X", "Y"), genomeStyle = "NCBI"){	
	#seqlevelsStyle(chrs) <- genomeStyle
	chrs <- mapSeqlevels(chrs, genomeStyle, drop = FALSE)[1, ]
  segs <- copy(segs)
	for (i in 1:nrow(centromeres)){
		x <- as.data.frame(centromeres[i,]); 
		names(x)[1:3] <- c("Chr","Start","End")
		chrInd <- which(segs[, Chromosome == x$Chr])
		## start is in centromere ##
		## NOT POSSIBLE ##
		ind <- which(segs[chrInd, Start >= x$Start & Start <= x$End])
		if (length(ind)){
			#message("Chr:", i, "Start in centromere.")
			# move start to end of centromere
			segs[chrInd[ind], Start := x$End + 1]
			#segs[chrInd[ind], Length.snp. := NA]
		}
		## end is in centromere ##
		## NOT POSSIBLE ##
		ind <- which(segs[chrInd, End >= x$Start & End <= x$End])
		if (length(ind)){
			#message("Chr:", i, "Stop in centromere.")
			# move end to start of centromere
			segs[chrInd[ind], End := x$Start - 1]
			#segs[chrInd[ind], Length.snp. := NA]
		}
		## both start and end in centromere ##
		## NOT POSSIBLE ##
		ind <- which(segs[chrInd, Start >= x$Start & End <= x$End])
		if (length(ind)){
			#message("Chr:", i, "Seg in centromere.")
			# remove segment #
			segs <- segs[-chrInd[ind]]
		}		
		## segment spans centromere ##
		ind <- which(segs[chrInd, Start <= x$Start & End >= x$End])
		if (length(ind)){
			#message("Chr:", i, "Seg span centromere.")
			# break into 2 segments #
			newRegion1 <- copy(segs[chrInd[ind]])
			#newRegion1[, Length.snp. := NA]
			newRegion2 <- copy(newRegion1)
			newRegion1[, End := x$Start - 1] #left segment before centromere
			newRegion2[, Start := x$End + 1] #right segment after centromere
			
			# remove old segment and add in 2 new ones
			segs <- segs[-chrInd[ind]]
			segs <- rbind(segs, newRegion1, newRegion2)
		}		
	}
	## re-order the segments ##
	segs[, Chromosome := factor(Chromosome, levels = chrs)]
	segs <- segs[do.call(order, segs[, c("Chromosome", "Start")])]
	return(copy(segs))
}


excludeGarbageState <- function(params, K) {
    newParams <- params
    for (i in 1:length(newParams)) {
        if (length(newParams[[i]]) == K) {
            newParams[[i]] <- newParams[[i]][2:K]
        }
    }
    return(newParams)
}

getOverlap <- function(x, y, type = "within", colToReturn = "Copy_Number", method = "common"){
  if (!type %in% c("any", "within")){
    stop("getOverlap type must be \'any\' or \'within\'.")
  }
  cn <- rep(NA, nrow(x))
  x <- as(x, "GRanges")
  y <- as(y, "GRanges")
  hits <- findOverlaps(query = x, subject = y, type = type)
  cn[queryHits(hits)] <- values(y)[subjectHits(hits), colToReturn]
  
  # find genes split by segment #
  runs <- rle(queryHits(hits))
  splitInd <- which(runs$lengths > 1)
  if (length(splitInd) > 0){ # take the larger overlap
  	if (method == "common"){
  		for (i in 1:length(splitInd)){
  			hitInd <- which(queryHits(hits)==runs$values[splitInd[i]])
				ind <- which.max(width(IRanges::overlapsRanges(query = ranges(x), subject = ranges(y), hits = hits[hitInd])))
				cn[unique(queryHits(hits)[hitInd])] <- values(y)[subjectHits(hits)[hitInd][ind], colToReturn]
			}
		}else{
			stop("Method other than 'common' is not yet supported.")
		}  
  }
  return(cn)
}


getPositionOverlap <- function(chr, posn, dataVal) {
# use RangedData to perform overlap
	colnames(dataVal)[4] <- "logR"
	dataGR <- as(dataVal, "GRanges")		
    ## load chr/posn as data.frame first to use proper chr ordering by factors/levels
    chrDF <- data.frame(seqnames=chr, start=posn, end=posn)
    chrDF$seqnames <- factor(chrDF$seqnames, levels = unique(chr))    
    chrGR <- as(chrDF, "GRanges")
    hits <- GenomicRanges::findOverlaps(query = chrGR, subject = dataGR)
    
    ## create full dataval list ##
    hitVal <- rep(NA, length = length(chr))
    hitVal[from(hits)] <- dataGR$logR[to(hits)]
	return(hitVal) 
}

setGenomeStyle <- function(x, genomeStyle = "NCBI", species = "Homo_sapiens", 
  filterExtraChr = TRUE){
  #chrs <- genomeStyles(species)[c("NCBI","UCSC")]
  if (!genomeStyle %in% seqlevelsStyle(as.character(x))[1]){
  	x <- suppressWarnings(mapSeqlevels(as.character(x), 
  					genomeStyle, drop = FALSE)[1,])
  }
  
  if (filterExtraChr){
    autoSexMChr <- extractSeqlevelsByGroup(species = species, 
    				style = genomeStyle, group = "all")
    x <- x[x %in% autoSexMChr]
  }
  return(x)
}

correctReadDepth <- function(tumWig, normWig, gcWig, mapWig, 
	genomeStyle = "NCBI", targetedSequence = NULL) {
    #require(HMMcopy)
    
  message("Reading GC and mappability files")
  gc <- wigToGRanges(gcWig)
  map <- wigToGRanges(mapWig)
  
  ### LOAD TUMOUR AND NORMAL FILES ###
  message("Loading tumour file:", tumWig)
  tumour_reads <- wigToGRanges(tumWig)
  message("Loading normal file:", normWig)
  normal_reads <- wigToGRanges(normWig)
  
  ### set the genomeStyle: NCBI or UCSC
  #require(GenomeInfoDb)
  seqlevelsStyle(gc) <- genomeStyle
  seqlevelsStyle(map) <- genomeStyle
  seqlevelsStyle(tumour_reads) <- genomeStyle
  seqlevelsStyle(normal_reads) <- genomeStyle

  ### make sure tumour wig and gc/map wigs have same
  ### chromosomes
  gc <- gc[seqnames(gc) %in% seqnames(tumour_reads)]
  map <- map[seqnames(map) %in% seqnames(tumour_reads)]
  samplesize <- 50000
  
  ### for targeted sequencing (e.g.  exome capture),
  ### ignore bins with 0 for both tumour and normal
  ### targetedSequence = RangedData (IRanges object)
  ### containing list of targeted regions to consider;
  ### 3 columns: chr, start, end
  if (!is.null(targetedSequence)) {
    message("Analyzing targeted regions...")
    targetIR <- GRanges(ranges = IRanges(start = targetedSequence[, 2], 
                end = targetedSequence[, 3]), seqnames = targetedSequence[, 1])
    names(targetIR) <- setGenomeStyle(names(targetIR), genomeStyle)
    hits <- findOverlaps(query = tumour_reads, subject = targetIR)
    keepInd <- unique(queryHits(hits))    
    tumour_reads <- tumour_reads[keepInd, ]
    normal_reads <- normal_reads[keepInd, ]
    gc <- gc[keepInd, ]
    map <- map[keepInd, ]
    samplesize <- min(ceiling(nrow(tumour_reads) * 
        0.1), samplesize)
  }
    
  ### add GC and Map data to IRanges objects ###
  tumour_reads$gc <- gc$value
  tumour_reads$map <- map$value
  colnames(values(tumour_reads)) <- c("reads", "gc", "map")
  normal_reads$gc <- gc$value
  normal_reads$map <- map$value
  colnames(values(normal_reads)) <- c("reads", "gc", "map")
  
  ### CORRECT TUMOUR DATA FOR GC CONTENT AND
  ### MAPPABILITY BIASES ###
  message("Correcting Tumour")
  tumour_copy <- correctReadcount(tumour_reads, samplesize = samplesize)
  
  ### CORRECT NORMAL DATA FOR GC CONTENT AND
  ### MAPPABILITY BIASES ###
  message("Correcting Normal")
  normal_copy <- correctReadcount(normal_reads, samplesize = samplesize)
  
  ### COMPUTE LOG RATIO ###
  message("Normalizing Tumour by Normal")
  tumour_copy$copy <- tumour_copy$copy - normal_copy$copy
  rm(normal_copy)
  
  ### PUTTING TOGETHER THE COLUMNS IN THE OUTPUT ###
  temp <- cbind(chr = as.character(seqnames(tumour_copy)), 
      start = start(tumour_copy), end = end(tumour_copy), 
      logR = tumour_copy$copy)
  temp <- as.data.frame(temp, stringsAsFactors = FALSE)
  mode(temp$start) <- "numeric"
  mode(temp$end) <- "numeric"
  mode(temp$logR) <- "numeric"
  return(temp)
}


setupClonalParameters <- function(Z, sPriorStrength = 2) {
    alphaSHyper = rep(sPriorStrength, Z)
    betaSHyper = rep(sPriorStrength, Z)
    kappaZHyper = rep(1, Z) + 1
    
    # use naive initialization
    s_0 <- (1:Z)/10
    #s_0 <- seq(0,1-1/Z,1/Z)
    
    ## first cluster should be clonally dominant (use
    ## 0.001) ##
    s_0[1] <- 0.001
    
    output <- vector("list", 0)
    output$s_0 <- s_0
    output$alphaSHyper <- alphaSHyper
    output$betaSHyper <- betaSHyper
    output$kappaZHyper <- kappaZHyper
    return(output)
}

computeBIC <- function(maxLoglik, M, N) {
    bic <- -2 * maxLoglik + (M * log(N))
    return(bic)
}

computeSDbwIndex <- function(x, centroid.method = "median", data.type = "LogRatio", 
						use.corrected.cn = TRUE,  
						S_Dbw.method = "Halkidi", symmetric = TRUE) {
    ## input x: Titan results dataframe from
    ## 'outputTitanResults()' S_Dbw Validity Index
    ## Halkidi and Vazirgiannis (2001). Clustering
    ## Validity Assessment: Finding the Optimal
    ## Partition of a Data Set
    ## AND
    ## Tong and Tan (2009) Cluster validity based on the 
    ## improved S_Dbw index
    
    if (!data.type %in% c("LogRatio", "AllelicRatio", "HaplotypeRatio")){
    	stop("computeSDbwIndex: data.type must be either 'LogRatio' or 'AllelicRatio'")
    }
    
      if (!S_Dbw.method %in% c("Halkidi", "Tong")){
    	stop("computeSDbwIndex: S_Dbw.method must be either 'Halkidi' or 'Tong'")
    }
    if (use.corrected.cn && "Corrected_Copy_Number" %in% names(x)){
    	cn.colname <- "Corrected_Copy_Number"
    	state.colName <- "TITANstate"
    }else{
    	cn.colname <- "CopyNumber"
    	state.colName <- "TITANstate"
    }
    ## flatten copynumber-clonalclusters to single vector
    if (data.type=="LogRatio"){
    	cn <- x[, get(cn.colname)] + 1
		  cn[cn == 3] <- NA  ## remove all CN=2 positions
    	flatState <- (x[, ClonalCluster] - 1) * (max(cn, na.rm = TRUE)) + cn
    	flatState[is.na(flatState)] <- 3 ### assign all the CN=2 positions to cluster 3
    	CNdata <- scale(x[, get(data.type)])
    	x <- as.matrix(cbind(as.numeric(flatState), CNdata))
    }else if (data.type=="AllelicRatio" | data.type=="HaplotypeRatio"){
    	st <- x[, get(state.colName)] + 1
    	st[x[, which(get(state.colName) == "HET")]] <- NA
    	flatState <- (x[, ClonalCluster] - 1) * (max(st, na.rm = TRUE)) + st
    	if (symmetric){
    		flatState[is.na(flatState)] <- 4
    	}else{
    		flatState[is.na(flatState)] <- 5
    	}
    	## for allelic ratios, compute the symmetric allelic ratio
    	ARdata <- x[, pmax(get(data.type), 1 - get(data.type))]
    	ARdata <- scale(ARdata)
    	x <- as.matrix(cbind(as.numeric(flatState), ARdata))
    	rm(ARdata)
    }
    
    clust <- sort(unique(x[, 1]))
    K <- length(clust)
    N <- nrow(x)
    
    ## find average standard deviation and scatter (compactness)
    stdev <- rep(NA, K)
    scat.Ci <- rep(NA, K)
    for (i in 1:K) {
        ind.i <- x[, 1] == clust[i]
        ni <- sum(ind.i)
        stdev[i] <- var(x[ind.i, 2], na.rm = TRUE)
        
        ## compute scatter based on variances within objects
        ## of a cluster (compactness)
        var.Ci <- var(x[ind.i, 2], na.rm = TRUE)
        var.D <- var(x[, 2], na.rm = TRUE)
        if (S_Dbw.method == "Halkidi"){
        	scat.Ci[i] <- var.Ci/var.D
        }else if (S_Dbw.method == "Tong"){
        	scat.Ci[i] <- ((N - ni) / N) * (var.Ci/var.D)
        }
    }
    avgStdev <- sqrt(sum(stdev, na.rm = TRUE))/K
    
    ## compute density between clusters (separation)
    sumDensityDiff <- matrix(NA, nrow = K, ncol = K)
    for (i in 1:K) {
        # cat('Calculating S_Dbw for cluster # ',clust[i],'\n')
        ind.i <- x[, 1] == clust[i]
        ni <- sum(ind.i)
        xci <- x[ind.i, 2]

        #density of Ci
        sumDiff.xci <- sdbw.density(xci, avgStdev, method = S_Dbw.method, 
        					centroid.method = centroid.method)
        
        for (j in 1:K) {
            if (i == j) {
                next
            }
            ind.j <- x[, 1] == clust[j]
            nj <- sum(ind.j)
            xcj <- x[ind.j, 2]

            #density of Cj
            sumDiff.xcj <- sdbw.density(xcj, avgStdev, method = S_Dbw.method, 
        					centroid.method = centroid.method)
            
            ## union and midpoint of both clusters
            x.ci.cj <- union(xci, xcj)
            ci <- median(xci, na.rm = TRUE)  #centroid of cluster Ci
            cj <- median(xcj, na.rm = TRUE)  #centroid of cluster Cj
            nij <- ni + nj
            stdCiCj <- (sd(xci) + sd(xcj)) / 2
            if (S_Dbw.method == "Halkidi"){
            	cij <- (ci + cj)/2
            	#cij <- median(x.ci.cj, na.rm = TRUE)
            }else if (S_Dbw.method == "Tong"){
            	lambda <- 0.7
            	cij <- lambda * ((nj * ci + ni * cj) / nij) + (1 - lambda) * 
            		(sumDiff.xci * ci + sumDiff.xcj * cj) /
            		(sumDiff.xci + sumDiff.xcj) 
            }
            
            
            #density of union of both clusters using special centroid
            sumDiff.xci.xcj <- sdbw.density(x.ci.cj, avgStdev, stDev = stdCiCj, 
            				method = S_Dbw.method, centroid = cij, 
            				centroid.method = centroid.method)            
            
            maxDiff <- max(sumDiff.xci, sumDiff.xcj)
            if (maxDiff == 0) {
                maxDiff <- 0.1
            }
            sumDensityDiff[i, j] <- sumDiff.xci.xcj/maxDiff
        }
    }
    
    if (S_Dbw.method == "Halkidi"){
        scat <- sum(scat.Ci, na.rm = TRUE)/(K)
    }else if (S_Dbw.method == "Tong"){
   		scat <- sum(scat.Ci, na.rm = TRUE)/(K - 1)
    }    
    dens.bw <- sum(sumDensityDiff, na.rm = TRUE)/(K * (K - 1))
    
    S_DbwIndex <- scat + dens.bw
    # return(S_DbwIndex)
    return(list(S_DbwIndex = S_DbwIndex, dens.bw = dens.bw, scat = scat))
}


sdbw.density <- function(x, avgStdev, stDev = NULL, method = "Halkidi", 
					centroid = NULL, centroid.method = "median"){
	if (is.null(centroid)){
		if (centroid.method == "median") {
        	centroid <- median(x, na.rm = TRUE)  #centroid of cluster Cj
    	} else if (centroid.method == "mean") {
        	centroid <- mean(x, na.rm = TRUE)  #centroid of cluster Cj
    	}
    }
	#density of Ci
    if (method == "Halkidi"){
        sumDiff <- sum(abs(x - centroid) <= avgStdev, na.rm = TRUE) 
    }else if (method == "Tong"){
    	if (is.null(stDev)){
    		stDev <- sd(x, na.rm = TRUE)
    	}
        conf.int <- 1.96 * (stDev / sqrt(length(x)))
        sumDiff <- sum(abs(x - centroid) <= conf.int, na.rm = TRUE)  
    }
    return(sumDiff)
}


# G = sequence of states for mega-variable K =
# number of unit states per cluster excluding
# outlier state precondition: If outlierState is
# included, it must be at G=1, else HOMD is G=1
decoupleMegaVar <- function(G, K, useOutlierState = FALSE) {
    if (useOutlierState) {
        G <- G - 1  #do this to make OUT=0 and HOMD=1
        G[G == 0] <- NA  #assign NA to OUT states
    }
    newG <- G%%K
    newG[newG == 0] <- K
    newG[is.na(newG)] <- 0
    newZ <- ceiling(G/K)
    output <- vector("list", 0)
    output$G <- newG
    output$Z <- newZ
    return(output)
}

# pre-condition: outlier state is -1 if included
decodeLOH <- function(G, symmetric = TRUE) {
    T <- length(G)
    Z <- rep("NA", T)
    CN <- rep(NA, T)
    
    if (symmetric) {
        DLOH <- G == 1
        NLOH <- G == 2
        ALOH <- G == 4 | G == 6 | G == 9 | G == 12 | 
            G == 16 | G == 20
        HET <- G == 3
        GAIN <- G == 5
        ASCNA <- G == 7 | G == 10 | G == 13 | G == 
            17 | G == 21
        BCNA <- G == 8 | G == 15 | G == 24
        UBCNA <- G == 11 | G == 14 | G == 18 | G == 
            19 | G == 22 | G == 23
    } else {
        DLOH <- G == 1 | G == 2
        NLOH <- G == 3 | G == 5
        ALOH <- G == 6 | G == 9 | G == 10 | G == 14 | 
            G == 15 | G == 20 | G == 22 | G == 28 | 
            G == 29 | G == 36 | G == 37 | G == 45
        HET <- G == 4
        GAIN <- G == 7 | G == 8
        ASCNA <- G == 11 | G == 13 | G == 16 | G == 
            19 | G == 23 | G == 27 | G == 30 | G == 
            35 | G == 38 | G == 44
        BCNA <- G == 12 | G == 25 | G == 41
        UBCNA <- G == 17 | G == 18 | G == 24 | G == 
            26 | G == 31 | G == 32 | G == 33 | G == 
            34 | G == 39 | G == 40 | G == 42 | G == 
            43
    }
    HOMD <- G == 0
    OUT <- G == -1
    
    Z[HOMD] <- "HOMD"
    Z[DLOH] <- "DLOH"
    Z[NLOH] <- "NLOH"
    Z[ALOH] <- "ALOH"
    Z[HET] <- "HET"
    Z[GAIN] <- "GAIN"
    Z[ASCNA] <- "ASCNA"
    Z[BCNA] <- "BCNA"
    Z[UBCNA] <- "UBCNA"
    Z[OUT] <- "OUT"
    
    if (symmetric) {
        CN[HOMD] <- 0
        CN[DLOH] <- 1
        CN[G >= 2 & G <= 3] <- 2
        CN[G >= 4 & G <= 5] <- 3
        CN[G >= 6 & G <= 8] <- 4
        CN[G >= 9 & G <= 11] <- 5
        CN[G >= 12 & G <= 15] <- 6
        CN[G >= 16 & G <= 19] <- 7
        CN[G >= 20 & G <= 24] <- 8
    } else {
        CN[HOMD] <- 0
        CN[DLOH] <- 1
        CN[G >= 3 & G <= 5] <- 2
        CN[G >= 6 & G <= 9] <- 3
        CN[G >= 10 & G <= 14] <- 4
        CN[G >= 15 & G <= 20] <- 5
        CN[G >= 21 & G <= 28] <- 6
        CN[G >= 29 & G <= 36] <- 7
        CN[G >= 37 & G <= 45] <- 8
    }
    
    output <- vector("list", 0)
    output$G <- Z
    output$CN <- CN
    return(output)
}


outputTitanResults <- function(data, convergeParams, 
    optimalPath, filename = NULL, is.haplotypeData = FALSE, posteriorProbs = FALSE, 
    subcloneProfiles = TRUE, correctResults = TRUE, proportionThreshold = 0.05, 
    proportionThresholdClonal = 0.05, recomputeLogLik = TRUE, rerunViterbi = FALSE, verbose = TRUE) {
    
    # check if useOutlierState is in convergeParams
    if (length(convergeParams$useOutlierState) == 0) {
        stop("convergeParams does not contain element: useOutlierState.")
    }
     useOutlierState <- convergeParams$useOutlierState
     # check if symmetric is in convergeParams
    if (length(convergeParams$symmetric) == 0) {
        stop("convergeParams does not contain element: symmetric.")
    }
   
    
    ## PROCESS HMM RESULTS 
    numClust <- dim(convergeParams$s)[1]
    K <- dim(convergeParams$var)[1]
    if (useOutlierState) {
        K <- K - 1
    }
    Z <- dim(convergeParams$s)[1]
    i <- dim(convergeParams$s)[2]  #iteration of training to use (last iteration)
    partGZ <- decoupleMegaVar(optimalPath, K, useOutlierState)
    G <- partGZ$G - 1  #assign analyzed points, minus 2 so HOMD=0
    sortS <- sort(convergeParams$s[, i], decreasing = FALSE, 
        index.return = TRUE)
    s <- sortS$x
    Zclust <- partGZ$Z  #assign analyzed points
    Zclust <- sortS$ix[Zclust]  #reassign sorted cluster membership
    
    ### OUTPUT RESULTS #### Output Z ##
    Gdecode <- decodeLOH(G, symmetric = convergeParams$symmetric)
    Gcalls <- Gdecode$G
    CN <- Gdecode$CN
    Zclust[Gcalls == "HET" & CN == 2] <- NA  #diploid HET positions do not have clusters
    Sout <- rep(NA, length(Zclust))  #output cluster frequencies
    Sout[Zclust > 0 & !is.na(Zclust)] <- s[Zclust[Zclust > 
        0 & !is.na(Zclust)]]
    Sout[Zclust == 0] <- 0
    clonalHeaderStr <- rep(NA, Z)
    for (j in 1:Z) {
        clonalHeaderStr[j] <- sprintf("pClust%d", j)
    }
    
    outmat <- data.table(Chr = data$chr, Position = data$posn, RefCount = data$refOriginal)
      
    if (is.haplotypeData){
      outmat <- cbind(outmat, NRefCount = data$tumDepthOriginal - data$refOriginal, 
        Depth = data$tumDepthOriginal, 
        AllelicRatio = data$refOriginal/data$tumDepthOriginal,
        HaplotypeCount = data$ref, #data$haplotypeCount,
        HaplotypeDepth = data$tumDepth,
        #HaplotypeRatio =  sprintf("%0.2f", data$haplotypeCount/data$tumDepth), 
        HaplotypeRatio = data$HaplotypeRatio,
        PhaseSet = data$phaseSet)
    }else{
      outmat <- cbind(outmat, Depth = data$tumDepth, 
        AllelicRatio = data$refOriginal/data$tumDepth)
    }
    outmat <- cbind(outmat, LogRatio = log2(exp(data$logR)), 
        CopyNumber = CN, TITANstate = G, TITANcall = Gcalls, 
        ClonalCluster = Zclust, CellularPrevalence = 1 - Sout)
   	
    ## filter results to remove empty clusters or set normal contamination to 1.0 if few events 
    if (correctResults){
      if (verbose)
        message("outputTitanResults: Correcting results...")
      corrResults <- removeEmptyClusters(data, convergeParams, outmat, 
                                         proportionThreshold = proportionThreshold, 
                                         proportionThresholdClonal = proportionThresholdClonal, 
                                         recomputeLogLik = recomputeLogLik, verbose = verbose)
      #rerunViterbi = rerunViterbi, subcloneProfiles = subcloneProfiles, is.haplotypeData = is.haplotypeData)
      outmatOriginal <- outmat
      outmat <- corrResults$results
      convergeParams <- corrResults$convergeParams
    }else{
      corrResults <- NULL
    }
    
   	## INCLUDE SUBCLONE PROFILES 
   	if (subcloneProfiles & numClust <= 2){
   		#outmat <- as.data.frame(outmat, stringsAsFactors = FALSE)
    	outmat <- cbind(outmat, getSubcloneProfiles(outmat))
    }else{
    	message("outputTitanResults: More than 2 clusters or subclone profiles not requested.")
    }
    if (posteriorProbs) {
    	rhoG <- t(convergeParams$rhoG)
    	rhoZ <- t(convergeParams$rhoZ)
    	rhoZ <- rhoZ[, sortS$ix, drop = FALSE]
   		colnames(rhoZ) <- clonalHeaderStr
        outmat <- cbind(outmat, format(round(rhoZ, 4), 
        nsmall = 4, scientific = FALSE), format(round(rhoG, 4), 
        nsmall = 4, scientific = FALSE))
    }
    if (!is.null(filename)) {
        message("titan: Writing results to ", filename)
        write.table(format(outmat, digits = 2, scientific = FALSE), file = filename, col.names = TRUE, 
            row.names = FALSE, quote = FALSE, sep = "\t")
    }
    
    if (correctResults){
      corrmat <- outmat
      outmat <- outmatOriginal
    }else{
      corrmat <- NULL
    }
  
    return(list(results = outmat, corrResults = corrmat, convergeParams = convergeParams))
}

outputModelParameters <- function(convergeParams, results, filename, 
		S_Dbw.scale = 1, S_Dbw.method = "Tong", S_Dbw.useCorrectedCN = TRUE) {
    message("titan: Saving parameters to ", filename)
    Z <- dim(convergeParams$s)[1]
    i <- dim(convergeParams$s)[2]  #iteration of training to use (last iteration)
    sortS <- sort(convergeParams$s[, i], decreasing = FALSE, 
        index.return = TRUE)
    s <- sortS$x
    fc <- file(filename, "w+")
    norm_str <- paste0("Normal contamination estimate:\t", signif(convergeParams$n[i], digits = 4))
    write.table(norm_str, file = fc, col.names = FALSE, 
        row.names = FALSE, quote = FALSE, sep = "", append = TRUE)
    ploid_str <- paste0("Average tumour ploidy estimate:\t", signif(convergeParams$phi[i], digits = 4))
    write.table(ploid_str, file = fc, col.names = FALSE, 
        row.names = FALSE, quote = FALSE, sep = "", append = TRUE)
    s_str <- signif(1 - s, digits = 4)
    s_str <- gsub(" ", "", s_str)
    outStr <- paste0("Clonal cluster cellular prevalence Z=", Z , ":\t", paste(s_str, collapse = " "))
    write.table(outStr, file = fc, col.names = FALSE, 
        row.names = FALSE, quote = FALSE, sep = "", append = TRUE)
    if ("HaplotypeRatio" %in% names(results)){
    	ratioColName <- "HaplotypeRatio"
    }else{
    	ratioColName <- "AllelicRatio"
    }
    for (j in 1:Z) {
        musR_str <- signif(convergeParams$muR[, j, i], digits = 4)
        musR_str <- gsub(" ", "", musR_str)
        outStr <- paste0(ratioColName, " ", convergeParams$genotypeParams$alleleEmissionModel, 
                         " means for clonal cluster Z=", j, ":\t", paste(musR_str, collapse = " "))
        write.table(outStr, file = fc, col.names = FALSE, 
            row.names = FALSE, quote = FALSE, sep = "", 
            append = TRUE)
        musC_str <- signif(log2(exp(convergeParams$muC[, j, i])), digits = 4)
        musC_str <- gsub(" ", "", musC_str)
        outStr <- paste0("logRatio Gaussian means for clonal cluster Z=", j, ":\t",paste(musC_str, collapse = " "))
        write.table(outStr, file = fc, col.names = FALSE, 
            row.names = FALSE, quote = FALSE, sep = "", 
            append = TRUE)
    }
    if (convergeParams$genotypeParams$alleleEmissionModel == "Gaussian"){
        varR_str <- signif(convergeParams$varR[, i], digits = 4)
        varR_str <- gsub(" ", "", varR_str)
        outStr <- paste0(ratioColName, " Gaussian variance:\t", paste(varR_str, collapse = " "))
        write.table(outStr, file = fc, col.names = FALSE, 
          row.names = FALSE, quote = FALSE, sep = "", append = TRUE)
    }
    var_str <- signif(convergeParams$var[, i], digits = 4)
    var_str <- gsub(" ", "", var_str)
    outStr <- paste0("logRatio Gaussian variance:\t", paste(var_str, collapse = " "))
    write.table(outStr, file = fc, col.names = FALSE, 
        row.names = FALSE, quote = FALSE, sep = "", append = TRUE)
    iter_str <- paste0("Number of iterations:\t", length(convergeParams$phi))
    write.table(iter_str, file = fc, col.names = FALSE, 
        row.names = FALSE, quote = FALSE, sep = "", 
        append = TRUE)
    loglik_str <- signif(convergeParams$loglik[i], digits = 6)
    loglik_str <- gsub(" ", "", loglik_str)
    outStr <- paste0("Log likelihood:\t", paste(loglik_str, collapse = " "))
    write.table(outStr, file = fc, col.names = FALSE, 
        row.names = FALSE, quote = FALSE, sep = "", 
        append = TRUE)
    
    # compute SDbw_index
    sdbw.LR <- computeSDbwIndex(results, centroid.method = "median", 
    					data.type = "LogRatio", use.corrected.cn = S_Dbw.useCorrectedCN, 
    					S_Dbw.method = S_Dbw.method,
    					symmetric = convergeParams$symmetric)
    sdbw.AR <- computeSDbwIndex(results, centroid.method = "median", 
    					data.type = ratioColName, use.corrected.cn = S_Dbw.useCorrectedCN,
    					S_Dbw.method = S_Dbw.method,
    					symmetric = convergeParams$symmetric)
    ## element-wise addition -> returns list
    ## add the values for allelicRatio and logRatio
    sdbw <- mapply('+', sdbw.LR, sdbw.AR, SIMPLIFY = FALSE)        
    
    ## print out combined S_Dbw ##
	printSDbw(sdbw.LR, fc, S_Dbw.scale, "LogRatio")
    printSDbw(sdbw.AR, fc, S_Dbw.scale, ratioColName)
    printSDbw(sdbw, fc, S_Dbw.scale, "Both")
    close(fc)
    
    return(list(dens.bw = sdbw$dens.bw, scat = sdbw$scat, 
    			S_Dbw = S_Dbw.scale * sdbw$dens.bw + sdbw$scat))
    
} 

outputTitanSegments <- function(results, id, convergeParams, filename = NULL, igvfilename = NULL){
  # get all possible states in this set of results
  #stateTable <- unique(results[, c("TITANstate", "TITANcall")])
  #rownames(stateTable) <- stateTable[, 1]
  rleResults <- t(sapply(unique(results$Chr), function(x){
  	ind <- results$Chr == x
    r <- rle(results$TITANstate[ind])
  }))
	rleLengths <- unlist(rleResults[, "lengths"])
	rleValues <- unlist(rleResults[, "values"])
	numSegs <- length(rleLengths)
	
  # convert allelic ratio to symmetric ratios #
  results$AllelicRatio <- pmax(results$AllelicRatio, 1-results$AllelicRatio)
	if (!is.null(results$HaplotypeRatio)){
	  results$HaplotypeRatio <- pmax(results$HaplotypeRatio, 1-results$HaplotypeRatio)
	}
  segs <- data.table(Sample = character(), Chromosome = character(), Start_Position.bp. = integer(), 
                     End_Position.bp. = integer(), Length.snp. = integer(), Median_Ratio = numeric())
  # add HaplotypeRatio column if also present in results object
  if (!is.null(results$HaplotypeRatio)){
  	segs <- cbind(segs, Median_HaplotypeRatio = numeric())
  }
  segs <- cbind(segs, Median_logR = numeric(), TITAN_state = integer(),
                     TITAN_call = character(), Copy_Number = integer(), MinorCN = integer(), MajorCN = integer(),
                     Clonal_Cluster = integer(), Cellular_Prevalence = numeric())[1:numSegs]
	segs[, Sample := id]
	#colNames <- c("Chr", "Position", "TITANstate", "AllelicRatio", "LogRatio")
	prevInd <- 0
	for (j in 1:numSegs){
		start <- prevInd + 1
		end <- prevInd + rleLengths[j]
		segDF <- results[start:end, ]
		prevInd <- end
		numR <- nrow(segDF)
		segs[j, "Chromosome"] <- as.character(segDF[1, "Chr"])
		segs[j, "Start_Position.bp."] <- segDF[1, "Position"]
		segs[j, "TITAN_state"] <- rleValues[j]
		segs[j, "TITAN_call"] <- segDF[1, "TITANcall"]#stateTable[as.character(rleValues[j]), 2]
		segs[j, "Copy_Number"] <- segDF[1, "CopyNumber"]
		segs[j, "Median_Ratio"] <- round(median(segDF$AllelicRatio, na.rm = TRUE), digits = 6)
		segs[j, "Median_logR"] <- round(median(segDF$LogRatio, na.rm = TRUE), digits = 6)
		segs[j, "MinorCN"] <- getMajorMinorCN(rleValues[j], convergeParams$symmetric)$majorCN
		segs[j, "MajorCN"] <- getMajorMinorCN(rleValues[j], convergeParams$symmetric)$minorCN
		segs[j, "Clonal_Cluster"] <- segDF[1, "ClonalCluster"]
		segs[j, "Cellular_Prevalence"] <- segDF[1, "CellularPrevalence"]
		if (!is.null(segDF$HaplotypeRatio)){
		  segs[j, "Median_HaplotypeRatio"] <- round(median(segDF$HaplotypeRatio, na.rm = TRUE), digits = 6)
		}
		if (segDF[1, "Chr"] == segDF[numR, "Chr"]){
			segs[j, "End_Position.bp."] <- segDF[numR, "Position"]
			segs[j, "Length.snp."] <- numR
		}else{ # segDF contains 2 different chromosomes
			print(j)
		}                                      
	}
  if (!is.null(filename)){
		# write out detailed segment file #
  	#write.table(segs, file = filename, col.names = TRUE, row.names = FALSE, quote = FALSE, sep = "\t")
    fwrite(segs, file = filename, sep = "\t")  
  }
  # write out IGV seg file #
  if (!is.null(igvfilename)){
  	igv <- segs[, c("Sample", "Chromosome", "Start_Position.bp.", 
  								"End_Position.bp.", "Length.snp.", "Median_logR")]
  	colnames(igv) <- c("sample", "chr", "start", "end", "num.snps", "median.logR")
  	#write.table(igv, file = igvfilename, col.names = TRUE, row.names = FALSE, quote = FALSE, sep = "\t")
    fwrite(igv, file = igvfilename, sep = "\t")  
  }
  return(segs)
}

## merge segments based on same values in given column
mergeSegsByCol <- function(segs, colToMerge = "Copy_Number", centromeres = NULL){
	rleResults <- t(sapply(unique(segs$Chr), function(x){
		ind <- segs$Chr == x
		r <- rle(segs[ind, get(colToMerge)])
	}))
	rleLengths <- unlist(rleResults[, "lengths"])
	rleValues <- unlist(rleResults[, "values"])
	numSegs <- length(rleLengths)
	
	newSegs <- NULL
	prevInd <- 0
	for (j in 1:numSegs){
		start <- prevInd + 1
		end <- prevInd + rleLengths[j]
		segDF <- segs[start:end, ]
		prevInd <- end
		numR <- nrow(segDF)
		newSegs <- rbind(newSegs, segDF[1,])
		newSegs[j, (colToMerge) := rleValues[j]]
		newSegs[j, Median_Ratio := round(median(segDF$Median_Ratio, na.rm = TRUE), digits = 6)]
		newSegs[j, Median_logR := round(median(segDF$Median_logR, na.rm = TRUE), digits = 6)]
		if (segDF[1, "Chromosome"] == segDF[numR, "Chromosome"]){
			newSegs[j, End := segDF[numR, End]]
			newSegs[j, Length.snp. := sum(segDF$Length.snp.)]
		}else{ # segDF contains 2 different chromosomes
			print(j)
		}                                      
	}
	if (!is.null(centromeres)){
		message("Removing centromeres from segments.")
		newSegs <- removeCentromereSegs(newSegs, centromeres)
	}
	return(copy(newSegs))
}

## Recompute integer CN for high-level amplifications ##
## compute logR-corrected copy number ##
correctIntegerCN <- function(cn, segs, purity, ploidy, maxCNtoCorrect.autosomes = NULL, 
		maxCNtoCorrect.X = NULL, correctHOMD = TRUE, minPurityToCorrect = 0.2, gender = "male", chrs = c(1:22, "X")){
	names <- c("HOMD","HETD","NEUT","GAIN","AMP","HLAMP", rep("HLAMP", 1000))
	names.chrX <- c("HETD","NEUT","GAIN","AMP","HLAMP", rep("HLAMP", 1000))
	cn.tmp <- copy(cn)
	segs.tmp <- copy(segs)
  if (is.null(cn.tmp[["Start"]])){
    cn.tmp[, c("Start", "End") := list(Position, Position)]
  }
  if (is.null(segs.tmp[["Start"]])){
    segs.tmp[, c("start", "end") := list(Start_Position.bp., End_Position.bp.)]
  }
  cn.gr <- as(cn.tmp, "GRanges")
  segs.gr <- as(segs.tmp, "GRanges")
  rm(cn.tmp, segs.tmp)

	## determine if Median_HaplotypeRatio (segs) and HaplotypeRatio (cn) columns exists (i.e. 10X analysis)
	segs.allelicRatioColName <- "Median_Ratio"
	if ("Median_HaplotypeRatio" %in% names(segs)){
		segs.allelicRatioColName <- "Median_HaplotypeRatio"
	}
	cn.allelicRatioColName <- "AllelicRatio"
	if ("HaplotypeRatio" %in% names(cn)){
		cn.allelicRatioColName <- "HaplotypeRatio"
	}
	## set up chromosome style
	autosomeStr <- grep("X|Y", chrs, value=TRUE, invert=TRUE)
	chrXStr <- grep("X", chrs, value=TRUE)
	
	if (is.null(maxCNtoCorrect.autosomes)){
		maxCNtoCorrect.autosomes <- segs[Chromosome %in% autosomeStr, max(Copy_Number, na.rm=TRUE)]
	}
	if (is.null(maxCNtoCorrect.X) & gender == "female" & length(chrXStr) > 0){
		maxCNtoCorrect.X <- segs[Chromosome == chrXStr, max(Copy_Number, na.rm=TRUE)]
	}
	## correct log ratio and compute corrected CN
	segs[Chromosome %in% chrs, logR_Copy_Number := logRbasedCN(Median_logR, purity, ploidy, Cellular_Prevalence, cn=2)]
  segs[Chromosome %in% chrs, Corrected_logR := log2(logR_Copy_Number / ploidy)]
	cn[Chr %in% chrs, logR_Copy_Number := logRbasedCN(LogRatio, purity, ploidy, CellularPrevalence, cn=2)]
  cn[Chr %in% chrs, Corrected_logR := log2(logR_Copy_Number / ploidy)]
	## correct allelic ratio and compute corrected major/minor CN (exclude chrX for males since no allelic CN)
	segs[Chromosome %in% chrs, Corrected_Ratio := allelicRatioBasedCN(get(segs.allelicRatioColName), logR_Copy_Number, purity, Cellular_Prevalence, rn=0.5, cn=2)]
	cn[Chr %in% chrs, Corrected_Ratio := allelicRatioBasedCN(get(cn.allelicRatioColName), logR_Copy_Number, purity, CellularPrevalence, rn=0.5, cn=2)]
	if (gender == "male" & length(chrXStr) > 0){ ## analyze chrX separately
		segs[Chromosome == chrXStr, logR_Copy_Number := logRbasedCN(Median_logR, purity, ploidy, Cellular_Prevalence, cn=1)]
    segs[Chromosome == chrXStr, Corrected_logR := log2(logR_Copy_Number / (ploidy / 2))]
		cn[Chr == chrXStr, logR_Copy_Number := logRbasedCN(LogRatio, purity, ploidy, CellularPrevalence, cn=1)]
    cn[Chr == chrXStr, Corrected_logR := log2(logR_Copy_Number / (ploidy / 2))]
		segs[Chromosome == chrXStr, Corrected_Ratio := NA]
		cn[Chr == chrXStr, Corrected_Ratio := NA]
	}

	####### assign copy number to use - Corrected_Copy_Number #######
	# 1. initialize to same TITAN calls for autosomes - no change in copy number at this point
	segs[, Corrected_Copy_Number := as.integer(Copy_Number)]
	segs[, Corrected_Call := TITAN_call]
	segs[, Corrected_MajorCN := as.integer(MajorCN)]
	segs[, Corrected_MinorCN := as.integer(MinorCN)]
	cn[, Corrected_Copy_Number := as.integer(CopyNumber)]
	cn[, Corrected_Call := TITANcall]

	if (purity >= minPurityToCorrect){
		# 2. TITAN calls adjusted for >= maxCNtoCorrect.autosomes copies - HLAMP e.g. 8 max copies)
		ind.seg.maxCN <- segs[Chromosome %in% chrs & Copy_Number >= maxCNtoCorrect.autosomes, which = TRUE]
		segs[ind.seg.maxCN, Corrected_Copy_Number := as.integer(round(logR_Copy_Number))]
    ind.cn.maxCN <- cn[Chr %in% chrs & CopyNumber >= maxCNtoCorrect.autosomes, which = TRUE]
		cn[ind.cn.maxCN, Corrected_Copy_Number := as.integer(round(logR_Copy_Number))]

		# 3. TITAN calls adjust for HOMD
    ind.seg.homd <- NULL
    ind.cn.homd <- NULL
		if (correctHOMD){
			ind.seg.homd <- segs[Chromosome %in% chrs & Copy_Number == 0, which = TRUE]
			segs[ind.seg.homd, Corrected_Copy_Number := as.integer(round(logR_Copy_Number))]
      ind.cn.homd <- cn[Chr %in% chrs & CopyNumber == 0, which = TRUE]
			cn[ind.cn.homd, Corrected_Copy_Number := as.integer(round(logR_Copy_Number))]
		}
	
		# 4. Add corrected calls for bins with CopyNumber = NA (ie. not included in TITAN analysis)
    ind.cn.naBins <- cn[Chr %in% chrs & is.na(CopyNumber), which = TRUE]
		cn[ind.cn.naBins, Corrected_Copy_Number := as.integer(round(logR_Copy_Number))]
	
		# 5. Adjust chrX copy number if purity is sufficiently high
		# males - all data points in chrX will be corrected
		# females - will already be corrected but will do special correction for copy number > maxCNtoCorrect.X (might be diff than maxCNtoCorrect.autosomes)
    ind.seg.chrX <- NULL
    ind.cn.chrX <- NULL
		if (gender == "male" & length(chrXStr) > 0){
      ind.seg.chrX <- segs[Chromosome == chrXStr, which = TRUE]
			segs[ind.seg.chrX, Corrected_Copy_Number := as.integer(round(logR_Copy_Number))]
      ind.cn.chrX <- cn[Chr == chrXStr, which = TRUE]
			cn[ind.cn.chrX, Corrected_Copy_Number := as.integer(round(logR_Copy_Number))]
		}else if (gender == "female"){  # already handled in Step 2
      # ind.seg.chrX <- segs[Chromosome == chrXStr & Copy_Number >= maxCNtoCorrect.X, which = TRUE]
			# segs[ind.seg.chrX, Corrected_Copy_Number := as.integer(round(logR_Copy_Number))]
      # ind.cn.chrX <- cn[Chr == chrXStr & CopyNumber >= maxCNtoCorrect.X, which = TRUE]
			# cn[Cind.cn.chrX, Corrected_Copy_Number := as.integer(round(logR_Copy_Number))]
		}

    # 6. Adjust copy number for inconsistent logR and copy number prediction (e.g. opposite copy number direction)
    # mostly affects outliers, which are short or single point segments
    # since chrX for males have all data corrected, it will by default not be included in this anyway
    # chrX for females are treated as regular diploid chromosomes here
    ind.seg.oppCNA <- segs[((round(logR_Copy_Number) < ploidy & Corrected_Copy_Number > ploidy) | 
                            (round(logR_Copy_Number) > ploidy & Corrected_Copy_Number < ploidy)) &
                           (abs(round(logR_Copy_Number) - Corrected_Copy_Number) > 2), which = TRUE]
    # ind.seg.oppCNA1 <- segs[((round(logR_Copy_Number / ploidy) <= 1 & Corrected_Copy_Number / ploidy > 1) | 
    #                         (round(logR_Copy_Number / ploidy) >= 1 & Corrected_Copy_Number / ploidy < 1)) &
    #                        (abs(round(logR_Copy_Number) - Corrected_Copy_Number) > 2), which = TRUE]
    segs[ind.seg.oppCNA, Corrected_Copy_Number := as.integer(round(logR_Copy_Number))]
    # correct bins overlapping adjusted segs
    if (length(ind.seg.oppCNA) > 0){    
      hits <- findOverlaps(query = cn.gr, subject = segs.gr[ind.seg.oppCNA])
      cn[queryHits(hits), Corrected_Copy_Number := segs[ind.seg.oppCNA][subjectHits(hits), Corrected_Copy_Number]]
    }
    ind.seg <- unique(c(ind.seg.maxCN, ind.seg.homd, ind.seg.chrX, ind.seg.oppCNA))
    # assign copy number for Corrected_MajorCN, Corrected_MinorCN (for corrected segments only)
    segs[ind.seg, Corrected_MajorCN := as.integer(round(Corrected_Ratio * Corrected_Copy_Number))]
    segs[ind.seg, Corrected_MinorCN := as.integer(round((1 - Corrected_Ratio) * Corrected_Copy_Number))]
	}
	
	## assign copy number call (string) based on Corrected_Copy_Number
	## for autosomes
  # Corrected_Copy_Number, Corrected_logR
	segs[, Corrected_Call := names[Corrected_Copy_Number + 1]]
  cn[, Corrected_Call := names[Corrected_Copy_Number + 1]]
	# for chrX
	if (gender == "male" & length(chrXStr) > 0){
		segs[Chromosome == chrXStr, Corrected_Call := names.chrX[Corrected_Copy_Number + 1]]
		cn[Chr == chrXStr, Corrected_Call := names.chrX[Corrected_Copy_Number + 1]]
	}else{ # female
		segs[Chromosome == chrXStr & Copy_Number >= maxCNtoCorrect.X, Corrected_Call := "HLAMP"]
		cn[Chr == chrXStr & CopyNumber >= maxCNtoCorrect.X, Corrected_Call := "HLAMP"]		
	}

	return(list(cn = copy(cn), segs = copy(segs)))
}

## compute copy number using corrected log ratio ##
logRbasedCN <- function(x, purity, ploidyT, cellPrev=NA, cn = 2){
	if (length(cellPrev) == 1 && is.na(cellPrev)){
		cellPrev <- 1
	}else{ #if cellPrev is a vector
		cellPrev[is.na(cellPrev)] <- 1
	}
	ct <- (2^x 
		* (cn * (1 - purity) + purity * ploidyT * (cn / 2)) 
		- (cn * (1 - purity)) 
		- (cn * purity * (1 - cellPrev))) 
	ct <- ct / (purity * cellPrev)
	ct <- sapply(ct, max, 1/2^6)
	return(ct)
}

allelicRatioBasedCN <- function(x, ct, purity, cellPrev=NA, rn = 0.5, cn = 2){
	if (length(cellPrev) == 1 && is.na(cellPrev)){
		cellPrev <- 1
	}else{ #if cellPrev is a vector
		cellPrev[is.na(cellPrev)] <- 1
	}
	totalAlleles <- ((1 - purity) * cn) + (purity * (1 - cellPrev)) * cn + (purity * cellPrev * ct)
	rt <- (x * totalAlleles - (((1 - purity) * rn) * cn + (purity * (1 - cellPrev) * rn * cn))) / (purity * cellPrev * ct)
	rt <- sapply(rt, min, 1)
	rt <- sapply(rt, max, 0)
	return(rt)
}

getMajorMinorCN <- function(state, symmetric = TRUE){
	majorCN <- NA
	minorCN <- NA
	if (symmetric){
		if (state==0){
			majorCN = 0; minorCN = 0;
		}else if (state==1){
			majorCN = 0; minorCN = 1;
		}else if(state==2){
			majorCN = 0; minorCN = 2;
		}else if (state==3){
			majorCN = 1; minorCN = 1;
		}else if (state==4){
			majorCN = 0; minorCN = 3;
		}else if (state==5){
			majorCN = 1; minorCN = 2;
		}else if (state==6){
			majorCN = 0; minorCN = 4;
		}else if (state==7){
			majorCN = 1; minorCN = 3;
		}else if (state==8){
			majorCN = 2; minorCN = 2;
		}else if (state==9){
			majorCN = 0; minorCN = 5;
		}else if (state==10){
			majorCN = 1; minorCN = 4;
		}else if (state==11){
			majorCN = 2; minorCN = 3;
		}else if (state==12){
			majorCN = 0; minorCN = 6;
		}else if (state==13){
			majorCN = 1; minorCN = 5;
		}else if (state==14){
			majorCN = 2; minorCN = 4;
		}else if (state==15){
			majorCN = 3; minorCN = 3;
		}else if (state==16){
			majorCN = 0; minorCN = 7;
		}else if (state==17){
			majorCN = 1; minorCN = 6;
		}else if (state==18){
			majorCN = 2; minorCN = 5;
		}else if (state==19){
			majorCN = 3; minorCN = 4;
		}else if (state==20){
			majorCN = 0; minorCN = 8;
		}else if (state==21){
			majorCN = 1; minorCN = 7;
		}else if (state==22){
			majorCN = 2; minorCN = 6;
		}else if (state==23){
			majorCN = 3; minorCN = 5;
		}else if (state==24){
			majorCN = 4; minorCN = 4;
		}
	}else{
		#stop("symmetric=FALSE not yet supported.")	
	}
	return(list(majorCN = majorCN, minorCN = minorCN))
}

printSDbw <- function(sdbw, fc, scale, data.type = ""){
	sdbw_str <- sprintf("S_Dbw dens.bw (%s):\t%0.4f ", data.type, sdbw$dens.bw)
    write.table(sdbw_str, file = fc, col.names = FALSE, 
        row.names = FALSE, quote = FALSE, sep = "", 
        append = TRUE)
    sdbw_str <- sprintf("S_Dbw scat (%s):\t%0.4f ", data.type, sdbw$scat)
    write.table(sdbw_str, file = fc, col.names = FALSE, 
        row.names = FALSE, quote = FALSE, sep = "", 
        append = TRUE)
    sdbw_str <- sprintf("S_Dbw validity index (%s):\t%0.4f ", 
        data.type, scale * sdbw$dens.bw + sdbw$scat)
    write.table(sdbw_str, file = fc, col.names = FALSE, 
        row.names = FALSE, quote = FALSE, sep = "", 
        append = TRUE)
}

## TODO: Add documentation
removeEmptyClusters <- function(data, convergeParams, results, proportionThreshold = 0.001, 
	proportionThresholdClonal = 0.05, recomputeLogLik = TRUE, verbose = TRUE){
  #rerunViterbi = TRUE, subcloneProfiles = FALSE, is.haplotypeData = FALSE){
	clust <- 1:nrow(convergeParams$s)
	names(clust) <- clust
	#newClust <- clust #original clusters
	for (cl in clust){
		ind <- which(results$ClonalCluster == cl)
		if (length(ind) / nrow(results) < proportionThreshold || 
				(length(ind) / nrow(results) < proportionThresholdClonal && cl == 1)){
			#newClust <- newClust[-which(names(newClust) == cl)]
			clust[cl] <- NA #assign cluster without sufficient data with NA
		}
	}
	k <- ncol(convergeParams$s)
	# sort the cellular prevalence since they are sorted in "results"
	convergeParams$s <- convergeParams$s[order(convergeParams$s[, k], decreasing = FALSE), , drop = FALSE]
	# if there is at least 1 cluster with sufficient data
	if (length(which(clust > 0)) > 0){
		#set new normal estimate as cluster 1 -> lowers purity from original estimate
	  clustToKeep <- which(!is.na(clust))
		purity <- (1 - convergeParams$s[clustToKeep[1], k]) * (1 -  convergeParams$n[k])
		convergeParams$n[k] <- 1 - purity
		#set new cellular prevalence using new clusters and renormalize to new cluster 1		
		convergeParams$s <- convergeParams$s[clustToKeep, , drop = FALSE]
		convergeParams$s[, k] <- 1 - (1 - convergeParams$s[, k]) / (1 - convergeParams$s[1, k])
		convergeParams$piZ <- convergeParams$piZ[clustToKeep, , drop = FALSE]
		convergeParams$rhoZ <- convergeParams$rhoZ[clustToKeep, , drop = FALSE]
		convergeParams$muC <- convergeParams$muC[, clustToKeep, , drop = FALSE]
		convergeParams$muR <- convergeParams$muR[, clustToKeep, , drop = FALSE]
		convergeParams$cellPrevParams <- lapply(convergeParams$cellPrevParams, "[", clustToKeep)
		#names(newClust) <- 1:length(newClust)
		clust[!is.na(clust)] <- 1:sum(!is.na(clust))

		#set new cellular prevalence and clonal cluster in results file	
		for (cl in 1:length(clust)){
			# assign data in removed cluster cl to next non-NA cluster
			if (is.na(clust[cl])){
			    # assign to the right (larger cluster number)
  				if (length(which(!is.na(clust) & names(clust) > cl)) > 0){
  					results[which(results$ClonalCluster == names(clust)[cl]), "CellularPrevalence"] <- 1 - convergeParams$s[clust[which(!is.na(clust) & names(clust) > cl)[1]], k]
  					results[which(results$ClonalCluster == names(clust)[cl]), "ClonalCluster"] <- clust[which(!is.na(clust) & names(clust) > cl)][1]
  				# assign to the left (smaller cluster number)
  				}else if (length(which(!is.na(clust) & names(clust) < cl)) > 0){
  					results[which(results$ClonalCluster == names(clust)[cl]), "CellularPrevalence"] <- 1 - convergeParams$s[clust[tail(which(!is.na(clust) & names(clust) < cl), 1)], k]
  					results[which(results$ClonalCluster == names(clust)[cl]), "ClonalCluster"] <- clust[tail(which(!is.na(clust) & names(clust) < cl), 1)]
  				}
			}else{ # update cluster and cellPrev info for kept clusters
				results[which(results$ClonalCluster == names(clust)[cl]), "CellularPrevalence"] <- 1 - convergeParams$s[clust[cl], k]
				results[which(results$ClonalCluster == names(clust)[cl]), "ClonalCluster"] <- clust[cl]		
			}
		}
	}else{ # no clusters with sufficient data
		
		# set params to only cluster with data or to default cluster01 if no cluster with data
		clustData <- which.max(table(results$ClonalCluster))
		if (length(clustData) >= 0){
			clustData <- 1
		}
		#set normal contamination to 100%
		convergeParams$n[k] <- 1 - convergeParams$s[clustData, k]
		convergeParams$s <- convergeParams$s[clustData, , drop = FALSE]
		convergeParams$s[, k] <- 0.0
		convergeParams$cellPrevParams <- lapply(convergeParams$cellPrevParams, "[", clustData)
		
			# set all clusters to 1 and all cellular prevalence to 1.0; leave HET as NA
		results[which(results$TITANcall != "HET"), "CellularPrevalence"] <- 1
		results[which(results$TITANcall != "HET"), "ClonalCluster"] <- 1
	}	
	
	# rerun viterbi with new adjusted param settings
	#if (rerunViterbi){
	#  optimalPath <- viterbiClonalCN(data,convergeParams)
	#  newResults <- outputTitanResults(data,convergeParams,optimalPath,
	#                                filename=NULL,posteriorProbs=F,subcloneProfiles=subcloneProfiles,
	#                                correctResults=FALSE, proportionThreshold = 0, is.haplotypeData=is.haplotypeData,
	#                                proportionThresholdClonal = 0, rerunViterbi = FALSE, recomputeLogLik = FALSE)
	#  results <- newResults$results
	#}
	
	if (recomputeLogLik){
	  if (verbose)
  	  message("outputTitanResults: Recomputing log-likelihood.")
  	newParams <- convergeParams
  	iter <- length(newParams$n)
  	newNumClust <- nrow(newParams$s)
  	newParams$genotypeParams$var_0 <- newParams$var[, iter]
  	newParams$genotypeParams$varR_0 <- newParams$varR[, iter]
  	newParams$genotypeParams$piG_0 <- newParams$piG[, iter]
  	newParams$normalParams$n_0 <- newParams$n[iter]
  	newParams$ploidyParams$phi_0 <- newParams$phi[iter]
  	newParams$cellPrevParams$s_0 <- newParams$s[, iter]
  	newParams$cellPrevParams$piZ_0 <- newParams$piZ[1:newNumClust, iter]
  	
  	p <- runEMclonalCN(data, newParams, maxiter=1, txnExpLen=convergeParams$txn_exp_len, 
  	                   txnZstrength=convergeParams$txn_z_strength, useOutlierState=FALSE, 
  	                   normalEstimateMethod="fixed", estimateS=FALSE,estimatePloidy=F, verbose=verbose)
    convergeParams$loglik[iter] <- tail(p$loglik, 1)
    convergeParams$muC[, , iter] <- p$muC[, , 2]
    convergeParams$muR[, , iter] <- p$muR[, , 2]
    convergeParams$rhoZ <- p$rhoZ
    convergeParams$rhoG <- p$rhoG
	}
	return(list(convergeParams = convergeParams, results = results))
}

getSubcloneProfiles <- function(titanResults){
	if (is.character(titanResults)){
		titanResults <- read.delim(titanResults, header = TRUE, 
				stringsAsFactors = FALSE, sep = "\t")
	}else if (!is.data.frame(titanResults)){
		stop("getSubcloneProfiles: titanResults is not character or
				data.frame.")
	}
	
	clonalClust <- titanResults$ClonalCluster
	clonalClust[is.na(clonalClust)] <- 0
	numClones <- as.numeric(max(clonalClust, na.rm = TRUE))
	if (is.na(numClones)){ numClones <- 0 }
	cellPrev <- unique(cbind(Cluster = titanResults$ClonalCluster, 
			Prevalence = titanResults$CellularPrevalence))
	
	if (numClones == 0 || is.infinite(numClones)){
		subc1 <- data.table(cbind(CopyNumber = as.numeric(titanResults$CopyNumber), 
				TITANcall = titanResults$TITANcall, Prevalence = "NA"))
	}
	if (numClones == 1){
		subc1Prev <- cellPrev[which(cellPrev[, "Cluster"] == "1"), "Prevalence"]
		subc1 <- data.table(cbind(CopyNumber = as.numeric(titanResults$CopyNumber), 
				TITANcall = titanResults$TITANcall,
				Prevalence = as.numeric(subc1Prev)))
	}
	if (numClones == 2){
		subc2Prev <- as.numeric(cellPrev[which(cellPrev[, "Cluster"] == "2"),
				"Prevalence"])
		subc1Prev <- as.numeric(cellPrev[which(cellPrev[, "Cluster"] == "1"),
				"Prevalence"])
		subc1Prev <- subc1Prev - subc2Prev
		subc2 <- data.table(CopyNumber = as.numeric(titanResults$CopyNumber), 
			TITANcall = titanResults$TITANcall, Prevalence = as.numeric(subc2Prev))
		#mode(subc2[, 1]) <- "numeric"; mode(subc2[, 3]) <- "numeric"
		subc1 <- copy(subc2)
		ind <- which(titanResults$ClonalCluster == 2)
		subc1[ind, CopyNumber := 2]
		subc1[ind, TITANcall := "HET"]
		subc1[, "Prevalence"] <- subc1Prev
	}
	
	## Add subclone 1, 2 and 3 if they are defined
	outMat <- data.table(Subclone1 = subc1)
	if (exists("subc2")){
		outMat <- cbind(outMat, Subclone2 = subc2)
	}
	#if (exists("subc3")){
	#	outMat <- cbind(outMat, Subclone3 = subc3, stringsAsFactors = FALSE)
	#}
	return(outMat)
}
gavinha/TitanCNA documentation built on April 22, 2021, 9:38 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
gavinha/TitanCNA
Subclonal copy number and LOH prediction from whole genome sequencing of tumours

R/utils.R
In gavinha/TitanCNA: Subclonal copy number and LOH prediction from whole genome sequencing of tumours

Documented in computeSDbwIndex correctIntegerCN correctReadDepth filterData getPositionOverlap loadAlleleCounts loadDefaultParameters outputModelParameters outputTitanResults outputTitanSegments setGenomeStyle

R Package Documentation

Browse R Packages

We want your feedback!

gavinha/TitanCNA Subclonal copy number and LOH prediction from whole genome sequencing of tumours

R/utils.R In gavinha/TitanCNA: Subclonal copy number and LOH prediction from whole genome sequencing of tumours

Documented in computeSDbwIndex correctIntegerCN correctReadDepth filterData getPositionOverlap loadAlleleCounts loadDefaultParameters outputModelParameters outputTitanResults outputTitanSegments setGenomeStyle

R Package Documentation

Browse R Packages

We want your feedback!

gavinha/TitanCNA
Subclonal copy number and LOH prediction from whole genome sequencing of tumours

R/utils.R
In gavinha/TitanCNA: Subclonal copy number and LOH prediction from whole genome sequencing of tumours