R/facopy_internal_v45.r
In facopy: Feature-based association and gene-set enrichment for copy number alteration analysis in cancer

Documented in addFeatures addVariables alterationSummary calculateCor facopy facopyEnrichment facopyPlot getFacopyInfo plotBar plotHist plotPCA plotZoom preview readCNData variableCor variableSummary

if(getRversion() >= "2.15.1") 
  utils::globalVariables(c("bp_len","code","chr_q_arm","type","altered","xmin","xmax",
                           "pos_st","pos_en","freq","value","pos","ymin","ymax",
                           "Frequency","Chromosome","facopy_msigdb","facopy_msigdbNames",
                           "GOBPPARENTS","GOCCPARENTS","GOMFPARENTS"))

#######################################################################
############################################### INTERNAL CODIFICATION #
#######################################################################
.printList = function(ttext, x, header="aliases") {
  cat(ttext,"\n",sep=":")
  for (i in seq(length(x))) {
    if (!is.null(names(x)[[i]]))
      cat(" - ", names(x)[[i]][1])
    else
      cat(" - ", x[[i]][1])
    if (length(x[[i]])>1)
      cat(" (",header,": ", paste(x[[i]],collapse=", "), ")", sep="")
    cat("\n")
  }
}
.getFacopyAnnot = function() {
  dat = data(package="facopy.annot")$results[,3]
  dat = sapply(paste(.genomes(), rep(c("_db","_feature"),each=length(.genomes())), sep=""), grep, dat, value=TRUE)
  dat = lapply(dat, function(i) gsub("^[a-zA-Z0-9]+_[a-zA-Z0-9]+_","",i))
  names(dat) = sapply(names(dat), function(i) gsub("_"," ",i))
  dat
}

getFacopyInfo = function() {
  dat = .getFacopyAnnot()
  .printList("Variable types", .types())
  .printList("Alteration combinations", .alterations())
  .printList("Input data", .methods())
  .printList("Genome builds", .genomes())
  .printList("Genomic features", dat[grep("feature",names(dat))], "bundled")
  .printList("External data sets", dat[grep("db",names(dat))], "available")
}

.genomes = function() c("hg18","hg19","mm8")
.features = function() c("ensembl","cancergene","oncogene","tumorsupressor","lincRNA","mirnas")
.allChrs = function(genome, sex) {
  if (genome%in%c("hg18","hg19")) {
    out = paste(rep(c(1:22,sex),each=2), c("p","q"), sep="")
    out[!out%in%c("13p","14p","15p","22p")]
  } else if (genome%in%c("mm8")) {
    paste(c(1:19,sex), "q", sep="")
  }
}
.methods = function() c("seqcna", "cnanorm", "patchwork", "freec", "oncosnp", "oncosnp-seq", "gap", "exomecnv", "titan", "fastcall", "excavator")
.alterCodes = function() c("1","3","11","13","12") #1:loss, 3:gain; x:noLOH, 1x:LOH
.allCodes = function() c("1","2","3","11","12","13") # in that specific order
.alterations = function() list("amplifications", "deletions", "loh", "cnas",
                               c("any","all"), "onlygain", "someloss")
.getAlterCodes = function(alteration) {
  alterCodes = list(c("3","13"), c("1","11"), c("11","13","12"), c("3","13","1","11"), 
                    c("3","13","1","11","12"), c("3"), c("1","11","12","13"))
  wh = which(!is.na(sapply(.alterations(), function(i) pmatch(tolower(alteration),i))))[1]
  if (is.na(wh))
    stop("Unknown alteration name: ", alteration)
  alterCodes[[wh]]
}
.types = function() list(categorical=c("categorical","discrete","qualitative","enumerative"),
                         quantitative=c("quantitative","continuous"))
.getTypeCode = function(type) {
  wh = which(!is.na(sapply(.types(), function(i) pmatch(tolower(type),i))))[1]
  if (is.na(wh))
    stop("Unknown variable type: ", type)
  names(.types())[[wh]]
}
.isCateg = function(fad, x) fad@vData$varTypes[x]=="categorical"
.n = function(fad, vn=NULL, vv=NULL) {
  if (!is.null(vn)) sel = fad@vData$varTable[,vn]%in%vv
  else sel = TRUE
  length(fad@vData$varTable$code[sel])
}
.unLOH = function(x) { x[x>=10] = x[x>=10] - 10; x }
.getLOH = function(x) floor(x/10)
.ord = function(x, arms, ...) order(match(x, arms), ...)
.refcols = function(n) {
	cc = c(colorRampPalette(c("#F8CA00","#bd1550"))(3),"#e00000","#926239", colorRampPalette(c("#25e361","#59BCE9","#5123c1"))(4), "#ff00cc")
	cc = cc[c(3:5,2:1,6:10)]
	cc = cc[c(2:10,1)]
	colorRampPalette(cc)(n+1)[1:n]
}

#######################################################################
########################### READING OUTPUT FROM CNA-DETECTION METHODS #
#######################################################################

.readFromInt = function(samples, samplenames, f, fargs, FUN=NULL, ...) {
  own = c()
  for (i in 1:length(samples)) {
    data = f(read.delim(file=samples[i], ...), fargs)
    own = rbind(own, cbind(rep(samplenames[i],nrow(data)), data))
    message(paste("(",i,"/",length(samples),") ",nrow(data)," calls in sample '",samplenames[i],"'",sep=""))
  }
  own[,1] = as.vector(as.character(own[,1]))
  if (!is.null(FUN)) own[,1] = sapply(own[,1], FUN)
  own
}
.pack = function(data, sex) {
  chrs = rle(data$chr)$values
  chrs = c(chrs[chrs!=c("X","Y")], sex)
  out = c()
  for (chr in chrs) {
    xx = data[data$chr==chr,]
    en = cumsum(rle(xx$type)$lengths)
    if (length(en)>1) st = c(1, en[seq(length(en)-1)]+1)
    else st = 1
    if (length(en)>0) {
      xx2 = xx[st,]
      xx2$pos_en = xx$pos_en[en]
      xx2$bp_len = sapply(seq(st), function(i) sum(xx$bp_len[st[i]:en[i]]+1))
      out = rbind(out, xx2)
    }
  }
  out
}
.makeInfo = function(cData, sex, pack=FALSE) {
  if (!is.null(sex)) if (sex=="") sex = NULL
  out = c()
  en = cumsum(rle(cData$code)$lengths)
  st = c(1, en[seq(length(en)-1)]+1)
  if (pack) for (s in seq(st)) out = rbind(out, .pack(cData[st[s]:en[s],], sex))
  if (!pack) { # sex chrs selection has already been done if 'pack', no need to redo this
    chrs = rle(cData$chr)$values
    chrs = c(chrs[!chrs%in%c("X","Y","XY","chrX","chrY","chrXY")], sex)
    out = cData[cData$chr%in%chrs,]
  }
  new("facopyInfo", cData=out, sex=sex)
}

readCNData = function(folder, method=NULL, sex=c("X"), FUN=NULL, 
                      version=NULL, window=NULL, rankThr=NULL,
                      pfbFilename, lengthThr=10, ...) {
  tryFUN = function(old, new) {
    if (is.null(old)) {
      message("Note:\nFunction applied to filenames will be: ",as.character(enquote(new)[2]))
      new
    } else { old }
  }
  methodArg = .methods()[pmatch(tolower(method),.methods())]
  if (length(methodArg)==0) {
    .readFromFolder(folder, sex, FUN, ...)
  } else if (is.na(methodArg)) {
    stop("Output data from specified copy number calling method not supported.") 
  } else if (methodArg=="seqcna") {
    .readFromSEQCNA(folder, sex, version, FUN)
  } else if (methodArg=="cnanorm") {
    if (is.null(window)) stop("Window size is required.")
    .readFromCNANORM(folder, sex, version, window, FUN)
  } else if (methodArg=="patchwork") {
    FUN = tryFUN(FUN, function(x)gsub("_Copynumbers.csv$","",x))
    .readFromPATCHWORK(folder, sex, version, FUN)
  } else if (methodArg=="freec") {
    FUN = tryFUN(FUN, function(x)gsub("_ratio.txt$","",x))
    .readFromFREEC(folder, sex, version, FUN)
  } else if (methodArg=="oncosnp") {
    if (is.null(version)) stop("Version is required for OncoSNP.")
    FUN = tryFUN(FUN, function(x)gsub(".cnvs$","",x))
    .readFromONCOSNP(folder, sex, version, rankThr, FUN)
  } else if (methodArg=="oncosnp-seq") {
    FUN = tryFUN(FUN, function(x)gsub(".cnvs$","",x))
    .readFromONCOSNPSEQ(folder, sex, version, rankThr, FUN)
  } else if (methodArg=="gap") {
    if (is.null(pfbFilename)) stop("PFB filename is required.")
    .readFromGAP(folder, pfbFilename, sex, lengthThr, FUN)
  } else if (methodArg=="exomecnv") {
    FUN = tryFUN(FUN, function(x)gsub(".segment.copynumber.txt$","",x))
    .readFromEXOMECNV(folder, sex, version, FUN)
  } else if (methodArg=="titan") {
    FUN = tryFUN(FUN, function(x)gsub(".txt$","",x))
    .readFromTITAN(folder, sex, version, FUN)
  } else if (methodArg%in%c("fastcall","excavator")) {
    FUN = tryFUN(function(x)gsub("^FastCallResults_([^.]*).*", "\\1", x))
    .readFromFASTCALL(folder, sex, version, FUN)
  } else {
    stop("Output data from specified copy number calling method not supported.") 
  }
}

.readFromFolder = function(folder, sex=c("X"), FUN=NULL, ...) {
  own = .readFromInt(list.files(folder,full.names=TRUE), list.files(folder,full.names=FALSE), function(x,fargs)I(x), NULL, FUN, ...)
  colnames(own) = c("code", "chr", "pos_st", "pos_en", "bp_len", "type")
  .makeInfo(own, sex)
}

.readFromSEQCNA = function(folder, sex=c("X"), version=NULL, FUN=NULL) {
  f = function(data, fargs) { # the output is NOT allele-specific
    data = data[,c("chrom","win.start","CN")]
    data = data[!is.na(data$CN),]
    data$CN[data$CN<1] = 1
    data$CN[data$CN>3] = 3
    l = diff(data$win.start[1:2])-1
    cbind(data, data$win.start+l, rep(l,nrow(data)))
  }
  getSample = function(x, f) grep("seqCNA_out.txt", list.files(x, full.names=f, recursive=TRUE), value=TRUE)
  own = .readFromInt(getSample(folder,TRUE), dirname(getSample(folder,FALSE)), f, NULL, FUN, header=TRUE, sep="\t")
  own[,2] = as.numeric(gsub("chr","",as.character(own[,2])))
  own = own[,c(1,2,3,5,6,4)]
  colnames(own) = c("code", "chr", "pos_st", "pos_en", "bp_len", "type")
  .makeInfo(own, sex, TRUE)
}

.readFromCNANORM = function(folder, sex=c("X"), version=NULL, window, FUN=NULL) {
  f = function(data, fargs) { # the output is NOT allele-specific
    data$Cn = round(data$SegMean.n)
    data$Cn[data$Cn<1] = 1
    data$Cn[data$Cn>3] = 3
    l = fargs$w
    data$Start = data$Pos*(l-1)+1
    cbind(data[,c("Chr","Start","Cn")], data$Start+l-1, rep(l,nrow(data)))
  }
  own = .readFromInt(list.files(folder,full.names=TRUE), list.files(folder,full.names=FALSE), f, list(w=window), FUN, header=TRUE, sep="\t")
  own = own[,c(1,2,3,5,6,4)]
  own[,2] = gsub("chr","",as.character(own[,2]))
  colnames(own) = c("code", "chr", "pos_st", "pos_en", "bp_len", "type")
  .makeInfo(own, sex, TRUE)
}

.readFromPATCHWORK = function(folder, sex=c("X"), version=NULL, FUN=function(x)gsub("_Copynumbers.csv$","",x)) {
  f = function(data, fargs) { # the output CAN be allele-specific
    mCn = data[,"mCn"]
    data = data[,c("chr","start","end","Cn")]
    data$Cn[data$Cn<1] = 1
    data$Cn[data$Cn>3] = 3
    if (! all(mCn==0)) # means that allele-specific was used
      data$Cn = data$Cn + 10*as.integer(mCn==0)
    cbind(data, data$end-data$start+1)
  }
  getSample = function(folder,f) grep(".*csv$", list.files(folder,full.names=f), value=TRUE)
  own = .readFromInt(getSample(folder,TRUE), basename(getSample(folder,FALSE)), f, NULL, FUN, header=TRUE, sep=",")
  colnames(own) = c("code", "chr", "pos_st", "pos_en", "type", "bp_len")
  own[,2] = gsub("chr","",as.character(own[,2]))
  .makeInfo(own, sex)
}

.readFromFREEC = function(folder, sex=c("X"), version=NULL, FUN=function(x)gsub("_ratio.txt$","",x)) {
  f = function(data, fargs) { # the output CAN be allele-specific
    hasGeno = FALSE
    if ("Genotype"%in%colnames(data)) { # not verified!!!
      geno = data[,"Genotype"]
      geno[is.na(geno)] = "NA"
      isHom = sapply(strsplit(geno,""), function(i)all(i==i[1]))
      hasGeno = TRUE
    }
    data = data[,c("Chromosome","Start","CopyNumber")]
    data[data[,3]<1,3] = 1
    data[data[,3]>3,3] = 3
    if (hasGeno) data[,3] = data[,3] + as.integer(isHom)
    l = diff(data$Start[1:2])-1
    cbind(data, data$Start+l, rep(l,nrow(data)))
  }
  getSample = function(folder,f) grep(".*_ratio.txt$", list.files(folder,full.names=f), value=TRUE)
  own = .readFromInt(getSample(folder,TRUE), basename(getSample(folder,FALSE)), f, NULL, FUN, header=TRUE, sep="\t")
  own = own[,c(1,2,3,5,6,4)]
  colnames(own) = c("code", "chr", "pos_st", "pos_en", "bp_len", "type")
  .makeInfo(own, sex, TRUE)
}

.readFromTITAN = function(folder, sex=c("X"), version=NULL, FUN=function(x)gsub(".txt$","",x)) {
  f = function(data, fargs) { # the output IS allele-specific
    data = data[,c("Chr","Position","TITANstate")]
    data[data[,3]%in%c(0,1,2),3] = 1
    data[data[,3]%in%c(3,5),3] = 12
    data[data[,3]%in%c(4),3] = 2
    data[data[,3]%in%c(6,9),3] = 13
    data[data[,3]%in%c(7,8),3] = 3
    data[data[,3]%in%c(10,14),3] = 13
    data[data[,3]%in%c(11,12,13),3] = 3
    data[data[,3]%in%c(15,20),3] = 13
    data[data[,3]%in%c(16,17,18,19),3] = 3
    data[data[,3]%in%c(21:100),3] = 3
    data$Start = c(1, data$Position[seq(length(data$Position)-1)]+1)
    cbind(data, data$Position-data$Start+1)
  }
  own = .readFromInt(list.files(folder,full.names=TRUE), list.files(folder,full.names=FALSE), f, NULL, FUN, header=TRUE, sep="\t")
  own = own[,c(1,2,5,3,6,4)]
  colnames(own) = c("code", "chr", "pos_st", "pos_en", "bp_len", "type")
  .makeInfo(own, sex)
}

.readFromONCOSNPSEQ = function(folder, sex=c("X"), version=1.04, rankThr=NULL, FUN=function(x)gsub(".cnvs$","",x)) {
  .readFromONCOSNPInt(folder, sex, version, rankThr, FUN, SEQ=TRUE)
}
.readFromONCOSNP = function(folder, sex=c("X"), version=1.3, rankThr=NULL, FUN=function(x)gsub(".cnvs$","",x)) {
  .readFromONCOSNPInt(folder, sex, version, rankThr, FUN, SEQ=FALSE)
}
.readFromONCOSNPInt = function(folder, sex, version, rankThr, FUN, SEQ) {
  f = function(data, fargs) { # the output IS allele-specific
    if (is.null(data$CopyNumber))
      data$CopyNumber = data$Copy.Number
    data$CopyNumber[data$CopyNumber<1] = 1
    data$CopyNumber[data$CopyNumber>3] = 3
    data$LOH[data$LOH==2] = 0
    data$CopyNumber = data$CopyNumber + 10 * data$LOH
    if (!is.null(data$Rank) && !is.null(fargs$rankThr))
      data = data[data$Rank<=fargs$rankThr,]
    if (!is.null(data$Rank)) cbind(data[,1:4], data$Rank)
    else data[,1:4]
  }
  getSample = function(folder,f) grep("*.cnvs$", list.files(folder,full.names=f), value=TRUE)
  if (SEQ==FALSE && version<1.3 && !is.null(rankThr)) {
    rankThr = NULL
    warning("Ranks are only available in OncoSNP from version 1.3 on.")
  }
  own = .readFromInt(getSample(folder,TRUE), getSample(folder,FALSE), f, list(rankThr=rankThr), FUN, header=TRUE,sep="\t",fill=TRUE)
  own = cbind(own, apply(own, 1, function(i) as.numeric(as.vector(i[4]))-as.numeric(as.vector(i[3]))))
  colnames(own)[1:6] = c("code", "chr", "pos_st", "pos_en", "type", "bp_len")
  if (version>=1.3 && ncol(own)>6) colnames(own)[7] = c("conf")
  .makeInfo(own, sex)
}

.readFromGAP = function(folder, pfbFilename, sex=c("X"), lengthThr=10, FUN=NULL) {
  f = function(data, fargs) { # the output IS allele-specific
    data = data[data$Len>=fargs$lengthThr & !is.na(data$CN) & !is.na(data$Chr), ] 
    # if CN = major allele count and there is no germline LOH (LOH column), there is somatic LOH
    isLOH = 10 * (round(data$CN)==round(data$BA) & !data$LOH)
    data$CN[data$CN<1] = 1
    data$CN[data$CN>3] = 3
    data$CN = round(data$CN) + isLOH
    data = cbind(pfb[data$Ind,c(1,3)], pfb[data$Ind_K,c(1,3)], data)
    data[, c(8,16,1,3,5,6,2,4,18)]
  }
  pfb = read.table(pfbFilename,header=TRUE,sep="\t")
  samples = list.files(folder, full.names=TRUE)
  samplenames = list.files(folder, full.names=FALSE)
  own = .readFromInt(samples, samplenames, f, list(lengthThr=lengthThr), FUN, header=TRUE,sep=" ",fill=TRUE)
  own[,2] = floor(as.numeric(as.character(own[,2]))) # may be problematic if sexual chromosomes used, but floor fun. is necessary
  own = cbind(own, apply(own, 1, function(i) as.numeric(as.vector(i[9]))-as.numeric(as.vector(i[8]))))
  colnames(own) = c("code", "chr", "type", "snp_st", "snp_en", "ind_st", "ind_en", "pos_st", "pos_en", "conf", "bp_len")
  .makeInfo(own, sex)
}

.readFromEXOMECNV = function(folder, sex=c("X"), version=NULL, FUN=function(x)gsub(".segment.copynumber.txt$","",x)) {
  f = function(data, fargs) { # the output is NOT allele-specific
    data[,4][data[,4]<1] = 1
    data[,4][data[,4]>3] = 3
    cbind(data, data[,3]-data[,2]+1)
  }
  getSample = function(folder,f) grep(".segment.copynumber.txt$", list.files(folder,full.names=f), value=TRUE)
  own = .readFromInt(getSample(folder,TRUE), basename(getSample(folder,FALSE)), f, NULL, FUN, header=FALSE, sep="\t")
  own[,2] = gsub("chr","",as.character(own[,2]))
  colnames(own) = c("code", "chr", "pos_st", "pos_en", "type", "bp_len")
  .makeInfo(own, sex)
}

.readFromFASTCALL = function(folder, sex=c("X"), version=NULL, FUN=function(x)gsub("^FastCallResults_([^.]*).*", "\\1", x)) {
  f = function(data, fargs) { # the output is NOT allele-specific
    data = data[,c("Chromosome","Start","End","CN")]
    data = data[!is.na(data$CN),]
    data$CN[data$CN<1] = 1
    data$CN[data$CN>3] = 3
    cbind(data, data[,3]-data[,2]+1)
  }
  getSample = function(x, f) grep("^FastCallResults_", list.files(x, full.names=f, recursive=TRUE), value=TRUE)
  own = .readFromInt(getSample(folder,TRUE), dirname(getSample(folder,FALSE)), f, NULL, FUN, header=TRUE, sep="\t")
  own[,2] = gsub("chr","",as.character(own[,2]))
  colnames(own) = c("code", "chr", "pos_st", "pos_en", "bp_len", "type")
  .makeInfo(own, sex)
}

#######################################################################
########################################## GENERIC INTERNAL FUNCTIONS #
#######################################################################

# necessary because aggregation in data.table package does not fill with not present factor levels
# col: annotation columns, thus 1 to 1 with respect to sample code
# ...: columns from which to produce combinations with themselves and code
.fill = function(varTable, aggr, col, emptyVal, ...) {
  eg = do.call(expand.grid,list(code=varTable$code, ...))
  st = ncol(eg)+1
  if (!is.null(col)) {
    eg = merge(eg, varTable[,c("code",col),drop=FALSE])
    colnames(eg)[st:ncol(eg)] = colnames(aggr)[st:ncol(eg)]
  }
  out = merge(aggr, eg, all=TRUE)
  out[is.na(out)] = emptyVal
  out
}

.calcFreqsInt = function(sel_features, tabs, cn, nums, vari) {
  freqs = c()
  for (i in tabs) {
    tt = i[i[,vari]%in%nums,]
    tab = table(factor(tt[,vari],levels=nums), factor(tt$type,levels=.allCodes()))
    rs = rowSums(tab[,colnames(tab)%in%cn,drop=FALSE])
    rsAll = rowSums(tab[,,drop=FALSE])
    freqs = rbind(freqs, rs / rsAll)
  }
  if (! is.null(dim(freqs))) {
    freqs[is.na(freqs)] = 0
    rownames(freqs) = sel_features$feature
    colnames(freqs)[1:length(nums)] = nums
  }
  data.frame(freqs, check.names=FALSE)
}

.makeArmsInt = function(cData, chrs, genome, varColumn=NULL) {
  if (genome%in%c("hg18","hg19")) {
	sepsText = paste(genome,"_armLimits",sep="")
	data(list=sepsText)
	seps = eval(parse(text=sepsText))
    seps = seps$limit[sapply(seps$chr_q_arm, function(i) length(grep("p",i))==1)]
  } else if (genome%in%c("mm8")) {
    seps = rep(0,length(chrs))
  }
  x = cData[,c("code","type","chr","pos_st","bp_len",varColumn)]
  out = c()
  for (i in 1:length(chrs)) {
    chr = chrs[i]
    xsub = x[x$chr==chr,]
    wh = xsub$pos_st >= seps[i]
    if (length(which(wh==FALSE)) > 0) out = rbind(out, cbind(xsub[!wh,!colnames(xsub)%in%c("chr","pos_st")], chr_q_arm=paste(chr,"p",sep="")))
    if (length(which(wh==TRUE)) > 0) out = rbind(out, cbind(xsub[wh,!colnames(xsub)%in%c("chr","pos_st")], chr_q_arm=paste(chr,"q",sep="")))
  }
  out
}

.aggregateInt = function(x, n, genome, arms, FUN, levels, varTable) {
  x = x[x$chr_q_arm%in%arms,] # disregard data in arms with no features
  aggr = data.frame(data.table(x)[,sum(bp_len), by=list(code, chr_q_arm, match.fun(FUN)(type))])
  aggr = .fill(varTable, aggr, NULL, 0, chr_q_arm=arms, match.fun=levels)
  colnames(aggr) = c("code","chr_q_arm", "type", "altered")
  #thanks to 'n', .fill not necessary
  aggr = data.frame(data.table(aggr)[,sum(altered)/n, by=list(chr_q_arm, type)])
  colnames(aggr) = c("chr_q_arm", "type", "altered")
  armlimitsText = paste(genome,"_armLimits",sep="")
  data(list=armlimitsText)
  armlimits = eval(parse(text=armlimitsText))
  lens = matrix(armlimits$limit, 2)
  lens[2,] = lens[2,]-lens[1,]
  armlengths = data.frame(chr_q_arm=as.character(armlimits$chr_q_arm), length=as.vector(lens))
  aggr = merge(data.frame(aggr), armlengths)
  aggr$freq = as.numeric(as.character(aggr$altered)) / as.numeric(as.character(aggr$length))
  # fill missing rows due to lack of alterations in specific arms
  tab = table(aggr$chr_q_arm, aggr$type)
  wh = which(tab==0, arr.ind=TRUE)
  if (nrow(wh) > 0)
    for (i in 1:nrow(wh)) {
      arm = rownames(tab)[wh[i,1]]
      len = armlengths$length[armlengths$chr_q_arm==arm]
      aggr = rbind(aggr, c(arm, colnames(tab)[wh[i,2]], 0, len, 0))
    }
  aggr[.ord(aggr$chr_q_arm, arms),]
}

#######################################################################
############################## ANNOTATION WITH VARIABLES AND FEATURES #
#######################################################################

# allows to give names to variables and their names
# allows to use only subsets of variable values
.makeVars = function(varTypes, varNames, varColumns, varValuesNames, varValues) {
  varTypes = sapply(varTypes, .getTypeCode)
  if (! length(varTypes)==length(varNames))
    stop("The lengths of the vectors for the variables names and their types do no match.")
	if (! length(varNames)==length(varColumns))
		stop("The lengths of the vectors for the variables names and their column names do no match.")
	if (! length(varNames)==length(varValues))
			stop("The lengths of the vector for the variable names and the list for their values do no match.")
	if (! identical(sapply(varValues,length), sapply(varValuesNames,length)))
		stop("The lengths of the lists for the values and their names do no match.")
	names(varTypes) = names(varColumns) = names(varValues) = names(varValuesNames) = varNames
	list(varTypes=varTypes, varNames=varNames, varColumns=varColumns, varValuesNames=varValuesNames, varValues=varValues)
}

.checkData = function(fad) {
	mandatoryColumns = c("code","chr","type","pos_st","pos_en","bp_len")
	wh = which(! mandatoryColumns%in%colnames(fad@cData))
	if (length(wh) >0)
		stop(paste("The following columns are required in the calls table: ",
				   paste(mandatoryColumns[wh],collapse=", "),".",sep=""))
	if (! all(fad@vData$varColumns%in%colnames(fad@cData)))
		stop("Not all variable column names are present in the calls table.")
	for (i in 1:length(fad@vData$varColumns)) {
		inter = intersect(fad@vData$varValues[[i]], unique(fad@cData[,fad@vData$varColumns[i]]))
		if (length(inter) == 0)
			stop(paste("Variable '",fad@vData$varNames[i],"': its defined values and those in the calls table do not match at all.",sep=""))
	}
}

# variable columns can be taken from file; all but 'code' will be taken
# variable values can be taken from file; values can be taken for one or more variables
# variable names can be taken from variable column names; all or none may be taken
# variable value names can be taken from variable values; can be taken for one or more variables
addVariables = function(fad, varInfo, varTypes, varColumns=NULL, varValues=NULL, varColumnsNames=NULL, varValuesNames=NULL, ...) {
	if (class(varInfo)=="character") vv = read.table(varInfo, header=TRUE, ...)
  else vv = varInfo
	if (! "code"%in%colnames(vv))
		stop("The necessary column 'code' is not present in the variables file.")
	cc = merge(fad@cData, cbind(vv, all=1), by="code")
	fad@cData = cc
  #
	if (is.null(varColumns)) {
	  if (!is.null(varValues))
	    stop("Variables should be specified with 'varColumns' if their values are given with 'varValues'.")
	  varColumns = colnames(vv)[colnames(vv)!="code"]
	}
  .autoComplete = function(x, y, f, ...) {
    wnull = function(o) sapply(o, is.null)
    if (is.null(x)) { x = f(y,...) } else if (any(wnull(x))) { x[wnull(x)] = f(y[wnull(x)],...) }
    x
  }
  f = function(y, vv) {
    splitc = function(x) lapply(seq_len(ncol(x)), function(i) x[,i])
    lapply(splitc(vv[,y,drop=FALSE]), function(i) names(table(i)))
  }
	varValues = .autoComplete(varValues, varColumns, f, vv)
	varColumnsNames = .autoComplete(varColumnsNames, varColumns, I)
	varValuesNames = .autoComplete(varValuesNames, varValues, I)  
  varData = .makeVars(c(varTypes,"categorical"), c(varColumnsNames,"All"), c(varColumns,"all"), 
                      c(varValuesNames,"yes"), c(varValues,1))
	fad@vData = varData
	fad@vData$varTable = cbind(vv, all=1)
	.checkData(fad)
	fad
}

addFeatures = function(fad, what=c("ensembl","cancergene","oncogene","tumorsupressor","lincRNA","mirnas")[1], 
                       genome=c("hg18","hg19","mm8")[1], lMargin=0, rMargin=0, minoverlap=1, data=NULL) {
	if (! genome%in%.genomes())
    stop("Currently, only human genome builds (hg18, hg19) and mouse mm8 build are supported.")
	if (genome%in%c("hg18","hg19")) fad@chrs = c(as.character(1:22),fad@sex)
  if (genome%in%c("mm8")) fad@chrs = c(as.character(1:19),fad@sex)
  if (is.null(data)) {
	xText = paste(genome,"_feature_",what,sep="")
	data(list=xText)
	x = eval(parse(text=xText))
  } else {
    if (class(data)=="character") {
      x = read.table(data, header=TRUE,sep="\t")
    } else if (class(data)=="data.frame") {
      reqCols = c("chr","bp_st","bp_en","feature","chr_q_arm")
      if (! all(reqCols%in%colnames(data)))
        stop("At least the following columns should be present in the external data: ", paste(reqCols,collapse=", "))
      data$chr = gsub("chr","",tolower(data$chr))
      data$chr_q_arm = gsub("chr","",tolower(data$chr_q_arm))
      data$feature = toupper(data$feature)
      x = data
    } else {
      stop("External data with information on genomic features should be a data.frame.")
    }
  }
  x = x[x$chr%in%fad@chrs,]
	if (lMargin > 0) x$bp_st = max(1, x$bp_st-lMargin)
	if (rMargin > 0) x$bp_en = x$bp_en + rMargin
	fad@features = x
	fad@genome = genome
	fad@what = what
	fad@arms = unique(as.character(x$chr_q_arm))
	fad = .calcCallFreqsInt(fad, minoverlap=minoverlap)
	fad
}

# features are assigned the status of the first alteration they overlap, where alterations are ordered in
# genomic order or, if a criterion is available (e.g. OncoSNP version>1.3, GAP), based on such criterion
.calcCallFreqsInt = function(fad, minoverlap=1) {
  # compute sample alteration status for each feature
  if (minoverlap < 1)
    stop("Minimum overlap should be a positive integer number.")
  tabs = list()
  for (chr in fad@chrs) {
    if ("conf" %in% colnames(fad@cData)) {
      cD_chr = fad@cData[fad@cData$chr==chr,c("pos_en","pos_st","code","type","conf",fad@vData$varColumns)]
      cD_chr = cD_chr[order(cD_chr$conf),]
    } else {
      cD_chr = fad@cData[fad@cData$chr==chr,c("pos_en","pos_st","code","type",fad@vData$varColumns)]
    }
    chr_features = fad@features[fad@features$chr==chr,]
    a = IRanges(start=cD_chr$pos_st, end=cD_chr$pos_en)
    b = IRanges(start=chr_features$bp_st, end=chr_features$bp_en)
    overlaps = findOverlaps(a,b, minoverlap=minoverlap)
    for (i in 1:nrow(chr_features)) {
      overTab = cD_chr[queryHits(overlaps)[subjectHits(overlaps)==i],]
      overTab = overTab[!duplicated(overTab$code),]
      tabs[[length(tabs)+1]] = overTab[,c("code","type",fad@vData$varColumns)]
    }
    message(paste("Alteration status for chr",chr,": done.", sep=""))
  }
  names(tabs) = fad@features$feature
  # compute alteration frequencies per feature by sample and variable
  freqs = list()
  for (i in 1:length(fad@vData$varNames)) {
    freqs[[fad@vData$varNames[i]]] = list()
    for (cn in .alterCodes()) {
      if (i%in%which(fad@vData$varTypes=="categorical")) {
        res = .calcFreqsInt(fad@features, tabs, cn, fad@vData$varValues[[i]], fad@vData$varColumns[i])
        freqs[[fad@vData$varNames[i]]][[cn]] = res
      }
    }
    message(paste(fad@vData$varNames[i], "variable processing: done."))
  }
  fad@fTable = freqs
  fad@gTable = tabs
  fad
}

#######################################################################
############################################### OUTPUT SUMMARY TABLES #
#######################################################################
	
preview = function(fad, folder=NULL) {
	if (is.null(folder))
		folder="."
	s1 = variableSummary(fad, file.path(folder,"variable_summary.txt"))
	s2 = alterationSummary(fad, file.path(folder,"alteration_summary.txt"))
	s3 = variableCor(fad, file.path(folder,"variable_correlations.txt"))
	list(byVar=s1, byAlt=s2, varCor=s3)
}

variableSummary = function(fad, filename=NULL) {
	getName = function(i, x) fad@vData$varValuesNames[i][[1]][which(fad@vData$varValues[i][[1]]==x)]
  ff = function(i, p, a, code) unlist(lapply(p[[1]], function(j) c(getName(i,j), round(mean(a[a[,2]==code&a[,3]==j, 4])))))
	wilcoxtest = function(x) signif(wilcox.test(x[,2] ~ as.integer(x[,1]))$p.value, 3)
	cortest = function(x) signif(unlist(cor.test(x[,2], x[,1], method="kendall")[c("p.value","estimate")]), 3)
	bpsTab = c()
  for (i in fad@vData$varNames[-length(fad@vData$varNames)]) {
    vari_col = fad@vData$varColumns[i]
    aggr = data.frame(data.table(fad@cData)[,sum(bp_len), by=list(code,.unLOH=.unLOH(type),fad@cData[,vari_col])])
    aggr = .fill(fad@vData$varTable, aggr, vari_col, 0, .unLOH=1:3)
    aggrLOH = data.frame(data.table(fad@cData)[,sum(bp_len), by=list(code,.getLOH=.getLOH(type),fad@cData[,vari_col])])
    aggrLOH = .fill(fad@vData$varTable, aggrLOH, vari_col, 0, .getLOH=0:1)
    if (.isCateg(fad,i)) {
      l = apply(combn(fad@vData$varValues[[i]],2), 2, list)
      for (p in l) {
        for (cn in c(1,3)) {
          xx = aggr[aggr[,2]==cn&aggr[,3]%in%p[[1]], 3:4]
          bpsTab = rbind(bpsTab, c(i, ifelse(cn=="3","Amplification","Deletion"), ff(i, p, aggr, cn), wilcoxtest(xx), NA))
        }
        xx = aggrLOH[aggrLOH[,2]==1&aggrLOH[,3]%in%p[[1]], 3:4]
        if (nrow(xx) > 0) bpsTab = rbind(bpsTab, c(i, "LOH", ff(i, p, aggrLOH, 1), wilcoxtest(xx), NA))
      }
    } else {
      vals = fad@vData$varTable[,vari_col]
      for (cn in c(1,3)) {
        xx = aggr[aggr[,2]==cn&aggr[,3]%in%vals, 3:4]
        bpsTab = rbind(bpsTab, c(i, ifelse(cn=="3","Amplification","Deletion"), rep(NA,4), cortest(xx)))
      }
      xx = aggrLOH[aggrLOH[,2]==1&aggrLOH[,3]%in%vals, 3:4]
      if (nrow(xx) > 0) bpsTab = rbind(bpsTab, c(i, "LOH", rep(NA,4), cortest(xx)))
    }
    message(paste(i, "variable processing: done."))
  }
	colnames(bpsTab) = c("Variable", "Alteration", "Value A", "Mean altered A", "Value B", "Mean altered B", "p value", "tau")
	rownames(bpsTab) = NULL
	if (!is.null(filename))
		write.table(bpsTab, filename, sep="\t",quote=FALSE,row.names=FALSE)
	bpsTab
}

alterationSummary = function(fad, filename=NULL) {
	out = .makeArmsInt(fad@cData, fad@chrs, fad@genome)
	if (any(as.integer(names(table(fad@cData$type))) > 10)) {
		aggrLOH = .aggregateInt(out, .n(fad), fad@genome, fad@arms, .getLOH, 0:1, fad@vData$varTable)
		aggrLOH$type = as.numeric(as.character(aggrLOH$type)) + 10
	} else {
		aggrLOH = c()
	}
	aggr = rbind(.aggregateInt(out, .n(fad), fad@genome, fad@arms, .unLOH, 1:3, fad@vData$varTable), aggrLOH)
	aggr = aggr[.ord(aggr$chr_q_arm,fad@arms,as.numeric(aggr$type)),]
	aggr$type = sapply(as.vector(aggr$type), function(i) {
								if (i==1) i="Deletion"
								else if (i==2) i="NormalPloidy"
								else if (i==3) i="Amplification"
								else if (i==10) i="NoLOH"
								else if (i==11) i="LOH"
					})
	aggr$freq = signif(as.numeric(aggr$freq), 3)
  aggr[,3] = as.integer(aggr[,3],2)
	colnames(aggr) = c("Arm", "Alteration", "Altered length", "Arm length", "Alteration frequency")
	if (!is.null(filename))
		write.table(aggr, filename, sep="\t",quote=FALSE,row.names=FALSE)
	aggr	
}

variableCor = function(fad, filename=NULL) {
	if (length(fad@vData$varNames) <= 2) # the last var is 'All'
		stop("Variable correlations can only be calculated if there are more two or more variables.")
	n = length(fad@vData$varNames)
  vc = fad@vData$varColumns[-n]
	aggr = data.frame(data.table(fad@cData)[,sum(bp_len), by=sapply(fad@cData[,c("code",vc)], list)])
	aggr = .fill(fad@vData$varTable, aggr, vc, 0)
	comb = combn(fad@vData$varNames[-n], 2)
	res = c()
	for (i in 1:ncol(comb)) {
	  types = fad@vData$varTypes[comb[1:2,i]]
    aggri = aggr[,fad@vData$varColumns[comb[,i]]]
	  aggri = aggri[apply(sapply(1:2, function(v) aggri[,v]%in%fad@vData$varValues[comb[v,i]][[1]]), 1, all) , ]
    resi = c(comb[1,i], comb[2,i], rep(NA, 6))
    if (all(types=="categorical")) {
      corTab = table(aggri)
      if (any(corTab<5)) chi = NA
      else chi = chisq.test(corTab)$p.value
      resi[3:4] = c(fisher.test(corTab)$p.value, chi)
    } else if (all(types=="quantitative")) {
      resi[7:8] = unlist(cor.test(aggri[,1], aggri[,2], method="k")[c("p.value","estimate")])
    } else {
      categ = which(types=="categorical")
      quant = which(types=="quantitative")
      f = aggri[,quant]~aggri[,categ]
      resi[5:6] = c(oneway.test(f)$p.value, kruskal.test(f)$p.value)
    }
    res = rbind(res, resi)
	}
	colnames(res) = c("Variable A", "Variable B", "Fisher", "Chisquare", "Welch_oneway", "Kruskal-Wallis", "Kendall_p", "Kendall_tau")
	rownames(res) = NULL
	res[,3:ncol(res)] = signif(as.numeric(res[,3:ncol(res)]), 3)
  if (!is.null(filename))
		write.table(res, filename, sep="\t",quote=FALSE,row.names=FALSE)
	res
}

#######################################################################
#################################### ARM-WISE BARPLOTS AND HISTOGRAMS #
#######################################################################

.checkSel = function(sel, arms) {
  wh = which(! sel%in%arms)
  if (length(wh) > 0)
    stop(paste("The following selected arms do not contain features and thus cannot be selected: ",paste(sel[wh],collapse=", "),".",sep=""))
}
.checkSelCols = function(sel, selColors) {
  if (length(sel) != length(selColors))
    stop("The selection and the slection colors vectors should be of the same length.")
}
.checkYlim = function(ylim) {
  if (length(ylim) != 2)
    stop("'ylim' should be a vector of length two.")
  if (ylim[1]>ylim[2])
    ylim = rev(ylim)
  if (ylim[1] < -1)
    ylim[1] = -1
  if (ylim[2] > 1)
    ylim[2] = 1
}
.checkAlter = function(alteration) {
  if (is.null(.getAlterCodes(alteration)))
    stop("The alteration type is not recognized.")
}
.checkVar = function(varName, fad) {
  if (length(varName)!=1)
    stop("No variables defined.")
  if (! varName%in%fad@vData$varNames)
    stop(paste("The variable name is not among those in the data: ",paste(fad@vData$varNames,collapse=", "),".",sep=""))
}
.checkVarValue = function(varName, varValue, fad) {
  values = fad@vData$varValuesNames[[varName]]
  if (! varValue%in%values)
    stop(paste("The variable value is not among those in the data: ",paste(values,collapse=", "),".",sep=""))
}

.plotBarInt = function(aggr, sel, selColors, ylim=c(-1,1), ylab, baseColor="black", genome, sex, arms, features) {
  aggr$freq = as.numeric(aggr$freq)
	aggr$freq[aggr$type==1] = aggr$freq[aggr$type==1] + aggr$freq[aggr$type==11]
	aggr$freq[aggr$type==3] = aggr$freq[aggr$type==3] + aggr$freq[aggr$type==13]
	aggr$freq[aggr$type%in%c(1,11)] = -aggr$freq[aggr$type%in%c(1,11)]
	aggr$negLOH = aggr$posLOH = aggr$neg = aggr$pos = NA
	aggr$negLOH[aggr$type==11] = aggr$freq[aggr$type==11]
	aggr$posLOH[aggr$type==13] = aggr$freq[aggr$type==13]
	aggr$neg[aggr$type==1] = aggr$freq[aggr$type==1]
	aggr$pos[aggr$type==3] = aggr$freq[aggr$type==3]

	cols = rep(baseColor, length(arms))
	if (!is.null(sel) & !is.null(selColors)) {
		pos_marked = which(arms%in%sel)
		cols[pos_marked] = selColors
	} else {
		pos_marked = 1:length(arms)
	}

  # alternate backgrounds for chromosomes
  xx = as.factor(gsub("[p|q]","",arms))
  odds = which(as.integer(as.character(xx))%%2==1)
  st = intersect(odds, which(!duplicated(xx)))-0.5
  wi = matrix(rle(as.vector(xx))$l,2)[1,]
	rect = data.frame(xmin=st, xmax=st+wi, ymin=-Inf, ymax=Inf)

	p = qplot(as.factor(chr_q_arm), pos, data=aggr, ylab=paste(ylab,"deletion/amplification frequency"), xlab="Chromosome arm") +
		geom_rect(data=rect, aes(xmin=xmin,xmax=xmax,ymin=ymin,ymax=ymax), fill="white", alpha=0.5, inherit.aes=FALSE) +
		geom_rect(aes(xmin=0,xmax=Inf,ymin=-diff(ylim)*0.005,ymax=diff(ylim)*0.005), fill="grey") +
		geom_point(aes(x=factor(chr_q_arm), y=aggr$pos, fill=factor(chr_q_arm)), shape=24, size=5, inherit.aes=FALSE) + 
		geom_point(aes(x=factor(chr_q_arm), y=aggr$posLOH, fill=factor(chr_q_arm)), shape=24, size=3, inherit.aes=FALSE) + 
		geom_point(aes(x=factor(chr_q_arm), y=aggr$neg, fill=factor(chr_q_arm)), shape=25, size=5, inherit.aes=FALSE) + 
		geom_point(aes(x=factor(chr_q_arm), y=aggr$negLOH, fill=factor(chr_q_arm)), shape=25, size=3, inherit.aes=FALSE) + 
		scale_fill_manual(values=cols) + theme(legend.position="none") + 
		scale_x_discrete(labels=arms) + scale_y_continuous(expand=c(0,0), limits=ylim)
	if (!is.null(sel)) {
		marked_aggr = aggr[aggr$chr_q_arm%in%sel&aggr$type%in%c(1,3),]
		marked_aggr = marked_aggr[.ord(marked_aggr$chr_q_arm,arms,marked_aggr$type),]
		marked_aggr = matrix(marked_aggr$freq[marked_aggr$chr_q_arm%in%sel], 2, byrow=FALSE)
		vals = t(rbind(marked_aggr,pos_marked-0.05,pos_marked+0.05))
		vals = data.frame(xmin=vals[,3],xmax=vals[,4],ymin=vals[,1],ymax=vals[,2])
		p = p + geom_rect(data=vals, aes(xmin=xmin,xmax=xmax,ymin=ymin,ymax=ymax), fill="black", inherit.aes=FALSE)
	}
	suppressWarnings(print(p))
}

plotBar = function(fad, byFeature=TRUE, sel=NULL, selColors=NULL, ylim=c(-1,1), baseColor="black", varName=NULL, value=NULL) {
  .checkSel(sel, fad@arms)
  .checkSelCols(sel, selColors)
  .checkYlim(ylim)
  if (is.null(varName)+is.null(value)==1) # only one of them is null
    stop("You should provide both a variable and its value or neither.")
  if (!is.null(varName)) {
    if (! .isCateg(fad,varName))
      stop("This type of plot is only applicable to categorical variables.")
    .checkVar(varName, fad)
    .checkVarValue(varName, value, fad)
    vn = fad@vData$varColumns[[varName]]
    vv = fad@vData$varValues[[varName]][fad@vData$varValuesNames[[varName]]%in%value]
  } else {
    vn = vv = NULL
  }
  selColors = selColors[order(factor(sel,levels=fad@arms))]
  if (byFeature) .plotBarFeatures(fad, sel, selColors, ylim, baseColor, vn, vv)
  else .plotBarBps(fad, sel, selColors, ylim, baseColor, vn, vv)
}

.plotBarFeatures = function(fad, sel, selColors, ylim, baseColor, vn, vv) {
	.checkFeatures(fad)
	gt = fad@gTable
	if (!is.null(vv)) gt = lapply(gt, function(i) i[which(i[,vn]%in%vv),])
	xx = sapply(gt, function(i) factor(i$type,levels=.allCodes()))
	xx2 = t(sapply(xx, table))
	xx3 = t(apply(xx2,1,function(i)i/sum(i)))
	rownames(xx3) = fad@features$chr_q_arm
	xx4 = melt(xx3)
	xx4 = xx4[xx4[,2]%in%.alterCodes(),]
	xx4[,1] = factor(xx4[,1], levels=fad@arms)
	xx4[,2] = factor(xx4[,2])
	aggr = data.frame(data.table(xx4)[, mean(value,na.rm=TRUE), by=list(xx4[,1],xx4[,2])])
	colnames(aggr) = c("chr_q_arm", "type", "freq")
	eg = do.call(expand.grid,list(chr_q_arm=fad@arms, type=.alterCodes()))
	aggr = merge(aggr, eg, all=TRUE)
	.plotBarInt(aggr, sel, selColors, ylim, "Mean feature", baseColor=baseColor, 
              fad@genome, fad@sex, fad@arms, fad@features)
}

.plotBarBps = function(fad, sel, selColors, ylim, baseColor, vn, vv) {
	if (!is.null(vv)) {
		out = .makeArmsInt(fad@cData, fad@chrs, fad@genome, vn)
		out = out[out[,vn]%in%vv, colnames(out)!=vn]
	} else {
		out = .makeArmsInt(fad@cData, fad@chrs, fad@genome)
	}
	out$chr_q_arm = factor(out$chr_q_arm, levels=fad@arms)
	out$type = factor(out$type, levels=.allCodes())
	aggr = .aggregateInt(out, .n(fad,vn,vv), fad@genome, fad@arms, identity, .allCodes(), fad@vData$varTable)
	.plotBarInt(aggr, sel, selColors, ylim, "Overall", baseColor=baseColor, fad@genome, fad@sex, fad@arms, fad@features)
}

plotHist = function(fad, alteration, varName, sel=NULL, selColors=NULL, selOnly=FALSE, 
                    baseColor="black", bin=0.05, xmax=1, ymax) {
	.checkSel(sel, fad@arms)
	.checkSelCols(sel, selColors)
	if (xmax <= 0)
		stop("'xmax', defining the limit in the X axis should be a positive number between 0 and 1.")
	if (xmax > 1)
		xmax = 1+bin
	if (ymax <= 0)
		stop("'ymax', defining the limit in the Y axis should be a positive number.")
	if (bin <= 0)
		stop("The bin width should be a positive number between 0 and 1.")
	if (bin > 1)
	  bin = 1
	.checkAlter(alteration)
	.checkVar(varName, fad)
  if (! .isCateg(fad, varName))
    stop("Only supported for categorical variables.")
	x = Reduce('+', lapply(.getAlterCodes(alteration), function(i) fad@fTable[[varName]][[i]]))
	x[x==1] = 0.9999999
	cc = as.factor(fad@features$chr_q_arm)
	cols = rep(baseColor, length(fad@arms))
	ord = order(factor(sel,levels=fad@arms))
	selColors = selColors[ord]
	sel = sel[ord]
	if (!is.null(sel) & !is.null(selColors)) {
		num_marked = which(levels(cc)%in%sel)
		cols[1:length(num_marked)] = selColors
	} else {
		num_marked = 1:length(cc)
	}
	cc_ord = cc[c(which(cc%in%levels(cc)[num_marked]),which(!cc%in%levels(cc)[num_marked]))]
	levs = cc_ord[!duplicated(cc_ord)]
	p = list()
	for (i in ncol(x):1) {
		dat = data.frame(Chromosome=cc, Frequency=x[,i])
		dat[,1] = factor(dat[,1], levels=levs)
		if (selOnly)
			dat = dat[dat$Chromosome%in%levels(cc)[num_marked],]
    lab = paste(varName, fad@vData$varValuesNames[fad@vData$varNames==varName][[1]][i], sep=": ")
		p[[i]] = qplot(Frequency, data=dat, geom="histogram", fill=as.factor(Chromosome), binwidth=bin) +
			theme(legend.position="none") + scale_fill_manual(values=cols) + ylab("Genes") +
			coord_cartesian(ylim=c(0,ymax), xlim=c(0,xmax))	 +
		  annotate("text", label=lab, x=xmax*0.8,y=ymax*0.9, size=6, color="white", fontface="bold", family="mono") +
			annotate("text", label=lab, x=xmax*0.8,y=ymax*0.9, size=6, family="mono")		
		if (i==ncol(x)) p[[i]] = p[[i]] + xlab(paste(alteration,"frequency"))  
		else p[[i]] = p[[i]] + xlab("")
	}
	g_legend<-function(){
		a.gplot = qplot(Frequency, data=dat[dat$Chromosome%in%levels(cc)[num_marked],], geom="histogram", fill=Chromosome, binwidth=bin) + 
			scale_fill_manual(values=selColors, labels=sel) + theme(legend.position="bottom")
		tmp = ggplot_gtable(ggplot_build(a.gplot))
		tmp$grobs[[which(sapply(tmp$grobs, function(x) x$name)=="guide-box")]]
	}
	if (!is.null(sel) & !is.null(selColors)) 
		grid.arrange(do.call(arrangeGrob, c(p,list(ncol=1))), g_legend(), nrow=2, heights=c(length(p)*5, 1))
	else
		grid.arrange(do.call(arrangeGrob, c(p,list(ncol=1))))
}

#######################################################################
################################################ ASSOCIATION ANALYSIS #
#######################################################################

.checkFeatures = function(fad) if (length(fad@gTable)==0) stop("Please run 'addFeatures' first.")

.checkModels = function(vars.m, areCateg, vars.0, argModel, toOrdered, toIntervals, strata) {
  if (length(vars.m)==0)
    stop("There are no variables in the model.")
  if (! all(vars.0%in%vars.m))
    stop("The null model is not nested within the model to test.")
  if ( !is.null(toOrdered)) {
    if (class(toOrdered)!="list") 
      stop("The ordered values should be a list whose names are within the variables used in the model.")
    if (! all(names(toOrdered)%in%names(areCateg[areCateg])))
      stop("Unknown categorical variables in the list of variables to convert to ordered.")
  }
  if ( !is.null(toIntervals)) {
    if (class(toIntervals)!="list") 
      stop("The interval breaks should be a list whose names are within the variables used in the model.")
    if (! all(names(toIntervals)%in%names(areCateg[!areCateg])))
      stop("Unknown continuous variables in the list of variables to convert to intervals.")
  }
  if (argModel=="unknown") {
    message(paste("Note: when the dependence between variables and CN is not defined,",
            "the model is limited to x1 + ... + xn | strata. See ?independence_test for more."))
  } else if (argModel%in%c("predictor","whole")) {
    if (! is.null(strata))
      warning("The 'strata' parameter is not used when CN depends on the variables. Use stratification within 'model' and 'nullModel'.")
  } else if (argModel=="response") {
    if (length(vars.m)!=1)
      stop("When variables depend on CN, association is only implemented for the case of a single variable.")
  }
}

.checkFullModel = function(m) {
  #contains the @ and ~ symbols
  if (length(grep("@",m))==0) stop("The symbol @, representing the copy number variable, must be in the model.")
  if (length(grep("~",m))==0) stop("The symbol ~, separating the response and the predictor, must be in the model.")
  m = try(formula(gsub("@","type",m)), silent=TRUE)
  if (class(m=="try-error")) stop("The model is not a valid formula. Please check it and try again.")
}

facopy = function(fad, alteration, model, nullModel=NULL, modelPart=c("response","predictor","unknown","whole")[1],
                       strata=NULL, toOrdered=NULL, toIntervals=NULL, sel=NULL, plot=FALSE, pvalThr=0.05, db=NULL,
                       link=c("logit","probit")[1], parametric=FALSE, design=c("binary","versus","lvog")[1],
                       FUN=NULL, ...) {
  same = function(a, b) all(a%in%b) & all(b%in%a)
  lwh = function(cc, vv, doFactor) {
    r = rep(0.5, length(tt[,"type"]))
    for (n in seq(length(cc))) r[which(tt[,"type"]%in%.getAlterCodes(cc[n]))] = vv[n]
    factor(r, levels=sort(unique(c(0.5,vv)))) # 0.5 is the middle value
  }
  doMultinom = function(tt, model) {
    x = formula(paste(model,"~type"))
    glm.model = try (multinom(x, data=tt, trace=FALSE), silent=TRUE)
    if (class(glm.model)[1]=="try-error") {
      NA
    } else {
      glm.0 = multinom(formula(paste(model,"~1")), data=tt, trace=FALSE)
      anova(glm.model, glm.0, test="Chisq")$P[2]
    }
  }
  ### checks
  argsModelNames = c("response","predictor","unknown","whole")
  argModel = pmatch(tolower(modelPart), argsModelNames)
  if (length(argModel)!=1 || is.na(argModel))
    stop("The 'modelPart' parameter should match one of the following: ", paste(argsModelNames,collapse=", "))
  argModel = argsModelNames[argModel]
  if (argModel=="whole" && is.null(FUN))
    stop("When providiing a full model, please also provide a function through the parameter 'FUN'.")
  .checkFeatures(fad)
  .checkAlter(alteration)
  varNames = all.vars(parse(text=model))
  dummy = sapply(varNames, .checkVar, fad)
  varNames.0 = all.vars(parse(text=ifelse(is.null(nullModel),"1",nullModel)))
  .checkModels(varNames, .isCateg(fad, varNames), varNames.0, argModel, toOrdered, toIntervals, strata)
  if (plot==TRUE && !is.null(sel)) {
    wwarning = "Plotting only available for genome-wide association (i.e. when 'sel' parameter not used)."
    warning(paste(wwarning, "Plotting disabled."))
    plot = FALSE
  }
  if (argModel=="whole") {
    .checkFullModel(model)
    if (!is.null(nullModel)) .checkFullModel(nullModel)
  }
  ### defs
  alterCode = .getAlterCodes(alteration)
  if (same(.getAlterCodes("cna"),alterCode) & design=="versus") {
    cc = c("amp","del"); vv = c(1,0)
  } else if (same(.getAlterCodes("all"),alterCode) & design=="lvog") {
    cc = c("someloss","onlygain"); vv = c(1,0)
  } else {
    cc = alteration; vv =  1
  }
  if (is.null(nullModel)) {
    nullModel = "1"
    if (argModel=="whole") {
      resp = gsub("~.*","",model)
      nullModel = paste(resp,"~",nullModel)
    }
  }
  if (argModel=="whole") {
    model = gsub("@","type",model)
    nullModel = gsub("@","type",nullModel)
  }
  tabs = fad@gTable
  if (! is.null(sel)) {
    sel = which(fad@features$feature%in%sel)
    tabs = tabs[sel]
  }
  ind = sapply(varNames, function(i) which(fad@vData$varNames==i))
  vars = fad@vData$varColumns[ind]
  ### assoc
  p1 = p2 = p3 = c()
  ps= c()
  for (i in tabs) {
    tt = i[,c(vars,"type",strata)]
    colnames(tt)[seq(ncol(tt)-1)] = fad@vData$varNames[ind]
    if (nrow(tt)==0) { ps = c(ps, NA); next }
    for (n in colnames(tt)[seq(ncol(tt)-1)]) {
      varVals = fad@vData$varValues[[n]]
      tt = tt[tt[,n]%in%varVals,]
      if (.isCateg(fad, n)) {
        if (n%in%names(toOrdered)) {
          for (o in seq(length(varVals)))
            tt[,n][tt[,n]==varVals[o]]=toOrdered[[n]][o]
        } else {
          tt[,n] = factor(tt[,n], levels=varVals)
        }
      } else if (n%in%names(toIntervals)) {
          tt[,n] = cut(tt[,n], c(-Inf,toIntervals[[n]],Inf), labels=seq(length(toIntervals[[n]])+1))
          tt[,n] = as.numeric(tt[,n])
      }
    }
    if (! is.null(strata)) {
      strataVals = fad@vData$varValues[[strata]]
      tt = tt[tt[,strata]%in%strataVals,]
      tt[,ncol(tt)] = factor(tt[,ncol(tt)], levels=strataVals)
    }
    tt[,"type"] = lwh(cc, vv)
    if (length(table(as.vector(tt[,"type"]))) < 2) {
      pvalue = NA
    } else {
      if (argModel=="unknown") {
        x = paste("type~",model)
        if (!is.null(strata)) x = paste(x,"|",strata)
        pvalue = pvalue(independence_test(formula(x), data=tt))
      } else if (argModel=="predictor") {
        glm.model = try (glm(paste("type~",model), data=tt, family=binomial(link)), silent=TRUE)
        if (class(glm.model)[1]=="try-error") {
          pvalue = NA
        } else {
          glm.0 = glm(paste("type~",nullModel),  data=tt, family=binomial(link))
          pvalue = anova(glm.model, glm.0, test="Chisq")$P[2]          
        }
      } else if (argModel=="response") { # model == varNames; length(varNames) == 1
        x = formula(paste(model,"~type"))
        if (.isCateg(fad, model)) {
          if (is.factor(tt[,model])) {
            # nnet::multinom(). FOR CATEGORICAL VALUES.
            # "The variables on the rhs of the formula should be roughly scaled to 
            # [0,1] or the fit will be slow or may not converge at all." THIS IS DONE.
            pvalue = doMultinom(tt, model)
          } else { # ordered
            if (length(table(toOrdered[[model]]))==2) { 
              # nnet::multinom() and stats::glm() do the same with 2 levels, but glm() requires 0 <= y <= 1
              pvalue = doMultinom(tt, model)
            } else {
              #MASS::polr(). FOR ORDERED VALUES. ONLY 3 OR MORE LEVELS.
              tt[,model] = factor(tt[,model], levels=toOrdered[[model]])
              method = ifelse(link=="logit","logistic","probit")
              glm.model = polr(x, data=tt, method=method)
              glm.0 = polr(formula(paste(model,"~1")), data=tt, method=method)
              pvalue = anova(glm.model, glm.0, test="Chisq")$P[2]
            }
          }
        } else {
          if (parametric) FUN = oneway_test
          else FUN = kruskal_test
          pvalue = pvalue(FUN(formula(x), data=tt))
        }
      } else { # argModel=="whole"
        res = FUN(formula(model), formula(nullModel), data=tt, ...)
        if ("htest"%in%class(res)) pvalue = res$p.value
        else if ("lm"%in%class(res)) pvalue = summary(res)$coef[2,4]
        else if (attr(class(res),"package")=="coin") pvalue = pvalue(res)
        else if ("numeric"%in%class(res)) pvalue = res
        else stop("The provided function FUN is unknown. Please provide a function whose output is a pvalue.")
      }
    }
    ps = c(ps, pvalue)
    if (length(ps)%%500==0)
      message(paste(round(length(ps)*100/length(tabs),1), "% done...",sep=""))
  }
  if (length(varNames)!=1)
    varNames = "All"
  if (plot)
    .facopyPlotInt(fad, alteration, varNames, db, ps, pvalThr)
  if (is.null(sel))
    sel = seq(nrow(fad@features))
  cbind(fad@features[sel,c("feature","bp_st","bp_en","chr_q_arm")], p_value=round(ps,4))[,c(1,5,4,2,3)]
}

facopyPlot = function(fad, alteration, varName, db=NULL) .facopyPlotInt(fad, alteration, varName, db)

.facopyPlotInt = function(fad, alteration, varName, db=NULL, pvals=NULL, pvalThr=0.05) {
  ### PLOTTING FUNCTIONS
  ######################################
  emptyPlot = function(xlab=NULL, ylab=NULL) {
    ep = ggplot(data.frame()) + geom_point() + scale_x_continuous(expand=c(0,0)) + ylab("") +
      scale_y_continuous(expand=c(0,0), limits=c(0,1)) + theme_classic() + 
      theme(axis.line=element_line(color='white'), axis.ticks=element_blank(),
            axis.text.x=element_blank(), axis.text.y=element_blank(),
            text=element_text(size=ifelse(is.null(ylab),12,20)))
    if (!is.null(xlab)) ep = ep + xlab(xlab)
    if (!is.null(ylab)) ep = ep + ylab(ylab)
    ep
  }
  printPlot = function(p) {
    g_legend = function(){
      a.gplot = p[[1]] + theme(legend.position="bottom", text=element_text(size=20))
      tmp = ggplot_gtable(ggplot_build(a.gplot))
      tmp$grobs[[which(sapply(tmp$grobs, function(x) x$name)=="guide-box")]]
    }
    if (fad@genome%in%c("hg18","hg19")) ncol = 8
    if (fad@genome%in%c("mm8")) ncol = 7
    grid.arrange(emptyPlot(NULL, paste("\n",alteration,"frequency [0,1]")), 
                 do.call(arrangeGrob, c(p,list(ncol=ncol))), emptyPlot(), g_legend(), 
                 nrow=2, ncol=2, widths=c(0.25, ncol), heights=c(12, 1))
  }
  ### DATABASE PROFILES
  ######################################
	if (! is.null(db)) {
		datText = paste(fad@genome,"_db_",db,sep="")
		data(list=datText)
		dat = eval(parse(text=datText))
		a = unlist(.alterations())[pmatch(tolower(alteration), unlist(.alterations()))]
    if (is.na(a)) stop("Unknown alteration type: ", alteration)
		if (a=="amplifications") { 
      sel = "amp"
		} else if (a=="deletions") {
      sel = "del"
		} else if (a%in%c("cnas","any","loh")) {
		  sel = c("amp", "del")
		} else {
		  sel = c("amp", "del")
      warning("Alterations from databases in parameter 'db', including '",db,"', only involve amplifications and deletions.")
		}
		dat = dat[dat$type%in%sel,]
	}
  ### PLOTTING
  ######################################
  message("Processing plot...")
	p = list()
	for (arm in .allChrs(fad@genome,fad@sex)) {
		i = length(p)+1
    sel = which(fad@features$chr_q_arm==arm)
		featnames = as.vector(fad@features$feature[sel])
    if (length(sel)==0) next
    ### frequency calculation
    varVals = fad@vData$varValuesNames[[varName]]
		subxx = list()
		for (ac in .getAlterCodes(alteration)) {
			a = fad@fTable[[varName]][[ac]][featnames,,drop=FALSE]
			if (!is.null(a)) rownames(a) = featnames
			a[is.na(a)] = 0
			subxx[[length(subxx)+1]] = a
		}
		xx = as.matrix(Reduce(`+`, subxx))
		if (nrow(xx)==0) { p[[i]] = emptyPlot(paste("No visualizable data for arm",arm));	next }
		rownames(xx) = fad@features$bp_st[sel]
		colnames(xx) = varVals
		xx2 = melt(xx)
    xx2[,2] = factor(xx2[,2], levels=varVals)
		colnames(xx2) = c("bp",varName,"value")
		xlim = c(min(xx2$bp), max(xx2$bp))
    ### plot initialization
    p[[i]] = ggplot(xx2, aes_string(x="bp",y="value",colour=varName,group=varName)) + 
		  scale_y_continuous(breaks=seq(0.25,0.75,0.25), labels=rep("",3)) + 
      # scale_x_continuous(breaks=pretty_breaks(n=3)) +
      ylab(NULL) +
		  xlab(paste(arm, " (chr",gsub("[p|q]","",arm)," position)",sep="")) +
		  coord_cartesian(ylim=c(0,1), xlim=xlim)
		### add shading for significant features 
    if (! is.null(pvals)) {
      signif = which(pvals[sel]<pvalThr & !is.na(pvals[sel]))
  		if (length(signif) > 0) {
  		  pos1 = xx2$bp[sapply(signif-1, max,1)]
  		  pos2 = xx2$bp[signif]
  		  pos3 = xx2$bp[sapply(signif+1, min,length(sel))]
  		  irang = IRanges(start=pos2-(pos2-pos1)/2, end=pos2+(pos3-pos2)/2-1)
  		  cov = coverage(irang)
  		  csum = cumsum(cov@lengths)
  		  csum[cov@values==0] = csum[cov@values==0]+1 # csum[cov@values==0] are starts of intervals
  		  if (cov@values[1]==1) csum = c(1, csum)
  		  vals = data.frame(matrix(csum,ncol=2,byrow=TRUE))
  		  colnames(vals) = c("xmin","xmax")
  		  p[[i]] = p[[i]] + geom_rect(data=vals, aes(xmin=xmin,xmax=xmax,ymin=0,ymax=1), fill="black",alpha=0.5,inherit.aes=FALSE)
  		}
    }
    ### database visualization
		if (!is.null(db)) {
			dat_chr = dat[dat$chr==gsub("[p|q]","",arm),]
			dat_chr$freq = dat_chr$freq/100
			if (nrow(dat_chr) > 0)
				p[[i]] = p[[i]] + geom_rect(data=dat_chr, aes(xmin=pos_st,xmax=pos_en,ymin=0,ymax=freq),
											inherit.aes=FALSE, fill="black",alpha=0.75)
		}
    ### points, axes and legends
    cols = .refcols(length(varVals))
    p[[i]] = p[[i]] + geom_line(size=1.5,alpha=0.5) + 
      geom_point(colour=cols[as.numeric(xx2[,2])],size=3,alpha=0.5) +
      theme(legend.position="bottom") + scale_color_manual(values=cols) 
	}
	for (i in 1:length(p))
		p[[i]] = p[[i]] + theme(legend.position="none", plot.margin=unit(c(0.02,0.02,0,0),"npc"))
  message("Generating plot, please wait while it appears on the graphics device...")
  printPlot(p)
}

#######################################################################
############################################ GENE ENRICHMENT ANALYSIS #
#######################################################################

.graphPathway = function(p, pval, genes) {
  g <- pathwayGraph(p)
  gg <- setting.graph.attributes(igraph.from.graphNEL(g))
  V(gg)$color = "#00000022"
  wh = toupper(V(gg)$label)%in%toupper(genes)
  V(gg)$color[wh] = "red"
  gg = delete.edges(gg, which(is.loop(gg, eids=E(gg))==TRUE))
  gg = as.undirected(gg)
  degrees = igraph::degree(gg,v=V(gg)) + 1
  degrees = degrees / min(degrees)
  degrees = sapply(degrees, min, 10)
  plot.igraph(gg, vertex.label.font = 2, vertex.frame.color = V(gg)$color,
              vertex.label.color = "#000000cc", vertex.label.cex = 5/sqrt(length(p@nodes)),
              vertex.size = 50/sqrt(length(p@nodes)) * sqrt(degrees),
              edge.width=10/sqrt(length(p@nodes)), edge.curved=TRUE, edge.arrow.mode=0,
              layout = layout.fruchterman.reingold(graph=gg,repulserad=vcount(gg)^3*0.005))
  mtext(paste(p@title,"\np-value: ",pval,"",sep=""))
}

.enrichment = function(dbName, db, dbCategNames, folder, allsymbols, ress, fad, plotThr) {
  # N  	black+white total	score_tg
  # K		white total			score_all
  # k		white drawn			score_eg
  # n		black+white drawn	score_sg
  allsymbols = unique(allsymbols)
  sigGeneSymbol = ress$feature
  totalgenes = length(allsymbols)
  signifgenes = length(sigGeneSymbol)
  score_sg = length(table(as.vector(ress$chr_q_arm)))
  score_tg = length(table(fad@features$chr_q_arm[sapply(fad@gTable,nrow)>0 & fad@features$feature%in%allsymbols]))
  tab = c()
  for (j in 1:length(db)) {
    wh = which(sigGeneSymbol%in%db[[j]])
    enrichmentGenes = sigGeneSymbol[wh]
    count = length(wh)
    wh_all = which(db[[j]]%in%allsymbols)
    size = length(wh_all)
    score_eg = length(table(as.vector(ress$chr_q_arm[wh])))
    score_all = length(table(as.vector(fad@features$chr_q_arm[which(fad@features$feature%in%db[[j]][wh_all])])))
    score_pvalue = phyper(score_eg, score_all, score_tg-score_all, score_sg, lower.tail=TRUE)
    expcount = signifgenes*size/totalgenes
    pvalue = phyper(count-1L, size, totalgenes-size, signifgenes, lower.tail=FALSE)
    oddsratio = count*(totalgenes-size-signifgenes+count) / ((signifgenes-count) * (size-count))
    tab = rbind(tab, c(dbCategNames[j], size, signif(expcount), count, signif(oddsratio), signif(pvalue),
                       paste(score_eg,round(score_pvalue,3),sep="_"), paste(enrichmentGenes,collapse=", ")))
  }
  tab = cbind(tab[,1:6], signif(p.adjust(tab[,6],'fdr')), tab[,7:8])
  colnames(tab) = c("Category","Size","ExpCount","Count","OddsRatio","Pvalue","PvalueAdj","Score","Genes")
  tab = tab[order(as.numeric(tab[,"Pvalue"])),]
  write.table(tab, file.path(folder,paste(dbName,".txt",sep="")), row.names=FALSE,quote=FALSE, sep="\t")
  
  canonical = c("kegg","reactome","biocarta")
  go = c("gobp","gocc","gomf")
  if (dbName%in%c(canonical,go)) {
    bottom.r = tail(which(tab[,"Pvalue"]<plotThr),1)
    if (length(bottom.r) > 0) {
      if (dbName%in%canonical) {
        dir.create(file.path(folder,dbName))
        for (r in 1:bottom.r) {
          graphObjsText = paste("facopy_",dbName,sep="")
          data(list=graphObjsText)
          graphObjs = eval(parse(text=graphObjsText))
          p = graphObjs[[tab[r,1]]]
          filename = paste(r,"_",gsub("[:|/]","_",substr(p@title,1,32)),".pdf", sep="")
          pdf(file.path(folder, dbName, filename), 12, 9)
          .graphPathway(p, tab[r,"Pvalue"], strsplit(tab[r,"Genes"],", ")[[1]])
          dev.off()
        }
      } else if (dbName%in%go) {
        onto = toupper(substr(dbName, 3,4))
        goparents = eval(parse(text=paste("GO",onto,"PARENTS",sep="")))
        sigGO.fdr = tab[seq(bottom.r),"Pvalue"]
        names(sigGO.fdr) = tab[seq(bottom.r),1]
        sigGO.term = sapply(names(sigGO.fdr), function(j) {
          name = try(getGOTerm(j)[[onto]], silent=TRUE)
          name = ifelse(class(name)=="try-error", "", name)
          paste(j,paste(strwrap(name,24),collapse="\\\n"),sigGO.fdr[j],sep="\\\n")
        })
        colors = sapply(as.numeric(sigGO.fdr), function(j) heat.colors(80)[min(80,max(1,round(1/log10(1/j)*100)))])
        names(colors) = names(sigGO.fdr)
        grph = GOGraph(names(sigGO.fdr),goparents)
        sizeFactor = round(sqrt(length(grph@nodes)))
        isSignifNode = sapply(grph@nodes, function(n) ifelse(n%in%names(sigGO.fdr),TRUE,FALSE))
        shapes = sapply(isSignifNode, ifelse, "rectangle", "circle")
        widths = sapply(isSignifNode, ifelse, 0.23, 0.1)
        heights = sapply(isSignifNode, ifelse, 0.13, 0.1)
        fontsizes = sapply(isSignifNode, ifelse, 20, 14)
        nodeAttrs = list(shape=shapes, label=sigGO.term, fillcolor=colors, 
                         width=widths*sizeFactor, height=heights*sizeFactor, fontsize=fontsizes)
        pdf(file.path(folder, paste(dbName,".pdf",sep="")), 2.0*sizeFactor, 1.4*sizeFactor)
        Rgraphviz::plot(grph, nodeAttrs=nodeAttrs)
        dev.off()
      }
    }
  }
  
  tab
}

calculateCor = function(fad, exprProfile, db=NULL) {
  if (! is.null(db)) {
    if (class(exprProfile)!="character" || class(db)!="character") 
      stop("'exprProfile' and 'db' should be characters indicating database and name of expression profile in cBio portal.")
    cor = .corInt(db, exprProfile, fad@features$feature)
  } else {
    if (class(exprProfile)=="character") {
      exprProfile = read.table(exprProfile, header=TRUE,sep="\t")
    } else if (class(exprProfile)=="data.frame") {
      reqCols = c("chr","bp_st","bp_en","feature","chr_q_arm")
      if (! all(reqCols%in%colnames(exprProfile)))
        stop("At least the following columns should be present in the external data: ", paste(reqCols,collapse=", "))
    } else {
      stop("External data with information on expression should be a data.frame.")
    }
    sampleNames = as.character(fad@vData$varTable$code)
    cnProfile = t(sapply(fad@gTable, function(i) { x = i$type; names(x) = i$code; x[sampleNames] }))
    colnames(cnProfile) = sampleNames
    ## next 2 represent a random exprProfile
    #exprProfile = cnProfile[,ncol(cnProfile)]
    #exprProfile[,] = rnorm(length(exprProfile))
    # exprProfile contains the same genes (rows) and samples (columns) as the CN data
    whrow = intersect(rownames(exprProfile),rownames(cnProfile))
    whcol = intersect(colnames(exprProfile),colnames(cnProfile))
    exprProfile = exprProfile[whrow,whcol]
    cnProfile = cnProfile[whrow,whcol]
    cor = .makeCor(cnProfile, exprProfile)
  }  
  list(cor=cor, db=db, profile=exprProfile)
}

facopyEnrichment = function(fad, geneTable, cor, outFolder, pvalThr=0.05, corThr=0.1, plotThr=0.001) {
  data(facopy_msigdb)
  data(facopy_msigdbNames)
  if (! fad@genome%in%c("hg18","hg19"))
    stop("Gene-set enrichment analysis is only available for human genomes.")
  geneTable = geneTable[order(geneTable$p_value),]
  cors = cor$cor[as.character(geneTable$feature)]
  cors_m = cbind(names(cors), round(cors,3))
  colnames(cors_m) = c("feature", "cor_expr")
  geneTable = merge(geneTable, cors_m, all.x=TRUE)
  geneTable = geneTable[order(geneTable$p_value, -as.numeric(as.character(geneTable$cor_expr))),]
  if (!is.null(outFolder)) {
    dir.create(outFolder)
    wh = which(as.vector(geneTable$cor_expr)>=corThr & as.vector(geneTable$p_value)<=pvalThr)
    univ = intersect(names(cor$cor)[cor$cor>corThr], fad@features$feature)
    for (x in names(facopy_msigdbNames))
      dummy = .enrichment(x, facopy_msigdb[[x]], facopy_msigdbNames[[x]], outFolder, univ, geneTable[wh,], fad, plotThr)
  }
  geneTable
}

.makeCor = function(m, n) {
  out = sapply(seq(nrow(m)), function(i) {
    cn = as.numeric(as.vector(m[i,]))
    expr = as.numeric(as.vector(n[i,]))
    if (!all(is.na(cn)) && !all(is.na(expr)))
      summary(lm(expr~cn))$r.squared
    else NA 
  })
  names(out) = rownames(m)
  out
}

.corInt = function(dbGeneExpr, exprProfile, x) {
  mycgds = CGDS("http://www.cbioportal.org/public-portal/")
  # test(mycgds)
  #getCancerStudies(mycgds)
  mycancerstudy = getCancerStudies(mycgds)[getCancerStudies(mycgds)[,1]==dbGeneExpr,1]
  mycaselist = grep("_all$", getCaseLists(mycgds,mycancerstudy)[,1], value=TRUE)[1]
  mygeneticprofiles = tolower(getGeneticProfiles(mycgds,mycancerstudy)[,1])
  cn_profile = grep("_gistic|_cna", mygeneticprofiles, value=TRUE)
  if (length(cn_profile)>1)
    cn_profile = cn_profile[sapply(cn_profile, function(i) nrow(getProfileData(mycgds,x[1],i,mycaselist)))>0][1]
  expr_profile = paste(mycancerstudy,"_",exprProfile,sep="")	
  n = length(x)
  chunk = 500
  cgds_gene = list()
  for (i in c(cn_profile, expr_profile)) {
    cgds_gene[[i]] = NULL
    for (j in 1:ceiling(n/chunk)) {
      st = 1 + chunk*(j-1)
      en = min(chunk*j,n)
      genes_chunk = toupper(x[st:en])
      genes_chunk = gsub("^MIRN", "MIR", genes_chunk) # match miRNA nomenclature
      trials = 0
      repeat{
        prof = try( t(getProfileData(mycgds,genes_chunk,i,mycaselist)) ,silent=TRUE)
        if (! class(prof)=="try-error") break
        trials = trials+1
        if (trials==3) message("Warning: connection to cBio portal is not stable. You might eventually get an error.")
        if (trials==5) stop("Cannot connect to cBio portal. Maybe connection is down?")
        Sys.sleep(0.5)
      }
      res = matrix(NA, ncol=ncol(prof),nrow=length(genes_chunk))
      rownames(res) = genes_chunk
      wh = genes_chunk%in%rownames(prof)
      res[wh,] = prof[genes_chunk[wh],]
      cgds_gene[[i]] = rbind(cgds_gene[[i]], res)
      message(paste(i," ",round(j*100/(ceiling(n/chunk)),2),"% done...",sep=""))
    }
  }
  .makeCor(cgds_gene[[1]], cgds_gene[[2]])
}

#######################################################################
######################################## PLOT ARM OR FEATURE VICINITY #
#######################################################################
#.plotVicinityInt(fad, NULL, gsub("[p|q]","",arm), st, en, alteration, 0, varName, arm)
.plotVicinityInt = function(fad, name=NULL, chr, st=NULL, en=NULL, 
							alteration=NULL, margin=0, varName=NULL, arm=NULL) {
	oldmargins <- par("mar")
	makeAb = function(col) {
		abline(v=c(st,en), lwd=3, col=col)
		abline(v=c(st-margin,en+margin), lwd=1, col=col)
	}
	alterCode = .getAlterCodes(alteration)
	colname = fad@vData$varColumns[[varName]]
  dfsub = fad@cData[ fad@cData[,colname]%in%fad@vData$varValues[[varName]] , ]
	dfsub$code = as.factor(dfsub$code)
	dfsub = dfsub[dfsub$chr==chr,]
	if (!is.null(st) & !(is.null(en)))
		dfsub = dfsub[dfsub$pos_st<en+margin & dfsub$pos_en>st-margin,]
	dfsub = dfsub[dfsub$type%in%alterCode,]
	if (nrow(dfsub)==0)
	  stop("There are no alterations of the selected type in the vicinity of the feature for that variable.")
	dfsub = dfsub[order(dfsub[,colname]),]
	xlim = c(min(dfsub$pos_st), max(dfsub$pos_en))
	if (is.null(name))
		for (i in levels(dfsub$code)[!levels(dfsub$code)%in%dfsub$code]) {
			ro = rep(NA, ncol(dfsub))
			names(ro) = colnames(dfsub)
			ro = as.data.frame(t(as.matrix(ro)))
			ro$code = i
			ro$pos_st = ro$pos_en = -1
			ro[,colname] = fad@cData[which(fad@cData$code==i)[1],colname]
			dfsub = rbind(dfsub, ro)
		}
	vals = - as.numeric(as.factor(dfsub[,colname])) # negative in order to plot from top to bottom
	aggr = data.frame(data.table(dfsub)[,sum(bp_len), by=list(code)]) #.fill not necessary
	aggr[,2] = rank(aggr[,2], na.last=FALSE)
	dfsub$alterRank = sapply(dfsub$code, function(i) aggr[aggr[,1]==i,2])
	dfsub = dfsub[order(vals,dfsub$alterRank),]
	vals = sort(vals)
	height = 4
	bins = dfsub$alterRank
	bins = vals*max(bins) + bins
	bins = as.numeric(as.factor(bins))
	ylim = c(0, max(bins)*(height + 1))
	layout(matrix(c(rep(1,15),2)))
	plot.new()
	plot.window(xlim, ylim)
	grid(ny=0, col="grey", lty=1)
	if (!is.null(name))
		makeAb("#000000")
	ybottom = bins * (1 + height) - height
  if (.isCateg(fad,varName)) {
	  factors = fad@vData$varValues[[varName]]
  } else {
    qu = quantile(as.numeric(names(table(fad@cData[,fad@vData$varColumns[[varName]]]))),c(0,0.5,1))
    factors = seq(qu[1],qu[3])
  }
	cols = .refcols(length(factors))
	allcols = cols[as.numeric(factor(dfsub[,colname], factors))]
	rect(dfsub$pos_st-0.5, ybottom, dfsub$pos_en+0.5, ybottom+height, col=allcols, border=0)
	grid(ny=0, col="white")
	if (!is.null(name))
		makeAb("#00000033")
	if (!is.null(name))
		title(paste(varName, ", alterations (",alteration,") near ", name, sep=""))
	else if (!is.null(arm))
		title(paste(varName, ", alterations (",alteration,") in chr", arm, sep=""))
	axis(1)
  mtext(text=paste("Chromosome",chr,"position"), cex=1, side=1, line=2.5)
  par(mar=c(0,0,0,0))
	plot.new()
	if (.isCateg(fad,varName)) {
	  legend("top", legend=fad@vData$varValuesNames[[varName]], col=cols, horiz=TRUE, lwd=10, box.col="white", cex=1.3)
	  uniq = fad@vData$varTable[,c("code",colname)]
	  uniq = uniq[uniq[,colname]%in%factors,]
	  uniq[,2] = factor(uniq[,2], levels=factors)
    tab = table(merge(aggr, uniq)[,3])
    if (is.null(arm)) tab = paste(tab, "of", table(uniq[,2]))
	  legend("bottom", legend=tab, col=cols, horiz=TRUE, lwd=10, bty="n", cex=1.3)
  } else {
    image(y=c(0.2,0.6), z=matrix(seq(length(allcols)),ncol=1), col=allcols, axes=FALSE,xlab="",ylab="", add=TRUE)
    legend("topleft", legend=qu[1], cex=1.3, bty="n", adj=1)
    legend("top", legend=qu[2], cex=1.3, bty="n")
    legend("topright", legend=qu[3], cex=1.3, bty="n")
  }
	par(mar=oldmargins)
}

plotZoom = function(fad, what=c("feature","arm"), name, alteration, varName, margin=0) {
  arg = function(x) match.arg(tolower(x),c("feature","arm"))
  if (arg(what)=="feature") .plotVicinity(fad, name, alteration, varName, margin)
  if (arg(what)=="arm") .plotArm(fad, name, alteration, varName)
}

.plotVicinity = function(fad, name, alteration, varName, margin=0) {
	info = fad@features[fad@features$feature==name,]
	.plotVicinityInt(fad, name, info$chr, info$bp_st, info$bp_en, alteration, margin, varName, NULL)
}

.plotArm = function(fad, arm, alteration, varName) {
	sepsText = paste(fad@genome,"_armLimits",sep="")
	data(list=sepsText)
	seps = eval(parse(text=sepsText))
	wh = which(seps$chr_q_arm==arm)
	en = seps$limit[wh]
	if (fad@genome%in%c("hg18","hg19")) st = ifelse(length(grep("p",arm))==1, 0, seps$limit[wh-1])
  if (fad@genome%in%c("mm8")) st = 0
	.plotVicinityInt(fad, NULL, gsub("[p|q]","",arm), st, en, alteration, 0, varName, arm)
}

#######################################################################
######################################## PRINCIPAL COMPONENT ANALYSIS #
#######################################################################

.makeMatrixInt = function(gtab, alterCode, samples, design)	{
	same = function(a, b) all(a%in%b) & all(b%in%a)
	lwh = function(cc, vv) lapply(gtab, function(g) {
		r = rep(0, length(samples))
		for (i in seq(length(cc))) r[which(samples%in%g$code[g$type%in%cc[[i]]])] = vv[i]
		r
	})
	if (same(.getAlterCodes("cna"),alterCode) & design=="versus") {
		xx = lwh(list(.getAlterCodes("amp"),.getAlterCodes("del")), c(1,-1))
	} else if (same(.getAlterCodes("all"),alterCode) & design=="lvog") {
		xx = lwh(list(.getAlterCodes("someloss"),.getAlterCodes("onlygain")), c(-1,1))
	} else {
		xx = lwh(list(alterCode), 1)
	}
	do.call(rbind, xx)
}

plotPCA = function(fad, alteration, varName, sel=NULL, 
					design=c("binary","versus","lvog")[1], do.plot=TRUE, by.size=TRUE, cex=4) {
					# versus design requires alteration="CNA"
					# lvog (loss versus only gain) design requires alteration="any","all"
	.checkFeatures(fad)
	.checkAlter(alteration)
	.checkVar(varName, fad)
	.checkSel(sel, fad@arms)
	if (cex < 1)
		stop("The size of the points 'cex' should be a positive number.")
	alterCode = .getAlterCodes(alteration)
	samples = names(table(fad@cData$code))
	rang = 1:length(fad@gTable)
	if (!is.null(sel)) rang = rang[fad@features$chr_q_arm%in%sel]
	m = .makeMatrixInt(fad@gTable[rang], alterCode, samples, design)
	vc = fad@vData$varColumns[[varName]]
	vv = fad@vData$varValues[[varName]]
	dat = fad@vData$varTable[,c("code",vc)]
	dat = dat[dat[,vc]%in%vv,]
	wh = which(fad@cData[,vc]%in%vv & fad@cData$type%in%alterCode)
	aggr = data.frame(data.table(fad@cData[wh,])[, sum(bp_len), by=list(code)])
	aggr[,2] = rank(aggr[,2], na.last=FALSE)
  alterRank = sapply(dat$code, function(i) aggr[aggr[,1]==i,2])
  alterRank[sapply(alterRank, length)==0] = 0
	dat$alterRank = unlist(alterRank)
	colnames(m) = samples
	m = m[,colnames(m)%in%dat$code]
	rownames(m) = fad@features$feature[rang]
	pc = PCA(t(m), graph=FALSE)
	if (do.plot) {
		ord = unlist(sapply(rownames(pc$ind$coord), function(i) which(dat$code==i)))
		cols = .refcols(length(vv))
		cols_t = paste(cols,"44",sep="")
		allcols = cols[as.numeric(as.factor(dat[ord,vc]))]
		allcols_t = cols_t[as.numeric(as.factor(dat[ord,vc]))]
		s = 2
		if (by.size)
			s = seq(1,6,length=nrow(dat))[dat$alterRank[ord]]
		labs = paste("PC",1:2, " (", round(pc$eig[1:2,2],2), "%)", sep="")
		ymin = min(pc$ind$coord[,2])
		ymax = max(pc$ind$coord[,2])
    ytop = ymax+(ymax-ymin)*0.12
		plot(pc$ind$coord[,1:2], xlab=labs[1], ylab=labs[2], cex=cex, pch=21, col=allcols, lwd=s, ylim=c(ymin,ytop), 
			main=paste("PCA clustering (",alteration,", ",varName,"), ",design," design",sep=""),	frame.plot=FALSE)
		points(pc$ind$coord[,1:2], xlab=labs[1], ylab=labs[2], cex=cex, pch=19, col=allcols_t)
    if (.isCateg(fad,varName)) {
      legend("top", legend=fad@vData$varValuesNames[[varName]], col=cols, horiz=TRUE, lwd=10, box.col="white", cex=1.3)
		} else {
		  xmin = min(pc$ind$coord[,1])
		  xmax = max(pc$ind$coord[,1])
  		image(x=seq(xmin,xmax,l=length(cols)), y=c(ymax+(ytop-ymax)*0.6,ytop-(ytop-ymax)*0.2),
            z=matrix(rep(seq(length(cols)),2),ncol=2), col=cols, axes=FALSE,xlab="",ylab="", add=TRUE)
      qu = quantile(dat[,vc],c(0,0.5,1))
  		legend("topleft", legend=qu[1], cex=1.2, bty="n", adj=1)
  		legend("top", legend=qu[2], cex=1.2, bty="n")
  		legend("topright", legend=qu[3], cex=1.2, bty="n")
		}
	}
	pc
}
Any scripts or data that you put into this service are public.
facopy documentation built on May 2, 2018, 2:30 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
facopy
Feature-based association and gene-set enrichment for copy number alteration analysis in cancer

R/facopy_internal_v45.r
In facopy: Feature-based association and gene-set enrichment for copy number alteration analysis in cancer

Documented in addFeatures addVariables alterationSummary calculateCor facopy facopyEnrichment facopyPlot getFacopyInfo plotBar plotHist plotPCA plotZoom preview readCNData variableCor variableSummary

Try the facopy package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

facopy Feature-based association and gene-set enrichment for copy number alteration analysis in cancer

R/facopy_internal_v45.r In facopy: Feature-based association and gene-set enrichment for copy number alteration analysis in cancer

Documented in addFeatures addVariables alterationSummary calculateCor facopy facopyEnrichment facopyPlot getFacopyInfo plotBar plotHist plotPCA plotZoom preview readCNData variableCor variableSummary

Try the facopy package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

facopy
Feature-based association and gene-set enrichment for copy number alteration analysis in cancer

R/facopy_internal_v45.r
In facopy: Feature-based association and gene-set enrichment for copy number alteration analysis in cancer