#' @title Survival analysis with univariate Cox regression package (dnet)
#' @description TCGAvisualize_SurvivalCoxNET can help an user to identify a group of survival genes that are
#' significant from univariate Kaplan Meier Analysis and also for Cox Regression.
#' It shows in the end a network build with community of genes with similar range of pvalues from
#' Cox regression (same color) and that interaction among those genes is already validated in
#' literatures using the STRING database (version 9.1).
#' TCGAvisualize_SurvivalCoxNET perform survival analysis with univariate Cox regression
#' and package (dnet) using following functions wrapping from these packages:
#' \enumerate{
#' \item survival::coxph
#' \item igraph::subgraph.edges
#' \item igraph::layout.fruchterman.reingold
#' \item igraph::spinglass.community
#' \item igraph::communities
#' \item dnet::dRDataLoader
#' \item dnet::dNetInduce
#' \item dnet::dNetPipeline
#' \item dnet::visNet
#' \item dnet::dCommSignif
#' }
#' @details TCGAvisualize_SurvivalCoxNET allow user to perform the complete workflow using coxph
#' and dnet package related to survival analysis with an identification of gene-active networks from
#' high-throughput omics data using gene expression and clinical data.
#' \enumerate{
#' \item Cox regression survival analysis to obtain hazard ratio (HR) and p-values
#' \item fit a Cox proportional hazards model and ANOVA (Chisq test)
#' \item Network comunites
#' \item An igraph object that contains a functional protein association network in human.
#' The network is extracted from the STRING database (version 9.1).
#' Only those associations with medium confidence (score>=400) are retained.
#' \item restrict to those edges with high confidence (score>=700)
#' \item extract network that only contains genes in pvals
#' \item Identification of gene-active network
#' \item visualisation of the gene-active network itself
#' \item the layout of the network visualisation (fixed in different visuals)
#' \item color nodes according to communities (identified via a spin-glass model and simulated annealing)
#' \item node sizes according to degrees
#' \item highlight different communities
#' \item visualize the subnetwork
#' }
#' @param clinical_patient is a data.frame using function 'clinic' with information
#' related to barcode / samples such as bcr_patient_barcode, days_to_death ,
#' days_to_last_followup , vital_status, etc
#' @param dataGE is a matrix of Gene expression (genes in rows, samples in cols) from TCGAprepare
#' @param Genelist is a list of gene symbols where perform survival KM.
#' @param org.Hs.string an igraph object that contains a functional protein association network
#' in human. The network is extracted from the STRING database (version 10).
#' @param scoreConfidence restrict to those edges with high confidence (eg. score>=700)
#' @param titlePlot is the title to show in the final plot.
#' @export
#' @return net IGRAPH with related Cox survival genes in community (same pval and color) and with
#' interactions from STRING database.
TCGAvisualize_SurvivalCoxNET <- function(
clinical_patient,
dataGE,
Genelist,
org.Hs.string,
scoreConfidence = 700,
titlePlot = "TCGAvisualize_SurvivalCoxNET Example"
){
check_package("survival")
check_package("dnet")
check_package("igraph")
combined_score <- NULL
pdf("SurvivalCoxNETOutput.pdf", width = 15, height = 10)
## fit a Cox proportional hazards model for age, gender, tumor type
cfu <- clinical_patient[clinical_patient[,"bcr_patient_barcode"] %in% substr(colnames(dataGE),1,12),]
rownames(cfu) <- cfu$bcr_patient_barcode
cfu <- cfu %>% dplyr::select(
"bcr_patient_barcode",
"days_to_last_follow_up",
"days_to_death",
"vital_status",
"age_at_diagnosis",
"gender"
)
rownames(cfu) <- cfu$bcr_patient_barcode
cfu <- cfu[,-1]
colnames(cfu) <- c("time","timeDead","status","Age","Gender")
cfu[which(cfu$status == "Alive"),"status"]<-0
cfu[which(cfu$status == "Dead"),"time"]<- cfu[which(cfu$status=="Dead"),"timeDead"]
cfu[which(cfu$status == "Dead"),"status"]<-1
cfu$Gender <- tolower(cfu$Gender)
cfu <- cfu[,-2]
cfu <- as.data.frame(subset(cfu, select=c("time","status")))
cfu$time <- as.numeric(cfu$time)
cfu$status <- as.numeric(cfu$status)
data <-as.data.frame(dataGE)
colnames(data) <- substr(colnames(data),1,12)
commonSamples <- intersect(rownames(cfu), colnames(data))
data <- t(data)
data <- data[commonSamples,Genelist]
#rownames(tabSurvKMfilt) <- tabSurvKMfilt$mRNA
#colnames(Cancer_rnaseqv2) <- substr(colnames(Cancer_rnaseqv2),1,12)
#md_selected<-log2(Cancer_rnaseqv2[rownames(tabSurvKMfilt),rownames(cfu)])
md_selected <- data
pd <- cfu
## survival analysis to obtain hazard ratio (HR) and pvaules
HR <- rep(1, ncol(md_selected))
pvals <- rep(1, ncol(md_selected))
for(i in 1:ncol(md_selected)){
## fit a Cox proportional hazards model
data <- cbind(pd, gene = md_selected[,i])
data <- data [which(data$gene !="-Inf"),]
fit <- survival::coxph(formula=survival::Surv(time,status) ~., data=data)
## ANOVA (Chisq test)
cox <- summary(fit)
cat(paste( (ncol(md_selected)-i),".",sep=""))
# res <- as.matrix(anova(fit, test="Chisq"))
HR[i] <- as.numeric(cox$coefficients[2])
pvals[i] <- as.numeric(cox$logtest[3])
}
names(HR) <- colnames(md_selected)
names(pvals) <- colnames(md_selected)
# Network comunites >>=
# An igraph object that contains a functional protein association network
# in human. The network is extracted from the STRING database (version 9.1).
# Only those associations with medium confidence (score>=400) are retained.
# org.Hs.string <- dRDataLoader(RData='org.Hs.string')
# restrict to those edges with high confidence (score>=700)
# with(org.Hs.string,{
# network <- subgraph.edges(org.Hs.string, eids=E(org.Hs.string)[combined_score>=scoreConfidence])})
#network
network <- igraph::subgraph.edges(org.Hs.string, eids=igraph::E(org.Hs.string)[combined_score>=scoreConfidence])
# extract network that only contains genes in pvals
ind <- match(igraph::V(network)$symbol, names(pvals))
## for extracted graph
nodes_mapped <- igraph::V(network)$name[!is.na(ind)]
network <- dnet::dNetInduce(g=network, nodes_query=nodes_mapped, knn=0,
remove.loops=FALSE, largest.comp=TRUE)
igraph::V(network)$name <- igraph::V(network)$symbol
# Identification of gene-active network
net <- dnet::dNetPipeline(g=network, pval=pvals, method="customised",
significance.threshold=5e-02)
# visualisation of the gene-active network itself
## the layout of the network visualisation (fixed in different visuals)
glayout <- igraph::layout.fruchterman.reingold(net)
## color nodes according to communities (identified via a spin-glass model and simulated annealing)
com <- igraph::spinglass.community(net, spins=25)
com$csize <- sapply(1:length(com),function(x) sum(com$membership==x))
vgroups <- com$membership
colormap <- "yellow-darkorange"
palette.name <- supraHex::visColormap(colormap=colormap)
mcolors <- palette.name(length(com))
vcolors <- mcolors[vgroups]
com$significance <- dnet::dCommSignif(net, com)
## node sizes according to degrees
vdegrees <- igraph::degree(net)
## highlight different communities
mark.groups <- igraph::communities(com)
mark.col <- supraHex::visColoralpha(mcolors, alpha=0.2)
mark.border <- supraHex::visColoralpha(mcolors, alpha=0.2)
edge.color <- c("#C0C0C0", "#000000")[igraph::crossing(com,net)+1]
edge.color <- supraHex::visColoralpha(edge.color, alpha=0.5)
## visualise the subnetwrok
dnet::visNet(g=net, glayout=glayout, vertex.label=igraph::V(net)$geneSymbol,
vertex.color=vcolors, vertex.frame.color=vcolors,
vertex.shape="sphere", mark.groups=mark.groups, mark.col=mark.col,
mark.border=mark.border, mark.shape=1, mark.expand=10,
edge.color=edge.color, newpage=FALSE, vertex.label.color="blue",
vertex.label.dist=0.4, vertex.label.font=2, main = titlePlot)
legend_name <- paste("C",1:length(mcolors)," (n=",com$csize,", pval=",signif(com$significance,digits=2),")",sep='')
legend("topleft", legend=legend_name, fill=mcolors, bty="n", cex=1.4)
dev.off()
return(net)
}
#' @title Principal components analysis (PCA) plot
#' @description
#' TCGAvisualize_PCA performs a principal components analysis (PCA) on the given data matrix
#' and returns the results as an object of class prcomp, and shows results in PCA level.
#' @param dataFilt A filtered dataframe or numeric matrix where each row represents a gene,
#' each column represents a sample from function TCGAanalyze_Filtering
#' @param dataDEGsFiltLevel table with DEGs, log Fold Change (FC), false discovery rate (FDR),
#' the gene expression level, etc, from function TCGAanalyze_LevelTab.
#' @param ntopgenes number of DEGs genes to plot in PCA
#' @param group1 a string containing the barcode list of the samples in in control group
#' @param group2 a string containing the barcode list of the samples in in disease group
#' the name of the group
#' @import ggplot2
#' @export
#' @return principal components analysis (PCA) plot of PC1 and PC2
#' @examples
#' # normalization of genes
#' dataNorm <- TCGAbiolinks::TCGAanalyze_Normalization(tabDF = dataBRCA, geneInfo = geneInfo,
#' method = "geneLength")
#' # quantile filter of genes
#' dataFilt <- TCGAanalyze_Filtering(tabDF = dataBRCA, method = "quantile", qnt.cut = 0.25)
#' # Principal Component Analysis plot for ntop selected DEGs
#' # selection of normal samples "NT"
#' group1 <- TCGAquery_SampleTypes(colnames(dataFilt), typesample = c("NT"))
#' # selection of normal samples "TP"
#' group2 <- TCGAquery_SampleTypes(colnames(dataFilt), typesample = c("TP"))
#' pca <- TCGAvisualize_PCA(dataFilt,dataDEGsFiltLevel, ntopgenes = 200, group1, group2)
TCGAvisualize_PCA <- function(dataFilt,dataDEGsFiltLevel ,ntopgenes,group1, group2) {
ComparisonSelected <- "Normal vs Tumor"
TitlePlot <- paste0("PCA ", "top ", ntopgenes,
" Up and down diff.expr genes between ",
ComparisonSelected)
dataFilt <- dataFilt[!duplicated(GenesCutID(rownames(dataFilt))),]
rownames(dataFilt) <- GenesCutID(rownames(dataFilt))
Genelist <- rownames(dataDEGsFiltLevel)[1:ntopgenes]
commonGenes <- intersect(Genelist, rownames(dataFilt) )
expr2 <- dataFilt[commonGenes,]
color1 <- "blue"
color2 <- "red"
nsample1 <- length(group1)
nsample2 <- length(group2)
#sampleColors <- rep(c(color1,color2), c(nsample1, nsample2))
#sampleColors <- rep(c("blue","red"), c(length(group1),
# length(group2)))
sampleColors <- c(rep("blue", length(group1)),
rep("red", length(group2)))
names(sampleColors) <- colnames(expr2)
cancer.pca <- stats::prcomp(t(expr2),cor = TRUE)
g <- ggbiplot(cancer.pca, obs.scale = 1, var.scale = 1,
groups = sampleColors, ellipse = TRUE, circle = FALSE)
g <- g + scale_colour_manual(name = "",
values = c("blue" = "blue","red" = "red"))
with(g,
g <- g + geom_point(aes(colour = sampleColors), size = 3)
)
#shape = tabClusterNew$Study)
g <- g + theme(legend.direction = 'horizontal', legend.position = 'top')
g <- g + ggtitle(TitlePlot)
print(g)
return(cancer.pca)
}
#' @title barPlot for a complete Enrichment Analysis
#' @description
#' The figure shows canonical pathways significantly overrepresented (enriched) by the DEGs
#' (differentially expressed genes).
#' The most statistically significant canonical pathways identified
#' in DEGs list are listed according to their p value corrected FDR (-Log) (colored bars)
#' and the ratio of list genes found in each pathway over the total number of
#' genes in that pathway (Ratio, red line).
#' @param tf is a list of gene symbols
#' @param GOBPTab is results from TCGAanalyze_EAcomplete related to Biological Process (BP)
#' @param GOCCTab is results from TCGAanalyze_EAcomplete related to Cellular Component (CC)
#' @param GOMFTab is results from TCGAanalyze_EAcomplete related to Molecular Function (MF)
#' @param PathTab is results from TCGAanalyze_EAcomplete related to Pathways EA
#' @param nBar is the number of bar histogram selected to show (default = 10)
#' @param nRGTab is the gene signature list with gene symbols.
#' @param filename Name for the pdf. If null it will return the plot.
#' @param color A vector of colors for each barplot. Deafult: c("orange", "cyan","green","yellow")
#' @param text.size Text size
#' @param xlim Upper limit of the x-axis.
#' @param mfrow Vector with number of rows/columns of the plot. Default 2 rows/2 columns "c(2,2)"
#' @param fig.width Default 30
#' @param fig.height Default 15
#' @export
#' @import graphics
#' @return Complete barPlot from Enrichment Analysis showing significant (default FDR < 0.01)
#' BP,CC,MF and pathways enriched by list of genes.
#' @examples
#' Genelist <- c("FN1","COL1A1")
#' ansEA <- TCGAanalyze_EAcomplete(TFname="DEA genes Normal Vs Tumor",Genelist)
#' TCGAvisualize_EAbarplot(tf = rownames(ansEA$ResBP),
#' GOBPTab = ansEA$ResBP,
#' GOCCTab = ansEA$ResCC,
#' GOMFTab = ansEA$ResMF,
#' PathTab = ansEA$ResPat,
#' nRGTab = Genelist,
#' nBar = 10,
#' filename="a.pdf")
#' \dontrun{
#' Genelist <- rownames(dataDEGsFiltLevel)
#' system.time(ansEA <- TCGAanalyze_EAcomplete(TFname="DEA genes Normal Vs Tumor",Genelist))
#' # Enrichment Analysis EA (TCGAVisualize)
#' # Gene Ontology (GO) and Pathway enrichment barPlot
#' TCGAvisualize_EAbarplot(tf = rownames(ansEA$ResBP),
#' GOBPTab = ansEA$ResBP,
#' GOCCTab = ansEA$ResCC,
#' GOMFTab = ansEA$ResMF,
#' PathTab = ansEA$ResPat,
#' nRGTab = Genelist,
#' nBar = 10)
#'}
TCGAvisualize_EAbarplot <- function(
tf,
GOMFTab,
GOBPTab,
GOCCTab,
PathTab,
nBar,
nRGTab,
filename = "TCGAvisualize_EAbarplot_Output.pdf",
text.size = 1.0,
mfrow = c(2, 2),
xlim = NULL,
fig.width = 30,
fig.height = 15,
color = c("orange", "cyan","green","yellow")
){
check_package("EDASeq")
if(!is.null(filename)) {
pdf(filename, width = fig.width, height = fig.height)
}
splitFun <- function(tf, Tab, nBar){
tmp <- lapply(Tab[tf, ], function(x) strsplit(x, ";"))
names(tmp) <- NULL
tmp <- matrix(unlist(tmp), ncol = 4, byrow = TRUE)
if (nrow(tmp) == 0 | tmp[1, 1] == "NA") return(matrix(0, ncol = 2))
tmp <- tmp[tmp[, 1] != "NA", , drop = FALSE]
tmp <- as.data.frame(tmp, stringsAsFactors = FALSE)
tmp[, 2] <- as.numeric(sub(" FDR= ", "", tmp[, 2]))
tmp[, 3] <- as.numeric(unlist(strsplit(matrix(unlist(strsplit(tmp[, 3],
"=")), nrow = 2)[2, ], ")")))
tmp[, 4] <- as.numeric(unlist(strsplit(matrix(unlist(strsplit(tmp[, 4],
"=")), nrow = 2)[2, ], ")")))
if (nrow(tmp) < nBar) nBar <- nrow(tmp)
tmp[, 2] <- -log10(tmp[, 2])
o <- order(tmp[, 2], decreasing = TRUE)
toPlot <- tmp[o[nBar:1], 1:2]
toPlot[, 1] <- paste0(toPlot[, 1], " (n=", tmp[o[nBar:1], 4], ")")
toPlot[, 3] <- tmp[o[nBar:1], 4]/tmp[o[nBar:1], 3]
return(toPlot)
}
par(mfrow = mfrow)
if(!missing(GOBPTab)){
if(!is.null(GOBPTab) & !all(is.na(GOBPTab))){
# Plotting GOBPTab
toPlot <- splitFun(tf, GOBPTab, nBar)
xAxis <- EDASeq::barplot(toPlot[, 2], horiz = TRUE, col = color[1],
main = "GO:Biological Process", xlab = "-log10(FDR)",xlim = xlim)
labs <- matrix(unlist(strsplit(toPlot[, 1], "~")), nrow = 2)[2, ]
text(x = 1, y = xAxis, labs, pos = 4, cex = text.size)
lines(x = toPlot[, 3], y = xAxis, col = "red")
points(x = toPlot[, 3], y = xAxis, col = "red")
axis(side = 3, at = pretty(range(0:1)), col = "red")
}
}
if(!missing(GOCCTab)){
if(!is.null(GOCCTab) & !all(is.na(GOCCTab))){
# Plotting GOCCTab
toPlot <- splitFun(tf, GOCCTab, nBar)
xAxis <- EDASeq::barplot(toPlot[, 2], horiz = TRUE, col = color[2],
main = "GO:Cellular Component", xlab = "-log10(FDR)",xlim = xlim)
labs <- matrix(unlist(strsplit(toPlot[, 1], "~")), nrow = 2)[2, ]
text(x = 1, y = xAxis, labs, pos = 4, cex = text.size)
lines(x = toPlot[, 3], y = xAxis, col = "red")
points(x = toPlot[, 3], y = xAxis, col = "red")
axis(side = 3, at = pretty(range(0:1)), col = "red")
}
}
if(!missing(GOMFTab)){
if(!is.null(GOMFTab) & !all(is.na(GOMFTab))){
# Plotting GOMFTab
toPlot <- splitFun(tf, GOMFTab, nBar)
xAxis <- EDASeq::barplot(toPlot[, 2], horiz = TRUE, col = color[3],
main = "GO:Molecular Function", xlab = "-log10(FDR)",xlim = xlim)
labs <- matrix(unlist(strsplit(toPlot[, 1], "~")), nrow = 2)[2, ]
text(x = 1, y = xAxis, labs, pos = 4, cex = text.size)
lines(x = toPlot[, 3], y = xAxis, col = "red")
points(x = toPlot[, 3], y = xAxis, col = "red")
axis(side = 3, at = pretty(range(0:1)), col = "red")
}
}
if(!missing(PathTab)){
if(!is.null(PathTab) & !all(is.na(PathTab))){
# Plotting PathTab
toPlot <- splitFun(tf, PathTab, nBar)
xAxis <- EDASeq::barplot(toPlot[, 2], horiz = TRUE, col = color[4],
main = "Pathways", xlab = "-log10(FDR)",xlim = xlim)
labs <- toPlot[, 1]
text(x = 1, y = xAxis, labs, pos = 4, cex = text.size)
lines(x = toPlot[, 3], y = xAxis, col = "red")
points(x = toPlot[, 3], y = xAxis, col = "red")
#axis(side = 1, at = pretty(range(0:1)), col = "red", line = 2.5)
axis(side = 3, at = pretty(range(0:1)), col = "red")
}
}
#par(new = TRUE)
#plot(toPlot[, 3], xAxis, axes = FALSE, bty = "n", xlab = "",
# ylab = "", col = "blue")
#par(new = TRUE)
#plot(toPlot[, 3], xAxis, type = "l", axes = FALSE, bty = "n", xlab = "",
# ylab = "", col = "blue")
#axis(side = 2, at = pretty(range(xAxis)))
#axis(side = 1, at = pretty(range(toPlot[, 3])), col = "red", line=2.5)
#axis(side = 3, at = pretty(range(toPlot[, 3])), col = "red")
if (is.null(nrow(nRGTab))) {
nRG <- length(nRGTab)
} else {
nRG <- nRGTab[tf, "RegSizeTF"]
}
mainLab <- paste(tf, " (nRG = ", nRG, ")", sep = "")
mtext(mainLab, side = 3, line = -1, outer = TRUE, font = 2)
if(!is.null(filename)) dev.off()
}
#' @title Barplot of subtypes and clinical info in groups of gene expression clustered.
#' @description
#' Barplot of subtypes and clinical info in groups of gene expression clustered.
#' @param DFfilt write
#' @param DFclin write
#' @param DFsubt write
#' @param data_Hc2 write
#' @param Subtype write
#' @param cbPalette Define the colors of the bar.
#' @param filename The name of the pdf file
#' @param width Image width
#' @param height Image height
#' @param dpi Image dpi
#' @import ggplot2
#' @export
#' @return barplot image in pdf or png file
TCGAvisualize_BarPlot <- function(DFfilt,
DFclin,
DFsubt,
data_Hc2,
Subtype,
cbPalette,
filename,
width,
height,
dpi){
if(Subtype =="AGE"){
dataClinNew <- dataClin
colnames(dataClin)[which(colnames(dataClin) == "age_at_initial_pathologic_diagnosis")] <- "AGE"
dataClin$AGE <- as.numeric(as.character(dataClin$AGE))
dataClin <- cbind(dataClin, AGE2 = matrix(0,nrow(dataClin),1))
dataClin[ dataClin$AGE <= 32, "AGE2"] <- "<=32 yr"
dataClin[ dataClin$AGE >= 41, "AGE2"] <- ">=41 yr"
dataClin[ dataClin$AGE2 == 0, "AGE2"] <- "33-40 yr"
dataClin$AGE <- as.character(dataClin$AGE2)
DFclin <- dataClin
}
ans <- hclust(ddist <- dist(DFfilt), method = "ward.D2")
hhc <- data_Hc2[[4]]$consensusTree
consensusClusters<-data_Hc2[[4]]$consensusClass
sampleOrder <- consensusClusters[hhc$order]
consensusClusters <- as.factor(data_Hc2[[4]]$clrs[[1]])
names(consensusClusters) <- attr(ddist, "Labels")
names(consensusClusters) <- substr(names(consensusClusters),1,12)
# adding information about gropus from consensus Cluster in clinical data
DFclin <- cbind(DFclin, groupsHC = matrix(0,nrow(DFclin),1))
rownames(DFclin) <- DFclin$bcr_patient_barcode
for( i in 1: nrow(DFclin)){
currSmp <- DFclin$bcr_patient_barcode[i]
DFclin[currSmp,"groupsHC"] <- as.character(consensusClusters[currSmp])
}
DFclin_filt <- DFclin[DFclin$bcr_patient_barcode %in% DFsubt$patient,]
DFclin_filt <- DFclin_filt[order(DFclin_filt$bcr_patient_barcode),]
DFsubt <- DFsubt[order(DFsubt$patient),]
DFclin_merged <- cbind(DFclin_filt,DFsubt)
subtype_sel <- colnames(DFclin_merged) == Subtype
DFclin_merged[,Subtype] <- as.character(DFclin_merged[,Subtype])
DFclin_merged <- DFclin_merged[!(is.na(DFclin_merged[,Subtype])),]
DFclin_merged <- DFclin_merged[DFclin_merged[,Subtype] !="NA",]
groupsColors <- levels(as.factor(DFclin_merged$groupsHC))
for(j in 1:length(table(DFclin_merged$groupsHC))){
curCol <- groupsColors[j]
DFclin_merged[DFclin_merged$groupsHC == curCol,"groupsHC"]<-paste0("EC",j)
}
subtypeCluster <- factor(DFclin_merged[,Subtype])
pplot <- qplot(factor(DFclin_merged$groupsHC),
data=DFclin_merged, geom="bar",
fill=subtypeCluster , xlab="Expression group") +
theme_bw() +
theme(panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(colour = "black")) +
guides(fill=guide_legend(title=Subtype)) +
theme(legend.position="top",
legend.text = element_text(size = 18),
legend.title = element_text(size=18, face="bold")) +
theme( legend.text = element_text(size = 18),
legend.title = element_text(size = 18),
axis.text= element_text(size = 30),
axis.title.x= element_text(size = 22),
axis.title.y= element_text(size = 30)) +
scale_fill_manual(values=cbPalette)
ggsave(pplot, filename = filename, width = width, height = height, dpi = dpi)
message(paste("Plot saved in: ", file.path(getwd(),filename)))
}
#' @title Heatmap with more sensible behavior using heatmap.plus
#' @description Heatmap with more sensible behavior using heatmap.plus
#' @param data The object to with the heatmap data (expression, methylation)
#' @param col.metadata Metadata for the columns (samples). It should have on of the following columns:
#' barcode (28 characters) column to match with the samples. It will also work with
#' "bcr_patient_barcode"(12 chars),"patient"(12 chars),"sample"(16 chars) columns but as one patient might
#' have more than one sample, this coul lead to errors in the annotation.
#' The code will throw a warning in case two samples are from the same patient.
#' @param row.metadata Metadata for the rows genes (expression) or probes (methylation)
#' @param col.colors A list of names colors
#' @param row.colors A list of named colors
#' @param type Select the colors of the heatmap values. Possible values are
#' "expression" (default), "methylation"
#' @param show_column_names Show column names names? Default: FALSE
#' @param show_row_names Show row names? Default: FALSE
#' @param cluster_rows Cluster rows ? Default: FALSE
#' @param cluster_columns Cluster columns ? Default: FALSE
#' @param filename Filename to save the heatmap. Default: heatmap.png
#' @param width figure width
#' @param height figure height
#' @param sortCol Name of the column to be used to sort the columns
#' @param title Title of the plot
#' @param rownames.size Rownames size
#' @param color.levels A vector with the colors (low level, middle level, high level)
#' @param extremes Extremes of colors (vector of 3 values)
#' @param values.label Text of the levels in the heatmap
#' @param heatmap.legend.color.bar Heatmap legends values type.
#' Options: "continuous", "discrete"
#' @param scale Use z-score to make the heatmap?
#' If we want to show differences between genes, it is good to make Z-score by samples
#' (force each sample to have zero mean and standard deviation=1).
#' If we want to show differences between samples, it is good to make Z-score by genes
#' (force each gene to have zero mean and standard deviation=1).
#' Possibilities: "row", "col". Default "none"
#' @examples
#' row.mdat <- matrix(c("FALSE","FALSE",
#' "TRUE","TRUE",
#' "FALSE","FALSE",
#' "TRUE","FALSE",
#' "FALSE","TRUE"
#' ),
#' nrow = 5, ncol = 2, byrow = TRUE,
#' dimnames = list(
#' c("probe1", "probe2","probe3","probe4","probe5"),
#' c("duplicated", "Enhancer region")))
#' dat <- matrix(c(0.3,0.2,0.3,1,1,0.1,1,1,0, 0.8,1,0.7,0.7,0.3,1),
#' nrow = 5, ncol = 3, byrow = TRUE,
#' dimnames = list(
#' c("probe1", "probe2","probe3","probe4","probe5"),
#' c("TCGA-DU-6410",
#' "TCGA-DU-A5TS",
#' "TCGA-HT-7688")))
#'
#' mdat <- data.frame(patient=c("TCGA-DU-6410","TCGA-DU-A5TS","TCGA-HT-7688"),
#' Sex=c("Male","Female","Male"),
#' COCCluster=c("coc1","coc1","coc1"),
#' IDHtype=c("IDHwt","IDHMut-cod","IDHMut-noncod"))
#'
#'TCGAvisualize_Heatmap(dat,
#' col.metadata = mdat,
#' row.metadata = row.mdat,
#' row.colors = list(duplicated = c("FALSE" = "pink",
#' "TRUE"="green"),
#' "Enhancer region" = c("FALSE" = "purple",
#' "TRUE"="grey")),
#' col.colors = list(Sex = c("Male" = "blue", "Female"="red"),
#' COCCluster=c("coc1"="grey"),
#' IDHtype=c("IDHwt"="cyan",
#' "IDHMut-cod"="tomato"
#' ,"IDHMut-noncod"="gold")),
#' type = "methylation",
#' show_row_names=TRUE)
#' @export
#' @return Heatmap plotted in the device
TCGAvisualize_Heatmap <- function(
data,
col.metadata,
row.metadata,
col.colors = NULL,
row.colors = NULL,
show_column_names = FALSE,
show_row_names = FALSE,
cluster_rows = FALSE,
cluster_columns = FALSE,
sortCol,
extremes = NULL,
rownames.size = 12,
title = NULL,
color.levels = NULL,
values.label = NULL,
filename = "heatmap.pdf",
width = 10,
height = 10,
type = "expression",
scale = "none",
heatmap.legend.color.bar = "continuous"){
check_package("ComplexHeatmap")
check_package("circlize")
# STEP 1 add columns labels (top of heatmap)
ha <- NULL
if(!missing(col.metadata)) {
if(!is.null(col.metadata)) {
id <- NULL
if("patient" %in% colnames(col.metadata)) {
id <- "patient"
size <- 12
}
if("barcode" %in% colnames(col.metadata)) {
id <- "barcode"
stopifnot(nchar(col.metadata[,id])[1] == 28)
size <- 28
}
if("bcr_patient_barcode" %in% colnames(col.metadata)) {
id <- "bcr_patient_barcode"
stopifnot(nchar(col.metadata[,id])[1] == 12)
size <- 12
}
if("sample" %in% colnames(col.metadata)) {
id <- "sample"
stopifnot(nchar(col.metadata[,id])[1] == 16)
size <- 16
}
if(is.null(id)) {
message("=============== INNPUT ERROR =================")
message("I'm expecting one of these columns:")
message(" => barcode")
message(" Has the complete barcode (TCGA-AA-3833-01A-01D-0904-05)")
message(" => bcr_patient_barcode")
message(" Has the patient barcode (TCGA-AA-3833)")
message(" => patient")
message(" Has the patient barcode (TCGA-AA-3833)")
message(" => sample")
message(" Has the sample barcode (TCGA-AA-3833-01A)")
message("-----------------------------------------------")
message("Obs: The complete barcode is the recommended one, as the others might lead to errors")
return(NULL)
}
stopifnot(nchar(as.character(col.metadata[,id])[1]) == size)
message(paste0("Reorganizing: col.metadata order should be the same of the data object"))
df <- col.metadata[match(substr(colnames(data),1,size), col.metadata[,id]),]
df[,id] <- NULL
duplicated.samples <- any(sapply(col.metadata[,id],
function(x) {length(grep(x,col.metadata[,id])) > 1 }))
if(duplicated.samples){
warning("Some samples are from the same patient, this might lead to the wrong upper annotation")
}
if (!missing(sortCol)) {
message(paste0("Sorting columns based on column: ",
sortCol))
column_order <- order(df[,sortCol])
}
if(is.null(col.colors)) {
ha <- ComplexHeatmap::HeatmapAnnotation(df = df)
} else {
ha <- ComplexHeatmap::HeatmapAnnotation(df = df,
col = col.colors)
}
}
}
# STEP 2 Create heatmap
if(is.null(color.levels)) {
if (type == "expression") color.levels <- c("green", "white", "red")
if (type == "methylation") color.levels <- c("blue", "white", "red")
}
# If we want to show differences between genes, it is good to make Z-score by samples
# (force each sample to have zero mean and standard deviation=1).
# If we want to show differences between samples, it is good to make Z-score by genes
# (force each gene to have zero mean and standard deviation=1).
if(scale == "row"){
message("Calculating z-scores for the rows....")
data <- t(scale(t(data)))
all.na <- apply(data,1, function(x) all(is.na(x)))
data <- data[!all.na,]
} else if(scale == "col"){
message("Calculiating z-scores for the columns....")
data <- scale(data)
}
if(is.null(extremes)) {
if(min(data,na.rm = TRUE) < 0) {
extremes <- c(min(data,na.rm = TRUE), (max(data,na.rm = TRUE) + min(data,na.rm = TRUE))/2, max(data,na.rm = TRUE))
} else {
extremes <- c(0, max(data,na.rm = TRUE)/2, max(data,na.rm = TRUE))
}
}
if (type == "expression") color <- circlize::colorRamp2(extremes, color.levels)
if (type == "methylation") color <- circlize::colorRamp2(extremes, color.levels)
# Creating plot title
if(is.null(title)) {
if(type == "methylation") title <- "DNA methylation heatmap"
if(type == "expression") title <- "Expression heatmap"
}
# Change label type
heatmap_legend_param <- list()
if(heatmap.legend.color.bar == "continuous" && type == "methylation"){
heatmap_legend_param <- c(list(color_bar = "continuous"),heatmap_legend_param)
if(!scale %in% c("row","col")) heatmap_legend_param <- list(color_bar = "continuous", at = c(0,0.2,0.4,0.6,0.8, 1), legend_height = unit(3, "cm"), labels = c("0.0 (hypomethylated)",0.2,0.4,0.6,0.8,"1.0 (hypermethylated)"))
}
if(heatmap.legend.color.bar == "continuous" && type == "expression"){
heatmap_legend_param <- c(list(color_bar = "continuous"),heatmap_legend_param)
}
# Change label reference
if(is.null(values.label)){
if(type == "methylation") values.label <- "DNA methylation level"
if(type == "expression") {
values.label <- "Expression"
if(scale != "none") values.label <- paste0(values.label, "(z-score)")
}
}
if(!missing(sortCol) & heatmap.legend.color.bar == "continuous"){
heatmap <- ComplexHeatmap::Heatmap(
data,
name = values.label,
top_annotation = ha,
col = color,
row_names_gp = grid::gpar(fontsize = rownames.size),
show_row_names = show_row_names,
cluster_rows = cluster_rows,
cluster_columns = cluster_columns,
show_column_names = show_column_names,
column_order = column_order,
column_title = title,
heatmap_legend_param = heatmap_legend_param)
} else if(missing(sortCol) & heatmap.legend.color.bar == "continuous"){
heatmap <- ComplexHeatmap::Heatmap(
data,
name = values.label,
top_annotation = ha,
col = color,
show_row_names = show_row_names,
row_names_gp = grid::gpar(fontsize = rownames.size),
cluster_rows = cluster_rows,
cluster_columns = cluster_columns,
show_column_names = show_column_names,
column_title = title,
heatmap_legend_param = heatmap_legend_param)
} else if(!missing(sortCol)){
heatmap <- ComplexHeatmap::Heatmap(
data,
name = values.label,
top_annotation = ha,
col = color,
row_names_gp = grid::gpar(fontsize = rownames.size),
show_row_names = show_row_names,
cluster_rows = cluster_rows,
cluster_columns = cluster_columns,
show_column_names = show_column_names,
column_order = column_order,
column_title = title)
} else {
heatmap <- ComplexHeatmap::Heatmap(
data,
name = values.label,
top_annotation = ha,
col = color,
row_names_gp = grid::gpar(fontsize = rownames.size),
show_row_names = show_row_names,
cluster_rows = cluster_rows,
cluster_columns = cluster_columns,
show_column_names = show_column_names,
column_title = title,
heatmap_legend_param = heatmap_legend_param)
}
# STEP 3 row labels (right side)
if (!missing(row.metadata)) {
if (!is.null(row.metadata)) {
for (i in 1:ncol(row.metadata)) {
if (!missing(row.colors) && !is.null(row.colors[[colnames(row.metadata)[i]]])) {
color <- row.colors[[colnames(row.metadata)[i]]]
x = ComplexHeatmap::Heatmap(
row.metadata[,i] ,
name = colnames(row.metadata)[i],
width = unit(0.5, "cm"),
show_row_names = FALSE, col = color )
} else {
x = ComplexHeatmap::Heatmap(
row.metadata[,i] ,
name = colnames(row.metadata)[i],
width = unit(0.5, "cm"),
show_row_names = FALSE)
}
heatmap <- ComplexHeatmap::add_heatmap(heatmap,x)
}
}
}
if(!is.null(filename)){
if(tools::file_ext(filename) == "png") png(filename, width = width, height = height )
if(tools::file_ext(filename) == "pdf") pdf(filename, width = width, height = height )
ComplexHeatmap::draw(heatmap)
dev.off()
} else {
ComplexHeatmap::draw(heatmap)
}
}
# unlist labels
# Help function that unlists a list into a vector
unlistlabels <- function(lab) {
dummy <- unlist(lab)
labels <- c()
labels <- c(labels, as.character(dummy))
return(labels)
}
#' Creating a oncoprint
#' @param mut A dataframe from the mutation annotation file (see TCGAquery_maf from TCGAbiolinks)
#' @param genes Gene list
#' @param filename name of the pdf
#' @param color named vector for the plot
#' @param height pdf height
#' @param width pdf width
#' @param rm.empty.columns If there is no alteration in that sample, whether remove it on the oncoprint
#' @param show.row.barplot Show barplot annotation on rows?
#' @param show.column.names Show column names? Default: FALSE
#' @param rows.font.size Size of the fonts
#' @param column.names.size Size of the fonts of the columns names
#' @param dist.col distance between columns in the plot
#' @param dist.row distance between rows in the plot
#' @param label.font.size Size of the fonts
#' @param row.order Order the genes (rows) Default:TRUE. Genes with more mutations will be in the first rows
#' @param col.order Order columns. Default:TRUE.
#' @param annotation Matrix or data frame with the annotation.
#' Should have a column bcr_patient_barcode with the same ID of the mutation object
#' @param annotation.position Position of the annotation "bottom" or "top"
#' @param label.title Title of the label
#' @param annotation.legend.side Position of the annotation legend
#' @param heatmap.legend.side Position of the heatmap legend
#' @param information Which column to use as information from MAF.
#' Options: 1) "Variant_Classification" (The information will be "Frame_Shift_Del", "Frame_Shift_Ins",
#' "In_Frame_Del", "In_Frame_Ins", "Missense_Mutation", "Nonsense_Mutation",
#' "Nonstop_Mutation", "RNA", "Silent" , "Splice_Site", "Targeted_Region", "Translation_Start_Site")
#' 2) "Variant_Type" (The information will be INS,DEL,SNP)
#' @importFrom data.table dcast setDT setDF :=
#' @examples
#' \dontrun{
#' library(dplyr)
#' query <- GDCquery(
#' project = "TCGA-CHOL",
#' data.category = "Simple Nucleotide Variation",
#' access = "open",
#' legacy = FALSE,
#' data.type = "Masked Somatic Mutation",
#' workflow.type = "Aliquot Ensemble Somatic Variant Merging and Masking"
#')
#' GDCdownload(query)
#' mut <- GDCprepare(query)
#' TCGAvisualize_oncoprint(mut = mut, genes = mut$Hugo_Symbol[1:10], rm.empty.columns = TRUE)
#' TCGAvisualize_oncoprint(
#' mut = mut, genes = mut$Hugo_Symbol[1:10],
#' filename = "onco.pdf",
#' color = c("background"="#CCCCCC","DEL"="purple","INS"="yellow","SNP"="brown")
#' )
#' clin <- GDCquery_clinic("TCGA-ACC","clinical")
#' clin <- clin[,c("bcr_patient_barcode","disease","gender","tumor_stage","race","vital_status")]
#' TCGAvisualize_oncoprint(
#' mut = mut, genes = mut$Hugo_Symbol[1:20],
#' filename = "onco.pdf",
#' annotation = clin,
#' color=c("background"="#CCCCCC","DEL"="purple","INS"="yellow","SNP"="brown"),
#' rows.font.size=10,
#' heatmap.legend.side = "right",
#' dist.col = 0,
#' label.font.size = 10
#' )
#' }
#' @export
#' @return A oncoprint plot
TCGAvisualize_oncoprint <- function(
mut,
genes,
filename,
color,
annotation.position = "bottom",
annotation,
height,
width = 10,
rm.empty.columns = FALSE,
show.column.names = FALSE,
show.row.barplot = TRUE,
label.title = "Mutation",
column.names.size = 8,
label.font.size = 16,
rows.font.size = 16,
dist.col = 0.5,
dist.row = 0.5,
information = "Variant_Type",
row.order = TRUE,
col.order = TRUE,
heatmap.legend.side = "bottom",
annotation.legend.side = "bottom"
){
check_package("ComplexHeatmap")
check_package("circlize")
check_package("grid")
if(missing(mut)) stop("Missing mut argument")
mut <- setDT(mut)
mut$value <- 1
if(rm.empty.columns == FALSE) all.samples <- unique(mut$Tumor_Sample_Barcode)
mut$Hugo_Symbol <- as.character(mut$Hugo_Symbol)
if(!missing(genes) & !is.null(genes)) mut <- subset(mut, mut$Hugo_Symbol %in% genes)
if(!rm.empty.columns){
formula <- paste0("Tumor_Sample_Barcode + Hugo_Symbol ~ ", information)
suppressMessages({mat <- dcast(mut, as.formula(formula),value.var = "value",fill = 0,drop = FALSE)})
} else {
formula <- paste0("Tumor_Sample_Barcode + Hugo_Symbol ~ ", information)
suppressMessages({mat <- dcast(mut, as.formula(formula),value.var = "value",fill = 0,drop = TRUE)})
}
# mutation in the file
columns <- colnames(mat)[-c(1:2)]
# value will be a collum with all the mutations
mat$value <- ""
for ( i in columns){
mat[,i] <- replace(mat[,i,with = FALSE],mat[,i,with = FALSE]>0,paste0(i,";"))
mat[,i] <- replace(mat[,i,with = FALSE],mat[,i,with = FALSE]==0,"")
mat[,value:=paste0(value,get(i))]
}
# After the gene selection, some of the mutation might not exist
# we will remove them to make the oncoprint work
mutation.type <- c()
for (i in columns){
if(length(grep(i,mat$value)) > 0) mutation.type <- c(mutation.type,i)
}
# now we have a matrix with pairs samples/genes mutations
# we want a matrix with samples vs genes mutations with the content being the value
mat <- setDF(dcast(mat, Tumor_Sample_Barcode~Hugo_Symbol, value.var="value",fill=""))
rownames(mat) <- mat[,1]
mat <- mat[,-1]
if(rm.empty.columns == FALSE) {
aux <- data.frame(row.names = all.samples[!all.samples %in% rownames(mat)])
if(nrow(aux) > 0) {
aux[,colnames(mat)] <- ""
mat <- rbind(mat,aux)
}
}
alter_fun = function(x, y, w, h, v) {
n = sum(v)
h = h * 0.9
# use `names(which(v))` to correctly map between `v` and `col`
# Oncoprint with one value does not have names in v
if(is.null(names(v))){
if(v){
grid::grid.rect(
x,
y - h * 0.5 + 1:n / n * h,
w - unit(dist.col, "mm"),
1 / n * h,
gp = grid::gpar(fill = setdiff(color,color["background"]), col = NA),
just = "top"
)
} else {
grid::grid.rect(
x,
y,
w - unit(dist.col, "mm"),
h - unit(dist.row, "mm"),
gp = grid::gpar(fill = color["background"], col = NA)
)
}
} else {
if (length(names(which(v)))) {
grid::grid.rect(
x,
y - h * 0.5 + 1:n / n * h,
w - unit(dist.col, "mm"),
1 / n * h,
gp = grid::gpar(fill = color[names(which(v))], col = NA),
just = "top"
)
} else {
grid::grid.rect(
x,
y,
w - unit(dist.col, "mm"),
h - unit(dist.row, "mm"),
gp = grid::gpar(fill = color["background"], col = NA)
)
}
}
}
# get only the colors to the mutations
# otherwise it gives errors
if(missing(color)){
check_package("grDevices")
color <- c(grDevices::rainbow(length(mutation.type)), "#CCCCCC")
names(color) <- c(mutation.type,"background")
} else{
if("background" %in% names(color)) {
color <- color[c(mutation.type,"background")]
} else {
color <- c(color[mutation.type],"background"= "#CCCCCC")
}
}
print(color)
# header are samples, rows genes
mat <- t(mat)
if(!missing(height)) height <- length(genes)/2
if(!missing(filename)) pdf(filename,width = width,height = height)
if(missing(annotation)) annotation <- NULL
if(!is.null(annotation)){
if(!"bcr_patient_barcode" %in% colnames(annotation))
stop("bcr_patient_barcode column should be in the annotation")
idx <- match(substr(colnames(mat),1,12),annotation$bcr_patient_barcode)
if(all(is.na(idx)))
stop(" We couldn't match the columns names with the bcr_patient_barcode column in the annotation object")
annotation <- annotation[idx,]
annotation$bcr_patient_barcode <- NULL
n.col <- sum(sapply(colnames(annotation), function(x) {
length(unique(annotation[,x]))
}))
# add automatic colors: not working
get.color <- function(df,col){
idx <- which(colnames(df) == col)
start <- 1
if(idx != 1) start <- length(na.omit(unique(unlist(c(df[,1:(idx-1)]))))) + 1
end <- start + length(na.omit(unique(df[,col]))) -1
diff.colors <- c("purple","thistle","deeppink3","magenta4","lightsteelblue1","black",
"chartreuse","lightgreen","maroon4","darkslategray",
"lightyellow3","darkslateblue","firebrick1","aquamarine",
"dodgerblue4","bisque4","moccasin","indianred1",
"yellow","gray93","cyan","darkseagreen4",
"lightgoldenrodyellow","lightpink","sienna1",
"darkred","palevioletred","tomato4","blue",
"mediumorchid4","royalblue1","magenta2","darkgoldenrod1")
return(diff.colors[start:end])
}
col.annot <- lapply(colnames(annotation), function(x) {
#idx <- which(colnames(annotation) == x) - 1
#print(idx/n.col)
ret <- get.color(annotation,x)
#ret <- rainbow(length(unique(annotation[,x])),start = idx/n.col,alpha=0.5)
names(ret) <- as.character(na.omit(unique(annotation[,x])))
return(ret)
})
names(col.annot) <- colnames(annotation)
annotHeatmap <-
ComplexHeatmap::HeatmapAnnotation(
df = annotation,
col = col.annot,
annotation_legend_param = list(
title_gp = grid::gpar(fontsize = label.font.size,
fontface =
"bold"),
labels_gp =
grid::gpar(fontsize = label.font.size),
#sizelabels
grid_height =
unit(8, "mm")
)
)
}
if(heatmap.legend.side == "bottom") {
nrow <- 1
title_position <- "leftcenter"
} else {
nrow <- 10
title_position <- "topcenter"
}
if(is.null(annotation) & !row.order & !col.order){
p <- ComplexHeatmap::oncoPrint(mat, get_type = function(x) strsplit(x, ";")[[1]],
row_order = NULL,
remove_empty_columns = FALSE,
show_column_names = show.column.names,
column_order = NULL, # Do not sort the columns
alter_fun = alter_fun, col = color,
column_names_gp = grid::gpar(fontsize = column.names.size),
row_names_gp = grid::gpar(fontsize = rows.font.size), # set size for row names
pct_gp = grid::gpar(fontsize = rows.font.size), # set size for percentage labels
#column_title = "OncoPrint for TCGA LGG, genes in Glioma signaling",
#column_title_gp = grid::gpar(fontsize = 11),
heatmap_legend_param = list(title = label.title, at = names(color),
labels = names(color),
title_gp = grid::gpar(fontsize = label.font.size, fontface = "bold"),
labels_gp = grid::gpar(fontsize = label.font.size), # size labels
grid_height = unit(8, "mm"), # vertical distance labels
nrow = nrow, title_position = title_position
)
)
} else if(!is.null(annotation) & annotation.position == "bottom" & !row.order & !col.order){
p <- ComplexHeatmap::oncoPrint(mat, get_type = function(x) strsplit(x, ";")[[1]],
row_order = NULL,
remove_empty_columns = FALSE,
column_names_gp = grid::gpar(fontsize = column.names.size),
show_column_names = show.column.names,
column_order = NULL, # Do not sort the columns
alter_fun = alter_fun, col = color,
row_names_gp = grid::gpar(fontsize = rows.font.size), # set size for row names
pct_gp = grid::gpar(fontsize = rows.font.size), # set size for percentage labels
#column_title = "OncoPrint for TCGA LGG, genes in Glioma signaling",
#column_title_gp = grid::gpar(fontsize = 11),
bottom_annotation = annotHeatmap,
heatmap_legend_param = list(title = label.title, at = names(color),
labels = names(color),
title_gp = grid::gpar(fontsize = label.font.size, fontface = "bold"),
labels_gp = grid::gpar(fontsize = label.font.size), # size labels
grid_height = unit(8, "mm"), # vertical distance labels
nrow = nrow, title_position = title_position
)
)
} else if(!is.null(annotation) & annotation.position == "top" & !row.order & !col.order){
p <- ComplexHeatmap::oncoPrint(mat, get_type = function(x) strsplit(x, ";")[[1]],
row_order = NULL,
remove_empty_columns = FALSE,
column_names_gp = grid::gpar(fontsize = column.names.size),
show_column_names = show.column.names,
column_order = NULL, # Do not sort the columns
alter_fun = alter_fun, col = color,
row_names_gp = grid::gpar(fontsize = rows.font.size), # set size for row names
pct_gp = grid::gpar(fontsize = rows.font.size), # set size for percentage labels
#column_title = "OncoPrint for TCGA LGG, genes in Glioma signaling",
#column_title_gp = grid::gpar(fontsize = 11),
top_annotation = annotHeatmap,
heatmap_legend_param = list(title = label.title, at = names(color),
labels = names(color),
title_gp = grid::gpar(fontsize = label.font.size, fontface = "bold"),
labels_gp = grid::gpar(fontsize = label.font.size), # size labels
grid_height = unit(8, "mm"), # vertical distance labels
nrow = nrow, title_position = title_position
)
)
} else if(is.null(annotation) & row.order & !col.order){
p <- ComplexHeatmap::oncoPrint(mat, get_type = function(x) strsplit(x, ";")[[1]],
remove_empty_columns = FALSE,
show_column_names = show.column.names,
column_order = NULL, # Do not sort the columns
alter_fun = alter_fun, col = color,
column_names_gp = grid::gpar(fontsize = column.names.size),
row_names_gp = grid::gpar(fontsize = rows.font.size), # set size for row names
pct_gp = grid::gpar(fontsize = rows.font.size), # set size for percentage labels
#column_title = "OncoPrint for TCGA LGG, genes in Glioma signaling",
#column_title_gp = grid::gpar(fontsize = 11),
heatmap_legend_param = list(title = label.title, at = names(color),
labels = names(color),
title_gp = grid::gpar(fontsize = label.font.size, fontface = "bold"),
labels_gp = grid::gpar(fontsize = label.font.size), # size labels
grid_height = unit(8, "mm"), # vertical distance labels
nrow = nrow, title_position = title_position
)
)
} else if(!is.null(annotation) & annotation.position == "bottom" & row.order & !col.order){
p <- ComplexHeatmap::oncoPrint(mat, get_type = function(x) strsplit(x, ";")[[1]],
remove_empty_columns = FALSE,
column_names_gp = grid::gpar(fontsize = column.names.size),
show_column_names = show.column.names,
column_order = NULL, # Do not sort the columns
alter_fun = alter_fun, col = color,
row_names_gp = grid::gpar(fontsize = rows.font.size), # set size for row names
pct_gp = grid::gpar(fontsize = rows.font.size), # set size for percentage labels
#column_title = "OncoPrint for TCGA LGG, genes in Glioma signaling",
#column_title_gp = grid::gpar(fontsize = 11),
bottom_annotation = annotHeatmap,
heatmap_legend_param = list(title = label.title, at = names(color),
labels = names(color),
title_gp = grid::gpar(fontsize = label.font.size, fontface = "bold"),
labels_gp = grid::gpar(fontsize = label.font.size), # size labels
grid_height = unit(8, "mm"), # vertical distance labels
nrow = nrow, title_position = title_position
)
)
} else if(!is.null(annotation) & annotation.position == "top" & row.order & !col.order){
p <- ComplexHeatmap::oncoPrint(mat, get_type = function(x) strsplit(x, ";")[[1]],
remove_empty_columns = FALSE,
show_column_names = show.column.names,
column_order = NULL, # Do not sort the columns
alter_fun = alter_fun, col = color,
column_names_gp = grid::gpar(fontsize = column.names.size),
row_names_gp = grid::gpar(fontsize = rows.font.size), # set size for row names
pct_gp = grid::gpar(fontsize = rows.font.size), # set size for percentage labels
#column_title = "OncoPrint for TCGA LGG, genes in Glioma signaling",
#column_title_gp = grid::gpar(fontsize = 11),
top_annotation = annotHeatmap,
heatmap_legend_param = list(title = label.title, at = names(color),
labels = names(color),
title_gp = grid::gpar(fontsize = label.font.size, fontface = "bold"),
labels_gp = grid::gpar(fontsize = label.font.size), # size labels
grid_height = unit(8, "mm"), # vertical distance labels
nrow = nrow, title_position = title_position
)
)
} else if(is.null(annotation) & !row.order & col.order){
p <- ComplexHeatmap::oncoPrint(mat, get_type = function(x) strsplit(x, ";")[[1]],
row_order = NULL,
remove_empty_columns = FALSE,
show_column_names = show.column.names,
alter_fun = alter_fun, col = color,
column_names_gp = grid::gpar(fontsize = column.names.size),
row_names_gp = grid::gpar(fontsize = rows.font.size), # set size for row names
pct_gp = grid::gpar(fontsize = rows.font.size), # set size for percentage labels
#column_title = "OncoPrint for TCGA LGG, genes in Glioma signaling",
#column_title_gp = grid::gpar(fontsize = 11),
heatmap_legend_param = list(title = label.title, at = names(color),
labels = names(color),
title_gp = grid::gpar(fontsize = label.font.size, fontface = "bold"),
labels_gp = grid::gpar(fontsize = label.font.size), # size labels
grid_height = unit(8, "mm"), # vertical distance labels
nrow = nrow, title_position = title_position
)
)
} else if(!is.null(annotation) & annotation.position == "bottom" & !row.order & col.order){
p <- ComplexHeatmap::oncoPrint(mat, get_type = function(x) strsplit(x, ";")[[1]],
row_order = NULL,
remove_empty_columns = FALSE,
column_names_gp = grid::gpar(fontsize = column.names.size),
show_column_names = show.column.names,
alter_fun = alter_fun, col = color,
row_names_gp = grid::gpar(fontsize = rows.font.size), # set size for row names
pct_gp = grid::gpar(fontsize = rows.font.size), # set size for percentage labels
#column_title = "OncoPrint for TCGA LGG, genes in Glioma signaling",
#column_title_gp = grid::gpar(fontsize = 11),
bottom_annotation = annotHeatmap,
heatmap_legend_param = list(title = label.title, at = names(color),
labels = names(color),
title_gp = grid::gpar(fontsize = label.font.size, fontface = "bold"),
labels_gp = grid::gpar(fontsize = label.font.size), # size labels
grid_height = unit(8, "mm"), # vertical distance labels
nrow = nrow, title_position = title_position
)
)
} else if(!is.null(annotation) & annotation.position == "top" & !row.order & col.order){
p <- ComplexHeatmap::oncoPrint(mat, get_type = function(x) strsplit(x, ";")[[1]],
row_order = NULL,
remove_empty_columns = FALSE,
column_names_gp = grid::gpar(fontsize = column.names.size),
show_column_names = show.column.names,
alter_fun = alter_fun, col = color,
row_names_gp = grid::gpar(fontsize = rows.font.size), # set size for row names
pct_gp = grid::gpar(fontsize = rows.font.size), # set size for percentage labels
#column_title = "OncoPrint for TCGA LGG, genes in Glioma signaling",
#column_title_gp = grid::gpar(fontsize = 11),
top_annotation = annotHeatmap,
heatmap_legend_param = list(title = label.title, at = names(color),
labels = names(color),
title_gp = grid::gpar(fontsize = label.font.size, fontface = "bold"),
labels_gp = grid::gpar(fontsize = label.font.size), # size labels
grid_height = unit(8, "mm"), # vertical distance labels
nrow = nrow, title_position = title_position
)
)
} else if(is.null(annotation) & row.order & col.order){
save(mat,color,alter_fun,file = "test.rda")
p <- ComplexHeatmap::oncoPrint(mat, get_type = function(x) strsplit(x, ";")[[1]],
remove_empty_columns = FALSE,
show_column_names = show.column.names,
alter_fun = alter_fun,
col = color,
column_names_gp = grid::gpar(fontsize = column.names.size),
row_names_gp = grid::gpar(fontsize = rows.font.size), # set size for row names
pct_gp = grid::gpar(fontsize = rows.font.size), # set size for percentage labels
#column_title = "OncoPrint for TCGA LGG, genes in Glioma signaling",
#column_title_gp = grid::gpar(fontsize = 11),
heatmap_legend_param = list(title = label.title, at = names(color),
labels = names(color),
title_gp = grid::gpar(fontsize = label.font.size, fontface = "bold"),
labels_gp = grid::gpar(fontsize = label.font.size), # size labels
grid_height = unit(8, "mm"), # vertical distance labels
nrow = nrow, title_position = title_position
)
)
} else if(!is.null(annotation) & annotation.position == "bottom" & row.order & col.order){
p <- ComplexHeatmap::oncoPrint(
mat, get_type = function(x) strsplit(x, ";")[[1]],
remove_empty_columns = FALSE,
column_names_gp = grid::gpar(fontsize = column.names.size),
#show_row_barplot = show.row.barplot,
show_column_names = show.column.names,
alter_fun = alter_fun, col = color,
row_names_gp = grid::gpar(fontsize = rows.font.size), # set size for row names
pct_gp = grid::gpar(fontsize = rows.font.size), # set size for percentage labels
#axis_gp = grid::gpar(fontsize = rows.font.size),# size of axis
#column_title = "OncoPrint for TCGA LGG, genes in Glioma signaling",
#column_title_gp = grid::gpar(fontsize = 11),
#row_barplot_width = unit(2, "cm"), #size barplot
bottom_annotation = annotHeatmap,
heatmap_legend_param = list(
title = label.title, at = names(color),
labels = names(color),
title_gp = grid::gpar(fontsize = label.font.size, fontface = "bold"),
labels_gp = grid::gpar(fontsize = label.font.size), # size labels
grid_height = unit(8, "mm"), # vertical distance labels
nrow = nrow,
title_position = title_position
)
)
} else if(!is.null(annotation) & annotation.position == "top" & row.order & col.order){
p <- ComplexHeatmap::oncoPrint(
mat, get_type = function(x) strsplit(x, ";")[[1]],
remove_empty_columns = FALSE,
show_column_names = show.column.names,
alter_fun = alter_fun, col = color,
column_names_gp = grid::gpar(fontsize = column.names.size),
row_names_gp = grid::gpar(fontsize = rows.font.size), # set size for row names
pct_gp = grid::gpar(fontsize = rows.font.size), # set size for percentage labels
#column_title = "OncoPrint for TCGA LGG, genes in Glioma signaling",
#column_title_gp = grid::gpar(fontsize = 11),
top_annotation = annotHeatmap,
heatmap_legend_param = list(
title = label.title, at = names(color),
labels = names(color),
title_gp = grid::gpar(fontsize = label.font.size, fontface = "bold"),
labels_gp = grid::gpar(fontsize = label.font.size), # size labels
grid_height = unit(8, "mm"), # vertical distance labels
nrow = nrow, title_position = title_position
)
)
}
ComplexHeatmap::draw(p, heatmap_legend_side = heatmap.legend.side,
annotation_legend_side = annotation.legend.side)
if(!missing(filename)) {
dev.off()
message(paste0("File saved as: ", filename ))
}
}
#' @title Creates a volcano plot for DNA methylation or gene expression
#' @description Creates a volcano plot from the
#' gene expression and DNA methylation analysis.
#' @details
#' Creates a volcano plot from the gene expression and DNA methylation analysis.
#' Please see the vignette for more information
#' @param x x-axis data (i.e. Diff mean beta-values or Log2FC).
#' @param y FDR adjusted p-value (q-value). This data will be transformed to -log10 values.
#' @param y.cut q-values threshold (i.e. 0.01, 10^-10)
#' @param x.cut x-axis threshold. Default: 0.0 If you give only one number (e.g. 0.2) the cut-offs will be
#' -0.2 and 0.2. Or you can give different cut-offs as a vector (e.g. c(-0.3,0.4))
#' @param filename File name: volcano.pdf, volcano.svg, volcano.png. If NULL returns
#' the ggplot object.
#' @param legend Legend title
#' @param color vector of colors to be used in graph
#' @param title main title. If not specified it will be
#' "Volcano plot (group1 vs group2)
#' @param ylab y axis text. Default: -Log10 FDR corrected P-values
#' @param xlab x axis text. Default: No text. Examples of input:
#' expression(paste(Log[2], "FoldChange"))
#' @param xlim x limits to cut image (i.e. c(-4,4))
#' @param ylim y limits to cut image (i.e. c(-1,10))
#' @param height Figure height
#' @param width Figure width
#' @param names Names to be plotted if significant.
#' Should be the same size of x and y
#' @param names.fill Names should be filled in a color box? Default: TRUE
#' @param names.size Size of the names text
#' @param dpi Figure dpi
#' @param label vector of labels to be used in the figure.
#' Example: c("Not Significant","Hypermethylated in group1",
#' "Hypomethylated in group1"))#'
#' @param highlight List of genes/probes to be highlighted. It should be in the names argument.
#' @param highlight.color Color of the points highlighted
#' @param show.names What names will be showed? Possibilities: "both", "significant", "highlighted"
#' @export
#' @return Saves the volcano plot in the current folder
#' @examples
#' log2_foldchange <- runif(200, -2, 2)
#' fdr <- runif(200, 0.01, 1)
#' TCGAVisualize_volcano(
#' x = log2_foldchange,
#' y = fdr,
#' x.cut = 1.5,
#' y.cut = 0.01,
#' title = "Title example",
#' xlab = expression(paste(Log[2], "FoldChange"))
#' )
#' \dontrun{
#' beta_diff <- runif(200, -1, 1)
#' fdr <- runif(200, 0.01, 1)
#' TCGAVisualize_volcano(
#' x = beta_diff,
#' y = fdr,
#' x.cut = 1.5,
#' y.cut = 0.01,
#' title = "Title example",
#' xlab = expression(paste("DNA Methylation difference (", beta, "-values)"))
#' )
#' TCGAVisualize_volcano(
#' x,
#' y,
#' filename = NULL,
#' y.cut = 10000000,
#' x.cut=0.8,
#' names = rep("AAAA",length(x)),
#' legend = "Status",
#' names.fill = FALSE
#' )
#'
#' TCGAVisualize_volcano(
#' x,
#' y,
#' filename = NULL,
#' y.cut = 10000000,
#' x.cut = 0.8,
#' names = as.character(1:length(x)),
#' legend = "Status",
#' names.fill = TRUE, highlight = c("1","2"),
#' show = "both"
#' )
#' TCGAVisualize_volcano(
#' x,
#' y,
#' filename = NULL,
#' y.cut = 10000000,
#' x.cut = c(-0.3,0.8),
#' names = as.character(1:length(x)),
#' legend = "Status",
#' names.fill = TRUE,
#' highlight = c("1","2"),
#' show = "both"
#' )
#' }
#' while (!(is.null(dev.list()["RStudioGD"]))){dev.off()}
TCGAVisualize_volcano <- function(
x,
y,
filename = "volcano.pdf",
ylab = expression(paste(-Log[10]," (FDR corrected P-values)")),
xlab = NULL,
title = "Volcano plot",
legend = NULL,
label = NULL,
xlim = NULL,
ylim = NULL,
color = c("black", "red", "green"),
names = NULL,
names.fill = TRUE,
show.names = "significant",
x.cut = 0,
y.cut = 0.01,
height = 5,
width = 10,
highlight = NULL,
highlight.color = "orange",
names.size = 4,
dpi = 300)
{
if (!is.null(names)) {
if (all(grepl("\\|", names))) {
names <- strsplit(names, "\\|")
names <- unlist(lapply(names, function(x)
x[1]))
}
}
.e <- environment()
threshold <- rep("1", length(x))
names(color) <- as.character(1:3)
if (is.null(label)) {
label = c(
"1" = "Not Significant",
"2" = "Up regulated",
"3" = "Down regulated"
)
} else {
names(label) <- as.character(1:3)
}
# get significant data
sig <- y < y.cut
sig[is.na(sig)] <- FALSE
# If x.cut
if (length(x.cut) == 1) {
x.cut.min <- -1 * x.cut
x.cut.max <- x.cut
}
if (length(x.cut) == 2) {
x.cut.min <- x.cut[1]
x.cut.max <- x.cut[2]
}
# hypermethylated/up regulated samples compared to old state
up <- x > x.cut.max
up[is.na(up)] <- FALSE
if (any(up & sig)) threshold[up & sig] <- "2"
# hypomethylated/ down regulated samples compared to old state
down <- x < x.cut.min
down[is.na(down)] <- FALSE
if (any(down & sig)) threshold[down & sig] <- "3"
if (!is.null(highlight)) {
idx <- which(names %in% highlight)
if (length(idx) > 0) {
threshold[which(names %in% highlight)] <- "4"
color <- c(color, highlight.color)
names(color) <- as.character(1:4)
}
}
df <- data.frame(
x = x,
y = y,
threshold = threshold
)
# As last color should be the highlighthed, we need to order all the vectors
if (!is.null(highlight)) {
order.idx <- order(df$threshold)
down <- down[order.idx]
sig <- sig[order.idx]
up <- up[order.idx]
df <- df[order.idx,]
names <- names[order.idx]
}
# Plot a volcano plot
p <- ggplot(
data = df,
aes(
x = x ,
y = -1 * log10(y),
colour = threshold
),
environment = .e
) +
geom_point() +
ggtitle(title) + ylab(ylab) + xlab(xlab) +
geom_vline(aes(xintercept = x.cut.min),
colour = "black",
linetype = "dashed") +
geom_vline(aes(xintercept = x.cut.max),
colour = "black",
linetype = "dashed") +
geom_hline(aes(yintercept = -1 * log10(y.cut)),
colour = "black",
linetype = "dashed") +
scale_color_manual(
breaks = as.numeric(names(label)),
values = color,
labels = label,
name = legend
) +
theme_bw() + theme(
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.text = element_text(size = 10),
plot.title = element_text(hjust = 0.5),
axis.line.x = element_line(colour = "black"),
axis.line.y = element_line(colour = "black"),
legend.position = "top",
legend.key = element_rect(colour = 'white')
)
# Label points with the textxy function from the calibrate plot
if (!is.null(names)) {
# With the names the user can highlight the significant genes, up and down
# or the ones highlighted
if (show.names == "significant") {
idx <- (up & sig) | (down & sig)
important <- c("2", "3")
} else if (show.names == "highlighted") {
if (!is.null(highlight)) {
idx <- (names %in% highlight)
important <- c("4")
} else {
message("Missing highlight argument")
return(NULL)
}
} else if (show.names == "both") {
if (!is.null(highlight)) {
idx <- (up & sig) | (down & sig) | (names %in% highlight)
important <- c("2", "3", "4")
} else {
message("Missing highlight argument")
return(NULL)
}
} else {
message("Wrong highlight argument")
return(NULL)
}
if (any(threshold %in% important)) {
check_package("ggrepel")
if (names.fill) {
p <- p + ggrepel::geom_label_repel(
data = subset(df, threshold %in% important),
aes(label = names[idx], fill = threshold),
size = names.size,
show.legend = FALSE,
fontface = 'bold',
color = 'white',
box.padding = unit(0.35, "lines"),
segment.colour = "grey",
point.padding = unit(0.3, "lines")
) + scale_fill_manual(values = color[as.numeric(important)])
} else {
p <- p + ggrepel::geom_text_repel(
data = subset(df, threshold %in% important),
aes(label = names[idx]),
size = names.size,
show.legend = FALSE,
segment.colour = "grey",
fontface = 'bold',
color = 'black',
point.padding = unit(0.3, "lines"),
box.padding = unit(0.5, 'lines')
)
}
}
}
if(!is.null(xlim)) {
p <- p + xlim(xlim)
}
if(!is.null(ylim)) {
p <- p + ylim(ylim)
}
if (!is.null(filename)) {
message("Saving file as: ", filename)
ggsave(
p,
filename = filename,
width = width,
height = height,
dpi = dpi
)
} else {
return(p)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.