#' @title Hierarchical cluster analysis
#' @description Hierarchical cluster analysis using several methods such as
#' ward.D", "ward.D2", "single", "complete", "average" (= UPGMA),
#' "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC).
#' @param tabDF is a dataframe or numeric matrix, each row represents a gene,
#' each column represents a sample come from TCGAPrepare.
#' @param method is method to be used for generic cluster such as 'hclust'
#' or 'consensus'
#' @param methodHC is method to be used for Hierarchical cluster.
#' @import stats
#' @export
#' @return object of class hclust if method selected is 'hclust'.
#' If method selected is 'Consensus' returns a list of length maxK
#' (maximum cluster number to evaluate.). Each element is a list containing
#' consensusMatrix (numerical matrix), consensusTree (hclust), consensusClass
#' (consensus class assignments). ConsensusClusterPlus also produces images.
TCGAanalyze_Clustering <-
function(tabDF, method, methodHC = "ward.D2") {
if (!requireNamespace("ConsensusClusterPlus", quietly = TRUE)) {
stop("ConsensusClusterPlus is needed. Please install it.",
call. = FALSE)
}
if (method == "hclust") {
ans <- hclust(ddist <- dist(tabDF), method = methodHC)
}
if (method == "consensus") {
sHc <- hclust(ddist <- dist(tabDF), method = methodHC)
ans <-
ConsensusClusterPlus::ConsensusClusterPlus(
ddist,
maxK = 7,
pItem = 0.9,
reps = 1000,
title = "mc_consensus_k7_1000",
clusterAlg = "hc",
innerLinkage = "ward.D2",
finalLinkage = "complete",
plot = 'pdf',
writeTable = TRUE
)
}
return(ans)
}
#' @title Array Array Intensity correlation (AAIC) and correlation boxplot to define outlier
#' @description TCGAanalyze_Preprocessing perform Array Array Intensity correlation (AAIC).
#' It defines a square symmetric matrix of spearman correlation among samples.
#' According this matrix and boxplot of correlation samples by samples it is possible
#' to find samples with low correlation that can be identified as possible outliers.
#' @param object of gene expression of class RangedSummarizedExperiment from TCGAprepare
#' @param cor.cut is a threshold to filter samples according their spearman correlation in
#' samples by samples. default cor.cut is 0
#' @param filename Filename of the image file
#' @param width Image width
#' @param height Image height
#' @param datatype is a string from RangedSummarizedExperiment assay
#' @importFrom grDevices dev.list
#' @importFrom SummarizedExperiment assays
#' @export
#' @return Plot with array array intensity correlation and boxplot of correlation samples by samples
TCGAanalyze_Preprocessing <- function(object,
cor.cut = 0,
filename = NULL,
width = 1000,
height = 1000,
datatype = names(assays(object))[1]) {
# This is a work around for raw_counts and raw_count
if (grepl("raw_count", datatype) &
any(grepl("raw_count", names(assays(object)))))
datatype <-
names(assays(object))[grepl("raw_count", names(assays(object)))]
if (!any(grepl(datatype, names(assays(object)))))
stop(
paste0(
datatype,
" not found in the assay list: ",
paste(names(assays(object)), collapse = ", "),
"\n Please set the correct datatype argument."
)
)
if (!(is.null(dev.list()["RStudioGD"]))) {
dev.off()
}
if (is.null(filename))
filename <- "PreprocessingOutput.png"
png(filename, width = width, height = height)
par(oma = c(10, 10, 10, 10))
ArrayIndex <- as.character(1:length(colData(object)$barcode))
pmat_new <- matrix(0, length(ArrayIndex), 4)
colnames(pmat_new) <- c("Disease", "platform", "SampleID", "Study")
rownames(pmat_new) <- as.character(colData(object)$barcode)
pmat_new <- as.data.frame(pmat_new)
pmat_new$Disease <- as.character(colData(object)$definition)
pmat_new$platform <- "platform"
pmat_new$SampleID <- as.character(colData(object)$barcode)
pmat_new$Study <- "study"
tabGroupCol <-
cbind(pmat_new, Color = matrix(0, nrow(pmat_new), 1))
for (i in seq_along(unique(tabGroupCol$Disease))) {
tabGroupCol[which(tabGroupCol$Disease == tabGroupCol$Disease[i]), "Color"] <-
rainbow(length(unique(tabGroupCol$Disease)))[i]
}
# pmat <- as.matrix(pData(phenoData(object)))
pmat <- pmat_new
phenodepth <- min(ncol(pmat), 3)
order <- switch(
phenodepth + 1,
ArrayIndex,
order(pmat[, 1]),
order(pmat[, 1], pmat[, 2]),
order(pmat[, 1],
pmat[, 2], pmat[, 3])
)
arraypos <-
(1:length(ArrayIndex)) * (1 / (length(ArrayIndex) - 1)) - (1 / (length(ArrayIndex) - 1))
arraypos2 = seq(1:length(ArrayIndex) - 1)
for (i in 2:length(ArrayIndex)) {
arraypos2[i - 1] <- (arraypos[i] + arraypos[i - 1]) / 2
}
layout(matrix(c(1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 2, 3, 3, 3, 4), 4, 4, byrow = TRUE))
c <- cor(assay(object, datatype)[, order], method = "spearman")
image(c,
xaxt = "n",
yaxt = "n",
#xlab = "Array Samples",
#ylab = "Array Samples",
main = "Array-Array Intensity Correlation after RMA")
for (i in 1:length(names(table(tabGroupCol$Color)))) {
currentCol <- names(table(tabGroupCol$Color))[i]
pos.col <- arraypos[which(tabGroupCol$Color == currentCol)]
lab.col <-
colnames(c)[which(tabGroupCol$Color == currentCol)]
#axis(1, labels = lab.col , at = pos.col, col = currentCol,lwd = 6,las = 2)
axis(
2,
labels = lab.col ,
at = pos.col,
col = currentCol,
lwd = 6,
las = 2
)
}
m <-
matrix(pretty(c, 10), nrow = 1, ncol = length(pretty(c, 10)))
image(m,
xaxt = "n",
yaxt = "n",
ylab = "Correlation Coefficient")
axis(2,
labels = as.list(pretty(c, 10)),
at = seq(0, 1, by = (1 / (
length(pretty(c, 10)) - 1
))))
abline(h = seq((1 / (
length(pretty(c, 10)) - 1
)) / 2,
1 - (1 / (
length(pretty(c, 10)) - 1
)),
by = (1 / (
length(pretty(c, 10)) - 1
))))
box()
boxplot(
c,
outline = FALSE,
las = 2,
lwd = 6,
# names = NULL,
col = tabGroupCol$Color,
main = "Boxplot of correlation samples by samples after normalization"
)
dev.off()
samplesCor <- rowMeans(c)
objectWO <- assay(object, datatype)[, samplesCor > cor.cut]
colnames(objectWO) <- colnames(object)[samplesCor > cor.cut]
return(objectWO)
}
#' @title survival analysis (SA) univariate with Kaplan-Meier (KM) method.
#' @description TCGAanalyze_SurvivalKM perform an univariate Kaplan-Meier (KM) survival analysis (SA).
#' It performed Kaplan-Meier survival univariate using complete follow up with all days
#' taking one gene a time from Genelist of gene symbols.
#' For each gene according its level of mean expression in cancer samples,
#' defining two thresholds for quantile
#' expression of that gene in all samples (default ThreshTop=0.67,ThreshDown=0.33) it is possible
#' to define a threshold of intensity of gene expression to divide the samples in 3 groups
#' (High, intermediate, low).
#' TCGAanalyze_SurvivalKM performs SA between High and low groups using following functions
#' from survival package
#' \enumerate{
#' \item survival::Surv
#' \item survival::survdiff
#' \item survival::survfit
#' }
#' @param clinical_patient is a data.frame using function 'clinic' with information
#' related to barcode / samples such as bcr_patient_barcode, days_to_death ,
#' days_to_last_follow_up , vital_status, etc
#' @param dataGE is a matrix of Gene expression (genes in rows, samples in cols) from TCGAprepare
#' @param Genelist is a list of gene symbols where perform survival KM.
#' @param Survresult is a parameter (default = FALSE) if is TRUE will show KM plot and results.
#' @param ThreshTop is a quantile threshold to identify samples with high expression of a gene
#' @param ThreshDown is a quantile threshold to identify samples with low expression of a gene
#' @param p.cut p.values threshold. Default: 0.05
#' @param group1 a string containing the barcode list of the samples in in control group
#' @param group2 a string containing the barcode list of the samples in in disease group
#' @export
#' @return table with survival genes pvalues from KM.
#' @examples
#' # Selecting only 20 genes for example
#' dataBRCAcomplete <- log2(dataBRCA[1:20,] + 1)
#'
#' # clinical_patient_Cancer <- GDCquery_clinic("TCGA-BRCA","clinical")
#' clinical_patient_Cancer <- data.frame(
#' bcr_patient_barcode = substr(colnames(dataBRCAcomplete),1,12),
#' vital_status = c(rep("alive",3),"dead",rep("alive",2),rep(c("dead","alive"),2)),
#' days_to_death = c(NA,NA,NA,172,NA,NA,3472,NA,786,NA),
#' days_to_last_follow_up = c(3011,965,718,NA,1914,423,NA,5,656,1417)
#' )
#'
#' group1 <- TCGAquery_SampleTypes(colnames(dataBRCAcomplete), typesample = c("NT"))
#' group2 <- TCGAquery_SampleTypes(colnames(dataBRCAcomplete), typesample = c("TP"))
#'
#' tabSurvKM <- TCGAanalyze_SurvivalKM(clinical_patient_Cancer,
#' dataBRCAcomplete,
#' Genelist = rownames(dataBRCAcomplete),
#' Survresult = FALSE,
#' p.cut = 0.4,
#' ThreshTop = 0.67,
#' ThreshDown = 0.33,
#' group1 = group1, # Control group
#' group2 = group2) # Disease group
#'
#' # If the groups are not specified group1 == group2 and all samples are used
#' \dontrun{
#' tabSurvKM <- TCGAanalyze_SurvivalKM(clinical_patient_Cancer,
#' dataBRCAcomplete,
#' Genelist = rownames(dataBRCAcomplete),
#' Survresult = TRUE,
#' p.cut = 0.2,
#' ThreshTop = 0.67,
#' ThreshDown = 0.33)
#' }
TCGAanalyze_SurvivalKM <- function(
clinical_patient,
dataGE,
Genelist,
Survresult = FALSE,
ThreshTop = 0.67,
ThreshDown = 0.33,
p.cut = 0.05,
group1,
group2
) {
check_package("survival")
# Check which genes we really have in the matrix
Genelist <- intersect(rownames(dataGE), Genelist)
# Split gene expression matrix btw the groups
dataCancer <- dataGE[Genelist, group2, drop = FALSE]
dataNormal <- dataGE[Genelist, group1, drop = FALSE]
colnames(dataCancer) <- substr(colnames(dataCancer), 1, 12)
cfu <-
clinical_patient[clinical_patient[, "bcr_patient_barcode"] %in% substr(colnames(dataCancer), 1, 12), ]
if ("days_to_last_followup" %in% colnames(cfu))
colnames(cfu)[grep("days_to_last_followup", colnames(cfu))] <-
"days_to_last_follow_up"
cfu <-
as.data.frame(subset(
cfu,
select = c(
"bcr_patient_barcode",
"days_to_death",
"days_to_last_follow_up",
"vital_status"
)
))
# Set alive death to inf
if (length(grep("alive", cfu$vital_status, ignore.case = TRUE)) > 0)
cfu[grep("alive", cfu$vital_status, ignore.case = TRUE), "days_to_death"] <-
"-Inf"
# Set dead follow up to inf
if (length(grep("dead", cfu$vital_status, ignore.case = TRUE)) > 0)
cfu[grep("dead", cfu$vital_status, ignore.case = TRUE), "days_to_last_follow_up"] <-
"-Inf"
cfu <- cfu[!(is.na(cfu[, "days_to_last_follow_up"])), ]
cfu <- cfu[!(is.na(cfu[, "days_to_death"])), ]
followUpLevel <- FALSE
#FC_FDR_table_mRNA
tabSurv_Matrix <-
matrix(0, nrow(as.matrix(rownames(dataNormal))), 8)
colnames(tabSurv_Matrix) <- c(
"mRNA",
"pvalue",
"Cancer Deaths",
"Cancer Deaths with Top",
"Cancer Deaths with Down",
"Mean Tumor Top",
"Mean Tumor Down",
"Mean Normal"
)
tabSurv_Matrix <- as.data.frame(tabSurv_Matrix)
cfu$days_to_death <- as.numeric(as.character(cfu$days_to_death))
cfu$days_to_last_follow_up <-
as.numeric(as.character(cfu$days_to_last_follow_up))
rownames(cfu) <- cfu[, "bcr_patient_barcode"] #mod1
cfu <- cfu[!(is.na(cfu[, "days_to_last_follow_up"])), ]
cfu <- cfu[!(is.na(cfu[, "days_to_death"])), ]
cfu_complete <- cfu
ngenes <- nrow(as.matrix(rownames(dataNormal)))
# Evaluate each gene
for (i in 1:nrow(as.matrix(rownames(dataNormal)))) {
cat(paste0((ngenes - i), "."))
mRNAselected <- as.matrix(rownames(dataNormal))[i]
mRNAselected_values <-
dataCancer[rownames(dataCancer) == mRNAselected, ]
mRNAselected_values_normal <-
dataNormal[rownames(dataNormal) == mRNAselected, ]
if (all(mRNAselected_values == 0))
next # All genes are 0
tabSurv_Matrix[i, "mRNA"] <- mRNAselected
# Get Thresh values for cancer expression
mRNAselected_values_ordered <-
sort(mRNAselected_values, decreasing = TRUE)
mRNAselected_values_ordered_top <-
as.numeric(quantile(as.numeric(mRNAselected_values_ordered), ThreshTop)[1])
mRNAselected_values_ordered_down <-
as.numeric(quantile(as.numeric(mRNAselected_values_ordered), ThreshDown)[1])
mRNAselected_values_newvector <- mRNAselected_values
if (!is.na(mRNAselected_values_ordered_top)) {
# How many samples do we have
numberOfSamples <- length(mRNAselected_values_ordered)
# High group (above ThreshTop)
lastelementTOP <-
max(which(
mRNAselected_values_ordered > mRNAselected_values_ordered_top
))
# Low group (below ThreshDown)
firstelementDOWN <-
min(
which(
mRNAselected_values_ordered <= mRNAselected_values_ordered_down
)
)
samples_top_mRNA_selected <-
names(mRNAselected_values_ordered[1:lastelementTOP])
samples_down_mRNA_selected <-
names(mRNAselected_values_ordered[firstelementDOWN:numberOfSamples])
# Which samples are in the intermediate group (above ThreshLow and below ThreshTop)
samples_UNCHANGED_mRNA_selected <-
names(mRNAselected_values_newvector[which((mRNAselected_values_newvector) > mRNAselected_values_ordered_down &
mRNAselected_values_newvector < mRNAselected_values_ordered_top
)])
cfu_onlyTOP <-
cfu_complete[cfu_complete[, "bcr_patient_barcode"] %in% samples_top_mRNA_selected, ]
cfu_onlyDOWN <-
cfu_complete[cfu_complete[, "bcr_patient_barcode"] %in% samples_down_mRNA_selected, ]
cfu_onlyUNCHANGED <-
cfu_complete[cfu_complete[, "bcr_patient_barcode"] %in% samples_UNCHANGED_mRNA_selected, ]
cfu_ordered <- NULL
cfu_ordered <- rbind(cfu_onlyTOP, cfu_onlyDOWN)
cfu <- cfu_ordered
ttime <- as.numeric(cfu[, "days_to_death"])
sum(status <- ttime > 0) # morti
deads_complete <- sum(status <- ttime > 0)
ttime_only_top <- cfu_onlyTOP[, "days_to_death"]
deads_top <- sum(ttime_only_top > 0)
if (dim(cfu_onlyDOWN)[1] >= 1) {
ttime_only_down <- cfu_onlyDOWN[, "days_to_death"]
deads_down <- sum(ttime_only_down > 0)
} else {
deads_down <- 0
}
tabSurv_Matrix[i, "Cancer Deaths"] <- deads_complete
tabSurv_Matrix[i, "Cancer Deaths with Top"] <- deads_top
tabSurv_Matrix[i, "Cancer Deaths with Down"] <-
deads_down
tabSurv_Matrix[i, "Mean Normal"] <-
mean(as.numeric(mRNAselected_values_normal))
dataCancer_onlyTop_sample <-
dataCancer[, samples_top_mRNA_selected, drop = FALSE]
dataCancer_onlyTop_sample_mRNASelected <-
dataCancer_onlyTop_sample[rownames(dataCancer_onlyTop_sample) == mRNAselected, ]
dataCancer_onlyDown_sample <-
dataCancer[, samples_down_mRNA_selected, drop = FALSE]
dataCancer_onlyDown_sample_mRNASelected <-
dataCancer_onlyDown_sample[rownames(dataCancer_onlyDown_sample) == mRNAselected, ]
tabSurv_Matrix[i, "Mean Tumor Top"] <-
mean(as.numeric(dataCancer_onlyTop_sample_mRNASelected))
tabSurv_Matrix[i, "Mean Tumor Down"] <-
mean(as.numeric(dataCancer_onlyDown_sample_mRNASelected))
ttime[!status] <-
as.numeric(cfu[!status, "days_to_last_follow_up"])
ttime[which(ttime == -Inf)] <- 0
ttime <- survival::Surv(ttime, status)
rownames(ttime) <- rownames(cfu)
legendHigh <- paste(mRNAselected, "High")
legendLow <- paste(mRNAselected, "Low")
tabSurv_pvalue <- tryCatch({
tabSurv <-
survival::survdiff(ttime ~ c(rep(
"top", nrow(cfu_onlyTOP)
), rep(
"down", nrow(cfu_onlyDOWN)
)))
tabSurv_chis <- unlist(tabSurv)$chisq
tabSurv_pvalue <-
as.numeric(1 - pchisq(abs(tabSurv$chisq), df = 1))
}, error = function(e) {
return(Inf)
})
tabSurv_Matrix[i, "pvalue"] <- tabSurv_pvalue
if (Survresult == TRUE) {
titlePlot <-
paste("Kaplan-Meier Survival analysis, pvalue=",
tabSurv_pvalue)
plot(
survival::survfit(ttime ~ c(
rep("low", nrow(cfu_onlyTOP)), rep("high", nrow(cfu_onlyDOWN))
)),
col = c("green", "red"),
main = titlePlot,
xlab = "Days",
ylab = "Survival"
)
legend(
100,
1,
legend = c(legendLow, legendHigh),
col = c("green", "red"),
text.col = c("green", "red"),
pch = 15
)
print(tabSurv)
}
} #end if
} #end for
tabSurv_Matrix[tabSurv_Matrix == "-Inf"] <- 0
tabSurvKM <- tabSurv_Matrix
# Filtering by selected pvalue < 0.01
tabSurvKM <- tabSurvKM[tabSurvKM$mRNA != 0, ]
tabSurvKM <- tabSurvKM[tabSurvKM$pvalue < p.cut, ]
tabSurvKM <- tabSurvKM[!duplicated(tabSurvKM$mRNA), ]
rownames(tabSurvKM) <- tabSurvKM$mRNA
tabSurvKM <- tabSurvKM[, -1]
tabSurvKM <-
tabSurvKM[order(tabSurvKM$pvalue, decreasing = FALSE), ]
colnames(tabSurvKM) <-
gsub("Cancer", "Group2", colnames(tabSurvKM))
colnames(tabSurvKM) <-
gsub("Tumor", "Group2", colnames(tabSurvKM))
colnames(tabSurvKM) <-
gsub("Normal", "Group1", colnames(tabSurvKM))
return(tabSurvKM)
}
#' @title Filtering mRNA transcripts and miRNA selecting a threshold.
#' @description
#' TCGAanalyze_Filtering allows user to filter mRNA transcripts and miRNA,
#' samples, higher than the threshold defined quantile mean across all samples.
#' @param tabDF is a dataframe or numeric matrix, each row represents a gene,
#' each column represents a sample come from TCGAPrepare
#' @param method is method of filtering such as 'quantile', 'varFilter', 'filter1', 'filter2'
#' @param qnt.cut is threshold selected as mean for filtering
#' @param var.func is function used as the per-feature filtering statistic.
#' See genefilter documentation
#' @param var.cutoff is a numeric value. See genefilter documentation
#' @param eta is a parameter for filter1. default eta = 0.05.
#' @param foldChange is a parameter for filter2. default foldChange = 1.
#' @export
#' @return A filtered dataframe or numeric matrix where each row represents a gene,
#' each column represents a sample
#' @examples
#' dataNorm <- TCGAbiolinks::TCGAanalyze_Normalization(dataBRCA, geneInfo)
#' dataNorm <- TCGAanalyze_Normalization(tabDF = dataBRCA,
#' geneInfo = geneInfo,
#' method = "geneLength")
#' dataFilt <- TCGAanalyze_Filtering(tabDF = dataNorm, method = "quantile", qnt.cut = 0.25)
TCGAanalyze_Filtering <- function(tabDF,
method,
qnt.cut = 0.25,
var.func = IQR,
var.cutoff = 0.75,
eta = 0.05,
foldChange = 1) {
if (method == "quantile") {
GeneThresh <- as.numeric(quantile(rowMeans(tabDF), qnt.cut))
geneFiltered <- names(which(rowMeans(tabDF) > GeneThresh))
tabDF_Filt <- tabDF[geneFiltered,]
}
if (method == "varFilter") {
check_package("genefilter")
tabDF_Filt <- genefilter::varFilter(
tabDF,
var.func = IQR,
var.cutoff = 0.75,
filterByQuantile = TRUE
)
}
if (method == "filter1") {
normCounts <- tabDF
geData <- t(log(1 + normCounts, 2))
filter <-
apply(geData, 2, function(x)
sum(quantile(x, probs = c(1 - eta, eta)) * c(1, -1)))
tabDF_Filt <- geData[, which(filter > foldChange)]
}
if (method == "filter2") {
geData <- tabDF
filter <-
apply(geData, 2, function(x)
prod(quantile(x, probs = c(1 - eta, eta)) - 10) < 0)
tabDF_Filt <- geData[, which(filter)]
}
return(tabDF_Filt)
}
#' @title normalization mRNA transcripts and miRNA using EDASeq package.
#' @description
#' TCGAanalyze_Normalization allows user to normalize mRNA transcripts and miRNA,
#' using EDASeq package.
#'
#' Normalization for RNA-Seq Numerical and graphical
#' summaries of RNA-Seq read data. Within-lane normalization procedures
#' to adjust for GC-content effect (or other gene-level effects) on read counts:
#' loess robust local regression, global-scaling, and full-quantile normalization
#' (Risso et al., 2011). Between-lane normalization procedures to adjust for
#' distributional differences between lanes (e.g., sequencing depth):
#' global-scaling and full-quantile normalization (Bullard et al., 2010).
#'
#' For istance returns all mRNA or miRNA with mean across all
#' samples, higher than the threshold defined quantile mean across all samples.
#'
#' TCGAanalyze_Normalization performs normalization using following functions
#' from EDASeq
#' \enumerate{
#' \item EDASeq::newSeqExpressionSet
#' \item EDASeq::withinLaneNormalization
#' \item EDASeq::betweenLaneNormalization
#' \item EDASeq::counts
#' }
#' @param tabDF Rnaseq numeric matrix, each row represents a gene,
#' each column represents a sample
#' @param geneInfo Information matrix of 20531 genes about geneLength and gcContent.
#' Two objects are provided: TCGAbiolinks::geneInfoHT,TCGAbiolinks::geneInfo
#' @param method is method of normalization such as 'gcContent' or 'geneLength'
#' @export
#' @return Rnaseq matrix normalized with counts slot holds the count data as a matrix
#' of non-negative integer count values, one row for each observational unit (gene or the like),
#' and one column for each sample.
#' @examples
#' dataNorm <- TCGAbiolinks::TCGAanalyze_Normalization(dataBRCA, geneInfo)
TCGAanalyze_Normalization <-
function(tabDF, geneInfo, method = "geneLength") {
check_package("EDASeq")
# Check if we have a SE, we need a gene expression matrix
if (is(tabDF, "SummarizedExperiment")) tabDF <- assay(tabDF)
geneInfo <- geneInfo[!is.na(geneInfo[, 1]), ]
geneInfo <- as.data.frame(geneInfo)
geneInfo$geneLength <-
as.numeric(as.character(geneInfo$geneLength))
geneInfo$gcContent <- as.numeric(as.character(geneInfo$gcContent))
if (method == "gcContent") {
tmp <- as.character(rownames(tabDF))
tmp <- strsplit(tmp, "\\|")
geneNames <- matrix("", ncol = 2, nrow = length(tmp))
j <- 1
while (j <= length(tmp)) {
geneNames[j, 1] <- tmp[[j]][1]
geneNames[j, 2] <- tmp[[j]][2]
j <- j + 1
}
tmp <- which(geneNames[, 1] == "?")
geneNames[tmp, 1] <- geneNames[tmp, 2]
tmp <- table(geneNames[, 1])
tmp <- which(geneNames[, 1] %in% names(tmp[which(tmp > 1)]))
geneNames[tmp, 1] <- paste(geneNames[tmp, 1], geneNames[tmp, 2], sep = ".")
tmp <- table(geneNames[, 1])
rownames(tabDF) <- geneNames[, 1]
rawCounts <- tabDF
commonGenes <- intersect(rownames(geneInfo), rownames(rawCounts))
geneInfo <- geneInfo[commonGenes, ]
rawCounts <- rawCounts[commonGenes, ]
timeEstimated <- format(ncol(tabDF) * nrow(tabDF) / 80000, digits = 2)
message(
messageEstimation <- paste(
"I Need about ",
timeEstimated,
"seconds for this Complete Normalization Upper Quantile",
" [Processing 80k elements /s] "
)
)
ffData <- as.data.frame(geneInfo)
rawCounts <- floor(rawCounts)
message("Step 1 of 4: newSeqExpressionSet ...")
tmp <- EDASeq::newSeqExpressionSet(as.matrix(rawCounts), featureData = ffData)
#fData(tmp)[, "gcContent"] <- as.numeric(geneInfo[, "gcContent"])
message("Step 2 of 4: withinLaneNormalization ...")
tmp <- EDASeq::withinLaneNormalization(
tmp, "gcContent",
which = "upper",
offset = TRUE
)
message("Step 3 of 4: betweenLaneNormalization ...")
tmp <- EDASeq::betweenLaneNormalization(
tmp,
which = "upper",
offset = TRUE
)
normCounts <- log(rawCounts + .1) + EDASeq::offst(tmp)
normCounts <- floor(exp(normCounts) - .1)
message("Step 4 of 4: .quantileNormalization ...")
tmp <- t(.quantileNormalization(t(normCounts)))
tabDF_norm <- floor(tmp)
}
if (method == "geneLength") {
tabDF <- tabDF[!(GenesCutID(as.matrix(rownames(tabDF))) == "?"), ]
tabDF <- tabDF[!(GenesCutID(as.matrix(rownames(tabDF))) == "SLC35E2"), ]
rownames(tabDF) <- GenesCutID(as.matrix(rownames(tabDF)))
tabDF <- tabDF[rownames(tabDF) != "?",]
tabDF <- tabDF[!duplicated(rownames(tabDF)), !duplicated(colnames(tabDF))]
tabDF <- tabDF[rownames(tabDF) %in% rownames(geneInfo), ]
tabDF <- as.matrix(tabDF)
geneInfo <-
geneInfo[rownames(geneInfo) %in% rownames(tabDF),]
geneInfo <- geneInfo[!duplicated(rownames(geneInfo)),]
toKeep <- which(geneInfo[, "geneLength"] != 0)
geneInfo <- geneInfo[toKeep,]
tabDF <- tabDF[toKeep,]
geneInfo <- as.data.frame(geneInfo)
tabDF <- round(tabDF)
commonGenes <- intersect(rownames(tabDF), rownames(geneInfo))
tabDF <- tabDF[commonGenes, ]
geneInfo <- geneInfo[commonGenes, ]
timeEstimated <-
format(ncol(tabDF) * nrow(tabDF) / 80000, digits = 2)
message(
messageEstimation <- paste(
"I Need about ",
timeEstimated,
"seconds for this Complete Normalization Upper Quantile",
" [Processing 80k elements /s] "
)
)
message("Step 1 of 4: newSeqExpressionSet ...")
system.time(tabDF_norm <-
EDASeq::newSeqExpressionSet(tabDF, featureData = geneInfo))
message("Step 2 of 4: withinLaneNormalization ...")
system.time(
tabDF_norm <-
EDASeq::withinLaneNormalization(
tabDF_norm,
"geneLength",
which = "upper",
offset = FALSE
)
)
message("Step 3 of 4: betweenLaneNormalization ...")
system.time(
tabDF_norm <-
EDASeq::betweenLaneNormalization(tabDF_norm, which = "upper", offset = FALSE)
)
message("Step 4 of 4: exprs ...")
#system.time(tabDF_norm <- EDASeq::exprs(tabDF_norm))
system.time(tabDF_norm <- EDASeq::counts(tabDF_norm))
}
return(tabDF_norm)
}
#' @title Differential expression analysis (DEA) using edgeR or limma package.
#' @description
#' TCGAanalyze_DEA allows user to perform Differentially expression analysis (DEA),
#' using edgeR package or limma to identify differentially expressed genes (DEGs).
#' It is possible to do a two-class analysis.
#'
#' TCGAanalyze_DEA performs DEA using following functions from edgeR:
#' \enumerate{
#' \item edgeR::DGEList converts the count matrix into an edgeR object.
#' \item edgeR::estimateCommonDisp each gene gets assigned the same dispersion estimate.
#' \item edgeR::exactTest performs pair-wise tests for differential expression between two groups.
#' \item edgeR::topTags takes the output from exactTest(), adjusts the raw p-values using the
#' False Discovery Rate (FDR) correction, and returns the top differentially expressed genes.
#' }
#' TCGAanalyze_DEA performs DEA using following functions from limma:
#' \enumerate{
#' \item limma::makeContrasts construct matrix of custom contrasts.
#' \item limma::lmFit Fit linear model for each gene given a series of arrays.
#' \item limma::contrasts.fit Given a linear model fit to microarray data, compute estimated coefficients and standard errors for a given set of contrasts.
#' \item limma::eBayes Given a microarray linear model fit, compute moderated t-statistics, moderated F-statistic, and log-odds of differential expression by empirical Bayes moderation of the standard errors towards a common value.
#' \item limma::toptable Extract a table of the top-ranked genes from a linear model fit.
#' }
#' @param mat1 numeric matrix, each row represents a gene,
#' each column represents a sample with Cond1type
#' @param mat2 numeric matrix, each row represents a gene,
#' each column represents a sample with Cond2type
#' @param metadata Add metadata
#' @param Cond1type a string containing the class label of the samples in mat1
#' (e.g., control group)
#' @param Cond2type a string containing the class label of the samples in mat2
#' (e.g., case group)
#' @param pipeline a string to specify which package to use ("limma" or "edgeR")
#' @param method is 'glmLRT' (1) or 'exactTest' (2) used for edgeR
#' (1) Fit a negative binomial generalized log-linear model to
#' the read counts for each gene
#' (2) Compute genewise exact tests for differences in the means between
#' two groups of negative-binomially distributed counts.
#' @param fdr.cut is a threshold to filter DEGs according their p-value corrected
#' @param logFC.cut is a threshold to filter DEGs according their logFC
#' @param elementsRatio is number of elements processed for second for time consumation estimation
#' @param batch.factors a vector containing strings to specify options for batch correction. Options are "Plate", "TSS", "Year", "Portion", "Center", and "Patients"
#' @param ClinicalDF a dataframe returned by GDCquery_clinic() to be used to extract year data
#' @param paired boolean to account for paired or non-paired samples. Set to TRUE for paired case
#' @param log.trans boolean to perform log cpm transformation. Set to TRUE for log transformation
#' @param trend boolean to perform limma-trend pipeline. Set to TRUE to go through limma-trend
#' @param MAT matrix containing expression set as all samples in columns and genes as rows. Do not provide if mat1 and mat2 are used
#' @param contrast.formula string input to determine coefficients and to design contrasts in a customized way
#' @param Condtypes vector of grouping for samples in MAT
#' @param voom boolean to perform voom transformation for limma-voom pipeline. Set to TRUE for voom transformation
#' @export
#' @examples
#' dataNorm <- TCGAbiolinks::TCGAanalyze_Normalization(dataBRCA, geneInfo)
#' dataFilt <- TCGAanalyze_Filtering(tabDF = dataBRCA, method = "quantile", qnt.cut = 0.25)
#' samplesNT <- TCGAquery_SampleTypes(colnames(dataFilt), typesample = c("NT"))
#' samplesTP <- TCGAquery_SampleTypes(colnames(dataFilt), typesample = c("TP"))
#' dataDEGs <- TCGAanalyze_DEA(mat1 = dataFilt[,samplesNT],
#' mat2 = dataFilt[,samplesTP],
#' Cond1type = "Normal",
#' Cond2type = "Tumor")
#'
#' @return table with DEGs containing for each gene logFC, logCPM, pValue,and FDR, also for each contrast
TCGAanalyze_DEA <-
function (mat1,
mat2,
metadata = TRUE,
Cond1type,
Cond2type,
pipeline = "edgeR",
method = "exactTest",
fdr.cut = 1,
logFC.cut = 0,
elementsRatio = 30000,
batch.factors = NULL,
ClinicalDF = data.frame(),
paired = FALSE,
log.trans = FALSE,
voom = FALSE,
trend = FALSE,
MAT = data.frame(),
contrast.formula = "",
Condtypes = c())
{
if (pipeline == "limma") {
if (!requireNamespace("limma", quietly = TRUE)) {
stop("limma is needed. Please install it.",
call. = FALSE)
}
}
if (!requireNamespace("edgeR", quietly = TRUE)) {
stop("edgeR is needed. Please install it.",
call. = FALSE)
}
table.code <- c(
"TP",
"TR",
"TB",
"TRBM",
"TAP",
"TM",
"TAM",
"THOC",
"TBM",
"NB",
"NT",
"NBC",
"NEBV",
"NBM",
"CELLC",
"TRB",
"CELL",
"XP",
"XCL"
)
names(table.code) <- c(
"01",
"02",
"03",
"04",
"05",
"06",
"07",
"08",
"09",
"10",
"11",
"12",
"13",
"14",
"20",
"40",
"50",
"60",
"61"
)
if (nrow(MAT) == 0) {
TOC <- cbind(mat1, mat2)
Cond1num <- ncol(mat1)
Cond2num <- ncol(mat2)
}
else {
TOC <- MAT
}
if (metadata == TRUE) {
my_IDs <- get_IDs(TOC)
Plate <- factor(my_IDs$plate)
Condition <- factor(my_IDs$condition)
TSS <- factor(my_IDs$tss)
Portion <- factor(my_IDs$portion)
Center <- factor(my_IDs$center)
Patients <- factor(my_IDs$patient)
}
if (paired == TRUE) {
matched.query <- TCGAquery_MatchedCoupledSampleTypes(my_IDs$barcode,
table.code[unique(my_IDs$sample)])
my_IDs <- subset(my_IDs, barcode == matched.query)
TOC <- TOC[, (names(TOC) %in% matched.query)]
}
if (nrow(ClinicalDF) > 0) {
names(ClinicalDF)[names(ClinicalDF) == "bcr_patient_barcode"] <-
"patient"
ClinicalDF$age_at_diag_year <-
floor(clinical$age_at_diagnosis / 365)
ClinicalDF$diag_year <- ClinicalDF$age_at_diag_year +
clinical$year_of_birth
diag_yearDF <- ClinicalDF[, c("patient", "diag_year")]
my_IDs <- merge(my_IDs, ClinicalDF, by = "patient")
Year <- as.factor(my_IDs$diag_year)
}
options <- c("Plate", "TSS", "Year", "Portion", "Center",
"Patients")
if (length(batch.factors) == 0) {
message("Batch correction skipped since no factors provided")
}
else
for (o in batch.factors) {
if (o %in% options == FALSE)
stop(paste0(o, " is not a valid batch correction factor"))
if (o == "Year" & nrow(ClinicalDF) == 0)
stop(
"batch correction using diagnosis year needs clinical info. Provide Clinical Data in arguments"
)
}
additiveformula <- paste(batch.factors, collapse = "+")
message("----------------------- DEA -------------------------------")
if (nrow(MAT) == 0) {
message(message1 <- paste(
"there are Cond1 type",
Cond1type,
"in ",
Cond1num,
"samples"
))
message(message2 <- paste(
"there are Cond2 type",
Cond2type,
"in ",
Cond2num,
"samples"
))
message(message3 <-
paste("there are ", nrow(TOC), "features as miRNA or genes "))
}
else {
message(message3 <-
paste("there are ", nrow(TOC), "features as miRNA or genes "))
}
timeEstimated <- format(ncol(TOC) * nrow(TOC) / elementsRatio,
digits = 2)
message(
messageEstimation <- paste(
"I Need about ",
timeEstimated,
"seconds for this DEA. [Processing 30k elements /s] "
)
)
colnames(TOC) <- paste0("s", 1:ncol(TOC))
if (length(Condtypes) > 0) {
tumorType <- factor(x = Condtypes, levels = unique(Condtypes))
}
else {
tumorType <- factor(x = rep(c(Cond1type, Cond2type),
c(Cond1num, Cond2num)),
levels = c(Cond1type, Cond2type))
}
if (length(batch.factors) == 0 & length(Condtypes) > 0) {
if (pipeline == "edgeR")
design <- model.matrix( ~ tumorType)
else
design <- model.matrix( ~ 0 + tumorType)
} else if (length(batch.factors) == 0 &
length(Condtypes) == 0) {
if (pipeline == "edgeR")
design <- model.matrix( ~ tumorType)
else
design <- model.matrix( ~ 0 + tumorType)
} else if (length(batch.factors) > 0 & length(Condtypes) == 0) {
if (pipeline == "edgeR")
formula <- paste0("~tumorType+", additiveformula)
else
formula <- paste0("~0+tumorType+", additiveformula)
design <- model.matrix(eval(parse(text = formula)))
} else if (length(batch.factors) > 0 & length(Condtypes) > 0) {
if (pipeline == "edgeR") {
formula <- paste0("~tumorType+", additiveformula)
if (length(Condtypes) > 2)
formula <- paste0("~0+tumorType+", additiveformula)
}
else
formula <- paste0("~0+tumorType+", additiveformula)
design <- model.matrix(eval(parse(text = formula)))
}
if (pipeline == "edgeR") {
if (method == "exactTest") {
DGE <- edgeR::DGEList(TOC, group = rep(c(Cond1type,
Cond2type), c(Cond1num, Cond2num)))
disp <- edgeR::estimateCommonDisp(DGE)
tested <- edgeR::exactTest(disp, pair = c(Cond1type,
Cond2type))
logFC_table <- tested$table
tableDEA <-
edgeR::topTags(tested, n = nrow(tested$table))$table
tableDEA <- tableDEA[tableDEA$FDR <= fdr.cut,]
tableDEA <- tableDEA[abs(tableDEA$logFC) >= logFC.cut,]
}
else if (method == "glmLRT") {
if (length(unique(tumorType)) == 2) {
aDGEList <- edgeR::DGEList(counts = TOC, group = tumorType)
aDGEList <-
edgeR::estimateGLMCommonDisp(aDGEList, design)
aDGEList <-
edgeR::estimateGLMTagwiseDisp(aDGEList, design)
aGlmFit <-
edgeR::glmFit(
aDGEList,
design,
dispersion = aDGEList$tagwise.dispersion,
prior.count.total = 0
)
aGlmLRT <- edgeR::glmLRT(aGlmFit, coef = 2)
tableDEA <-
cbind(aGlmLRT$table,
FDR = p.adjust(aGlmLRT$table$PValue, "fdr"))
tableDEA <- tableDEA[tableDEA$FDR < fdr.cut, ]
tableDEA <-
tableDEA[abs(tableDEA$logFC) > logFC.cut, ]
if (all(grepl("ENSG", rownames(tableDEA))))
tableDEA <-
cbind(tableDEA, map.ensg(genes = rownames(tableDEA))[, 2:3])
}
else if (length(unique(tumorType)) > 2) {
aDGEList <- edgeR::DGEList(counts = TOC, group = tumorType)
colnames(design)[1:length(levels(tumorType))] <-
levels(tumorType)
prestr = "makeContrasts("
poststr = ",levels=design)"
commandstr = paste(prestr, contrast.formula,
poststr, sep = "")
commandstr = paste0("limma::", commandstr)
cont.matrix <- eval(parse(text = commandstr))
aDGEList <- edgeR::estimateGLMCommonDisp(aDGEList,
design)
aDGEList <- edgeR::estimateGLMTagwiseDisp(aDGEList,
design)
aGlmFit <-
edgeR::glmFit(
aDGEList,
design,
dispersion = aDGEList$tagwise.dispersion,
prior.count.total = 0
)
tableDEA <- list()
for (mycoef in colnames(cont.matrix)) {
message(paste0("DEA for", " :", mycoef))
aGlmLRT <-
edgeR::glmLRT(aGlmFit, contrast = cont.matrix[, mycoef])
#print("---toptags---")
#print(topTags(aGlmLRT, adjust.method = "fdr",
# sort.by = "PValue"))
tt <- aGlmLRT$table
tt <-
cbind(tt, FDR = p.adjust(aGlmLRT$table$PValue,
"fdr"))
tt <-
tt[(tt$FDR < fdr.cut & abs(as.numeric(tt$logFC)) >
logFC.cut),]
tableDEA[[as.character(mycoef)]] <- tt
if (all(grepl("ENSG", rownames(tableDEA[[as.character(mycoef)]]))))
tableDEA[[as.character(mycoef)]] <-
cbind(tableDEA[[as.character(mycoef)]],
map.ensg(genes = rownames(tableDEA[[as.character(mycoef)]]))[,
2:3])
}
}
}
else
stop(
paste0(
method,
" is not a valid DEA method option. Choose 'exactTest' or 'glmLRT' "
)
)
}
else if (pipeline == "limma") {
if (log.trans == TRUE)
logCPM <- edgeR::cpm(TOC, log = TRUE, prior.count = 3)
else
logCPM <- TOC
if (voom == TRUE) {
message("Voom Transformation...")
logCPM <- limma::voom(logCPM, design)
}
if (length(unique(tumorType)) == 2) {
colnames(design)[1:2] <- c(Cond1type, Cond2type)
contr <- paste0(Cond2type, "-", Cond1type)
cont.matrix <- limma::makeContrasts(contrasts = contr,
levels = design)
fit <- limma::lmFit(logCPM, design)
fit <- limma::contrasts.fit(fit, cont.matrix)
if (trend == TRUE) {
fit <- limma::eBayes(fit, trend = TRUE)
}
else {
fit <- limma::eBayes(fit, trend = FALSE)
}
tableDEA <-
limma::topTable(
fit,
coef = 1,
adjust.method = "fdr",
number = nrow(TOC)
)
limma::volcanoplot(fit, highlight = 10)
index <- which(tableDEA[, 4] < fdr.cut)
tableDEA <- tableDEA[index,]
neg_logFC.cut <- -1 * logFC.cut
index <- which(abs(as.numeric(tableDEA$logFC)) >
logFC.cut)
tableDEA <- tableDEA[index,]
}
else if (length(unique(tumorType)) > 2) {
DGE <- edgeR::DGEList(TOC, group = tumorType)
colnames(design)[1:length(levels(tumorType))] <-
levels(tumorType)
prestr = "makeContrasts("
poststr = ",levels=colnames(design))"
commandstr = paste(prestr, contrast.formula, poststr,
sep = "")
commandstr = paste0("limma::", commandstr)
cont.matrix <- eval(parse(text = commandstr))
fit <- limma::lmFit(logCPM, design)
fit <- limma::contrasts.fit(fit, cont.matrix)
if (trend == TRUE)
fit <- limma::eBayes(fit, trend = TRUE)
else
fit <- limma::eBayes(fit, trend = FALSE)
tableDEA <- list()
for (mycoef in colnames(cont.matrix)) {
tableDEA[[as.character(mycoef)]] <- limma::topTable(
fit,
coef = mycoef,
adjust.method = "fdr",
number = nrow(MAT)
)
message(paste0("DEA for", " :", mycoef))
tempDEA <- tableDEA[[as.character(mycoef)]]
index.up <- which(tempDEA$adj.P.Val < fdr.cut &
abs(as.numeric(tempDEA$logFC)) > logFC.cut)
tableDEA[[as.character(mycoef)]] <-
tempDEA[index.up,]
if (all(grepl("ENSG", rownames(tableDEA[[as.character(mycoef)]]))))
tableDEA[[as.character(mycoef)]] <-
cbind(tableDEA[[as.character(mycoef)]],
map.ensg(genes = rownames(tableDEA[[as.character(mycoef)]]))[,
2:3])
}
}
}
else
stop(paste0(
pipeline,
" is not a valid pipeline option. Choose 'edgeR' or 'limma'"
))
message("----------------------- END DEA -------------------------------")
return(tableDEA)
}
#' @title Batch correction using ComBat and Voom transformation using limma package.
#' @description
#' TCGAbatch_correction allows user to perform a Voom correction on gene expression data and have it ready for DEA.
#' One can also use ComBat for batch correction for exploratory analysis. If batch.factor or adjustment argument is "Year"
#' please provide clinical data. If no batch factor is provided, the data will be voom corrected only
#'
#' TCGAanalyze_DEA performs DEA using following functions from sva and limma:
#' \enumerate{
#' \item limma::voom Transform RNA-Seq Data Ready for Linear Modelling.
#' \item sva::ComBat Adjust for batch effects using an empirical Bayes framework.
#' }
#' @param tabDF numeric matrix, each row represents a gene,
#' each column represents a sample
#' @param batch.factor a string containing the batch factor to use for correction. Options are "Plate", "TSS", "Year", "Portion", "Center"
#' @param adjustment vector containing strings for factors to adjust for using ComBat. Options are "Plate", "TSS", "Year", "Portion", "Center"
#' @param UnpublishedData if TRUE perform a batch correction after adding new data
#' @param ClinicalDF a dataframe returned by GDCquery_clinic() to be used to extract year data
#' @param AnnotationDF a dataframe with column Batch indicating different batches of the samples in the tabDF
#' @export
#' @return data frame with ComBat batch correction applied
TCGAbatch_Correction <- function (tabDF,
batch.factor = NULL,
adjustment = NULL,
ClinicalDF = data.frame(),
UnpublishedData = FALSE,
AnnotationDF = data.frame())
{
if (!requireNamespace("sva", quietly = TRUE)) {
stop("sva is needed. Please install it.",
call. = FALSE)
}
if (UnpublishedData == TRUE) {
batch.factor <- as.factor(AnnotationDF$Batch)
batch_corr <-
sva::ComBat(
dat = tabDF,
batch = batch.factor,
par.prior = TRUE,
prior.plots = TRUE
)
}
if (UnpublishedData == FALSE) {
if (length(batch.factor) == 0 & length(adjustment) == 0)
message("batch correction will be skipped")
else if (batch.factor %in% adjustment) {
stop(paste0("Cannot adjust and correct for the same factor|"))
}
my_IDs <- get_IDs(tabDF)
if (length(batch.factor) > 0 || length(adjustment) > 0)
if ((nrow(ClinicalDF) > 0 & batch.factor == "Year") ||
("Year" %in% adjustment == TRUE & nrow(ClinicalDF) >
0)) {
names(ClinicalDF)[names(ClinicalDF) == "bcr_patient_barcode"] <-
"patient"
ClinicalDF$age_at_diag_year <-
floor(ClinicalDF$age_at_diagnosis / 365)
ClinicalDF$diag_year <-
ClinicalDF$age_at_diag_year +
ClinicalDF$year_of_birth
diag_yearDF <-
ClinicalDF[, c("patient", "diag_year")]
Year <- merge(my_IDs, diag_yearDF, by = "patient")
Year <- Year$diag_year
Year <- as.factor(Year)
}
else if (nrow(ClinicalDF) == 0 & batch.factor == "Year") {
stop("Cannot extract Year data. Clinical data was not provided")
}
Plate <- as.factor(my_IDs$plate)
Condition <- as.factor(my_IDs$condition)
TSS <- as.factor(my_IDs$tss)
Portion <- as.factor(my_IDs$portion)
Sequencing.Center <- as.factor(my_IDs$center)
design.matrix <- model.matrix( ~ Condition)
design.mod.combat <- model.matrix( ~ Condition)
options <-
c("Plate", "TSS", "Year", "Portion", "Sequencing Center")
if (length(batch.factor) > 1)
stop("Combat can only correct for one batch variable. Provide one batch factor")
if (batch.factor %in% options == FALSE)
stop(paste0(o, " is not a valid batch correction factor"))
for (o in adjustment) {
if (o %in% options == FALSE)
stop(paste0(o, " is not a valid adjustment factor"))
}
adjustment.data <- c()
for (a in adjustment) {
if (a == "Sequencing Center")
a <- Sequencing.Center
adjustment.data <-
cbind(eval(parse(text = a)), adjustment.data)
}
if (batch.factor == "Sequencing Center")
batch.factor <- Sequencing.Center
batchCombat <- eval(parse(text = batch.factor))
if (length(adjustment) > 0) {
adjustment.formula <- paste(adjustment, collapse = "+")
adjustment.formula <- paste0("+", adjustment.formula)
adjustment.formula <-
paste0("~Condition", adjustment.formula)
print(adjustment.formula)
model <-
data.frame(batchCombat, row.names = colnames(tabDF))
design.mod.combat <-
model.matrix(eval(parse(text = adjustment.formula)),
data = model)
}
print(unique(batchCombat))
batch_corr <- sva::ComBat(
dat = tabDF,
batch = batchCombat,
mod = design.mod.combat,
par.prior = TRUE,
prior.plots = TRUE
)
}
return(batch_corr)
}
##Function to take raw counts by removing rows filtered after norm and filter process###
#' @title Use raw count from the DataPrep object which genes are removed by normalization and filtering steps.
#' @description function to keep raw counts after filtering and/or normalizing.
#' @param DataPrep DataPrep object returned by TCGAanalyze_Preprocessing()
#' @param DataFilt Filtered data frame containing samples in columns and genes in rows after normalization and/or filtering steps
#' @examples
#' \dontrun{
#' dataPrep_raw <- UseRaw_afterFilter(dataPrep, dataFilt)
#' }
#' @export
#' @return Filtered return object similar to DataPrep with genes removed after normalization and filtering process.
UseRaw_afterFilter <- function(DataPrep, DataFilt) {
rownames(DataPrep) <-
lapply(rownames(DataPrep), function(x)
gsub("[[:punct:]]\\d*", "", x))
filtered.list <- setdiff(rownames(DataPrep), rownames(DataFilt))
Res <- DataPrep[!rownames(DataPrep) %in% filtered.list,]
return(Res)
}
#' @title Adding information related to DEGs genes from DEA as mean values in two conditions.
#' @description
#' TCGAanalyze_LevelTab allows user to add information related to DEGs genes from
#' Differentially expression analysis (DEA) such as mean values and in two conditions.
#' @param FC_FDR_table_mRNA Output of dataDEGs filter by abs(LogFC) >=1
#' @param typeCond1 a string containing the class label of the samples
#' in TableCond1 (e.g., control group)
#' @param typeCond2 a string containing the class label of the samples
#' in TableCond2 (e.g., case group)
#' @param TableCond1 numeric matrix, each row represents a gene, each column
#' represents a sample with Cond1type
#' @param TableCond2 numeric matrix, each row represents a gene, each column
#' represents a sample with Cond2type
#' @param typeOrder typeOrder
#' @export
#' @return table with DEGs, log Fold Change (FC), false discovery rate (FDR),
#' the gene expression level
#' for samples in Cond1type, and Cond2type, and Delta value (the difference
#' of gene expression between the two
#' conditions multiplied logFC)
#' @examples
#' dataNorm <- TCGAbiolinks::TCGAanalyze_Normalization(dataBRCA, geneInfo)
#' dataFilt <- TCGAanalyze_Filtering(tabDF = dataBRCA, method = "quantile", qnt.cut = 0.25)
#' samplesNT <- TCGAquery_SampleTypes(colnames(dataFilt), typesample = c("NT"))
#' samplesTP <- TCGAquery_SampleTypes(colnames(dataFilt), typesample = c("TP"))
#' dataDEGs <- TCGAanalyze_DEA(dataFilt[,samplesNT],
#' dataFilt[,samplesTP],
#' Cond1type = "Normal",
#' Cond2type = "Tumor")
#' dataDEGsFilt <- dataDEGs[abs(dataDEGs$logFC) >= 1,]
#' dataTP <- dataFilt[,samplesTP]
#' dataTN <- dataFilt[,samplesNT]
#' dataDEGsFiltLevel <- TCGAanalyze_LevelTab(dataDEGsFilt,"Tumor","Normal",
#' dataTP,dataTN)
TCGAanalyze_LevelTab <- function(FC_FDR_table_mRNA,
typeCond1,
typeCond2,
TableCond1,
TableCond2,
typeOrder = TRUE) {
TF_enriched <- as.matrix(rownames(FC_FDR_table_mRNA))
TableLevel <- matrix(0, nrow(TF_enriched), 6)
TableLevel <- as.data.frame(TableLevel)
colnames(TableLevel) <-
c("mRNA", "logFC", "FDR", typeCond1, typeCond2, "Delta")
TableLevel[, "mRNA"] <- TF_enriched
Tabfilt <-
FC_FDR_table_mRNA[which(rownames(FC_FDR_table_mRNA) %in%
TF_enriched), ]
TableLevel[, "logFC"] <-
as.numeric(Tabfilt[TF_enriched, ][, "logFC"])
TableLevel[, "FDR"] <- as.numeric(Tabfilt[TF_enriched, ][, "FDR"])
MeanTumor <- matrix(0, nrow(TF_enriched), 1)
MeanDiffTumorNormal <- matrix(0, nrow(TF_enriched), 1)
for (i in 1:nrow(TF_enriched)) {
TableLevel[i, typeCond1] <-
mean(as.numeric(TableCond1[rownames(TableCond1) %in%
TF_enriched[i] ,]))
TableLevel[i, typeCond2] <-
mean(as.numeric(TableCond2[rownames(TableCond2) %in%
TF_enriched[i] ,]))
}
TableLevel[, "Delta"] <- as.numeric(abs(TableLevel[, "logFC"]) *
TableLevel[, typeCond1])
TableLevel <-
TableLevel[order(as.numeric(TableLevel[, "Delta"]),
decreasing = typeOrder), ]
rownames(TableLevel) <- TableLevel[, "mRNA"]
if (all(grepl("ENSG", rownames(TableLevel))))
TableLevel <-
cbind(TableLevel, map.ensg(genes = rownames(TableLevel))[, 2:3])
return(TableLevel)
}
#' @title Enrichment analysis for Gene Ontology (GO) [BP,MF,CC] and Pathways
#' @description
#' Researchers, in order to better understand the underlying biological
#' processes, often want to retrieve a functional profile of a set of genes
#' that might have an important role. This can be done by performing an
#' enrichment analysis.
#'
#'We will perform an enrichment analysis on gene sets using the TCGAanalyze_EAcomplete
#'function. Given a set of genes that are
#'up-regulated under certain conditions, an enrichment analysis will find
#'identify classes of genes or proteins that are #'over-represented using
#'annotations for that gene set.
#' @param TFname is the name of the list of genes or TF's regulon.
#' @param RegulonList List of genes such as TF's regulon or DEGs where to find enrichment.
#' @export
#' @return Enrichment analysis GO[BP,MF,CC] and Pathways complete table enriched by genelist.
#' @examples
#' Genelist <- c("FN1","COL1A1")
#' ansEA <- TCGAanalyze_EAcomplete(TFname="DEA genes Normal Vs Tumor",Genelist)
#' \dontrun{
#' Genelist <- rownames(dataDEGsFiltLevel)
#' system.time(ansEA <- TCGAanalyze_EAcomplete(TFname="DEA genes Normal Vs Tumor",Genelist))
#' }
TCGAanalyze_EAcomplete <- function(TFname, RegulonList) {
# This is a verification of the input
# in case the List is like Gene|ID
# we will get only the Gene
if (all(grepl("\\|", RegulonList))) {
RegulonList <- strsplit(RegulonList, "\\|")
RegulonList <- unlist(lapply(RegulonList, function(x)
x[1]))
}
print(
paste(
"I need about ",
"1 minute to finish complete ",
"Enrichment analysis GO[BP,MF,CC] and Pathways... "
)
)
ResBP <- TCGAanalyze_EA(TFname, RegulonList, DAVID_BP_matrix,
EAGenes, GOtype = "DavidBP")
print("GO Enrichment Analysis BP completed....done")
ResMF <- TCGAanalyze_EA(TFname, RegulonList, DAVID_MF_matrix,
EAGenes, GOtype = "DavidMF")
print("GO Enrichment Analysis MF completed....done")
ResCC <- TCGAanalyze_EA(TFname, RegulonList, DAVID_CC_matrix,
EAGenes, GOtype = "DavidCC")
print("GO Enrichment Analysis CC completed....done")
ResPat <- TCGAanalyze_EA(TFname, RegulonList, listEA_pathways,
EAGenes, GOtype = "Pathway")
print("Pathway Enrichment Analysis completed....done")
ans <-
list(
ResBP = ResBP,
ResMF = ResMF,
ResCC = ResCC,
ResPat = ResPat
)
return(ans)
}
#' @title Enrichment analysis of a gene-set with GO [BP,MF,CC] and pathways.
#' @description
#' The rational behind a enrichment analysis ( gene-set, pathway etc) is to compute
#' statistics of whether the overlap between the focus list (signature) and the gene-set
#' is significant. ie the confidence that overlap between the list is not due to chance.
#' The Gene Ontology project describes genes (gene products) using terms from
#' three structured vocabularies: biological process, cellular component and molecular function.
#' The Gene Ontology Enrichment component, also referred to as the GO Terms" component, allows
#' the genes in any such "changed-gene" list to be characterized using the Gene Ontology terms
#' annotated to them. It asks, whether for any particular GO term, the fraction of genes
#' assigned to it in the "changed-gene" list is higher than expected by chance
#' (is over-represented), relative to the fraction of genes assigned to that term in the
#' reference set.
#' In statistical terms it perform the analysis tests the null hypothesis that,
#' for any particular ontology term, there is no difference in the proportion of genes
#' annotated to it in the reference list and the proportion annotated to it in the test list.
#' We adopted a Fisher Exact Test to perform the EA.
#' @param GeneName is the name of gene signatures list
#' @param TableEnrichment is a table related to annotations of gene symbols such as
#' GO[BP,MF,CC] and Pathways. It was created from DAVID gene ontology on-line.
#' @param RegulonList is a gene signature (lisf of genes) in which perform EA.
#' @param GOtype is type of gene ontology Biological process (BP), Molecular Function (MF),
#' Cellular componet (CC)
#' @param FDRThresh pvalue corrected (FDR) as threshold to selected significant
#' BP, MF,CC, or pathways. (default FDR < 0.01)
#' @param EAGenes is a table with informations about genes
#' such as ID, Gene, Description, Location and Family.
#' @param GeneSymbolsTable if it is TRUE will return a table with GeneSymbols in common GO or pathways.
# @export
#' @import stats
#' @return Table with enriched GO or pathways by selected gene signature.
#' @examples
#' \dontrun{
#' EAGenes <- get("EAGenes")
#' RegulonList <- rownames(dataDEGsFiltLevel)
#' ResBP <- TCGAanalyze_EA(GeneName="DEA genes Normal Vs Tumor",
#' RegulonList,DAVID_BP_matrix,
#' EAGenes,GOtype = "DavidBP")
#'}
TCGAanalyze_EA <-
function (GeneName,
RegulonList,
TableEnrichment,
EAGenes,
GOtype,
FDRThresh = 0.01,
GeneSymbolsTable = FALSE)
{
topPathways <- nrow(TableEnrichment)
topPathways_tab <- matrix(0, 1, topPathways)
topPathways_tab <- as.matrix(topPathways_tab)
rownames(topPathways_tab) <- GeneName
rownames(EAGenes) <- toupper(rownames(EAGenes))
EAGenes <- EAGenes[!duplicated(EAGenes[, "ID"]),]
rownames(EAGenes) <- EAGenes[, "ID"]
allgene <- EAGenes[, "ID"]
current_pathway_from_EA <- as.matrix(TableEnrichment[, GOtype])
TableNames <- gsub("David",
"",
paste("Top ", GOtype, " n. ",
1:topPathways, " of ", topPathways, sep = ""))
colnames(topPathways_tab) <- TableNames
topPathways_tab <- as.data.frame(topPathways_tab)
table_pathway_enriched <-
matrix(1, nrow(current_pathway_from_EA),
8)
colnames(table_pathway_enriched) <-
c(
"Pathway",
"GenesInPathway",
"Pvalue",
"FDR",
"CommonGenesPathway",
"PercentPathway",
"PercentRegulon",
"CommonGeneSymbols"
)
table_pathway_enriched <- as.data.frame(table_pathway_enriched)
for (i in 1:nrow(current_pathway_from_EA)) {
table_pathway_enriched[i, "Pathway"] <-
as.character(current_pathway_from_EA[i,])
if (nrow(TableEnrichment) == 589) {
genes_from_current_pathway_from_EA <-
GeneSplitRegulon(TableEnrichment[TableEnrichment[GOtype] ==
as.character(current_pathway_from_EA[i,]),][,
"Molecules"], ",")
}
else {
genes_from_current_pathway_from_EA <-
GeneSplitRegulon(TableEnrichment[TableEnrichment[GOtype] ==
as.character(current_pathway_from_EA[i,]),][,
"Molecules"], ", ")
}
genes_common_pathway_TFregulon <-
as.matrix(intersect(
toupper(RegulonList),
toupper(genes_from_current_pathway_from_EA)
))
if (length(genes_common_pathway_TFregulon) != 0) {
current_pathway_commongenes_num <-
length(genes_common_pathway_TFregulon)
seta <- allgene %in% RegulonList
setb <- allgene %in% genes_from_current_pathway_from_EA
ft <- fisher.test(seta, setb)
FisherpvalueTF <- ft$p.value
table_pathway_enriched[i, "Pvalue"] <-
as.numeric(FisherpvalueTF)
if (FisherpvalueTF < 0.01) {
current_pathway_commongenes_percent <- paste("(",
format((
current_pathway_commongenes_num / length(genes_from_current_pathway_from_EA)
) *
100,
digits = 2
), "%)")
current_pathway_commongenes_num_with_percent <-
gsub(
" ",
"",
paste(
current_pathway_commongenes_num,
current_pathway_commongenes_percent,
"pv=",
format(FisherpvalueTF, digits = 2)
)
)
table_pathway_enriched[i, "CommonGenesPathway"] <-
length(genes_common_pathway_TFregulon)
table_pathway_enriched[i, "CommonGeneSymbols"] <-
paste0(genes_common_pathway_TFregulon, collapse = ";")
table_pathway_enriched[i, "GenesInPathway"] <-
length(genes_from_current_pathway_from_EA)
table_pathway_enriched[i, "PercentPathway"] <-
as.numeric(table_pathway_enriched[i,
"CommonGenesPathway"]) /
as.numeric(table_pathway_enriched[i,
"GenesInPathway"]) * 100
table_pathway_enriched[i, "PercentRegulon"] <-
as.numeric(table_pathway_enriched[i,
"CommonGenesPathway"]) /
length(RegulonList) *
100
}
}
}
table_pathway_enriched <-
table_pathway_enriched[order(table_pathway_enriched[,
"Pvalue"], decreasing = FALSE),]
table_pathway_enriched <-
table_pathway_enriched[table_pathway_enriched[,
"Pvalue"] < 0.01,]
table_pathway_enriched[, "FDR"] <-
p.adjust(table_pathway_enriched[,
"Pvalue"], method = "fdr")
table_pathway_enriched <-
table_pathway_enriched[table_pathway_enriched[,
"FDR"] < FDRThresh,]
table_pathway_enriched <-
table_pathway_enriched[order(table_pathway_enriched[,
"FDR"], decreasing = FALSE),]
table_pathway_enriched_filt <-
table_pathway_enriched[table_pathway_enriched$FDR < FDRThresh, ]
if (nrow(table_pathway_enriched) > 0) {
tmp <- table_pathway_enriched
tmp <- paste(
tmp[, "Pathway"],
"; FDR= ",
format(tmp[,
"FDR"], digits = 3),
"; (ng=",
round(tmp[, "GenesInPathway"]),
"); (ncommon=",
format(tmp[, "CommonGenesPathway"],
digits = 2),
")",
sep = ""
)
tmp <- as.matrix(tmp)
topPathways_tab <-
topPathways_tab[, 1:nrow(table_pathway_enriched),
drop = FALSE]
topPathways_tab[1,] <- tmp
}
else {
topPathways_tab <- NA
}
if (GeneSymbolsTable == FALSE) {
return(topPathways_tab)
}
if (GeneSymbolsTable == TRUE) {
return(table_pathway_enriched_filt)
}
}
#' @title Differentially expression analysis (DEA) using limma package.
#' @description Differentially expression analysis (DEA) using limma package.
#' @param FC.cut write
#' @param AffySet A matrix-like data object containing log-ratios or log-expression values
#' for a series of arrays, with rows corresponding to genes and columns to samples
#' @examples
#' \dontrun{
#' to add example
#' }
#' @export
#' @return List of list with tables in 2 by 2 comparison
#' of the top-ranked genes from a linear model fitted by DEA's limma
TCGAanalyze_DEA_Affy <- function(AffySet, FC.cut = 0.01) {
if (!requireNamespace("Biobase", quietly = TRUE)) {
stop("Biobase package is needed for this function to work. Please install it.",
call. = FALSE)
}
if (!requireNamespace("limma", quietly = TRUE)) {
stop("limma package is needed for this function to work. Please install it.",
call. = FALSE)
}
Pdatatable <- Biobase::phenoData(AffySet)
f <- factor(Pdatatable$Disease)
groupColors <- names(table(f))
tmp <- matrix(0, length(groupColors), length(groupColors))
colnames(tmp) <- groupColors
rownames(tmp) <- groupColors
tmp[upper.tri(tmp)] <- 1
sample_tab <- Pdatatable
f <- factor(Pdatatable$Disease)
design <- model.matrix( ~ 0 + f)
colnames(design) <- levels(f)
fit <-
limma::lmFit(AffySet, design) ## fit is an object of class MArrayLM.
groupColors <- names(table(Pdatatable$Disease))
CompleteList <- vector("list", sum(tmp))
k <- 1
for (i in 1:length(groupColors)) {
col1 <- colnames(tmp)[i]
for (j in 1:length(groupColors)) {
col2 <- rownames(tmp)[j]
if (i != j) {
if (tmp[i, j] != 0) {
Comparison <- paste(col2, "-", col1, sep = "")
if (i == 4 &&
j == 6) {
Comparison <- paste(col1, "-", col2, sep = "")
}
if (i == 5 &&
j == 6) {
Comparison <- paste(col1, "-", col2, sep = "")
}
print(paste(i, j, Comparison, "to do..."))
cont.matrix <-
limma::makeContrasts(I = Comparison, levels = design)
fit2 <- limma::contrasts.fit(fit, cont.matrix)
fit2 <- limma::eBayes(fit2)
sigI <-
limma::topTable(
fit2,
coef = 1,
adjust.method = "BH",
sort.by = "B",
p.value = 0.05,
lfc = FC.cut,
number = 50000
)
sigIbis <-
sigI[order(abs(as.numeric(sigI$logFC)), decreasing = TRUE), ]
names(CompleteList)[k] <- gsub("-", "_", Comparison)
CompleteList[[k]] <- sigIbis
k <- k + 1
}
}
}
}
return(CompleteList)
}
#' @title Generate network
#' @description TCGAanalyze_analyseGRN perform gene regulatory network.
#' @param TFs a vector of genes.
#' @param normCounts is a matrix of gene expression with genes in rows and samples in columns.
#' @param kNum the number of nearest neighbors to consider to estimate the mutual information.
#' Must be less than the number of columns of normCounts.
#' @export
#' @return an adjacent matrix
TCGAanalyze_analyseGRN <- function(TFs, normCounts, kNum) {
if (!requireNamespace("parmigene", quietly = TRUE)) {
stop(
"parmigene package is needed for this function to work. Please install it.",
call. = FALSE
)
}
MRcandidates <- intersect(rownames(normCounts), TFs)
# Mutual information between TF and genes
sampleNames <- colnames(normCounts)
geneNames <- rownames(normCounts)
messageMI_TFgenes <-
paste(
"Estimation of MI among [",
length(MRcandidates),
" TRs and ",
nrow(normCounts),
" genes].....",
sep = ""
)
timeEstimatedMI_TFgenes1 <-
length(MRcandidates) * nrow(normCounts) / 1000
timeEstimatedMI_TFgenes <-
format(timeEstimatedMI_TFgenes1 * ncol(normCounts) / 17000,
digits = 2)
messageEstimation <-
print(
paste(
"I Need about ",
timeEstimatedMI_TFgenes,
"seconds for this MI estimation. [Processing 17000k elements /s] "
)
)
system.time(miTFGenes <-
parmigene::knnmi.cross(normCounts[MRcandidates,], normCounts, k = kNum))
return(miTFGenes)
}
#' @title Generate pathview graph
#' @description TCGAanalyze_Pathview pathway based data integration and visualization.
#' @param dataDEGs dataDEGs
#' @param pathwayKEGG pathwayKEGG
#' @export
#' @return an adjacent matrix
#' @examples
#' \dontrun{
#' dataDEGs <- data.frame(mRNA = c("TP53","TP63","TP73"), logFC = c(1,2,3))
#' TCGAanalyze_Pathview(dataDEGs)
#' }
TCGAanalyze_Pathview <-
function(dataDEGs, pathwayKEGG = "hsa05200") {
if (!requireNamespace("clusterProfiler", quietly = TRUE)) {
stop("clusterProfiler needed for this function to work. Please install it.",
call. = FALSE)
}
if (!requireNamespace("pathview", quietly = TRUE)) {
stop("pathview needed for this function to work. Please install it.",
call. = FALSE)
}
# Converting Gene symbol to gene ID
eg = as.data.frame(
clusterProfiler::bitr(
dataDEGs$mRNA,
fromType = "SYMBOL",
toType = "ENTREZID",
OrgDb = "org.Hs.eg.db"
)
)
eg <- eg[!duplicated(eg$SYMBOL), ]
dataDEGs <- dataDEGs[dataDEGs$mRNA %in% eg$SYMBOL, ]
dataDEGs <- dataDEGs[order(dataDEGs$mRNA, decreasing = FALSE), ]
eg <- eg[order(eg$SYMBOL, decreasing = FALSE), ]
dataDEGs$GeneID <- eg$ENTREZID
dataDEGsFiltLevel_sub <-
subset(dataDEGs, select = c("GeneID", "logFC"))
genelistDEGs <- as.numeric(dataDEGsFiltLevel_sub$logFC)
names(genelistDEGs) <- dataDEGsFiltLevel_sub$GeneID
hsa05200 <- pathview::pathview(
gene.data = genelistDEGs,
pathway.id = pathwayKEGG,
species = "hsa",
limit = list(gene = as.integer(max(
abs(genelistDEGs)
)))
)
}
#' @title infer gene regulatory networks
#' @description TCGAanalyze_networkInference taking expression data as input,
#' this will return an adjacency matrix of interactions
#' @param data expression data, genes in columns, samples in rows
#' @param optionMethod inference method, chose from aracne, c3net, clr and mrnet
#' @export
#' @return an adjacent matrix
TCGAanalyze_networkInference <-
function(data, optionMethod = "clr") {
# Converting Gene symbol to gene ID
if (optionMethod == "c3net") {
if (!requireNamespace("c3net", quietly = TRUE)) {
stop(
"c3net package is needed for this function to work. Please install it.",
call. = FALSE
)
}
net <- c3net::c3net(t(data))
} else {
if (!requireNamespace("minet", quietly = TRUE)) {
stop(
"minet package is needed for this function to work. Please install it.",
call. = FALSE
)
}
net <- minet::minet(data, method = optionMethod)
}
return(net)
}
#' Creates a plot for GAIA output (all significant aberrant regions.)
#' @description
#' This function is a auxiliary function to visualize GAIA output
#' (all significant aberrant regions.)
#' @param calls A matrix with the following columns: Chromossome, Aberration Kind
#' Region Start, Region End, Region Size and score
#' @param threshold Score threshold (orange horizontal line in the plot)
#' @export
#' @importFrom graphics abline axis legend plot points
#' @return A plot with all significant aberrant regions.
#' @examples
#' call <- data.frame("Chromossome" = rep(9,100),
#' "Aberration Kind" = rep(c(-2,-1,0,1,2),20),
#' "Region Start [bp]" = 18259823:18259922,
#' "Region End [bp]" = 18259823:18259922,
#' "score" = rep(c(1,2,3,4),25))
#' gaiaCNVplot(call,threshold = 0.01)
#' call <- data.frame("Chromossome" = rep(c(1,9),50),
#' "Aberration Kind" = rep(c(-2,-1,0,1,2),20),
#' "Region Start [bp]" = 18259823:18259922,
#' "Region End [bp]" = 18259823:18259922,
#' "score" = rep(c(1,2,3,4),25))
#' gaiaCNVplot(call,threshold = 0.01)
gaiaCNVplot <- function (calls, threshold = 0.01) {
Calls <-
calls[order(calls[, grep("start", colnames(calls), ignore.case = TRUE)]), ]
Calls <-
Calls[order(Calls[, grep("chr", colnames(calls), ignore.case = TRUE)]), ]
rownames(Calls) <- NULL
Chromo <- Calls[, grep("chr", colnames(calls), ignore.case = TRUE)]
Gains <-
apply(Calls, 1, function(x)
ifelse(x[grep("aberration", colnames(calls), ignore.case = TRUE)] == 1, x["score"], 0))
Losses <-
apply(Calls, 1, function(x)
ifelse(x[grep("aberration", colnames(calls), ignore.case = TRUE)] == 0, x["score"], 0))
plot(
Gains,
ylim = c(-max(Calls[, "score"] + 2), max(Calls[, "score"] + 2)),
type = "h",
col = "red",
xlab = "Chromosome",
ylab = "Score",
xaxt = "n"
)
points(-(Losses), type = "h", col = "blue")
# Draw origin line
abline(h = 0, cex = 4)
# Draw threshold lines
abline(
h = -log10(threshold),
col = "orange",
cex = 4,
main = "test"
)
abline(
h = log10(threshold),
col = "orange",
cex = 4,
main = "test"
)
uni.chr <- unique(Chromo)
temp <- rep(0, length(uni.chr))
for (i in 1:length(uni.chr)) {
temp[i] <- max(which(uni.chr[i] == Chromo))
}
for (i in 1:length(temp)) {
abline(v = temp[i],
col = "black",
lty = "dashed")
}
nChroms <- length(uni.chr)
begin <- c()
for (d in 1:nChroms) {
chrom <- sum(Chromo == uni.chr[d])
begin <- append(begin, chrom)
}
temp2 <- rep(0, nChroms)
for (i in 1:nChroms) {
if (i == 1) {
temp2[1] <- (begin[1] * 0.5)
}
else if (i > 1) {
temp2[i] <- temp[i - 1] + (begin[i] * 0.5)
}
}
uni.chr[uni.chr == 23] <- "X"
uni.chr[uni.chr == 24] <- "Y"
for (i in 1:length(temp)) {
axis(1,
at = temp2[i],
labels = uni.chr[i],
cex.axis = 1)
}
legend(
x = 1,
y = max(Calls[, "score"] + 2),
y.intersp = 0.8,
c("Amp"),
pch = 15,
col = c("red"),
text.font = 3
)
legend(
x = 1,
y = -max(Calls[, "score"] + 0.5),
y.intersp = 0.8,
c("Del"),
pch = 15,
col = c("blue"),
text.font = 3
)
}
#' Get a matrix of interactions of genes from biogrid
#' @description
#' Using biogrid database, it will create a matrix of gene interactions.
#' If columns A and row B has value 1, it means the gene A and gene B interacts.
#' @param tmp.biogrid Biogrid table
#' @export
#' @param names.genes List of genes to filter from output. Default: consider all genes
#' @return A matrix with 1 for genes that interacts, 0 for no interaction.
#' @examples
#' names.genes.de <- c("PLCB1","MCL1","PRDX4","TTF2","TACC3", "PARP4","LSM1")
#' tmp.biogrid <- data.frame("Official.Symbol.Interactor.A" = names.genes.de,
#' "Official.Symbol.Interactor.B" = rev(names.genes.de))
#' net.biogrid.de <- getAdjacencyBiogrid(tmp.biogrid, names.genes.de)
#' \dontrun{
#' file <- paste0("http://thebiogrid.org/downloads/archives/",
#' "Release%20Archive/BIOGRID-3.4.133/BIOGRID-ALL-3.4.133.tab2.zip")
#' downloader::download(file,basename(file))
#' unzip(basename(file),junkpaths =TRUE)
#' tmp.biogrid <- read.csv(gsub("zip","txt",basename(file)),
#' header=TRUE, sep="\t", stringsAsFactors=FALSE)
#' names.genes.de <- c("PLCB1","MCL1","PRDX4","TTF2","TACC3", "PARP4","LSM1")
#' net.biogrid.de <- getAdjacencyBiogrid(tmp.biogrid, names.genes.de)
#' }
getAdjacencyBiogrid <- function(tmp.biogrid, names.genes = NULL) {
it.a <- grep("Symbol", colnames(tmp.biogrid), value = TRUE)[1]
it.b <- grep("Symbol", colnames(tmp.biogrid), value = TRUE)[2]
if (is.null(names.genes)) {
names.genes <-
sort(union(unique(tmp.biogrid[, it.a]), unique(tmp.biogrid[, it.b])))
ind <- seq(1, nrow(tmp.biogrid))
} else {
ind.A <- which(tmp.biogrid[, it.a] %in% names.genes)
ind.B <- which(tmp.biogrid[, it.b] %in% names.genes)
ind <- intersect(ind.A, ind.B)
}
mat.biogrid <- matrix(
0,
nrow = length(names.genes),
ncol = length(names.genes),
dimnames = list(names.genes, names.genes)
)
for (i in ind) {
mat.biogrid[tmp.biogrid[i, it.a], tmp.biogrid[i, it.b]] <-
mat.biogrid[tmp.biogrid[i, it.b], tmp.biogrid[i, it.a]] <- 1
}
diag(mat.biogrid) <- 0
return(mat.biogrid)
}
#' Get GDC samples with both DNA methylation (HM450K) and Gene expression data from
#' GDC databse
#' @description
#' For a given TCGA project it gets the samples (barcode) with both DNA methylation and Gene expression data
#' from GDC database
#' @param project A GDC project
#' @param n Number of samples to return. If NULL return all (default)
#' @param legacy Access legacy (hg19) or harmonized database (hg38).
#' @return A vector of barcodes
#' @export
#' @examples
#' # Get ACC samples with both DNA methylation (HM450K) and gene expression aligned to hg19
#' samples <- matchedMetExp("TCGA-UCS", legacy = TRUE)
matchedMetExp <- function(project, legacy = FALSE, n = NULL) {
if (legacy) {
# get primary solid tumor samples: DNA methylation
message("Download DNA methylation information")
met450k <- GDCquery(
project = project,
data.category = "DNA methylation",
platform = "Illumina Human Methylation 450",
legacy = TRUE,
sample.type = c("Primary Tumor")
)
# get primary solid tumor samples: RNAseq
message("Download gene expression information")
exp <- GDCquery(
project = project,
data.category = "Gene expression",
data.type = "Gene expression quantification",
platform = "Illumina HiSeq",
file.type = "results",
sample.type = c("Primary Tumor"),
legacy = TRUE
)
} else {
# get primary solid tumor samples: DNA methylation
message("Download DNA methylation information")
met450k <- GDCquery(
project = project,
data.category = "DNA Methylation",
platform = "Illumina Human Methylation 450",
sample.type = c("Primary Tumor")
)
# get primary solid tumor samples: RNAseq
message("Download gene expression information")
exp <- GDCquery(
project = project,
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
workflow.type = "HTSeq - Counts"
)
}
met450k.tp <- met450k$results[[1]]$cases
# Get patients with samples in both platforms
exp.tp <- exp$results[[1]]$cases
patients <-
unique(substr(exp.tp, 1, 15)[substr(exp.tp, 1, 12) %in% substr(met450k.tp, 1, 12)])
if (!is.null(n))
patients <- patients[1:n] # get only n samples
return(patients)
}
#' @title Generate Stemness Score based on RNASeq (mRNAsi stemness index) Malta et al., Cell, 2018
#' @description TCGAanalyze_Stemness generate the mRNAsi score
#' @param stemSig is a vector of the stemness Signature generated using gelnet package
#' @param dataGE is a matrix of Gene expression (genes in rows, samples in cols) from TCGAprepare
#' @param annotation as default is FALSE.
#' If annotation == subtype it returns the molecular subtype of a sample.
#' If annotation == sampleType it returns the type of a sample (normal or tumor)
#' @export
#' @return table with samples and stemness score
#' @examples
#' # Selecting TCGA breast cancer (10 samples) for example stored in dataBRCA
#' dataNorm <- TCGAanalyze_Normalization(tabDF = dataBRCA, geneInfo = geneInfo)
#' # quantile filter of genes
#' dataFilt <- TCGAanalyze_Filtering(tabDF = dataNorm,
#' method = "quantile",
#' qnt.cut = 0.25)
#' dataBRCA_stemness <- TCGAanalyze_Stemness(stemSig = PCBC_stemSig,
#' dataGE = dataFilt, annotation = "sampleType")
TCGAanalyze_Stemness <- function(stemSig,
dataGE,
annotation = FALSE) {
reads <- dataGE
X <- reads
w <- stemSig
commonStemsigGenes <- intersect(names(w), rownames(X))
X <- X[commonStemsigGenes, ]
w <- w[rownames(X)]
# Score the Matrix X using Spearman correlation.
s <-
apply(X, 2, function(z) {
cor(z, w, method = "sp", use = "complete.obs")
})
## Scale the scores to be between 0 and 1
s <- s - min(s)
s <- s / max(s)
dataSce_stemness <- cbind(s)
dataAnnotationSC <- matrix(0, ncol(reads), 2)
colnames(dataAnnotationSC) <- c("Sample", "Annotation")
dataAnnotationSC <- as.data.frame(dataAnnotationSC)
dataAnnotationSC$Sample <- colnames(reads)
rownames(dataAnnotationSC) <- colnames(reads)
dataAnnotationSC <-
cbind(dataAnnotationSC, StemnessScore = rep(0, nrow(dataAnnotationSC)))
dataAnnotationSC[rownames(dataSce_stemness), "StemnessScore"] <-
as.numeric(dataSce_stemness)
colnames(dataAnnotationSC)[1] <- "Sample"
if (annotation == "sampleType") {
sampleTP <-
TCGAquery_SampleTypes(barcode = dataAnnotationSC$Sample, typesample = "TP")
sampleNT <-
TCGAquery_SampleTypes(barcode = dataAnnotationSC$Sample, typesample = "NT")
dataAnnotationSC[sampleTP, "Annotation"] <- "TP"
dataAnnotationSC[sampleNT, "Annotation"] <- "NT"
}
if (annotation == "subtype") {
dataSubt <- TCGAquery_subtype(tumor = "BRCA")
for (i in 1:nrow(dataAnnotationSC)) {
curSample <- dataAnnotationSC$Sample[i]
dataAnnotationSC$Annotation <-
dataSubt[dataSubt$patient %in% substr(curSample, 1, 12), "BRCA_Subtype_PAM50"]
}
}
return(dataAnnotationSC)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.