#############################################################################
# TCGA-Assembler : An open-source R program for downloading, processing and analyzing public TCGA data.
# Copyright (C) <2014> <Yitan Zhu>
# This file is part of TCGA-Assembler.
# TCGA-Assembler is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# TCGA-Assembler is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with TCGA-Assembler. If not, see <http://www.gnu.org/licenses/>.
############################################################################
##############################################################################
# TCGA-Assembler Version 1.0.3 Module B
##############################################################################
library(RCurl);
library(httr);
library(stringr);
library(HGNChelper);
######################### Main Functions of Module B #############################################################
CombineMultiPlatformData<-function(inputDataList, combineStyle = "Intersect")
{
options(warn=-1);
for (i in 1:length(inputDataList))
{
# Keep only one sample for a tissue type of a patient. Usually, there is only one sample of a tissue type of a patient existing in data.
inputDataList[[i]]$Data = ToPatientData(inputDataList[[i]]$Data);
if (i == 1)
{
sample = colnames(inputDataList[[i]]$Data);
}
# For each genomic feature, keep only one row of data.
Result = CombineRedundantFeature(Data = inputDataList[[i]]$Data, Des = inputDataList[[i]]$Des);
inputDataList[[i]]$Des = Result$Des;
inputDataList[[i]]$Data = Result$Data;
if (combineStyle == "Intersect")
{
sample = sort(intersect(sample, colnames(inputDataList[[i]]$Data)));
}
if (combineStyle == "Union")
{
sample = sort(union(sample, colnames(inputDataList[[i]]$Data)));
}
if (dim(inputDataList[[i]]$Des)[2] == 1)
{
inputDataList[[i]]$Des = cbind(inputDataList[[i]]$Des, Description = cbind(rep("", dim(inputDataList[[i]]$Des)[1])));
}
else
{
if ((dim(inputDataList[[i]]$Des)[2] == 3) && (inputDataList[[i]]$dataType == "CNA"))
{
inputDataList[[i]]$Des = cbind(inputDataList[[i]]$Des[, 1, drop = FALSE],
Description = paste(inputDataList[[i]]$Des[, 2], inputDataList[[i]]$Des[, 3], sep = ""));
}
}
if (inputDataList[[i]]$dataType == "miRNAExp")
{
for (kk in 1:dim(inputDataList[[i]]$Des)[1])
{
inputDataList[[i]]$Des[kk, 1] = paste(substr(inputDataList[[i]]$Des[kk, 1], 5, 7), substr(inputDataList[[i]]$Des[kk, 1], 9, 100), sep = "");
}
inputDataList[[i]]$Des[, 1] = toupper(inputDataList[[i]]$Des[, 1]);
}
# for combining methylation data at CpG site level.
if (inputDataList[[i]]$dataType == "Methylation")
{
if (sum(toupper(c("REF", "GeneSymbol", "ChromosomeID", "CoordinateID")) %in% toupper(colnames(inputDataList[[i]]$Des))) == 4)
{
inputDataList[[i]]$Des = cbind(inputDataList[[i]]$Des[, "GeneSymbol", drop = FALSE],
Description = paste(inputDataList[[i]]$Des[, "REF"], inputDataList[[i]]$Des[, "ChromosomeID"],
inputDataList[[i]]$Des[, "CoordinateID"], sep = "|"));
inputDataList[[i]]$Des[, "Description"] = gsub(" ", "", inputDataList[[i]]$Des[, "Description"]);
}
}
inputDataList[[i]]$Des = switch(inputDataList[[i]]$dataType,
GeneExp = cbind(inputDataList[[i]]$Des[, "GeneSymbol", drop = FALSE],
Platform = rep("GE", dim(inputDataList[[i]]$Des)[1]), Description = inputDataList[[i]]$Des[, 2]),
ProteinExp = cbind(inputDataList[[i]]$Des[, "GeneSymbol", drop = FALSE],
Platform = rep("PE", dim(inputDataList[[i]]$Des)[1]), Description = inputDataList[[i]]$Des[, 2]),
Methylation = cbind(inputDataList[[i]]$Des[, "GeneSymbol", drop = FALSE],
Platform = rep("ME", dim(inputDataList[[i]]$Des)[1]), Description = inputDataList[[i]]$Des[, 2]),
CNA = cbind(inputDataList[[i]]$Des[, "GeneSymbol", drop = FALSE],
Platform = rep("CN", dim(inputDataList[[i]]$Des)[1]), Description = inputDataList[[i]]$Des[, 2]),
miRNAExp = cbind(inputDataList[[i]]$Des[, "GeneSymbol", drop = FALSE],
Platform = rep("miRExp", dim(inputDataList[[i]]$Des)[1]), Description = inputDataList[[i]]$Des[, 2]));
# check NA gene. which(inputDataList[[i]]$Des[, "GeneSymbol"] != "NA")
ID = intersect(which(!is.na(inputDataList[[i]]$Des[, "GeneSymbol"])),
which(inputDataList[[i]]$Des[, "GeneSymbol"] != "?"));
inputDataList[[i]]$Des = inputDataList[[i]]$Des[ID, , drop = FALSE];
inputDataList[[i]]$Data = inputDataList[[i]]$Data[ID, , drop = FALSE];
}
for (i in 1:length(inputDataList))
{
# Get the data of samples that should be kept
if (combineStyle == "Intersect")
{
inputDataList[[i]]$Data = inputDataList[[i]]$Data[, sample, drop = FALSE];
}
if (combineStyle == "Union")
{
tempData = matrix(NA, dim(inputDataList[[i]]$Data)[1], length(sample));
colnames(tempData) = sample;
tempData[, colnames(inputDataList[[i]]$Data)] = inputDataList[[i]]$Data;
inputDataList[[i]]$Data = tempData;
}
rownames(inputDataList[[i]]$Data) = NULL;
rownames(inputDataList[[i]]$Des) = NULL;
}
# Combine the datasets into matrix format
Data = inputDataList[[1]]$Data;
Des = inputDataList[[1]]$Des;
for (i in 2:length(inputDataList))
{
Data = rbind(Data, inputDataList[[i]]$Data);
Des = rbind(Des, inputDataList[[i]]$Des);
}
OrderID = order(as.character(Des[, "GeneSymbol"]), as.character(Des[, "Platform"]),
as.character(Des[, "Description"]), na.last = TRUE, decreasing = FALSE);
Data = Data[OrderID, , drop = FALSE];
Des = Des[OrderID, , drop = FALSE];
rownames(Data) = NULL;
rownames(Des) = NULL;
Result = list(Des = Des, Data = Data);
options(warn=0);
Result;
}
ExtractTissueSpecificSamples<-function(inputData, tissueType, singleSampleFlag, sampleTypeFile = "./SupportingFiles/TCGASampleType.txt")
{
options(warn=-1);
TCGABarcode = colnames(inputData);
TCGABarcode = sapply(strsplit(TCGABarcode, split = "-"), function(x){
Str = paste(x[1], x[2], x[3], substr(x[4], 1, 2), sep = "-");
Str;
});
if (singleSampleFlag == TRUE)
{
DuplicatedLabel = duplicated(TCGABarcode);
ID = which(DuplicatedLabel == FALSE);
inputData = inputData[, ID, drop = FALSE];
TCGABarcode = TCGABarcode[ID];
}
TCGADataLabel = sapply(strsplit(TCGABarcode, split = "-"), function(x)x[4]);
TCGADataLabelValue = as.numeric(substr(TCGADataLabel, 1, 2));
sampleType = read.table(file = sampleTypeFile, header = TRUE, sep = "\t", na.string = "", quote = "", stringsAsFactors = FALSE);
Code = sampleType[which(sampleType[, "Options"] %in% tissueType), "Code"];
if (length(Code) == 0)
{
writeLines("Error: no valid tissue or cell type specified");
inputData = NULL;
}
else
{
ID = which(TCGADataLabelValue %in% Code);
if (length(ID) == 0)
{
writeLines(paste("There is no sample of ", paste(tissueType, collapse = ", "), sep = ""));
inputData = NULL;
}
else
{
inputData = inputData[, ID, drop = FALSE];
writeLines(paste(length(ID), " samples of ", paste(tissueType, collapse = ", "), " are extracted.", sep = ""));
}
}
options(warn=0);
inputData;
}
CalculateSingleValueMethylationData<-function(input, regionOption, DHSOption, outputFileName, outputFileFolder, chipAnnotationFile = "./SupportingFiles/MethylationChipAnnotation.rda")
{
options(warn=-1);
dir.create(path = outputFileFolder, recursive = TRUE);
if (sum(! regionOption %in% c("TSS1500", "TSS200", "5'UTR", "1stExon", "Body", "3'UTR", "All")) > 0)
{
stop("Error: invalid regionOption.");
}
if (sum(! DHSOption %in% c("DHS", "notDHS", "Both")) > 0)
{
stop("Error: invalid DHSOption");
}
regionOptionO = regionOption;
if (regionOptionO == "TSS1500")
{
regionOption = c(regionOption, "TSS200");
}
if ((regionOptionO != "All") || (DHSOption != "Both"))
{
load(chipAnnotationFile);
ID = 1:dim(MethylAnno)[1];
if (regionOptionO != "All")
{
ID = sort(unique(which(MethylAnno[, "UCSC_RefGene_Group"] %in% regionOption)));
}
if (DHSOption == "DHS")
{
ID = sort(intersect(ID, which(MethylAnno[, "DHS"] == TRUE)));
}
else
{
if (DHSOption == "notDHS")
{
ID = sort(intersect(ID, setdiff(1:dim(MethylAnno)[1], which(MethylAnno[, "DHS"] == TRUE))));
}
}
MethylAnno = MethylAnno[ID, , drop = FALSE];
ID = which(input$Des[, "REF"] %in% MethylAnno[, "IlmnID"]);
input$Des = input$Des[ID, , drop = FALSE];
input$Data = input$Data[ID, , drop = FALSE];
}
# Check whether conflict with NA gene
ID = which(!is.na(input$Des[, "GeneSymbol"]));
input$Des = input$Des[ID, , drop = FALSE];
input$Data = input$Data[ID, , drop = FALSE];
UniGene = unique(input$Des[, "GeneSymbol"]);
Data = matrix(NA, length(UniGene), dim(input$Data)[2]);
for (i in 1:length(UniGene))
{
if ((i %% 2000 == 0) || (i == length(UniGene)))
{
writeLines(paste("Calculation ", round(i/length(UniGene)*100, digits = 2), "% done.", sep = ""));
}
Data[i, ] = colMeans(input$Data[which(input$Des[, "GeneSymbol"] == UniGene[i]), , drop = FALSE], na.rm = TRUE);
}
colnames(Data) = colnames(input$Data);
Des = cbind(GeneSymbol = UniGene, SingleValueType = rep(paste(regionOptionO, DHSOption, sep = "|"), length(UniGene)));
# draw and save a box plot of data
png(filename = paste(outputFileFolder, "/", outputFileName, "__", regionOptionO, "__", DHSOption, "__boxplot.png", sep = ""), width = 30*dim(Data)[2]+300, height = 1500, units = "px");
par(las = 2, mai = c(4.5, 1, 0.5, 1), cex = 1.5);
boxplot(Data);
dev.off();
# save data files.
rownames(Data) = NULL;
rownames(Des) = NULL;
save(Des, Data, file = paste(outputFileFolder, "/", outputFileName, "__", regionOptionO, "__", DHSOption, ".rda", sep = ""));
write.table(cbind(Des, Data), file = paste(outputFileFolder, "/", outputFileName, "__", regionOptionO, "__", DHSOption, ".txt", sep = ""),
quote = FALSE, sep = "\t", na = "", col.names = TRUE, row.names = FALSE);
options(warn=0);
return(list(Data = Data, Des = Des));
}
MergeMethylationData<-function(input1, input2, outputFileName, outputFileFolder)
{
options(warn=-1);
dir.create(path = outputFileFolder, recursive = TRUE);
input1_O = input1;
input2_O = input2;
rownames(input1$Des) = paste(input1$Des[, "REF"], input1$Des[, "GeneSymbol"], sep = "||");
rownames(input1$Data) = paste(input1$Des[, "REF"], input1$Des[, "GeneSymbol"], sep = "||");
rownames(input2$Des) = paste(input2$Des[, "REF"], input2$Des[, "GeneSymbol"], sep = "||");
rownames(input2$Data) = paste(input2$Des[, "REF"], input2$Des[, "GeneSymbol"], sep = "||");
CommonID = intersect(rownames(input1$Des), rownames(input2$Des));
input1$Des = input1$Des[CommonID, , drop = FALSE];
input1$Data = input1$Data[CommonID, , drop = FALSE];
input2$Des = input2$Des[CommonID, , drop = FALSE];
input2$Data = input2$Data[CommonID, , drop = FALSE];
if (dim(input1_O$Des)[1] >= dim(input2_O$Des)[1])
{
Des = input1$Des;
}
else
{
Des = input2$Des;
}
Data = cbind(input1$Data, input2$Data);
# draw and save a box plot of combined data before normalization
png(filename = paste(outputFileFolder, "/", outputFileName, "__BeforeNormalizationBoxplot.png", sep = ""), width = 30*dim(Data)[2]+300, height = 1500, units = "px");
par(las = 2, mai = c(4.5, 1, 0.5, 1), cex = 1.5);
boxplot(Data);
dev.off();
Data = normalize.quantiles(Data);
# draw and save a box plot of combined data after normalization
png(filename = paste(outputFileFolder, "/", outputFileName, "__AfterNormalizationBoxplot.png", sep = ""), width = 30*dim(Data)[2]+300, height = 1500, units = "px");
par(las = 2, mai = c(4.5, 1, 0.5, 1), cex = 1.5);
boxplot(Data);
dev.off();
# save data files.
rownames(Data) = NULL;
rownames(Des) = NULL;
save(Des, Data, file = paste(outputFileFolder, "/", outputFileName, ".rda", sep = ""));
write.table(cbind(Des, Data), file = paste(outputFileFolder, "/", outputFileName, ".txt", sep = ""), quote = FALSE,
sep = "\t", na = "", col.names = TRUE, row.names = FALSE);
options(warn=0);
return(list(Data = Data, Des = Des));
}
ProcessCNAData<-function(inputFilePath, outputFileName, outputFileFolder, refGenomeFile)
{
options(warn=-1);
dir.create(path = outputFileFolder, recursive = TRUE);
# Load reference genomes, which give the genomic locations of genes
RefGenome = read.table(file = refGenomeFile, header = TRUE, sep = "\t", na.string = "", quote = "", stringsAsFactors = FALSE);
NumGene = dim(RefGenome)[1];
mode(RefGenome[, "txStarts"]) = "numeric";
mode(RefGenome[, "txEnds"]) = "numeric";
# Load copy number data
InData = read.table(file = inputFilePath, header = TRUE, sep = "\t", na.string = "", quote = "", stringsAsFactors = FALSE, check.names = FALSE);
InData[, "Sample"] = toupper(InData[, "Sample"]);
TCGAID = InData[, "Sample"];
UniqueTCGAID = unique(TCGAID);
InData[, "Chromosome"] = as.character(InData[, "Chromosome"]);
InData[, "Chromosome"] = paste("CHR", InData[, "Chromosome"], sep = "");
IDID = which(InData[, "Chromosome"] == "CHR23");
InData[IDID, "Chromosome"] = "CHRX";
IDID = which(InData[, "Chromosome"] == "CHR24");
InData[IDID, "Chromosome"] = "CHRY";
InData[, "Start"] = as.numeric(InData[, "Start"]);
InData[, "End"] = as.numeric(InData[, "End"]);
InData[, "Segment_Mean"] = as.numeric(InData[, "Segment_Mean"]);
# Calculate gene-level copy number
Data = matrix(NA, NumGene, length(UniqueTCGAID));
for (i in 1:length(UniqueTCGAID))
{
IDi = which(InData[, "Sample"] == UniqueTCGAID[i]);
InDatai = InData[IDi, c("Chromosome", "Start", "End", "Segment_Mean"), drop = FALSE];
for (j in 1:NumGene)
{
IDj = which(InDatai[, "Chromosome"] == RefGenome[j, "Chromosome"]);
if (length(IDj) > 0)
{
StartP = InDatai[IDj, "Start"];
EndP = InDatai[IDj, "End"];
Ratio = InDatai[IDj, "Segment_Mean"];
Overlap = ifelse((EndP >= RefGenome[j, "txStarts"]) & (RefGenome[j, "txEnds"] >= StartP), pmin(EndP, RefGenome[j, "txEnds"]) - pmax(StartP, RefGenome[j, "txStarts"]) + 1, 0);
IDNonNA = which(!is.na(Ratio));
Ratio = Ratio[IDNonNA];
Overlap = Overlap[IDNonNA];
if (sum(Overlap, na.rm = TRUE) > 0)
{
Data[j, i] = sum(Overlap * Ratio, na.rm = TRUE)/sum(Overlap, na.rm = TRUE);
}
}
}
if ((i %% 20 == 0) & (i < length(UniqueTCGAID)))
{
writeLines(paste("Calculating gene copy number , ", round(i/length(UniqueTCGAID)*100, digits = 2), "% done.", sep = ""));
}
}
writeLines("Calculating gene copy number, 100% done.");
colnames(Data) = UniqueTCGAID;
Des = as.matrix(RefGenome[, 1:3, drop = FALSE]);
#Check and Correct gene symbol
Des = CheckGeneSymbol(Des);
# Draw and save a box plot
png(filename = paste(outputFileFolder, "/", outputFileName, "__boxplot.png", sep = ""), width = 30*dim(Data)[2]+300, height = 1500, units = "px");
par(las = 2, mai = c(4.5, 1, 0.5, 1), cex = 1.5);
boxplot(Data);
dev.off();
# save data files.
rownames(Data) = NULL;
rownames(Des) = NULL;
save(Des, Data, file = paste(outputFileFolder, "/", outputFileName, ".rda", sep = ""));
write.table(cbind(Des, Data), file = paste(outputFileFolder, "/", outputFileName, ".txt", sep = ""), quote = FALSE,
sep = "\t", na = "", col.names = TRUE, row.names = FALSE);
options(warn=0);
return(list(Data = Data, Des = Des));
}
ProcessRNASeqData<-function(inputFilePath, outputFileName, outputFileFolder, dataType, verType)
{
options(warn=-1);
dir.create(path = outputFileFolder, recursive = TRUE);
# Read in data.
InData = read.table(file = inputFilePath, header = TRUE, sep = "\t", na.string = "", stringsAsFactors = FALSE, quote = "", check.names = FALSE);
InData = InData[2:dim(InData)[1], , drop = FALSE];
if ((dataType == "GeneExp") & (verType == "RNASeqV1"))
{
REF = InData[, 1];
RPKM = as.matrix(InData[, seq(4, dim(InData)[2], 3), drop = FALSE]);
mode(RPKM) = "numeric";
GeneSymbol = sapply(strsplit(REF, split = "\\|"), function(x)x[1]);
EntrezID = sapply(strsplit(REF, split = "\\|"), function(x)x[2]);
Des = cbind(GeneSymbol = GeneSymbol, EntrezID = EntrezID);
Data = RPKM;
#Check and Correct gene symbol
Des = CheckGeneSymbol(Des);
}
if (dataType == "ExonExp")
{
REF = InData[, 1];
RPKM = as.matrix(InData[, seq(4, dim(InData)[2], 3), drop = FALSE]);
mode(RPKM) = "numeric";
Des = cbind(ExonID = REF);
Data = RPKM;
}
if ((dataType == "GeneExp") & (verType == "RNASeqV2"))
{
REF = InData[, 1];
NormalizedCount = as.matrix(InData[, 2:dim(InData)[2], drop = FALSE]);
mode(NormalizedCount) = "numeric";
GeneSymbol = sapply(strsplit(REF, split = "\\|"), function(x)x[1]);
EntrezID = sapply(strsplit(REF, split = "\\|"), function(x)x[2]);
Des = cbind(GeneSymbol = GeneSymbol, EntrezID = EntrezID);
Data = NormalizedCount;
#Check and Correct gene symbol
Des = CheckGeneSymbol(Des);
}
if (verType == "Microarray")
{
dataType = "GeneExp";
REF = InData[, 1];
Data = as.matrix(InData[, 2:dim(InData)[2], drop = FALSE]);
mode(Data) = "numeric";
Des = cbind(GeneSymbol = REF, EntrezID = rep("", length(REF)));
#Check and Correct gene symbol
Des = CheckGeneSymbol(Des);
}
if (dataType == "GeneExp")
{
# Draw and save a box plot
png(filename = paste(outputFileFolder, "/", outputFileName, "__boxplot.png", sep = ""), width = 30*dim(Data)[2]+300, height = 1500, units = "px");
par(las = 2, mai = c(4.5, 1, 0.5, 1), cex = 1.5);
nonNaID = which(!is.na(Data));
if ((sum(Data[nonNaID]<0)==0) && (max(Data[nonNaID])>50))
{
boxplot(log2(Data), main = "Boxplot drawn based on log2 tranformed data.");
} else {
boxplot(Data);
}
dev.off();
}
# save data files
rownames(Data) = NULL;
rownames(Des) = NULL;
save(Des, Data, file = paste(outputFileFolder, "/", outputFileName, ".rda", sep = ""));
write.table(cbind(Des, Data), file = paste(outputFileFolder, "/", outputFileName, ".txt", sep = ""), quote = FALSE,
sep = "\t", na = "", col.names = TRUE, row.names = FALSE);
options(warn=0);
return(list(Data = Data, Des = Des));
}
ProcessmiRNASeqData<-function(inputFilePath, outputFileName, outputFileFolder, fileSource = "TCGA-Assembler")
{
options(warn=-1);
dir.create(path = outputFileFolder, recursive = TRUE);
# Read in data.
InData = read.table(file = inputFilePath, header = TRUE, sep = "\t", na.string = "", quote = "", stringsAsFactors = FALSE, check.names = FALSE);
InData = InData[2:dim(InData)[1], , drop = FALSE];
# divide read cout data and RPM data
REF = InData[, 1];
if (fileSource == "Firehose")
{
Count = as.matrix(InData[, seq(2, dim(InData)[2], 3), drop = FALSE]);
RPM = as.matrix(InData[, seq(3, dim(InData)[2], 3), drop = FALSE]);
}
if (fileSource == "TCGA-Assembler")
{
Count = as.matrix(InData[, seq(2, dim(InData)[2], 2), drop = FALSE]);
RPM = as.matrix(InData[, seq(3, dim(InData)[2], 2), drop = FALSE]);
}
# rownames(Count) = REF;
mode(Count) = "numeric";
# rownames(RPM) = REF;
mode(RPM) = "numeric";
# # Draw and save box plot
# png(filename = paste(outputFileFolder, "/", outputFileName, "__ReadCount.boxplot.png", sep = ""), width = 30*dim(Count)[2]+300, height = 1500, units = "px");
# par(las = 2, mai = c(4.5, 1, 0.5, 1), cex = 1.5);
# boxplot(log2(Count));
# dev.off();
# png(filename = paste(outputFileFolder, "/", outputFileName, "__RPM.boxplot.png", sep = ""), width = 30*dim(RPM)[2]+300, height = 1500, units = "px");
# par(las = 2, mai = c(4.5, 1, 0.5, 1), cex = 1.5);
# boxplot(log2(RPM));
# dev.off();
#save data files
Des = cbind(GeneSymbol = REF);
Data = Count;
rownames(Data) = NULL;
rownames(Des) = NULL;
save(Des, Data, file = paste(outputFileFolder, "/", outputFileName, "__ReadCount.rda", sep = ""));
write.table(cbind(Des, Data), file = paste(outputFileFolder, "/", outputFileName, "__ReadCount.txt", sep = ""), quote = FALSE,
sep = "\t", na = "", col.names = TRUE, row.names = FALSE);
Data = RPM;
rownames(Data) = NULL;
save(Des, Data, file = paste(outputFileFolder, "/", outputFileName, "__RPM.rda", sep = ""));
write.table(cbind(Des, Data), file = paste(outputFileFolder, "/", outputFileName, "__RPM.txt", sep = ""), quote = FALSE,
sep = "\t", na = "", col.names = TRUE, row.names = FALSE);
options(warn=0);
return(list(Data = Data, Des = Des));
}
ProcessMethylation450Data<-function(inputFilePath, outputFileName, outputFileFolder, fileSource = "TCGA-Assembler")
{
options(warn=-1);
dir.create(path = outputFileFolder, recursive = TRUE);
writeLines("Loading data.")
if (fileSource == "Firehose")
{
# read in data
GetMethylation450Data(inputFilePath, outputFileFolder, outputFileName);
load(paste(outputFileFolder, "/", outputFileName, ".TempFiles/", outputFileName, ".TempData.rda", sep = ""));
unlink(x = paste(outputFileFolder, "/", outputFileName, ".TempFiles", sep = ""), recursive = TRUE, force = TRUE);
Methy450Des = as.matrix(Methy450Des);
}
if (fileSource == "TCGA-Assembler")
{
# read in data
InData = read.table(file = inputFilePath, header = TRUE, sep = "\t", na.string = "", quote = "", stringsAsFactors = FALSE, check.names = FALSE);
Methy450Des = as.matrix(InData[, 1:4, drop = FALSE]);
colnames(Methy450Des) = c("REF", "GeneSymbol", "ChromosomeID", "CoordinateID");
ID = which(Methy450Des[, "GeneSymbol"] == "DISCONTUNUED");
Methy450Des[ID, "GeneSymbol"] = "";
Methy450Data = as.matrix(InData[, 5:dim(InData)[2], drop = FALSE]);
mode(Methy450Data) = "numeric";
rm("InData");
}
# some probes have more than one gene symbols, collpase the data
# AddDes = matrix("", dim(Methy450Des)[1], 4);
AddDes = matrix("", 100000, 4);
colnames(AddDes) = colnames(Methy450Des);
# AddData = matrix(NA, dim(Methy450Data)[1], dim(Methy450Data)[2]);
AddData = matrix(NA, 100000, dim(Methy450Data)[2]);
colnames(AddData) = colnames(Methy450Data)
AddIndex = 0;
for (ProbeID in 1:dim(Methy450Des)[1])
{
GrepID = grep(";", Methy450Des[ProbeID, "GeneSymbol"]);
if ((ProbeID %% 50000 == 0) || (ProbeID == dim(Methy450Des)[1]))
{
writeLines(paste("Process CpG sites corresponding to multiple genes, ", ceiling(ProbeID/dim(Methy450Des)[1]*100), "% done.", sep = ""));
}
if (length(GrepID) > 0)
{
TempStr = strsplit(Methy450Des[ProbeID, "GeneSymbol"], split = ";");
Methy450Des[ProbeID, "GeneSymbol"] = TempStr[[1]][1];
for (TempStri in 2:length(TempStr[[1]]))
{
TempAddDes = Methy450Des[ProbeID, , drop = FALSE];
TempAddDes[1, "GeneSymbol"] = TempStr[[1]][TempStri];
AddIndex = AddIndex + 1;
AddDes[AddIndex, ] = TempAddDes;
AddData[AddIndex, ] = Methy450Data[ProbeID, ];
}
}
}
# writeLines(paste("Number of added rows is ", AddIndex, sep = ""));
Methy450Des = rbind(Methy450Des, AddDes[1:AddIndex, , drop = FALSE]);
Methy450Data = rbind(Methy450Data, AddData[1:AddIndex, , drop = FALSE]);
Methy450Des[, "ChromosomeID"] = gsub(" ", "", Methy450Des[, "ChromosomeID"]);
Methy450Des[, "CoordinateID"] = gsub(" ", "", Methy450Des[, "CoordinateID"]);
OrderID = order(as.numeric(Methy450Des[, "ChromosomeID"]), as.numeric(Methy450Des[, "CoordinateID"]), Methy450Des[, "GeneSymbol"], na.last = TRUE, decreasing = FALSE);
Des = Methy450Des[OrderID, , drop = FALSE];
Data = Methy450Data[OrderID, , drop = FALSE];
#Check and Correct gene symbol
Des = CheckGeneSymbol(Des);
# Draw and save box plot
png(filename = paste(outputFileFolder, "/", outputFileName, "__boxplot.png", sep = ""), width = 30*dim(Data)[2]+300, height = 1500, units = "px");
par(las = 2, mai = c(4.5, 1, 0.5, 1), cex = 1.5);
boxplot(Data);
dev.off();
# save output data files
rownames(Data) = NULL;
rownames(Des) = NULL;
save(Data, Des, file = paste(outputFileFolder, "/", outputFileName, ".rda", sep = ""));
write.table(cbind(Des, Data), file = paste(outputFileFolder, "/", outputFileName, ".txt", sep = ""),
quote = FALSE, sep = "\t", na = "", col.names = TRUE, row.names = FALSE);
options(warn=0);
return(list(Data = Data, Des = Des));
}
CheckGeneSymbol<-function(Des)
{
data(hgnc.table);
hgnc.table = rbind(hgnc.table, c("13-SEP", "SEPT7P2"));
rID = intersect(which(toupper(hgnc.table[, "Symbol"]) == toupper("NCRNA00185")),
which(toupper(hgnc.table[, "Approved.Symbol"]) == toupper("TTTY14")));
hgnc.table = hgnc.table[sort(setdiff(1:dim(hgnc.table)[1], rID)), , drop = FALSE];
hgnc.table = hgnc.table[which(!is.na(hgnc.table[, "Approved.Symbol"])), , drop = FALSE];
regex = "[0-9]\\-(JAN|FEB|MAR|APR|MAY|JUN|JUL|AUG|SEP|OCT|NOV|DEC)|[0-9]\\.[0-9][0-9]E\\+[[0-9][0-9]";
MonthID = grep(pattern = regex, hgnc.table[, 1], ignore.case = TRUE);
MonthMappingTable = hgnc.table[MonthID, , drop = FALSE];
Des[, "GeneSymbol"] = sub("(.*C[0-9XY]+)ORF(.+)", "\\1orf\\2", Des[, "GeneSymbol"]);
ID = intersect(which(!Des[, "GeneSymbol"] %in% hgnc.table[, "Approved.Symbol"]), which(Des[, "GeneSymbol"] %in% hgnc.table[, "Symbol"]));
DesOrg = Des;
if (length(ID) > 0)
{
Des[ID, "GeneSymbol"] = sapply(Des[ID, "GeneSymbol"], function(x)paste(hgnc.table[hgnc.table[, "Symbol"] == x, "Approved.Symbol"], collapse = "___"));
}
# writeLines("Changed genes are");
# print(cbind(DesOrg[ID, "GeneSymbol"], Des[ID, "GeneSymbol"]));
ID = intersect(which(!Des[, "GeneSymbol"] %in% hgnc.table[, "Approved.Symbol"]), which(toupper(Des[, "GeneSymbol"]) %in% toupper(MonthMappingTable[, "Symbol"])));
if (length(ID) > 0)
{
Des[ID, "GeneSymbol"] = sapply(Des[ID, "GeneSymbol"], function(x)paste(MonthMappingTable[toupper(MonthMappingTable[, "Symbol"]) == toupper(x), "Approved.Symbol"], collapse = "___"));
}
# writeLines("Changed genes are");
# print(cbind(DesOrg[ID, "GeneSymbol"], Des[ID, "GeneSymbol"]));
Des;
}
ProcessRPPADataWithGeneAnnotation<-function(inputFilePath, outputFileName, outputFileFolder)
{
options(warn=-1);
dir.create(path = outputFileFolder, recursive = TRUE);
# Read in data
InData = read.table(file = inputFilePath, header = TRUE, sep = "\t", na.string = "", quote = "", stringsAsFactors = FALSE, check.names = FALSE);
OutData = matrix(NA, 0, dim(InData)[2]-1);
# Split the gene symbol and protein antibody into two separate columns
# Duplicate rows for proteins coded by multiple genes.
Gene = c();
Antibody = c();
for (i in 1:dim(InData)[1])
{
GeneAntibodyStr = InData[i, 1];
GeneStr = strsplit(GeneAntibodyStr, split = "\\|")[[1]][1];
AntibodyStr = strsplit(GeneAntibodyStr, split = "\\|")[[1]][2];
GeneStr = unlist(strsplit(GeneStr, split = " "));
Gene = c(Gene, GeneStr);
Antibody = c(Antibody, rep(AntibodyStr, length = length(GeneStr)));
OutData = rbind(OutData, InData[rep(i, length = length(GeneStr)), 2:dim(InData)[2], drop = FALSE]);
}
colnames(OutData) = colnames(InData)[2:dim(InData)[2]];
Des = cbind(GeneSymbol = Gene, ProteinAntibody = Antibody);
# check gene symbol
Des = CheckGeneSymbol(Des);
Data = as.matrix(OutData);
mode(Data) = "numeric";
# save output data files
rownames(Data) = NULL;
rownames(Des) = NULL;
save(Data, Des, file = paste(outputFileFolder, "/", outputFileName, ".rda", sep = ""));
write.table(cbind(Des, Data), file = paste(outputFileFolder, "/", outputFileName, ".txt", sep = ""),
quote = FALSE, sep = "\t", na = "", col.names = TRUE, row.names = FALSE);
# draw and save boxplot
png(filename = paste(outputFileFolder, "/", outputFileName, "__boxplot.png", sep = ""), width = 30*dim(Data)[2]+300, height = 1500, units = "px");
par(las = 2, mai = c(4.5, 1, 0.5, 1), cex = 1.5);
boxplot(Data);
dev.off();
options(warn=0);
return(list(Data = Data, Des = Des));
}
ProcessMethylation27Data<-function(inputFilePath, outputFileName, outputFileFolder, fileSource = "TCGA-Assembler")
{
options(warn=-1);
dir.create(path = outputFileFolder, recursive = TRUE);
if (fileSource == "TCGA-Assembler")
{
# read in data
InData = read.table(file = inputFilePath, header = TRUE, sep = "\t", na.string = "", quote = "", stringsAsFactors = FALSE, check.names = FALSE);
Methy27Des = as.matrix(InData[, 1:4, drop = FALSE]);
colnames(Methy27Des) = c("REF", "GeneSymbol", "ChromosomeID", "CoordinateID");
ID = which(Methy27Des[, "GeneSymbol"] == "DISCONTUNUED");
Methy27Des[ID, "GeneSymbol"] = "";
Methy27Data = as.matrix(InData[, 5:dim(InData)[2], drop = FALSE]);
mode(Methy27Data) = "numeric";
}
if (fileSource == "Firehose")
{
# read in data
InData = read.table(file = inputFilePath, header = TRUE, sep = "\t", na.string = "", quote = "", stringsAsFactors = FALSE, check.names = FALSE);
InData = InData[2:dim(InData)[1], , drop = FALSE];
# remove redudant columns of CpG site descriptions
GeneSymbol = as.matrix(InData[, seq(3, dim(InData)[2], 4), drop = FALSE]);
ID = which(GeneSymbol == "DISCONTUNUED");
GeneSymbol[ID] = "";
GeneSymbol = t(unique(t(GeneSymbol)));
ChromID = as.matrix(InData[, seq(4, dim(InData)[2], 4), drop = FALSE]);
ChromID = t(unique(t(ChromID)));
CoordID = as.matrix(InData[, seq(5, dim(InData)[2], 4), drop = FALSE]);
CoordID = t(unique(t(CoordID)));
Methy27Data = as.matrix(InData[, seq(2, dim(InData)[2], 4), drop = FALSE]);
mode(Methy27Data) = "numeric";
REF = InData[, 1];
# Check whether the probe information is the same between samples
if (dim(GeneSymbol)[2] != 1)
{
stop("Some probe's gene symbols are not consistent between samples");
}
if (dim(ChromID)[2] != 1)
{
stop("Some probe's chromosome IDs are not consistent between samples");
}
if (dim(CoordID)[2] != 1)
{
stop("Some probe's genmoe coordinates are not consistent between samples");
}
ChromID = toupper(ChromID);
Methy27Des = cbind(REF, GeneSymbol, ChromID, CoordID);
colnames(Methy27Des) = c("REF", "GeneSymbol", "ChromosomeID", "CoordinateID");
}
# some probes have more than one gene symbols, collpase the data
AddGeneDes = matrix("", dim(Methy27Des)[1]*3, 4);
AddData = matrix(NA, dim(Methy27Data)[1]*3, dim(Methy27Data)[2]);
AddIndex = 0;
for (ProbeID in 1:dim(Methy27Des)[1])
{
GrepID = grep(";", Methy27Des[ProbeID, "GeneSymbol"]);
if (length(GrepID) > 0)
{
TempStr = strsplit(Methy27Des[ProbeID, "GeneSymbol"], split = ";");
Methy27Des[ProbeID, "GeneSymbol"] = TempStr[[1]][1];
for (TempStri in 2:length(TempStr[[1]]))
{
TempAddGeneDes = c(Methy27Des[ProbeID, "REF"], TempStr[[1]][TempStri], Methy27Des[ProbeID, "ChromosomeID"], Methy27Des[ProbeID, "CoordinateID"]);
AddIndex = AddIndex + 1;
AddGeneDes[AddIndex, ] = TempAddGeneDes;
AddData[AddIndex, ] = Methy27Data[ProbeID, ];
}
}
}
colnames(AddGeneDes) = colnames(Methy27Des);
colnames(AddData) = colnames(Methy27Data);
Methy27Des = rbind(Methy27Des, AddGeneDes[1:AddIndex, , drop = FALSE]);
Methy27Data = rbind(Methy27Data, AddData[1:AddIndex, , drop = FALSE]);
Methy27Des[, "ChromosomeID"] = gsub(" ", "", Methy27Des[, "ChromosomeID"]);
Methy27Des[, "CoordinateID"] = gsub(" ", "", Methy27Des[, "CoordinateID"]);
OrderID = order(as.numeric(Methy27Des[, "ChromosomeID"]), as.numeric(Methy27Des[, "CoordinateID"]), Methy27Des[, "GeneSymbol"], na.last = TRUE, decreasing = FALSE);
Methy27Des = Methy27Des[OrderID, , drop = FALSE];
Methy27Data = Methy27Data[OrderID, , drop = FALSE];
Data = Methy27Data;
Des = Methy27Des;
# check gene symbol
Des = CheckGeneSymbol(Des);
# Draw box plot
png(filename = paste(outputFileFolder, "/", outputFileName, "__boxplot.png", sep = ""), width = 30*dim(Data)[2]+300, height = 1500, units = "px");
par(las = 2, mai = c(4.5, 1, 0.5, 1), cex = 1.5);
boxplot(Data);
dev.off();
# save output data files
rownames(Data) = NULL;
rownames(Des) = NULL;
save(Data, Des, file = paste(outputFileFolder, "/", outputFileName, ".rda", sep = ""));
write.table(cbind(Des, Data), file = paste(outputFileFolder, "/", outputFileName, ".txt", sep = ""),
quote = FALSE, sep = "\t", na = "", col.names = TRUE, row.names = FALSE);
options(warn=0);
return(list(Data = Data, Des = Des));
}
######################### Auxiliary Functions of Module B #############################################################
ToPatientData<-function(TCGAData)
{
TCGABarcode = colnames(TCGAData);
TCGABarcode = sapply(strsplit(TCGABarcode, split = "-"), function(x){
Str = paste(x[1], x[2], x[3], substr(x[4], 1, 2), sep = "-");
Str;
});
DuplicatedLabel = duplicated(TCGABarcode);
ID = which(DuplicatedLabel == FALSE);
TCGAData = TCGAData[, ID, drop = FALSE];
TCGABarcode = TCGABarcode[ID];
colnames(TCGAData) = TCGABarcode;
TCGAData;
}
CombineRedundantFeature <- function(Data, Des)
{
RowName = Des[, 1];
if (dim(Des)[2] > 1)
{
for (i in 2:dim(Des)[2])
{
RowName = paste(RowName, Des[, i], sep = "||");
}
}
UniqueRowName = unique(RowName);
if (length(RowName) == length(UniqueRowName))
{
Result = list(Data = Data, Des = Des);
}
else
{
IDKeep = c();
for (i in 1:length(UniqueRowName))
{
IDi = which(RowName == UniqueRowName[i]);
if (length(IDi) > 1)
{
Data[IDi[1], ] = apply(Data[IDi, , drop = FALSE], 2, mean, na.rm = TRUE);
}
IDKeep = c(IDKeep, IDi[1]);
}
IDKeep = sort(IDKeep);
Result = list(Data = Data[IDKeep, , drop = FALSE], Des = Des[IDKeep, , drop = FALSE]);
}
Result;
}
normalize.quantiles <- function(Data)
{
VectorData = as.vector(Data);
VectorData = VectorData[!is.na(VectorData)];
VectorData = sort(VectorData, decreasing = FALSE);
NumVectorData = length(VectorData);
NormData = matrix(NA, dim(Data)[1], dim(Data)[2]);
for (i in 1:dim(Data)[2])
{
if ((i %% 10 == 0) || (i == dim(Data)[2]))
{
writeLines(paste("Normalizing data, ", round(i/dim(Data)[2]*100, digits = 2), "% done.", sep = ""));
}
IDNonNA = which(!is.na(Data[, i]));
RankNonNA = rank(Data[IDNonNA, i], ties.method = "average");
NumNonNA = length(IDNonNA);
Sep = round(seq(1, NumVectorData, (NumVectorData-1)/NumNonNA));
NormData[IDNonNA, i] = sapply(RankNonNA, function(x)
{
NumTie = sum(RankNonNA == x);
mean(VectorData[(Sep[ceiling(x-NumTie/2)]+1):Sep[floor(x+NumTie/2)+1]]);
})
}
rownames(NormData) = rownames(Data);
colnames(NormData) = colnames(Data);
NormData;
}
GetMethylation450Data<-function(InputFileName, OutputFileFolder, outputFileName)
{
OriginalOutputFileFolder = OutputFileFolder;
OutputFileFolder = paste(OutputFileFolder, "/", outputFileName, ".TempFiles/", sep = "");
dir.create(OutputFileFolder, showWarnings = TRUE, recursive = FALSE, mode = "0777");
FileHandle = file(InputFileName, "rt");
on.exit(close(FileHandle));
ReadIn = readLines(FileHandle, 1);
ColumnName1 = strsplit(ReadIn, split = "\t")[[1]];
ReadIn = readLines(FileHandle, 1);
ColumnName2 = strsplit(ReadIn, split = "\t")[[1]];
LineNum = 10000;
Iter = 0;
Flag = TRUE;
while (Flag)
{
ReadIn = readLines(FileHandle, LineNum);
if (length(ReadIn) > 0)
{
Iter = Iter + 1;
if (Iter %% 5 == 0)
{
writeLines(paste("Load ", floor(Iter/49*100), "% of the humanmethylation450 data set.", sep = ""));
}
if (length(ReadIn) == 1)
{
TempMatrix = rbind(strsplit(ReadIn, split = "\t")[[1]]);
colnames(TempMatrix) = ColumnName1;
TempMethyData = rbind(TempMatrix[, seq(2, dim(TempMatrix)[2], 4), drop = FALSE]);
GeneSymbol = unique(TempMatrix[, seq(3, dim(TempMatrix)[2], 4)]);
ChromID = unique(TempMatrix[, seq(4, dim(TempMatrix)[2], 4)]);
CoordID = unique(TempMatrix[, seq(5, dim(TempMatrix)[2], 4)]);
REF = unique(TempMatrix[, 1]);
ChromID = toupper(ChromID);
# Check whether the probe information is the same between samples
if (length(GeneSymbol) != 1)
{
stop("Some probe's gene symbols are not consistent between samples");
}
if (length(ChromID) != 1)
{
stop("Some probe's chromosome IDs are not consistent between samples");
}
if (length(CoordID) != 1)
{
stop("Some probe's genmoe coordinates are not consistent between samples");
}
TempDes = cbind(REF, GeneSymbol, ChromID, CoordID);
}
else
{
ReadInSep = strsplit(ReadIn, split = "\t");
TempMatrix = matrix(NA, length(ReadIn), length(ColumnName1));
for (i in 1:length(ReadInSep))
{
TempMatrix[i, ] = ReadInSep[[i]];
}
colnames(TempMatrix) = ColumnName1;
GeneSymbol = TempMatrix[, seq(3, dim(TempMatrix)[2], 4), drop = FALSE];
ChromID = TempMatrix[, seq(4, dim(TempMatrix)[2], 4), drop = FALSE];
CoordID = TempMatrix[, seq(5, dim(TempMatrix)[2], 4), drop = FALSE];
TempMethyData = as.matrix(TempMatrix[, seq(2, dim(TempMatrix)[2], 4), drop = FALSE]);
GeneSymbol = t(unique(t(GeneSymbol)));
ChromID = t(unique(t(ChromID)));
CoordID = t(unique(t(CoordID)));
REF = TempMatrix[, 1];
ChromID = toupper(ChromID);
# Check whether the probe information is the same between samples
if (dim(GeneSymbol)[2] != 1)
{
stop("Some probe's gene symbols are not consistent between samples");
}
if (dim(ChromID)[2] != 1)
{
stop("Some probe's chromosome IDs are not consistent between samples");
}
if (dim(CoordID)[2] != 1)
{
stop("Some probe's genmoe coordinates are not consistent between samples");
}
TempDes = cbind(REF, GeneSymbol, ChromID, CoordID);
}
colnames(TempDes) = c("REF", "GeneSymbol", "ChromosomeID", "CoordinateID");
mode(TempMethyData) = "numeric";
TempMethyData = round(x = TempMethyData, digits = 4);
save(TempMethyData, TempDes, file = paste(OutputFileFolder, "Methy450Data", Iter, ".rda", sep = ""));
}
if (length(ReadIn) < LineNum)
{
Flag = FALSE;
}
}
writeLines(paste("Load 100% of the humanmethylation450 data set.", sep = ""));
rm(ReadIn, ReadInSep, TempMatrix, GeneSymbol, ChromID, CoordID, TempMethyData, REF, TempDes)
load(paste(OutputFileFolder, "Methy450Data", 1, ".rda", sep = ""));
Methy450Des = TempDes;
Methy450Data = TempMethyData;
for (i in 2:Iter)
{
load(paste(OutputFileFolder, "Methy450Data", i, ".rda", sep = ""));
Methy450Des = rbind(Methy450Des, TempDes);
Methy450Data = rbind(Methy450Data, TempMethyData);
}
save(Methy450Data, Methy450Des, file = paste(OutputFileFolder, outputFileName, ".TempData.rda", sep = ""));
}
######################### Check whether this is the most updated version of TCGA-Assembler #####################################################
VCwebContent = try(content(GET("http://health.bsd.uchicago.edu/yji/soft.html"), as = "text"), silent = TRUE);
if (class(VCwebContent) == "try-error")
{
rm(VCwebContent);
} else {
VCstartID = str_locate_all(string = VCwebContent, pattern = "LinkToCheckTCGA-AssebmlerVersionNumber");
if (dim(VCstartID[[1]])[1] == 0)
{
rm(VCwebContent, VCstartID);
} else {
VCstartID = VCstartID[[1]][1, "end"]+1;
VCwebContent = substr(VCwebContent, VCstartID, nchar(VCwebContent));
VCstartID = str_locate_all(string = VCwebContent, pattern = "href=\"")[[1]][1, "end"]+1;
VCendID = str_locate_all(string = VCwebContent, pattern = "\">")[[1]][1, "start"]-1;
VCurl = substr(VCwebContent, VCstartID, VCendID);
VCwebContent = try(content(GET(VCurl), as = "text"), silent = TRUE);
if (class(VCwebContent) == "try-error")
{
rm(VCstartID, VCwebContent, VCendID, VCurl);
} else {
VCstartID = str_locate_all(string = VCwebContent, pattern = "CheckVersionNumber1");
if (dim(VCstartID[[1]])[1] == 0)
{
rm(VCstartID, VCwebContent, VCendID, VCurl);
} else {
VCstartID = VCstartID[[1]][1, "end"]+1;
VCwebContent = substr(VCwebContent, VCstartID, nchar(VCwebContent));
VCstartID = str_locate_all(string = VCwebContent, pattern = "\">")[[1]][1, "end"]+1;
VCendID = str_locate_all(string = VCwebContent, pattern = "</span>")[[1]][1, "start"]-1;
VCnewestVersionNum = substr(VCwebContent, VCstartID, VCendID);
if (VCnewestVersionNum != "1.0.3")
{
writeLines("\n");
writeLines("***************************************************************");
writeLines("A new version of TCGA-Assembler is available!")
writeLines(paste("Please download version ", VCnewestVersionNum, " at ", VCurl, sep = ""));
writeLines("***************************************************************");
writeLines("\n");
}
rm(VCstartID, VCwebContent, VCendID, VCnewestVersionNum, VCurl);
}
}
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.