readNanoStringGeoMxSet <-
function(dccFiles,
pkcFiles,
phenoDataFile,
phenoDataSheet,
phenoDataDccColName = "Sample_ID",
phenoDataColPrefix = "",
protocolDataColNames = NULL,
experimentDataColNames = NULL,
configFile = NULL,
analyte = "RNA",
defaultPKCVersions = NULL,
...)
{
# check inputs
if (!(sum(grepl("\\.dcc$",dccFiles)) == length(dccFiles) && length(dccFiles) > 0L)){
stop("Specify valid dcc files." )
}
if (!(sum(grepl("\\.pkc$",pkcFiles)) == length(pkcFiles) && length(pkcFiles) > 0L)){
stop( "Specify valid PKC files." )
}
# Read data rccFiles
data <- structure(lapply(dccFiles, readDccFile), names = basename(dccFiles))
# Create assayData
assay <- lapply(data, function(x)
structure(x[["Code_Summary"]][["Count"]],
names = rownames(x[["Code_Summary"]])))
# Create phenoData
if (is.null(phenoDataFile)) {
stop("Please specify an input for phenoDataFile.")
} else {
pheno <- readxl::read_xlsx(phenoDataFile, col_names = TRUE, sheet = phenoDataSheet, ...)
pheno <- data.frame(pheno, stringsAsFactors = FALSE, check.names = FALSE)
j <- colnames(pheno)[colnames(pheno) == phenoDataDccColName]
if (length(j) == 0L){
stop("Column `phenoDataDccColName` not found in `phenoDataFile`")
} else if (length(j) > 1L){
stop("Multiple columns in `phenoDataFile` match `phenoDataDccColName`")
}
# check protocolDataColNames
if (!(all(protocolDataColNames %in% colnames(pheno))) &
!(is.null(protocolDataColNames))) {
stop("Columns specified in `protocolDataColNames` are not found in `phenoDataFile`")
}
# check experimentDataColNames
if (!(all(experimentDataColNames %in% colnames(pheno))) &
!(is.null(experimentDataColNames))) {
stop("Columns specified in `experimentDataColNames` are not found in `phenoDataFile`")
}
# add ".dcc" to the filenames if there is none
pheno[[j]] <- ifelse(grepl(".dcc", pheno[[j]]), paste0(pheno[[j]]),
paste0(pheno[[j]], ".dcc"))
if ("slide name" %in% colnames(pheno)) {
ntcs <- which(tolower(pheno[["slide name"]]) == "no template control")
if (length(ntcs) > 0) {
ntcData <- lapply(seq_along(ntcs), function(x) {
ntcID <- pheno[ntcs[x], j]
if(!is.na(ntcs[x + 1L])) {
ntcNames <- rep(ntcID, ntcs[x + 1L] - ntcs[x])
ntcCounts <-
rep(sum(assay[[ntcID]]), ntcs[x + 1L] - ntcs[x])
ntcDF <- data.frame("NTC_ID"=ntcNames, "NTC"=ntcCounts)
} else {
ntcNames <- rep(ntcID, dim(pheno)[1L] - ntcs[x] + 1L)
ntcCounts <-
rep(sum(assay[[ntcID]]), dim(pheno)[1L] - ntcs[x] + 1L)
ntcDF <- data.frame("NTC_ID"=ntcNames, "NTC"=ntcCounts)
}
return(ntcDF)
})
if (length(ntcs) > 1L) {
ntcData <- do.call(rbind, ntcData)
} else {
ntcData <- ntcData[[1L]]
}
pheno <- cbind(pheno, ntcData)
pheno <- pheno[!rownames(pheno) %in% ntcs, ]
assay <- assay[!names(assay) %in% unique(pheno[["NTC_ID"]])]
data <- data[!names(data) %in% unique(pheno[["NTC_ID"]])]
protocolDataColNames <- c(protocolDataColNames, "NTC_ID", "NTC")
}
}
rownames(pheno) <- pheno[[j]]
zeroReads <- names(which(lapply(assay, length) == 0L))
if (length(zeroReads) > 0L) {
warning("The following DCC files had no counts: ",
paste0(zeroReads, sep=", "),
"These will be excluded from the GeoMxSet object.")
pheno <- pheno[!rownames(pheno) %in% zeroReads, ]
assay <- assay[!names(assay) %in% zeroReads]
data <- data[!names(data) %in% zeroReads]
}
missingDCCFiles <- pheno[[j]][!pheno[[j]] %in% names(assay)]
missingPhenoData <- names(assay)[!names(assay) %in% pheno[[j]]]
assay <- assay[names(assay) %in% pheno[[j]]]
data <- data[names(data) %in% pheno[[j]]]
pheno <- pheno[names(assay), , drop = FALSE]
if (length(missingDCCFiles) > 0L) {
warning("DCC files missing for the following: ",
paste0(missingDCCFiles, sep=", "),
"These will be excluded from the GeoMxSet object.")
}
if (length(missingPhenoData) > 0L) {
warning("Annotations missing for the following: ",
paste0(missingPhenoData, sep=", "),
"These will be excluded from the GeoMxSet object.")
}
pheno[[j]] <- NULL
if (phenoDataColPrefix != "") {
colnames(pheno) <- paste0(phenoDataColPrefix, colnames(pheno))
protocolDataColNames <- paste0(phenoDataColPrefix, protocolDataColNames)
}
if("area" %in% tolower(colnames(pheno))){
areaCol <- colnames(pheno)[which(tolower(colnames(pheno)) == "area")]
pheno[[areaCol]] <- as.numeric(pheno[[areaCol]])
}
if("nuclei" %in% tolower(colnames(pheno))){
nucleiCol <- colnames(pheno)[which(tolower(colnames(pheno)) == "nuclei")]
pheno[[nucleiCol]] <- as.numeric(pheno[[nucleiCol]])
}
if("aoinucleicount" %in% tolower(colnames(pheno))){
nucleiCol <- colnames(pheno)[which(tolower(colnames(pheno)) == "aoinucleicount")]
pheno[[nucleiCol]] <- as.numeric(pheno[[nucleiCol]])
}
pheno <- Biobase::AnnotatedDataFrame(pheno,
dimLabels = c("sampleNames", "sampleColumns"))
}
#stopifnot(all(sapply(feature, function(x) identical(feature[[1L]], x))))
if (is.null(pkcFiles)) {
stop("Please specify an input for pkcFiles")
} else if (!is.null(pkcFiles)) {
pkcData <- readPKCFile(pkcFiles, default_pkc_vers=defaultPKCVersions)
pkcHeader <- S4Vectors::metadata(pkcData)
# pkcHeader[["PKCFileDate"]] <- as.character(pkcHeader[["PKCFileDate"]])
pkcData$RTS_ID <- gsub("RNA", "RTS00", pkcData$RTS_ID)
pkcData <- as.data.frame(pkcData)
rownames(pkcData) <- pkcData[["RTS_ID"]]
}
probeAssay <- lapply(names(data), function(x)
data.frame(data[[x]][["Code_Summary"]],
Sample_ID = x))
probeAssay <- do.call(rbind, probeAssay)
# check for missing probes in PKC that are in assay
missingProbes <- setdiff(unique(probeAssay[["RTS_ID"]]), rownames(pkcData))
if (length(missingProbes) > 0L){
warning("Not all probes are found within PKC probe metadata.",
" The following probes are ignored from analysis",
" and were most likely removed from metadata while",
" resolving multiple module PKC version conflicts.\n",
paste(missingProbes, sep=", "))
}
if(!is.null(configFile)){
pkcProbes <- compareToConfig(config = configFile, pkcProbes = pkcData, pkcHeader = pkcHeader)
pkcData <- pkcProbes$pkcData
pkcHeader <- pkcProbes$pkcHeader
pkcFiles <- paste0(unique(pkcData$Module), ".pkc")
}
if(tolower(analyte) == "protein"){
analyte <- "Protein"
}else if(tolower(analyte) == "rna"){
analyte <- "RNA"
}
pkcs <- names(which(tolower(pkcHeader$AnalyteType) == tolower(analyte)))
if(length(pkcs) == 0){
stop(paste("Given analyte is not valid: options =", paste(unique(pkcHeader$AnalyteType), collapse = ", ")))
}
pkcFiles <- paste0(pkcs, ".pkc")
pkcData <- pkcData[pkcData$Module %in% pkcs,]
for(i in names(pkcHeader)){
pkcHeader[[i]] <- pkcHeader[[i]][names(pkcHeader[[i]]) %in% pkcs]
}
# Handle older assay probe labels
if (any(startsWith(pkcData[["RTS_ID"]][1L], "RTS")) &
any(startsWith(probeAssay[["RTS_ID"]][1L], "RNA"))) {
# replace RNA with RTS00 in probeAssay[["RTS_ID"]]
probeAssay[["RTS_ID"]] <- gsub("^RNA", "RTS00", probeAssay[["RTS_ID"]])
}
probeAssay <- probeAssay[probeAssay[["RTS_ID"]] %in% pkcData[["RTS_ID"]],]
zeroProbes <- setdiff(rownames(pkcData), unique(probeAssay[["RTS_ID"]]))
zeroProbeAssay <- data.frame(RTS_ID=pkcData[zeroProbes, "RTS_ID"],
Count=rep(0, length(zeroProbes)),
Sample_ID=rep(probeAssay[1, "Sample_ID"], length(zeroProbes)))
probeAssay <- rbind(probeAssay, zeroProbeAssay)
probeAssay[["Module"]] <- pkcData[probeAssay[["RTS_ID"]], "Module"]
probeAssay <- reshape2::dcast(probeAssay, RTS_ID + Module ~ Sample_ID,
value.var="Count", fill=0)
rownames(probeAssay) <- probeAssay[, "RTS_ID"]
if(!all(names(data) %in% names(probeAssay))){
missingAnalyteDCC <- names(data)[which(!names(data) %in% names(probeAssay))]
warning("The following DCC files had no counts and will be excluded from the GeoMxSet object: ",
paste0(missingAnalyteDCC, sep=", "))
data <- data[which(names(data) %in% names(probeAssay))]
pheno <- pheno[names(data),]
}
assay <- as.matrix(probeAssay[, names(data)])
# Create featureData
feature <- pkcData[rownames(assay), , drop = FALSE]
# change the colnames of feature data to match with dimLabels
colnames(feature)[which(colnames(feature)=="Target")] <- "TargetName"
feature <- AnnotatedDataFrame(feature,
dimLabels = c("featureNames", "featureColumns"))
# Create experimentData
if (!(is.null(experimentDataColNames))) {
experimentList <-
lapply(experimentDataColNames,
function(experimentDataColName) {
unique(S4Vectors::na.omit(pheno@data[[experimentDataColName]]))})
names(experimentList) <- experimentDataColNames
experiment <-
Biobase::MIAME(name = "",
other = c(experimentList,
pkcHeader,
list(shiftedByOne=FALSE)))
} else {
experiment <-
Biobase::MIAME(name = "",
other = c(pkcHeader,
list(shiftedByOne=FALSE)))
}
# Create annotation
annotation <- sort(sapply(strsplit(pkcFiles, "/"), function(x) x[length(x)]))
if(!identical(annotation, paste0(sort(unique(probeAssay[["Module"]])), ".pkc"))) {
stop("Name mismatch between pool and PKC files")
}
# Create protocolData
protocol <-
do.call(dplyr::bind_rows,
lapply(names(data), function(i) {
cbind(data[[i]][["Header"]], data[[i]][["Scan_Attributes"]],
data[[i]][["NGS_Processing_Attributes"]])
}))
protocol <- data.frame(protocol,
pheno@data[, which(colnames(pheno@data) %in% protocolDataColNames)],
check.names = FALSE)
pheno <- pheno[, setdiff(colnames(pheno@data),
c(protocolDataColNames, experimentDataColNames))]
annot_labelDescription <-
data.frame(
labelDescription=rep(NA_character_, length(protocolDataColNames) + 1L),
row.names = c(protocolDataColNames, "DeduplicatedReads"),
stringsAsFactors = FALSE)
protocol <-
protocol[, names(protocol) %in% c(rownames(.dccMetadata[["protocolData"]]),
rownames(annot_labelDescription))]
protocol <- AnnotatedDataFrame(protocol,
rbind(.dccMetadata[["protocolData"]],
annot_labelDescription),
dimLabels = c("sampleNames", "sampleColumns"))
# Create NanoStringGeoMxSet
GxT <- NanoStringGeoMxSet(assayData = assay,
phenoData = pheno,
featureData = feature,
experimentData = experiment,
annotation = annotation,
protocolData = protocol,
check = FALSE,
dimLabels = c("RTS_ID", "SampleID"),
analyte = analyte)
if(analyte(GxT) == "Protein"){
GxT <- suppressWarnings(aggregateCounts(GxT))
}
return(GxT)
}
#' Compare given PKC probes to probes in config file
#'
#' Check if extra PKCs are given based on probes in config file
#'
#' @param config file path to config file
#' @param pkcProbes probe information from readPKCFile
#' @param pkcHeader pkc metadata from readPKCFile
#'
compareToConfig <- function(config, pkcProbes, pkcHeader){
if(!file.exists(config)){
stop("Given config file does not exist")
}
config <- readLines(config)
targets <- config[(which(config == "[Targets]")+1):length(config)]
targets <- str_split(string = targets, pattern = " = ", simplify = T)[,1]
extraProbes <- which(!pkcProbes$RTS_ID %in% targets)
if(length(extraProbes) > 0){
extraPKCs <- unique(pkcProbes$Module[extraProbes])
for(pkc in extraPKCs){
if(all(which(pkcProbes$Module == pkc) %in% extraProbes) == FALSE){
extraPKCs <- extraPKCs[-which(extraPKCs == pkc)]
}
}
if(length(extraPKCs) > 0){
warning(paste0("Extra PKC files were provided and will be removed from further analysis: ", paste(extraPKCs, collapse = ", ")),
immediate. = TRUE)
pkcs <- unique(pkcProbes$Module[!pkcProbes$Module %in% extraPKCs])
pkcProbes <- pkcProbes[pkcProbes$Module %in% pkcs,]
for(i in names(pkcHeader)){
pkcHeader[[i]] <- pkcHeader[[i]][names(pkcHeader[[i]]) %in% pkcs]
}
}
}
return(list(pkcData=pkcProbes, pkcHeader=pkcHeader))
}
# analyteSubset <- function(object, analyte){
# if(length(unique(fData(object)$AnalyteType)) > 1){
# if(!tolower(analyte) %in% tolower(unique(fData(object)$AnalyteType))){
# stop(paste("Given analyte is not in dataset; options:", paste(unique(fData(object)$AnalyteType), collapse = ", ")))
# }
#
# objects <- list()
# pkcs <- object@annotation
#
# object <- subset(object, subset = tolower(AnalyteType) == tolower(analyte))
# object@annotation <- pkcs[pkcs %in% paste0(unique(fData(object)$Module), ".pkc")]
#
# return(object)
# }else{
# warning("Only one analyte present, no subsetting neccesary")
# }
# }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.