genefu: Computation of Gene Expression-Based Signatures in Breast Cancer

Documented in oncotypedx

#' @title Function to compute the OncotypeDX signature as published by
#'   Paik et al. in 2004.
#'
#' @description
#' This function computes signature scores and risk classifications from
#'   gene expression values following the algorithm used for the OncotypeDX
#'   signature as published by Paik et al. 2004.
#'
#' @usage
#' oncotypedx(data, annot, do.mapping = FALSE, mapping, do.scaling=TRUE,
#'   verbose = FALSE)
#'
#' @param data Matrix of gene expressions with samples in rows and
#'   probes in columns, dimnames being properly defined.
#' @param annot Matrix of annotations with at least one column named
#'   "EntrezGene.ID", dimnames being properly defined.
#' @param do.mapping TRUE if the mapping through Entrez Gene ids must
#'   be performed (in case of ambiguities, the most variant probe is kept
#'   for each gene), FALSE otherwise. Note that for Affymetrix HGU
#'   datasets, the mapping is not necessary.
#' @param mapping Matrix with columns "EntrezGene.ID" and "probe" used
#'   to force the mapping such that the probes are not selected based on
#'   their variance.
#' @param do.scaling Should the data be scaled?
#' @param verbose TRUE to print informative messages, FALSE otherwise.
#'
#' @details
#' Note that for Affymetrix HGU datasets, the mapping is not necessary.
#'
#' @return
#' A list with items:
#' - score: Continuous signature scores
#' - risk: Binary risk classification, 1 being high risk and 0 being low risk.
#' - mapping: Mapping used if necessary.
#' - probe: If mapping is performed, this matrix contains the correspondence
#'   between the gene list (aka signature) and gene expression data.
#'
#' @references
#' S. Paik, S. Shak, G. Tang, C. Kim, J. Bakker, M. Cronin, F. L. Baehner,
#'   M. G. Walker, D. Watson, T. Park, W. Hiller, E. R. Fisher, D. L. Wickerham,
#'   J. Bryant, and N. Wolmark (2004) "A Multigene Assay to Predict Recurrence
#'   of Tamoxifen-Treated, Node-Negative Breast Cancer", New England Journal
#'   of Medicine, 351(27):2817-2826.
#'
#' @examples
#' # load GENE70 signature
#' data(sig.oncotypedx)
#' # load NKI dataset
#' data(nkis)
#' # compute relapse score
#' rs.nkis <- oncotypedx(data=data.nkis, annot=annot.nkis, do.mapping=TRUE)
#' table(rs.nkis$risk)
#'
#' @md
#' @export
oncotypedx <- function(data, annot, do.mapping=FALSE, mapping, do.scaling=TRUE,
                       verbose=FALSE) {
	
	if (!exists('sig.oncotypedx')) data(sig.oncotypedx, envir=environment())

	## the reference genes are not taken into account due to their absence from most platforms
	sig2 <- sig.oncotypedx[sig.oncotypedx[ , "group"] != "reference",  , drop=FALSE]
	dimnames(sig2)[[1]] <- sig2[ , "probe.affy"]
	gt <- nrow(sig2)
	if(do.mapping) { ## not an affy HGU platform
		gid1 <- as.numeric(as.character(sig2[ ,"EntrezGene.ID"]))
		names(gid1) <- dimnames(sig2)[[1]]
		gid2 <- as.numeric(as.character(annot[ ,"EntrezGene.ID"]))
		names(gid2) <- dimnames(annot)[[1]]
		## remove missing and duplicated geneids from the gene list
		rm.ix <- is.na(gid1) | duplicated(gid1)
		gid1 <- gid1[!rm.ix]
		## mqpping
		rr <- geneid.map(geneid1=gid2, data1=data, geneid2=gid1, verbose=FALSE)
		gm <- length(rr$geneid2)
		mymapping <- c("mapped"=gm, "total"=gt)
		if(length(rr$geneid1) != gt) { ## some genes are missing
			res <- rep(NA, nrow(data))
			names(res) <- dimnames(data)[[1]]
			if(verbose) { message(sprintf("probe candidates: %i/%i", gm, gt)) }
			return(list("score"=res, "risk"=res, "mapping"=mymapping, "probe"=NA))
		}
		gid1 <- rr$geneid2
		gid2 <- rr$geneid1
		data <- rr$data1
		myprobe <- cbind("probe"=names(gid1), "EntrezGene.ID"=gid1, "new.probe"=names(gid2))
		## change the names of probes in the data
		dimnames(data)[[2]] <- names(gid2) <- names(gid1)
	} else {
		myprobe <- NA
		data <- data[ ,intersect(dimnames(sig2)[[1]], dimnames(data)[[2]])]
		gm <- ncol(data)
		mymapping <- c("mapped"=gm, "total"=gt)
		if(nrow(sig2) != ncol(data)) { ## some genes are missing
			res <- rep(NA, nrow(data))
			names(res) <- dimnames(data)[[1]]
			if(verbose) { message(sprintf("probe candidates: %i/%i", ncol(data), gt)) }
			return(list("score"=res, "risk"=res, "mapping"=mymapping, "probe"=myprobe))
		}
	}
	## rename gene names by the gene symbols
	dimnames(data)[[2]] <- dimnames(sig2)[[1]] <- sig2[ , "symbol"]

	if (do.scaling) {
		## scaling between 0 and 15
		data <- apply(data, 2, function(x) { xx <- (x - min(x, na.rm=TRUE)) / (max(x, na.rm=TRUE) - min(x, na.rm=TRUE)); return(xx * 15) })
	} else if (max(data, na.rm=TRUE) > 20 || min(data, na.rm=TRUE) < -5) {
		## check that each gene expression lies approximately in [0, 15]
		warning("The max and min values of your data suggest it is not already scaled...
			If it is please ignore this message, otherwise set `do.scaling=TRUE` to scale in the function call."
			)
	}

	## OcotypeDX recurrence score
	## GRB7 group score = 0.9 * GRB7 + 0.1 * HER2 if result < 8, then result = 8
	## ER group score = (0.8 * ER + 1.2 * PGR + BCL2 + SCUBE2) / 4
	## proliferation group score = ( survivin + KI67 + MYBL2 + CCNB1 + STK15) / 5 if result < 6.5, then result = 6.5
	## invasion group score = (CTSL2 + MMP11) / 2
	## RSU = + 0.47 * GRB7 group score - 0.34 * ER group score + 1.04 * proliferation group score + 0.10 * invasion group score + 0.05 * CD68 - 0.08 GSTM1 - 0.07 * BAG1

	cc.ix <- complete.cases(data)
	rs <- rs.unscaled <- rsrisk <- NULL
	for (i in 1:nrow(data)) {
		if(cc.ix[i]) {
			grb7.gs <- 0.9 * data[i, "GRB7"] + 0.1 * data[i, "ERBB2"]
			if (grb7.gs < 8) { grb7.gs <- 8 }

			er.gs <- (0.8 * data[i, "ESR1"] + 1.2 * data[i, "PGR"] + data[i, "BCL2"] + data[i, "SCUBE2"]) / 4

			proliferation.gs <- (data[i, "BIRC5"] + data[i, "MKI67"] + data[i, "MYBL2"] + data[i, "CCNB1"] + data[i, "AURKA"]) / 5
			if (proliferation.gs < 6.5) { proliferation.gs <- 6.5 }

			invasion.gs <- (data[i, "CTSL2"] + data[i, "MMP11"]) / 2

			rsu <- 0.47 * (grb7.gs) - 0.34 * (er.gs) + 1.04 * (proliferation.gs) + 0.1 * (invasion.gs) + 0.05 * data[i, "CD68"] - 0.08 * data[i, "GSTM1"] - 0.07 * data[i, "BAG1"]
			## rescale the score
			rsu2 <- rsu
			if(rsu >= 0 & rsu <= 100) { rsu <- 20 * (rsu - 6.7) }
			if(rsu < 0) { rsu <- 0 }
			if(rsu > 100) { rsu <- 100 }
			## use of the official curoffs
			if(rsu < 18) { rsr <- 0 }
			if(rsu >= 18 & rsu < 31) { rsr <- 0.5 }
			if(rsu >= 31) { rsr <- 1 }
		}
		else { rsu <- rsr <- rsu2 <- NA }
		rs.unscaled <- c(rs.unscaled, rsu2)
		rs <- c(rs, rsu)
		rsrisk <- c(rsrisk, rsr)
	}
	names(rs) <- names(rs.unscaled) <- names(rsrisk) <- dimnames(data)[[1]]
	return(list("score"=rs, "risk"=rsrisk, "mapping"=mymapping, "probe"=myprobe))
}