pepStat: Statistical analysis of peptide microarrays

#' Make antibody binding positivity calls
#'
#' After normalization and data smoothing, this last step makes the call for each
#' peptide of the peptideSet after baseline correcting the peptide intenstities.
#'
#' @param peptideSet A \code{peptideSet} object. The peptides, after normalization
#' and possibly data smoothing.
#' @param cutoff A \code{numeric}. If FDR, the FDR threshold. Otherwise, a cutoff
#' for the background corrected intensities.
#' @param method A \code{character}. The method used to make positivity calls.
#' "absolute" and "FDR" are available. See details below.
#' @param freq A \code{logical}. If set to TRUE, return the percentage of slides
#' calling a peptide positive. Otherwise, return a \code{logical} indicating binding
#' events.
#' @param group A \code{character}. Only used when freq is set to TRUE. A character indicating
#' a variable by which to group slides. If non-null the percentage is calculated by group.
#' @param verbose A \code{logical}. If set to TRUE, progress information will be displayed.
#'
#' @details
#' This function requires specific variables ptid and visit in pData(peptideSet).
#' The variable \code{ptid} should indicate subjects, and the variable \code{visit}
#' should be a factor with levels pre and post.
#'
#' If slides are paired for subjects, intensities corresponding to post-visit are
#' substracted from pre. If slides are not paired, slides with pre have intensities
#' averaged by peptides, and averaged peptide intensities are subtracted from slides
#' that have entry post. Calls are made on these baseline corrected intensities.
#'
#' When method = FDR, a left-tail method is used to generate a threshold controlling
#' the False Discovery Rate at level \code{cutoff}. When method = absolute, Intensities
#' exceeding the threshold are labelled as positive.
#'
#' When freq = TRUE a group variable may be specified. The argument group indicates
#' the name of a variable in pData(peptideSet) by which positive calls should be grouped.
#' The call frequency for each peptide is calculated within groups.
#'
#' @return If freq = TRUE, a \code{numeric} \code{matrix} with peptides as rows and
#' groups as columns where the values are the frequency of response in the group. If
#' freq = FALSE, a \code{logical} \code{matrix} indicating binding events for each
#' peptide in each subject.
#'
#' @rdname makeCalls
#' @aliases makeCalls
#' @author Greg Imholte
#' @export
#' @example examples/pipeline.R
makeCalls <- function(peptideSet, cutoff=1.2, method="absolute", freq=TRUE, group=NULL, verbose=FALSE){
	if (class(peptideSet)!="peptideSet") {
		stop("peptideSet must be an object of class peptideSet")
	}

	if (preproc(peptideSet@experimentData)$transformation!="log") {
		warning("The probe measurements should be log transformed.")
	}

	if (preproc(peptideSet@experimentData)$normalization=="none") {
		warning("You should probably normalize your data before using this function.")
	}

	I <- baselineCorrect.pSet(peptideSet, verbose=verbose)

	if (method == "FDR") {
		Calls<-.findFDR(I, cutoff, position(peptideSet), verbose=verbose)
	} else if(method == "absolute") {
		Calls <- I > cutoff
	}

	if (!is.null(group) && freq) {
		#parse the grouping variable

		# Only select the Post and remove empty levels
		t1 <- grepl("post", tolower(pData(peptideSet)$visit))
		pd <- pData(peptideSet)[t1, ]

    if(!group%in%colnames(pd)){
      stop("The grouping variable is not part of the pData object.")
      } else {
        factor<-factor(pd[,group])
      }
		if(nlevels(factor) > 1){
			#split the ptid into groups
			ptidGroups <- split(pd$ptid,factor)
			#apply the rowMeans to each group
			res <- lapply(ptidGroups, function(curPtid,Calls,ptid){rowMeans(Calls[,ptid%in%curPtid, drop=FALSE])}, Calls, pd$ptid)
			res <- do.call(cbind, res)
		} else {
			return(rowMeans(Calls)*100)
		}
    return(res*100)
	} else if(freq) {
        return(rowMeans(Calls)*100)
    } else {
        return(Calls)
    }
}

.findFDR <- function(I, cutoff, position, verbose=FALSE){
    seqY <- seq(min(abs(I)), max(abs(I)),.05)
    # Split the data by unique positions
    tmp <- split(as.data.frame(I), position)
    # Compute the mean over unique positions
    D <- sapply(tmp,apply, 2, mean)

    #Calculate F(T)
    FDR <- sapply(seqY, function(x, D){median(
                  #Fs(T)
                  apply(D, 1, function(D,x){min(sum(D < -x)/sum(D > x), 1)}, x), na.rm=TRUE)}, D)

    # Did not find anything below the cutoff or everything is NA
    if (all(round(FDR,2)>cutoff, na.rm=TRUE) | all(is.na(FDR))) {
        return(I > max(I)) # Return all FALSE
    } else {
        Dmin <- seqY[which.min(abs(FDR-cutoff))]
        if(verbose){
          cat("The selected threshold T is ", Dmin, "\n")
        }
        return(I > Dmin)
    }
}


#' Substract baseline intensities
#'
#' Correct intensities by substracting PRE visit sample intensities.
#'
#' @param pSet A \code{peptideSet} with sample PRE and POST visits.
#' @param verbose A \code{logical}. If TRUE, information regarding the
#' pairedness of the data will be displayed.
#'
#' @return A \code{matrix} of the baseline corrected intensities, with as many
#' columns as there are samples POST visit
#'
#' @details
#' If samples are not PAIRED (One PRE and POST for each ptid), then the average
#' expression of all PRE visit samples is substracted from each sample.
#'
#' @author Raphael Gottardo, Gregory Imholte
#'
baselineCorrect.pSet <- function(pSet, verbose=FALSE){
  y <- exprs(pSet)
  ptid <- pData(pSet)$ptid
  t0 <- grep("pre", tolower(pData(pSet)$visit))
  t1 <- grep("post", tolower(pData(pSet)$visit))
  ### Paired?
  if (length(ptid[t0])==0||length(ptid[t1])==0) {
    I<-as.matrix(y[,t1])
  } else {
    if (isTRUE(all.equal(sort(ptid[t0]), sort(ptid[t1])))) {
      if (verbose) {
        message("You have paired PRE/POST samples\n")
      }
      I <- as.matrix(y[,t1])-as.matrix(y[,t0])
    } else {
      if(verbose) {
        message("You don't have paired PRE/POST samples\n")
      }
      I <- as.matrix(y[,t1])-rowMeans(y[, t0, drop=FALSE], na.rm=TRUE)#the vector to be subtracted from matrix need to be the same length as nrow of the matrix
    }
  }
  colnames(I) <- ptid[t1]
  if(isTRUE(preproc(pSet)$split.by.clade)){
    rownames(I) <- rownames(pSet)
  } else{
    rownames(I) <- peptide(pSet)
  }
  return(I)
}

#' Substract baseline intensities
#'
#' Correct intensities by substracting PRE visit sample intensities.
#'
#' @param pSet A \code{peptideSet} with sample PRE and POST visits.
#' @param verbose A \code{logical}. If TRUE, information regarding the
#' pairedness of the data will be displayed.
#'
#' @return A \code{matrix} of the baseline corrected intensities, with as many
#' columns as there are samples POST visit
#'
#' @details
#' The function will try to pair as many sample as possible. The remaining subjects
#' with a POST and no PRE will use the average expression of all baseline samples.
#' Subjects with baseline only will not be represented in the resulting matrix.
#'
#' @author Renan Sauteraud
baseline_correct <- function(pSet, verbose=FALSE){
  exprs <- exprs(pSet)
  pd <- pData(pSet)
  t0 <- grep("pre", tolower(pd$visit))
  t1 <- grep("post", tolower(pd$visit))
  ## All paired
  if (isTRUE(all.equal(sort(pd$ptid[t0]), sort(pd$ptid[t1])))){
    if (verbose) {
      message("You have paired PRE/POST samples\n")
    }
    I <- as.matrix(exprs[,t1])-as.matrix(exprs[,t0])
  } else {
    ## Deal with paired samples first
    paired <- pd$ptid[t1][pd$ptid[t1] %in% pd$ptid[t0]]
    pre_paired <- rownames(pd[ pd$ptid %in% paired & tolower(pd$visit)=="pre",])
    post_paired <- rownames(pd[ pd$ptid %in% paired & tolower(pd$visit)=="post",])
    I <- exprs[, post_paired] - exprs[, pre_paired]
    colnames(I) <- paired
    ## Cbind the rest
    if (verbose) {
      cat(length(non_paired), "subjects don't have a baseline sample.\n")
    }
    non_paired <- unique(pd$ptid[!( pd$ptid %in% paired) & tolower(pd$visit)=="post"])
    if(length(non_paired) > 0){
      post_only <- rownames(pd[ pd$ptid %in% non_paired, ])
      I_no_pairs <- exprs[, post_only] - rowMeans(exprs[, t0, drop=FALSE], na.rm=TRUE)
      colnames(I_no_pairs) <- non_paired
      I <- cbind(I, I_no_pairs)
    }
  }
  rownames(I) <- peptide(pSet)
  return(I)
}