R/Impulse_DE_fin.R
In ImpulseDE: Detection of DE genes in time series data using impulse models

Documented in calc_impulse impulse_DE plot_impulse

################################################################################
########################     Impulse DE package     ############################
################################################################################

### Version:  0.99.0
### Date:     2016-06-02
### Author:   Jil Sander

################################################################################


#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#                             Functions to call                                #
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#



################################################################################

#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
#++++++++++++++++++++++     Annotation preparation    +++++++++++++++++++++++++#
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

#Prepares the annotation table for internal use

annotation_preparation <- function(data_annotation, data_table = NULL,
  colname_time = NULL, colname_condition = NULL, control_timecourse = FALSE,
  control_name = NULL, case_name = NULL){

   if (is.null(data_table)) {
      stop("ERROR: table of data values must be specified")
   }
   if ((nrow(data_annotation) != ncol(data_table)) ||
      (FALSE %in% (sort(rownames(data_annotation)) ==
            sort(colnames(data_table))))) {
                 stop("ERROR: column names of data table must be the same as
                    row names of annotation table!")
   }
   if (is.null(colname_time)) {
      stop("ERROR: name of the column containing the timepoints must be
           specified as character!")
   }
   if (is.null(colname_condition)) {
      stop("ERROR: name of the column containing the condition(s) must be
           specified as character!")
   }
   if (is.numeric(data_annotation[,colname_time]) == FALSE) {
      stop("ERROR: time variable must be numeric and is not allowed to
           contain characters!")
   }
   if (control_timecourse == FALSE &
      length(summary(as.factor(data_annotation[,colname_condition]))) >= 2) {
        stop("ERROR: if you have only one time course do not provide more than
             one condition!")
   }
   if (control_timecourse == TRUE & (length(summary(as.factor(data_annotation[,
        colname_condition]))) > 2) & is.null(case_name)) {
            stop("ERROR: please specify case and control names of interest
                 since you provide more than three conditions!")
   }
   annot <- data_annotation[,c(colname_time,colname_condition)]
   annot <- annot[colnames(data_table),]
   colnames(annot) <- c("Time","Condition")
   annot$Condition <- as.character(annot$Condition)

   if (length(summary(as.factor(data_annotation[,colname_condition]))) > 2) {
      print(paste("Case condition: ", case_name, sep  = ""))
   } else {
      print(paste("Case condition: ",  data_annotation[!(data_annotation[,
        colname_condition] %in% control_name),colname_condition][1], sep = ""))
   }
   if (control_timecourse == TRUE) {
      print(paste("Control condition: ", control_name, sep  = ""))
   }
   if (control_timecourse == TRUE & (length(summary(as.factor(data_annotation[,
        colname_condition]))) > 2)) {
             annot <- annot[annot$Condition %in% c(control_name, case_name),]
   }
   return(annot)
}

################################################################################

#' Impulse model value prediction
#'
#' Calculates impulse model values for given timepoints and predicted
#' impulse parameters.
#' @aliases calc_impulse
#' @param theta numerical vector of impulse parameters with the order
#' beta, h0, h1, h2, t1, t2.
#' @param timepoints numercial vector of time point(s).
#' @return The predicted impulse model values for the given time point(s).
#' @seealso \code{\link{impulse_DE}}, \code{\link{plot_impulse}}.
#' @author Jil Sander
#' @references Chechik, G. and Koller, D. (2009) Timing of Gene Expression
#' Responses to Envi-ronmental Changes. J. Comput. Biol., 16, 279-290.
#' @examples
#' #' theta vector in the order beta, h0, h1, h2, t1, t2
#' theta <- c(9.9, 14.7, 17.0, 16.9, -0.1, 37.0)
#' #' time points
#' timepoints <- c(0, 2, 4, 6, 8, 18, 24, 32, 48, 72)
#' #' calculate impulse values
#' impulse_values <- calc_impulse(theta, timepoints)
#' @export
calc_impulse <- function(theta,timepoints){
  beta1 = theta[1]
  h0 = theta[2]
  h1 = theta[3]
  h2 = theta[4]
  t1 = theta[5]
  t2 = theta[6]
  res = NULL
   for (i in 1:length(timepoints)) {
    res[i] = (1/h1) * (h0 + (h1 - h0) * (1/(1 + exp(-beta1*(timepoints[i] - t1))))) *
        (h2 + (h1 - h2) * (1/(1 + exp(beta1*(timepoints[i] - t2)))))
  }
  res = unlist(res)
  return(res)
}

################################################################################

#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
#++++++++++++++++     Calculation of input for Optimization    ++++++++++++++++#
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

### specifies the function of squared errors which is minimized to fit
### parameters
two_impulses <- function(theta,x_vec,y_vec){
  beta1 = theta[1]
  h0 = theta[2]
  h1 = theta[3]
  h2 = theta[4]
  t1 = theta[5]
  t2 = theta[6]
   f = sum((unlist(lapply(x_vec, function(x) {(1/h1) * (h0 + (h1 - h0) *
        (1/(1 + exp(-beta1*(x - t1))))) * (h2 + (h1 - h2) *
        (1/(1 + exp(beta1*(x - t2)))))})) - y_vec)^2)
   return(f)
}

################################################################################

## compile simple functions to make them quicker
calc_impulse_comp <- cmpfun(calc_impulse)
two_impulses_comp <- cmpfun(two_impulses)


################################################################################

#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
#++++++++++++++++++++++++++     Clustering    +++++++++++++++++++++++++++++++++#
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

### performs 2-step clustering on expression data or background data
### 1. Step: clustering genes based on correlation to compare the shape
### 2. Step: split the clusters from step 1 into finer clusters based on the
###          eucledian distance to compare the magnitude
cluster_genes_for_impulse <- function(data_table, data_annotation,
    control_timecourse = FALSE, control_name = NULL, plot_clusters = FALSE,
    no_of_clusters = NULL, n_genes = NULL, n_device = TRUE){

  #' @import amap

  corr_thres = 0.8
  eucl_thres = 1.5

  ### assume only one timecourse as default and perform only 1 run
  results = list()
  runs = 1
  label = "case"

  ### perform 3 runs if control timecourse data is present
  if (control_timecourse == TRUE & is.null(control_name) == FALSE) {
     runs = 3
     label = c("combined","case","control")
  }
  ### clustering for different runs
  for (c_runs in 1:runs) {
    if (c_runs == 1) {                # combined or only case if no control
      print(paste("Clustering of ",label[c_runs]," data set", sep = ""))
      dat_pre = data_table
      dat_pre = dat_pre[which(apply(dat_pre,1,function(x){100*
              sd(x)/(mean(x))}) >= 2.5),]
      dat = t(apply(dat_pre,1,function(y){unlist(lapply(as.list(unique(
          as.numeric(as.character(data_annotation$Time)))),function(x){
          mean(y[data_annotation$Time == x])}))}))
      colnames(dat) <- unique(as.numeric(as.character(data_annotation$Time)))
    } else if(c_runs == 2) {         # case
      print(paste("Clustering of ",label[c_runs]," data set", sep = ""))
      dat_pre = data_table[,!(data_annotation$Condition %in% control_name)]
      dat_pre = dat_pre[which(apply(dat_pre,1,function(x){100*
              sd(x)/(mean(x))}) >= 2.5),]
      dat = t(apply(dat_pre,1,function(y){unlist(lapply(as.list(unique(
          as.numeric(as.character(data_annotation[!(data_annotation$Condition
          %in% control_name),"Time"])))),function(x){mean(
          y[data_annotation[!(data_annotation$Condition %in% control_name),
          "Time"] == x])}))}))
      colnames(dat) <- unique(as.numeric(as.character(data_annotation[
          !(data_annotation$Condition %in% control_name),"Time"])))
    } else {                        # control
      print(paste("Clustering of ",label[c_runs]," data set", sep = ""))
      dat_pre = data_table[,data_annotation$Condition %in% control_name]
      dat_pre = dat_pre[which(apply(dat_pre,1,function(x){100*
              sd(x)/(mean(x))}) >= 2.5),]
      dat = t(apply(dat_pre,1,function(y){unlist(lapply(as.list(unique(
          as.numeric(as.character(data_annotation[data_annotation$Condition
          %in% control_name,"Time"])))),function(x){mean(
          y[data_annotation[data_annotation$Condition %in% control_name,
         "Time"] == x])}))}))
      colnames(dat) <- unique(as.numeric(as.character(data_annotation[
          (data_annotation$Condition %in% control_name),"Time"])))
    }
    print(paste("- ", (nrow(data_table) - nrow(dat_pre)),
            " genes were excluded due to very low variation", sep = ""))

   #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
   #~~~~~~~~~~~~~   Step 1: clustering based on correlation   ~~~~~~~~~~~~~~~~~#
   #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

    max_n_pre_clus = 25   # restrict maximum number of pre-clusters to 25
    ### z-transformation of data for clustering
    dat_ztrans <- t(scale(t(dat)))

    ### ---> clustering of expression data
    if (is.null(no_of_clusters)) {
      file_part = "genes"
      n_clus = 0
      for (i in 1:max_n_pre_clus) {

        kmeans_clus <- Kmeans(dat_ztrans,i,200,method = "correlation")

    ### accept clustering if mean correlation between genes of the cluster and
    ### centroid of the cluster is higher than 'corr_thres' (default: 0.75)
        for (j in 1:i) {
          tmp <- dat_ztrans[kmeans_clus$cluster %in% j,]
          indt1 <- NULL
          if (i > 1 & sort(summary(as.factor(kmeans_clus$cluster)))[1]  < 10) {
               indt1 <- "I"
               n_clus = i - 1
               kmeans_clus <- kmeans_clus_pre
               break
          }
          if (is.null(nrow(tmp))) {    # if cluster contains only 1 gene
             average_pearson <- cor(tmp, kmeans_clus$centers[j,])
          } else {     # if cluster contains > 1 genes
             average_pearson <- mean(apply(tmp,1,function(x){cor(x,
                    kmeans_clus$centers[j,])}))
          }
        ### if one cluster does not fufill the correlation condition leave loop,
        ### increase numbers of potential clusters and try again
          if (average_pearson < corr_thres) {
            break
          }
          ### accept clustering and leave loop if all clusters fufill condition
          if (j == i) {
            n_clus = i
            break
          }
        }
        if (is.null(indt1) == FALSE) {
           break
        }
        if (n_clus != 0) {
          break
        ### if maximum number of allowed clusters is reached accept this number
        } else if ((n_clus == 0) & (i == max_n_pre_clus)) {
          n_clus = i
          break
        }
        kmeans_clus_pre = kmeans_clus
      }
      print(paste("--- Number of correlation-based pre-clusters: ",
            n_clus,sep = ""))

    ### clustering of background data based on clusters from expression data
    } else {
      file_part = "bg"
      n_clus <- max(round(no_of_clusters[[c_runs*2 - 1]]*nrow(data_table)/
            n_genes),1)
      kmeans_clus <- Kmeans(dat_ztrans,n_clus,200,method = "correlation")
      n_fine_clust_rand <- round(sample( no_of_clusters[[c_runs*2]] ,n_clus,
            replace = TRUE)*nrow(data_table)/n_genes)
      n_fine_clust_rand[n_fine_clust_rand < 1] = 1
    }
    kmeans_pre_clus_corr <- kmeans_clus
    n_pre_clus <- n_clus
    kmeans_clus_final <- kmeans_pre_clus_corr$cluster
    print(paste("------ Number of genes in pre-clusters: ",
      paste(paste("C",names(summary(as.factor(kmeans_clus_final))),": ",
      summary(as.factor(kmeans_clus_final)), sep = ""),collapse = ", "),sep = ""))


   #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
   #~~~~~~~~~   Step 2: clustering based on eucledian distance   ~~~~~~~~~~~~~~#
   #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#


    n_fine_clusters <- NULL
    cluster_means <-  NULL
    # restrict maximum number of fine clusters to 5 per pre-cluster
    max_n_fine_clus <- 10
    kmeans_clus <- NULL

     tempo2 <- do.call(c,as.data.frame(dat))
     tempo3 <- (tempo2 - median(tempo2))/mad(tempo2)
     dat_median_ztrans <- matrix(tempo3, nrow(dat), ncol(dat))
     rownames(dat_median_ztrans) = rownames(dat)
     colnames(dat_median_ztrans) = colnames(dat)

    ### cluster each pre-cluster into fine clusters
    for (cl in 1:n_pre_clus) {


     ### ---> clustering of expression data
       if (is.null(no_of_clusters)) {
        if (length(kmeans_pre_clus_corr$cluster[kmeans_pre_clus_corr$cluster %in%
                cl]) >  max_n_fine_clus) {
            n_clus = 0

            for (i in 1:max_n_fine_clus) {

              kmeans_clus <- kmeans(dat_median_ztrans[names(
                  kmeans_pre_clus_corr$cluster[kmeans_pre_clus_corr$cluster
                  %in% cl]),],i,200)
            ### accept clustering if mean eucledian distance between genes of
            ### the cluster and centroid of the cluster is lower than
            ### 'eucl_thres' (default: 5)
              for (j in 1:i) {
              indt2 <- NULL
              tmp <- dat_median_ztrans[names(kmeans_pre_clus_corr$cluster[
                  kmeans_pre_clus_corr$cluster %in% cl]),][kmeans_clus$cluster
                  %in% j,]
               # if smallest cluster would have less than 5 genes take previous
               # number of clusters
              if (i > 1 & sort(summary(as.factor(kmeans_clus$cluster)))[1] < 5) {
                  indt2 = "I"
                  n_clus = i - 1
                  kmeans_clus <- kmeans_clus_pre
                  break
              }

                if (is.null(nrow(tmp))) {    # if cluster contains only 1 gene
                   average_eucledian <- 0
                } else {  # if cluster contains > 1 genes
                   average_eucledian <- sd(apply(tmp,1,function(x){dist(rbind(x,
                        kmeans_clus$centers[j,]))}))
                }

                ### if one cluster does not fufill the eucledian condition leave
                ### loop,increase numbers of potential clusters and try again
                if (average_eucledian > eucl_thres) {
                  break
                }
                ### accept clustering and leave loop if all clusters fulfill
                ### condition
                if (j == i) {
                  n_clus = i
                  break
                }
              }
              if (is.null(indt2) == FALSE) {
                 break
              }
              if (n_clus != 0) {
                break
              ### if maximum number of allowed clusters is reached accept this
              ### number
              } else if ((n_clus == 0) & (i == max_n_fine_clus)) {
                n_clus = i
                break
              }
              kmeans_clus_pre = kmeans_clus
            }
            n_fine_clusters <- c(n_fine_clusters,n_clus)
          } else {
            n_clus = 1
            n_fine_clusters <- c(n_fine_clusters,n_clus)
            kmeans_clus$cluster <- rep(1,length(kmeans_pre_clus_corr$cluster[
                kmeans_pre_clus_corr$cluster %in% cl]))
            names(kmeans_clus$cluster) <- names(kmeans_pre_clus_corr$cluster[
                kmeans_pre_clus_corr$cluster %in% cl])
          }
        ### clustering of background data based on clusters from expression data
        } else {
          if (floor(length(kmeans_pre_clus_corr$cluster[
                kmeans_pre_clus_corr$cluster %in% cl]) / n_fine_clust_rand[cl])
                >= 2) {
            n_clus <- n_fine_clust_rand[cl]
            kmeans_clus <- kmeans(dat_median_ztrans[names(
                  kmeans_pre_clus_corr$cluster[kmeans_pre_clus_corr$cluster
                  %in% cl]),], n_clus,200)
          } else if (n_fine_clust_rand[cl] > 1 & floor(length(
                kmeans_pre_clus_corr$cluster[kmeans_pre_clus_corr$cluster %in%
                cl]) / (n_fine_clust_rand[cl] - 1)) >= 2) {
            n_clus <- n_fine_clust_rand[cl] - 1
            kmeans_clus <- kmeans(dat_median_ztrans[names(
                kmeans_pre_clus_corr$cluster[kmeans_pre_clus_corr$cluster
                %in% cl]),], n_clus,200)
          } else {
            n_clus = 1
            kmeans_clus$cluster <- rep(1,length(
                kmeans_pre_clus_corr$cluster[kmeans_pre_clus_corr$cluster
                %in% cl]))
            names(kmeans_clus$cluster) <- names(kmeans_pre_clus_corr$cluster[
                kmeans_pre_clus_corr$cluster %in% cl])
          }
          n_fine_clusters <- c(n_fine_clusters,n_clus)
        }

#      ### calculate mean values for each final cluster
      for (i in 1:n_clus) {
         tmp <- dat_pre[names(kmeans_clus$cluster)[kmeans_clus$cluster %in% i],]
      if (is.null(dim(tmp))) {       # if cluster contains only 1 gene
          cluster_means <- rbind(cluster_means,tmp)
        } else { # if cluster contains > 1 genes
          cluster_means <- rbind(cluster_means,colMeans(tmp))
        }
      }
      ### overwrite pre-clusters by fine clusters to have cluster information
      ### for each gene
      if (cl == 1) {
        kmeans_clus_final[names(kmeans_pre_clus_corr$cluster[
            kmeans_pre_clus_corr$cluster %in% cl])] <-  kmeans_clus$cluster
      } else {
        kmeans_clus_final[names(kmeans_pre_clus_corr$cluster[
            kmeans_pre_clus_corr$cluster %in% cl])] <-  kmeans_clus$cluster +
            sum(n_fine_clusters[1:(cl - 1)])
      }
    }
    rownames(cluster_means)  <- 1:sum(n_fine_clusters)
    n_clus = sum(n_fine_clusters)

    print(paste("--- Final number of clusters: ",n_clus,sep = ""))
    print(paste("------ Number of genes in final clusters: ",
      paste(paste("C",names(summary(as.factor(kmeans_clus_final))),": ",
      summary(as.factor(kmeans_clus_final)), sep = ""),collapse = ", "),sep = ""))


   #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
   #~~~~~~~~~~~~~~~~   Plot clusters if plot_clusters == TRUE ~~~~~~~~~~~~~~~~~#
   #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

    if (plot_clusters == TRUE & file_part != "bg") {
      for (i in 1:n_clus) {

        if (i == 1){
            if(n_device == TRUE){
                dev.new()
            }
            if (n_clus == 1) {
                par(mfrow = c(1,1))
            }
            else if (n_clus == 2) {
                par(mfrow = c(1,2))
            } else if (n_clus <= 4) {
                par(mfrow = c(2,2))
            } else if (n_clus <= 12) {
                par(mfrow = c(3,4))
            } else if (n_clus > 12) {
                par(mfrow = c(4,5))
            }
        } else if ((i - 1) %% 20 == 0) {
            if (n_device == TRUE) {
                dev.new()
            }
            par(mfrow = c(4,5))
        }
        ### expression values of genes in cluster
        tmp <- dat_ztrans[kmeans_clus_final %in% i,]
        # if cluster contains only 1 gene
        if (length(kmeans_clus_final[kmeans_clus_final %in% i]) == 1) {
          n_g = 1
        } else {   # if cluster contains > 1 genes
          n_g = nrow(tmp)
        }

        ### order gene expression data according to timepoints
        for (j in 1:n_g) {
          # if cluster contains only 1 gene
          if (length(kmeans_clus_final[kmeans_clus_final %in% i]) == 1) {
            tmp_plot = tmp
          } else {       # if cluster contains > 1 genes
            tmp_plot = tmp[j,]
          }
          ### plot clusters
          if (j == 1) {                     # if cluster contains only 1 gene
            plot(tmp_plot,type = "l",xlab = "Time points", ylab = "Z-score",
                    main = paste("Cluster ",i,", ",label[c_runs],sep = ""),
                    ylim = c(min(tmp),max(tmp)),xaxt = "n")
          axis(1, at = 1:length(colnames(tmp)), labels = colnames(tmp), las = 3)
          } else {   # if cluster contains > 1 genes
            points(colnames(tmp),tmp_plot,type = "l")
          }
        }
      }
    }
    results[[c_runs*4 - 3]] <- kmeans_clus_final
    results[[c_runs*4 - 2]] <- cluster_means
    results[[c_runs*4 - 1]] <- n_pre_clus
    results[[c_runs*4]] <- n_fine_clusters
    names(results)[c((c_runs*4 - 3):(c_runs*4))] <- paste(c("kmeans_clus",
         "cluster_means", "pre_clus","fine_clus"),label[c_runs],sep = "_")
  }

  return(results)
}


################################################################################

#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
#++++++++++++++++++++++     Impulse model fit    ++++++++++++++++++++++++++++++#
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

### fits impulse model to a timecourse dataset
### genes are suppposed to be in rows, samples in columns
### timepoints need to be specified as numbers (integers or float)
### function is splitted into 3 parts
###### 1. fitting impulse model to a single gene
###### 2. fitting impulse model to matrix of genes, which calls 1.
###### 3. prepare data and fit model by calling 2.
### This separation was chosen to be able to use 'apply' to replace for-loops

impulse_fit <- function(data_input, data_annotation, n_iter = 100,
    control_timecourse = FALSE, control_name = NULL, cluster_results = NULL,
    start_values = NULL, fit_backg = FALSE, n_proc = 4, ...){

  ### check dataset for consistency
  if (ncol(data_annotation) != 2 || FALSE %in% (colnames(data_annotation) ==
      c("Time","Condition"))) {
            stop("Please use function 'annotation_preparation' to prepare
                 the annotation table")
  }
  if (control_timecourse == TRUE & is.null(control_name)) {
     stop("Please specify the name of the control in the column 'Condition'")
  }
  if (control_timecourse == TRUE & (is.null(start_values) == FALSE)) {
    start_values <- start_values[c(1,3,5)]
  }
  if (control_timecourse == FALSE & (is.null(control_name) == FALSE)) {
     start_values <- start_values[c(1,3)]
  }

  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
  #~~~~~~~~~~~~~~~~~~~~~~~~  Fit impulse model to gene ~~~~~~~~~~~~~~~~~~~~~~~~#
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

  ### fits an impulse model to expression values of a single gene

  impulse_fit_gene_wise <- function(vec, timepoints, NPARAM, n_iters = 100,
      fit_to_clusters = FALSE, start_val = NULL, fit_bg = FALSE, ...){

      mat1 = c(0,abs(vec[2:length(vec)] - vec[1:(length(vec) - 1)]) *
            vec[2:length(vec)])
      mn_beta = 0
      mx_beta = 10
      middle_ind = which(timepoints > min(timepoints) & timepoints <
            max(timepoints))
      min_ind = which(timepoints == min(timepoints))[1]
      max_ind = which(timepoints == max(timepoints))[1]
      mx_tm = max(timepoints)
      mn_tm = min(timepoints)
      tmp_ind = which(mat1 == max(mat1))[1]
      peaktime = timepoints[tmp_ind]
      peak = min(which(timepoints == peaktime))
      beta1 = abs(log(abs((vec[peak] - vec[min_ind])/(peaktime -
            timepoints[min_ind]))));

      ### set beta to 1 if calculcated value is infinite
      if (is.finite(beta1) == FALSE || is.na(beta1)) { beta1 = 1 }

      ### define start value theta based on the gene's data
      orig_theta = c(beta1,
         vec[min_ind],                    # h0
         vec[peak],                       # h1
         vec[max_ind],                    # h2
         (peaktime - timepoints[min_ind])/2,            # t1
         peaktime + (timepoints[max_ind] - peaktime)/2) # t2
      names(orig_theta) = c("beta","h0","h1","h2","t1","t2")
      theta = orig_theta

      ### replace theta estimates of 0 by small value, because optimization
      ### function has problems with 0 as input
      rand_num <- runif(length(which(theta == 0)),0,0.0001)
      theta[names(which(theta == 0))] <- rand_num
      names(theta) = c("beta","h0","h1","h2","t1","t2")

      ### ---> fit model to clusters
      if (fit_to_clusters == TRUE & is.null(start_val)) {

          if (fit_bg == FALSE) {
            thetas <- cbind(as.numeric(theta),apply(matrix(rep(runif((n_iters -
                    1)*NPARAM)),NPARAM,(n_iters - 1)),2,
              function(x){c(mn_beta + (mx_beta - mn_beta)*x[1],
                            min(vec) + (max(vec) - min(vec))*x[2:4],
                            mn_tm + (mx_tm - mn_tm)*x[5],
                            (mn_tm + (mx_tm - mn_tm)*x[5]) + (mx_tm - (mn_tm +
                            (mx_tm - mn_tm)*x[5]))*x[6])}))
           } else {
              thetas <- apply(matrix(rep(runif((n_iters)*NPARAM)),NPARAM,
                    (n_iters)),2,
              function(x){c(mn_beta + (mx_beta - mn_beta)*x[1],
                            min(vec) + (max(vec) - min(vec))*x[2:4],
                            mn_tm + (mx_tm - mn_tm)*x[5],
                            (mn_tm + (mx_tm - mn_tm)*x[5]) + (mx_tm - (mn_tm +
                            (mx_tm - mn_tm)*x[5]))*x[6])})
           }
        tmm1 <- system.time({
        fmin_outs <- cbind(
        # fitting via quasi-Newton method (optim, "BFGS")
           apply(thetas[,1:(floor(n_iters/2))],2,function(x){unlist(optim(x,
             two_impulses, x_vec = timepoints,
             y_vec = vec,method = "BFGS")[c("par","value")])}),
        # fitting via PORT routines (nlminb)
           apply(thetas[, c(1,(floor(n_iters/2) + 1):n_iters)],2,function(x){
             unlist(nlminb(x, two_impulses, x_vec = timepoints,
             y_vec = vec)[c("par","objective")])})
           )
        })

        ### return results of 3 best fits, which will serve as start values for
        ### the fits to the genes
        pvec_and_SSE = as.vector(fmin_outs[,order(fmin_outs["value",])][,1:3])

      ### ---> fit model to genes
      } else if (fit_to_clusters == FALSE & is.null(start_val) == FALSE) {
        tmm2 <- system.time({

      ### use first 3 best hits from fit to clusters plus initial guess 'theta'
      ### as start values
        if (fit_bg == TRUE) {
          toll <- matrix(start_val,7,3)[1:NPARAM,]
        } else {
          toll <- cbind(theta,matrix(start_val,7,3)[1:NPARAM,][,1:3])
        }

        fmin_outs <- cbind(
        # fitting via quasi-Newton method (optim, "BFGS")
        apply(toll,2,
          function(x){unlist(optim(x, two_impulses, x_vec = timepoints,
          y_vec = as.numeric(vec),method = "BFGS" )[c("par","value")])}),
        # again fitting via PORT routines (nlminb)
        apply(toll,2,
          function(x){unlist(nlminb(x, two_impulses, x_vec = timepoints,
          y_vec = as.numeric(vec) )[c("par","objective")])})
        )
        })
        if (is.null(dim(fmin_outs[ ,fmin_outs["value",] == min(fmin_outs["value",
            ])])) == TRUE) {
          pvec_and_SSE = fmin_outs[ ,fmin_outs["value",] ==
                min(fmin_outs["value",])]
       } else {
          ### if two or more randomization have the same minimum Sum of
          ### Squared Errors (SSE), choose the first one
          pvec_and_SSE = fmin_outs[ ,fmin_outs["value",] ==
                min(fmin_outs["value",])][,1]
       }
      }
     return(pvec_and_SSE)
  }

  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
  #~~~~~~~~~~~~~~~~~~~~~~~  Fit impulse model to matrix ~~~~~~~~~~~~~~~~~~~~~~~#
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

  ### fits an impulse model to all genes of a dataset
  impulse_fit_matrix <- function(data_tab, timepoints, n_it = 100,
        ctrl_tc = FALSE, ctrl = NULL, fit_to_clus = FALSE, start_val = NULL,
        fit_bg = FALSE, n_process = 4, ...){

    NPARAM = 6
    mat1 = NULL
    res <- list()
    for (j in 2:ncol(data_tab)) {
        mat1 = cbind(mat1, abs(data_tab[,j] - data_tab[,j - 1]) * data_tab[,j])
    }

        mc <- min(detectCores() - 1, n_process)
     if (fit_bg == FALSE && nrow(data_tab) > max(2*mc,10)) {

    ## do not split if less than 10 genes
        ind_list = list()
        bord = floor(nrow(data_tab)/mc)
        for (i in 1:mc) {
           ind_list[[i]] <- ((i - 1)*bord + 1):(i*bord)
        }
        if (mc*bord < nrow(data_tab)) {
             ind_list[[mc]] <-  c(ind_list[[mc]],(mc*bord + 1):nrow(data_tab))
        }

        cl <- makeCluster(mc)
        my.env <- new.env()
        assign("data_tab", data_tab, envir = my.env)
        assign("calc_impulse_comp", calc_impulse_comp, envir = my.env)
        assign("n_it", n_it, envir = my.env)
        assign("cluster_genes_for_impulse", cluster_genes_for_impulse,
             envir = my.env)
        assign("impulse_fit", impulse_fit, envir = my.env)
        assign("two_impulses", two_impulses, envir = my.env)
        assign("impulse_fit_gene_wise",impulse_fit_gene_wise, envir = my.env)
        assign("impulse_fit_matrix",impulse_fit_matrix, envir = my.env)
        assign("timepoints", timepoints, envir = my.env)
        assign("fit_to_clus", fit_to_clus, envir = my.env)
        assign("start_val", start_val, envir = my.env)
        assign("fit_bg", fit_bg, envir = my.env)
        assign("NPARAM", NPARAM, envir = my.env)

        clusterExport(cl = cl, varlist = c("data_tab",
         "calc_impulse_comp","n_it","cluster_genes_for_impulse","impulse_fit",
         "two_impulses", "impulse_fit_gene_wise",
         "impulse_fit_matrix", "timepoints","fit_to_clus","start_val",
         "fit_bg","NPARAM"), envir = my.env)

        ### fit impulse model to each gene of matrix and get impulse parameters
        resi <- clusterApply(cl, ind_list, function(z){t(apply(data_tab[z,],1,
            function(x){impulse_fit_gene_wise(x,timepoints,
            NPARAM,n_it,fit_to_clus, start_val, fit_bg)}))})
        resmat = do.call(rbind,resi)
        resmat <- resmat[rownames(data_tab),]
        stopCluster(cl)
      } else {
      ### fit impulse model to each gene of matrix and get impulse parameters
        resmat <- t(apply(data_tab,1,function(x){impulse_fit_gene_wise(x,
            timepoints, NPARAM,n_it,fit_to_clus, start_val, fit_bg)}))
      }


    ### use obtained impulse parameters to calculate impulse fit values
    ### ---> if fit to genes
    if (fit_to_clus == FALSE) {

     colnames(resmat) <- c("beta","h0","h1","h2","t1","t2","SSE")
     if (nrow(resmat) == 1) {      # if matrix contains only one gene
       resmat2 <- as.data.frame(t(calc_impulse_comp(resmat[,1:NPARAM],
          unique(sort(timepoints)))),
          row.names = rownames(resmat))
       colnames(resmat2) = unique(sort(timepoints))
     } else {   # if matrix contains > 1 genes
        resmat2 <- t(apply(resmat[,1:NPARAM],1,function(x){calc_impulse_comp(x,
                unique(sort(timepoints)))}))
        colnames(resmat2) = unique(sort(timepoints))
     }

    ### ---> if fit to clusters
    ### more complex because results from 3 best fits need to be saved and later
    ### used for the fit to the genes
    } else {
     colnames(resmat) <- c(paste(c("beta","h0","h1","h2","t1","t2","SSE"),1,
          sep = "_"), paste(c("beta","h0","h1","h2","t1","t2","SSE"),2,sep = "_"),
          paste(c("beta","h0","h1","h2","t1","t2","SSE"),3,sep = "_"))
     resmat2 <- t(apply(resmat,1,function(x){apply(matrix(x,7,3)[1:NPARAM,],2,
          function(y){calc_impulse_comp(y,unique(sort(timepoints)))})}))
     colnames(resmat2) <- c(paste(unique(sort(timepoints)),1, sep = "_"),
          paste(unique(sort(timepoints)),2, sep = "_"),
          paste(unique(sort(timepoints)),3, sep = "_"))
    }
    res[[1]] <- resmat
    res[[2]] <- resmat2
    names(res) <- c("impulse_parameters","impulse_fits")
    return(res)
  }

  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
  #~~~~~~~~~~~~~~~~~~~~  Prepare data for impulse model fit ~~~~~~~~~~~~~~~~~~~#
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

  ### assume only one timecourse as default and perform only 1 run
  g_names = rownames(data_input)
  runs = 1
  label = "case"
  results = list()
  if (control_timecourse == TRUE) {
    runs = 3
    label = c("combined","case","control")
  }

  ### ---> fit to genes
  if (is.null(cluster_results) == FALSE & is.null(start_values) == FALSE) {
    file_ext = "genes"
    c1 = "genes"

    ### perform 3 runs if control timecourse data is present
    if (control_timecourse == TRUE) {
      ### generate a data list containing the combined, case and control data if
      ### control timecourse is present
      dat1 = as.data.frame(data_input)[names(cluster_results[[1]]),]
      dat1_n = as.data.frame(data_input)[!(rownames(data_input) %in%
            rownames(dat1)),]
      dat1_n = dat1_n[,colnames(dat1)]
      dat2 = as.data.frame(data_input)[,!(data_annotation$Condition %in%
            control_name)]
      dat2 = dat2[names(cluster_results[[5]]),]
      dat2_n = as.data.frame(data_input)[!(rownames(data_input) %in%
            rownames(dat2)),]
      dat2_n = dat2_n[,colnames(dat2)]
      dat3 = as.data.frame(data_input)[,(data_annotation$Condition %in%
            control_name)]
      dat3 = dat3[names(cluster_results[[9]]),]
      dat3_n = as.data.frame(data_input)[!(rownames(data_input) %in%
            rownames(dat3)),]
      dat3_n = dat3_n[,colnames(dat3)]
      dat_ind <- c("dat1_n","dat2_n", "dat3_n")
      data_input <- list(dat1, dat2, dat3)

    } else if (control_timecourse == FALSE) {
      dat1 = as.data.frame(data_input)[names(cluster_results[[1]]),]
      dat1_n = as.data.frame(data_input)[!(rownames(data_input) %in%
            rownames(dat1)),]
      dat_ind <- c("dat1_n")
      data_input <- list(dat1)
    }


  ### ---> fit to clusters
  }  else if (is.null(cluster_results) & is.null(start_values)) {
    file_ext = "clusters"
    c1 = "cluster"
    ### input are the means of the gene expression values of the clusters
    ### 2 is combined, 6 is case and 10 is control
    ### if runs == 1 or runs == 2, then 6 and 10 or just 10 are empty
    data_input <- data_input[c(2,6,10)[1:runs]]
  }

  ### fitting for different runs
  for (c_runs in 1:runs) {
    imp_res <- NULL

    ### ---> fit to genes
    if (is.null(cluster_results) == FALSE & is.null(start_values) == FALSE) {

      ### split genes into the clusters and fit impulse model to the
      ### genes of a cluster
      cluster_res_list <- split(data_input[[c_runs]],
            cluster_results[[(c_runs - 1)*4 + 1]])
      ind <- split(1:max(cluster_results[[(c_runs - 1)*4 + 1]]),
            1:max(cluster_results[[(c_runs - 1)*4 + 1]]))

      imp_res_list <- lapply(ind,
          function(x){impulse_fit_matrix(cluster_res_list[[x]],
          as.numeric(as.character(data_annotation[colnames(data_input[[c_runs]])
            ,"Time"])), n_it = n_iter, ctrl_tc = control_timecourse,
            ctrl = control_name, fit_to_clus = FALSE,
            start_val = start_values[[c_runs]][x,],
            fit_bg = fit_backg, n_process = n_proc)})

       if (nrow(get(dat_ind[c_runs])) != 0) {
        tump1  <- do.call(rbind,lapply(1:(length(imp_res_list)),
            function(x) imp_res_list[[x]][[1]]))
        tump2  <- do.call(rbind,lapply(1:(length(imp_res_list)),
            function(x) imp_res_list[[x]][[2]]))

        tmpp1 <- t(apply(get(dat_ind[c_runs]),1,function(x){rep(mean(x),
            ncol(tump2))}))
        colnames(tmpp1) <- colnames(tump2)
        tump2 <- rbind(tump2, tmpp1)
        tmpp2 <-  cbind( matrix(NA,nrow(get(dat_ind[c_runs])),ncol(tump1) - 1),
            apply(cbind(get(dat_ind[c_runs]), tmpp1[,1]),1,
            function(x){sum((x[1:(length(x) - 1)] - x[length(x)])^2)}))
        rownames(tmpp2) <- rownames(get(dat_ind[c_runs]))
        tump1 <- rbind(tump1, tmpp2)

        tempi1 <- tump1
        tempi2 <- tump2
      } else {
         tempi1  <- do.call(rbind,lapply(1:(length(imp_res_list)),
             function(x) imp_res_list[[x]][[1]]))
         tempi2  <- do.call(rbind,lapply(1:(length(imp_res_list)),
             function(x) imp_res_list[[x]][[2]]))
      }
      imp_res$impulse_parameters <- tempi1[g_names,]
      imp_res$impulse_fits <- tempi2[g_names,]

    ### ---> fit to clusters
    } else if (is.null(cluster_results) & is.null(start_values)) {
      ### fit impulse model to the mean of the cluster
      imp_res <-  impulse_fit_matrix(data_input[[c_runs]],
        as.numeric(as.character(data_annotation[colnames(data_input[[c_runs]]),
        "Time"])),
          n_it = n_iter, ctrl_tc = control_timecourse, ctrl = control_name,
                fit_to_clus = TRUE, fit_bg = fit_backg, n_process = n_proc)
    }
    names(imp_res) <- paste(c("impulse_parameters","impulse_fits"),
        label[c_runs], sep = "_")
    results[[names(imp_res)[1]]] <- imp_res[[1]]
    results[[names(imp_res)[2]]] <- imp_res[[2]]
  }

  ### for naming of the output files
  if (fit_backg == TRUE) { c2 = "bg" } else {c2 = "data"}
  return(results)
}

################################################################################

#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
#++++++++++++++++++++++     Plot impulse fits    ++++++++++++++++++++++++++++++#
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

### plots impulse fits to timecourse data and also the control data if present

#' Plot impulse model fits
#'
#' Plots impulse model fits for the specified gene IDs. In the case of two
#' time courses, the fits for the combined, case and control data are plotted.
#' @aliases plot_impulse
#' @param gene_IDs character vector of gene names to be plotted; must be part
#' of the \code{rownames} of \code{data_table}
#' @param data_table numeric matrix of expression values; genes should be in
#' rows, samples in columns. Data should be properly normalized and
#' log2-transformed as well as filtered for present or variable genes.
#' @param data_annotation table providing co-variables for the samples including
#' condition and time points. Time points must be numeric numbers.
#' @param imp_fit_genes list of  fitted impulse model values and parameters as
#' produced by \code{impulse_DE} (list element \code{impulse_fit_results}).
#' @param control_timecourse logical indicating whether a control time
#' timecourse is part of the data set (\code{TRUE}) or not (\code{FALSE}).
#' Default is \code{FALSE}.
#' @param control_name character string specifying the name of the control
#' condition in \code{annotation_table}.
#' @param case_name character string specifying the name of the case
#' condition in \code{annotation_table}. Should be set if more than two
#' conditions are present in \code{data_annotation}.
#' @param file_name_part character string to be used as file extention.
#' @param title_line character string to be used as title for each plot.
#' @param sub_line character string to be used as subtitle for each plot.
#' @param new_device logical indicating whether each plot should be plotted
#' into a new device (\code{TRUE}) or not (\code{FALSE}). Default is \code{TRUE}.
#' @return Plots of the impulse model fits for the specified gene IDs.
#' @seealso \code{\link{impulse_DE}}, \code{\link{calc_impulse}}.
#' @author Jil Sander
#' @references Chechik, G. and Koller, D. (2009) Timing of Gene Expression
#' Responses to Envi-ronmental Changes. J. Comput. Biol., 16, 279-290.
#' @examples
#' #' Install package longitudinal and load it
#' library(longitudinal)
#' #' Attach datasets
#' data(tcell)
#' #' check dimension of data matrix of interest
#' dim(tcell.10)
#' #' generate a proper annotation table
#' annot <- as.data.frame(cbind("Time" =
#'      sort(rep(get.time.repeats(tcell.10)$time,10)),
#'      "Condition" = "activated"), stringsAsFactors = FALSE)
#' #' Time columns must be numeric
#' annot$Time <- as.numeric(annot$Time)
#' #' rownames of annotation table must appear in data table
#' rownames(annot) = rownames(tcell.10)
#' #' since genes must be in rows, transpose data matrix using t()
#' #' consider 6 genes for now only
#' genes <- c("SIVA","CD69","ZNFN1A1","IL4R","MAP2K4","JUND")
#' tcell.10.filtered <- t(tcell.10[,genes])
#' #' generate a list object having the form of the output of impulse_DE
#' #' first the parameter fits and SSEs
#' impulse_parameters_case <- matrix(
#'     c(0.6,	18.6,	17.2,	17.4,	5.1,	40.2,	3.5,
#'       0.3,	-464.9,	18.3,	17.3,	-17.2,	35.3,	17.5,
#'       23.2,	18,	18.8,	18.5,	3,	37,	13.2,
#'       NA,	NA,	NA,	NA,	NA,	NA,	3.1,
#'       NA,	NA,	NA,	NA,	NA,	NA,	9.6,
#'       9.5,	17.5,	18.7,	17.5,	8,	48,	46.7),length(genes),7, byrow = TRUE)
#' rownames(impulse_parameters_case) <- genes
#' colnames(impulse_parameters_case) <- c("beta", "h0", "h1", "h2", "t1", "t2", "SSE")
#' #' then the fitted values for the time points
#' impulse_fits_case <- matrix(c(
#'     18.55,	18.43,	18.15,	17.73,	17.43,	17.24,	17.24,	17.24,	17.38,	17.38,
#'     16.22,	17.18,	17.7,	17.97,	18.12,	18.27,	18.26,	18.03,	17.3,	17.28,
#'     18,	18,	18.82,	18.82,	18.82,	18.82,	18.82,	18.82,	18.48,	18.48,
#'     15.93,	15.93,	15.93,	15.93,	15.93,	15.93,	15.93,	15.93,	15.93,	15.93,
#'     17.62,	17.62,	17.62,	17.62,	17.62,	17.62,	17.62,	17.62,	17.62,	17.62,
#'     17.5,	17.5,	17.5,	17.5,	18.18,	18.67,	18.67,	18.67,	17.98,	17.53)
#'     ,length(genes),length(unique(annot$Time)), byrow = TRUE)
#' rownames(impulse_fits_case) <- genes
#' colnames(impulse_fits_case) <- unique(annot$Time)
#' #' finalize list object
#' impulse_fit_genes <- list("impulse_parameters_case" = impulse_parameters_case,
#'                           "impulse_fits_case" = impulse_fits_case)
#' #' Plot expression values
#' plot_impulse(genes, tcell.10.filtered, annot, impulse_fit_genes)
#' @export
plot_impulse <- function(gene_IDs, data_table, data_annotation,imp_fit_genes,
    control_timecourse = FALSE, control_name = NULL, case_name = NULL,
    file_name_part = "", title_line = "", sub_line = "", new_device = TRUE){

  print("---Plotting genes")

  if (length(grep("[a-zA-Z]",rownames(data_table))) == 0) {
       rownames(data_table) <- paste(rownames(data_table),"G", sep = "_")
       gene_IDs <- paste(gene_IDs, "G", sep = "_")
  }
  print(gene_IDs)

  ### if control timecourse is present split data into case and control data
  if (control_timecourse == TRUE) {
      fmat_case <- data_table[,!(data_annotation$Condition %in% control_name)]
      fmat_ctrl <- data_table[,data_annotation$Condition %in% control_name]
      timep_case <- as.numeric(as.character(data_annotation[colnames(fmat_case),
            "Time"]))
      timep_ctrl <- as.numeric(as.character(data_annotation[colnames(fmat_ctrl),
            "Time"]))
  }
  timep <- as.numeric(as.character(data_annotation[colnames(data_table),"Time"]))
  x_vec <- seq(0,max(timep),0.1)

   for (i in 1:length(gene_IDs)) {
       if (i == 1) {
           if(new_device == TRUE){
               dev.new()
           }
           if (length(gene_IDs) == 1) {
               par(mfrow = c(1,1))
           } else if (length(gene_IDs) == 2) {
               par(mfrow = c(1,2))
           } else if (length(gene_IDs) <= 4) {
               par(mfrow = c(2,2))
           } else if (length(gene_IDs) <= 6) {
               par(mfrow = c(2,3))
           } else if (length(gene_IDs) > 6) {
               par(mfrow = c(3,3))
           }
        } else if ((i - 1) %% 9 == 0) {
            if(new_device == TRUE){
                dev.new()
            }
            par(mfrow = c(3,3))
        }

     ### if there is no control data plot only case time course data and fit
     if (control_timecourse == FALSE) {
         if (TRUE %in% is.na(imp_fit_genes$impulse_parameters_case[gene_IDs[i],])) {
             calc_case = imp_fit_genes$impulse_fits_case[gene_IDs[i],1]
         } else {
             calc_case = calc_impulse_comp(imp_fit_genes$impulse_parameters_case[gene_IDs[i],1:6],x_vec)
         }
         plot(timep,data_table[gene_IDs[i],],col = "blue",pch = 3,xlim = c(0,max(timep)),
            ylim = c(min(c(as.numeric(data_table[gene_IDs[i],]),as.numeric(calc_case))) - 0.5,
               max(c(as.numeric(data_table[gene_IDs[i],]),as.numeric(calc_case))) + 0.5),
            xlab = "Time", ylab = "Impulse fit or log2 expression value",
            main = paste(gene_IDs[i]," ",title_line, sep = ""),sub = sub_line)
         if (TRUE %in% is.na(imp_fit_genes$impulse_parameters_case[gene_IDs[i],])) {
            abline(h = calc_case , col = "blue")
         } else {
           points(x_vec, calc_case, col = "blue", type = "l")
         }
         legend(x = "bottomright",as.character(data_annotation[1,"Condition"]),
                fill = c("blue"), cex = 0.6)

     ### if there is control data
     } else if (control_timecourse == TRUE) {

         if (TRUE %in% is.na(imp_fit_genes$impulse_parameters_case[gene_IDs[i],])) {
            calc_case = imp_fit_genes$impulse_fits_case[gene_IDs[i],1]
            status_case = FALSE
         } else {
            calc_case = calc_impulse_comp(imp_fit_genes$impulse_parameters_case[gene_IDs[i],1:6],x_vec)
            status_case = TRUE
         }
         if (TRUE %in% is.na(imp_fit_genes$impulse_parameters_control[gene_IDs[i],])) {
            calc_ctrl = imp_fit_genes$impulse_fits_control[gene_IDs[i],1]
            status_ctrl = FALSE
         } else {
            calc_ctrl = calc_impulse_comp(imp_fit_genes$impulse_parameters_control[gene_IDs[i],1:6],x_vec)
            status_ctrl = TRUE
         }
         if (TRUE %in% is.na(imp_fit_genes$impulse_parameters_combined[gene_IDs[i],])) {
            calc_comb = imp_fit_genes$impulse_fits_combined[gene_IDs[i],1]
            status_comb = FALSE
         } else {
            calc_comb = calc_impulse_comp(imp_fit_genes$impulse_parameters_combined[gene_IDs[i],1:6],x_vec)
            status_comb = TRUE
         }

         plot(timep_case,fmat_case[gene_IDs[i],],col = "blue",pch = 3,
              xlim = c(0,max(timep)),
              ylim = c(min(c(as.numeric(data_table[gene_IDs[i],]),
               as.numeric(calc_case), as.numeric(calc_ctrl),
               as.numeric(calc_comb))) - 0.5,
               max(c(as.numeric(data_table[gene_IDs[i],]),as.numeric(calc_case),
               as.numeric(calc_ctrl), as.numeric(calc_comb))) + 0.5),
            xlab = "Time", ylab = "Impulse fit or log2 expression value",
            main = paste(gene_IDs[i]," ",title_line,sep = ""),sub = sub_line)

         points(timep_ctrl,fmat_ctrl[gene_IDs[i],],col = "red",pch = 4)

         if (status_case == FALSE) {
           abline(h = calc_case , col = "blue")
         } else {
           points(x_vec,calc_case, col = "blue", type = "l")
         }
         if (status_ctrl == FALSE) {
            abline(h = calc_ctrl , col = "red")
         } else {
           points(x_vec,calc_ctrl, col = "red", type = "l")
         }
         if (status_comb == FALSE) {
            abline(h = calc_comb , col = "grey")
         } else {
           points(x_vec,calc_comb, col = "grey", type = "l")
         }
         legend(x = "bottomright",
            c(as.character(data_annotation$Condition[data_annotation$Condition !=
            control_name][1]),control_name,"combined"),
            fill = c("blue","red","grey"), cex = 0.6)
     }
   }
}


#################################################################################

#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#
#++++++++++++++++++++++     Background generation   +++++++++++++++++++++++++++#
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++#

### generates the background for the DE analysis
### can be parallelized onto 'n_proc' nodes

generate_background <- function(data_table, data_annotation, n_iter = 100,
    imp_fit_genes, control_timecourse = FALSE, control_name = NULL,
    no_of_clus = NULL, n_rands = 50000, n_proc = 4){

  ### get number of pre- and fine clusters as input from the genes
  ### also exclude genes for which the model was not fitted due to almost
  ### no variation
  if (control_timecourse == TRUE) {
    no_of_clus = no_of_clus[c(3,4,7,8,11,12)]
    data_table <- data_table[!(is.na(imp_fit_genes[[1]][,"beta"]) |
        is.na(imp_fit_genes[[3]][,"beta"]) |
        is.na(imp_fit_genes[[5]][,"beta"])),]
  } else if (control_timecourse == FALSE & is.null(control_name)) {
    no_of_clus = no_of_clus[c(3,4)]
    data_table <- data_table[!(is.na(imp_fit_genes[[1]][,"beta"])),]
  }
    imp_fit_genes <- lapply(imp_fit_genes,
        function(x){x <- x[rownames(data_table),]})

  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~  Fit background ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
  #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

  impulse_bg <- function(data_tab, data_annot,  n_it = 100, imp_fit,
        ctrl_tk = FALSE, ctrl_name = NULL,
        n_clust = NULL, n_randoms_tot = 50000, no_proc = 4){

    ### divide randomizations by no. of nodes
    n_rand <- round(n_randoms_tot/no_proc)

    background = rep(0,n_rand)
    timep <- as.numeric(as.character(data_annot[colnames(data_tab),"Time"]))
    NPARAM = 6

    ### take 'n_rand' random entries from data table
    i_s <- sample(1:nrow(data_tab),n_rand, replace = TRUE)
    data_tab_rand <- data_tab[i_s,]
    rownames(data_tab_rand) <- paste("bg_",c(1:length(i_s)))

    ### ---> if control data is present
    if (ctrl_tk == TRUE) {
      data_list <- list(data_tab_rand, data_tab_rand[,
          !(data_annot$Condition %in% ctrl_name)],
          data_tab_rand[,data_annot$Condition %in% ctrl_name])

      ### null model: fits to the combined dataset
      y_i_null_s = (imp_fit[[2]][rownames(data_tab),])[i_s,
            as.character(data_annot[colnames(data_list[[1]]),"Time"])]
      calc_impulse_comp_data_tab_rand_case <- (imp_fit[[4]][rownames(data_tab),
            ])[i_s,as.character(data_annot[colnames(data_list[[2]]),"Time"])]

      ### if real control timecourse is present
        calc_impulse_comp_data_tab_rand_ctrl <- (imp_fit[[6]][rownames(data_tab),
            ])[i_s, as.character(data_annot[colnames(data_list[[3]]),"Time"])]
        RESD_1_s = cbind((data_list[[2]] - calc_impulse_comp_data_tab_rand_case),
          (data_list[[3]] - calc_impulse_comp_data_tab_rand_ctrl))
        RESD_1_s = RESD_1_s[,colnames(data_list[[1]])]

      ### draw random resdiuals and add them to the null model
      Y_star_s = y_i_null_s + t(apply(RESD_1_s,1,
          function(x){x[sample(1:ncol(RESD_1_s),
          ncol(data_tab_rand),replace = TRUE)]}))
      colnames(Y_star_s) <- colnames(data_tab_rand)
      rownames(Y_star_s) <- paste("bg_",c(1:length(i_s)))

      ### cluster Y_star_s data by using the clusters from the genes
      tm_c <- system.time({
        clustering_results_background <- cluster_genes_for_impulse(Y_star_s,
            data_annot, ctrl_tk, ctrl_name, plot_clusters = TRUE,
            no_of_clusters = n_clust, n_genes = nrow(data_tab))
      })
      print(paste("Consumed time bg clustering: ",
            round(tm_c["elapsed"]/60,2)," min", sep = ""))

      ### fit impulse model to the clusters of Y_star_s
      tm_f <- system.time({
        impulse_fit_clusters_bg <- impulse_fit(clustering_results_background,
            data_annot, n_it, control_timecourse = ctrl_tk,
            ctrl_name, fit_backg = TRUE)
      })
      print(paste("Consumed time bg clus fit: ",
            round(tm_f["elapsed"]/60,2)," min", sep = ""))

      ### fit impulse model to each Y_star
      tm_fg <- system.time({
        impulse_fit_genes_bg <- impulse_fit(Y_star_s, data_annot, 1, ctrl_tk,
            ctrl_name, clustering_results_background, impulse_fit_clusters_bg,
            fit_backg = TRUE )
      })

      print(paste("Consumed time bg gene fit: ",
            round(tm_fg["elapsed"]/60,2)," min", sep = ""))

      ### calculate background
        RESD_1_s_b = cbind((Y_star_s[rownames(data_list[[1]]),
            colnames(data_list[[2]])] -
            impulse_fit_genes_bg[[4]][rownames(data_list[[1]]),
            as.character(data_annot[colnames(data_list[[2]]),"Time"])]),
            (Y_star_s[rownames(data_list[[1]]),colnames(data_list[[3]])] -
            impulse_fit_genes_bg[[6]][rownames(data_list[[1]]),
            as.character(data_annot[colnames(data_list[[3]]),"Time"])]))

        colnames(RESD_1_s_b) <- c(colnames(data_list[[2]]),colnames(data_list[[3]]))
        RESD_1_s_b = RESD_1_s_b[,colnames(data_list[[1]])]
        RESD_0_s_b = Y_star_s[rownames(data_list[[1]]),
            colnames(data_list[[1]])] -
            impulse_fit_genes_bg[[2]][rownames(data_list[[1]]),
            as.character(data_annot[colnames(data_list[[1]]),"Time"])]

       SS_1_s_b = apply(RESD_1_s_b,1,function(x){sum(x^2)})
       SS_0_s_b = apply(RESD_0_s_b,1,function(x){sum(x^2)})

    ### ---> if no control data is present
    } else {
      y_i_null_s = rowMeans(data_tab_rand)
      calc_impulse_comp_data_tab_rand <- (imp_fit[[2]][rownames(data_tab),])[i_s,
            as.character(data_annot[,"Time"])]
      RESD_1_s = data_tab_rand - calc_impulse_comp_data_tab_rand

      ### draw random resdiuals and add them to the null model
      Y_star_s = t(y_i_null_s + apply(RESD_1_s,1,function(x){x[sample(1:ncol(RESD_1_s),
          ncol(data_tab_rand),replace = TRUE)]}))
      colnames(Y_star_s) <- colnames(data_tab_rand)

      ### cluster Y_star_s data by using the clusters from the genes
      tm_c <- system.time({
         clustering_results_background <- cluster_genes_for_impulse(Y_star_s,
            data_annot, ctrl_tk, ctrl_name, plot_clusters = TRUE,
            no_of_clusters = n_clust, n_genes = nrow(data_tab))
      })
      print(paste("Consumed time bg clustering: ",
                  round(tm_c["elapsed"]/60,2)," min",sep = ""))

      ### fit impulse model to the clusters of Y_star_s
      tm_f <- system.time({
       impulse_fit_clusters_bg <- impulse_fit(clustering_results_background,
            data_annot, n_it, control_timecourse = ctrl_tk, ctrl_name,
            fit_backg = TRUE)
      })
      print(paste("Consumed time bg clust fit: ",
                  round(tm_f["elapsed"]/60,2)," min",sep = ""))

      ### fit impulse model to each Y_star
      tm_fg <- system.time({
        impulse_fit_genes_bg <- impulse_fit(Y_star_s, data_annot, 1, ctrl_tk, ctrl_name,
            clustering_results_background, impulse_fit_clusters_bg, fit_backg = TRUE )
      })
      print(paste("Consumed time bg gene fit: ",
        round(tm_fg["elapsed"]/60,2)," min", sep = ""))

      ### calculate background
       RESD_1_s_b = Y_star_s - impulse_fit_genes_bg[[2]][rownames(Y_star_s),
            as.character(data_annot[colnames(Y_star_s),"Time"])]
       SS_1_s_b = apply(RESD_1_s_b,1,function(x){sum(x^2)})
       RESD_0_s_b = Y_star_s - rowMeans(Y_star_s)
       SS_0_s_b = apply(RESD_0_s_b,1,function(x){sum(x^2)})
    }
    background = (SS_0_s_b - SS_1_s_b) / SS_1_s_b
    return(background)
  }

#   define a new environment, which is used for parallelization
  my.env <- new.env()
  assign("n_rands", n_rands, envir = my.env)
  assign("n_proc", n_proc, envir = my.env)
  assign("data_table", data_table, envir = my.env)
  assign("data_annotation", data_annotation, envir = my.env)
  assign("n_iter",n_iter,envir = my.env)
  assign("control_timecourse", control_timecourse, envir = my.env)
  assign("control_name", control_name, envir = my.env)
  assign("imp_fit_genes", imp_fit_genes, envir = my.env)
  assign("no_of_clus", no_of_clus, envir = my.env)
  assign("calc_impulse_comp", calc_impulse_comp, envir = my.env)
  assign("cluster_genes_for_impulse", cluster_genes_for_impulse, envir = my.env)
  assign("impulse_fit", impulse_fit, envir = my.env)
  assign("two_impulses", two_impulses, envir = my.env)
  environment(impulse_bg) <- my.env

#' @import parallel
#' @import boot

#  ### set number of nodes to maximum - 1
  mc <- min(detectCores() - 1, n_proc)
  assign("mc", mc, envir = my.env)
  cl <- makeCluster(mc)
  clusterExport(cl = cl, varlist = c("impulse_bg","n_rands","n_proc","data_table",
    "data_annotation","n_iter","control_timecourse","control_name",
    "imp_fit_genes","mc", "no_of_clus","calc_impulse_comp",
    "cluster_genes_for_impulse","impulse_fit", "two_impulses"), envir = my.env)
  junk <- clusterEvalQ(cl,library(boot))
  clusterSetRNGStream(cl,123)
  res <- clusterEvalQ(cl,impulse_bg(data_table, data_annotation, n_iter,
     imp_fit_genes, control_timecourse, control_name, no_of_clus, n_rands, mc))
  environment(res) <- my.env
  res2 <- do.call(c,res)
  stopCluster(cl)
  return(res2)
}

################################################################################


DE_analysis <- function(data_table,data_annotation,impulse_fit_results,
    background, control_timecourse = FALSE, control_name = NULL,
    e_type = "Array", Q = 0.01){

  ### if control timecourse is present split data into case and control data
  if (control_timecourse == TRUE) {
      fmat_case <- as.matrix(data_table[,!(data_annotation$Condition %in%
            control_name)])
      fmat_ctrl <- as.matrix(data_table[,data_annotation$Condition %in%
            control_name])
     timep_case <-  as.numeric(as.character(data_annotation[colnames(fmat_case),
            "Time"]))
      timep_ctrl <- as.numeric(as.character(data_annotation[colnames(fmat_ctrl),
            "Time"]))
  }
  timep <- as.numeric(as.character(data_annotation[colnames(data_table),
        "Time"]))

  ### correct background values
  background[background < 0] = 0
  background = background[background < 100]

  if (control_timecourse == TRUE) {
    y_i_null = impulse_fit_results[[2]][rownames(fmat_case),as.character(timep)]
     RESD_1_s = cbind(fmat_case - impulse_fit_results[[4]][rownames(fmat_case),
        as.character(timep_case)], (fmat_ctrl[rownames(fmat_case),] -
        impulse_fit_results[[6]][rownames(fmat_case),as.character(timep_ctrl)]))
     RESD_1_s = RESD_1_s[,colnames(data_table)]

    ### if there is no control data
  } else if (control_timecourse == FALSE) {
      y_i_null = rowMeans(data_table)
      RESD_1_s = data_table - impulse_fit_results[[2]][rownames(data_table),
            as.character(timep)]
  }
  RESD_0_s = data_table - y_i_null
  SS_0_s = apply(RESD_0_s, 1, function(x){sum(x^2)})
  SS_1_s = apply(RESD_1_s, 1, function(x){sum(x^2)})

  F_stats =  (SS_0_s - SS_1_s) / SS_1_s
  ### avoid cases where SS_1 = 0  & SS_0 = 0--> Quotient will be NaN
  F_stats[ which(SS_0_s == 0 & SS_1_s == 0)] = 0

  ### bootstrap the p-values
  p  =  unlist(lapply(F_stats,function(x){length(which(background >=
        x))/length(background)}))
  p_scaled = p.adjust(p, method = "BH")
  p_scaled_orig = p_scaled


  # calculate FCs

  # FCs timepoint vs earlier
  if (control_timecourse == FALSE) {
       means <- t(apply(data_table, 1,function(x){
       tmp = NULL
       for (i in 1:length(unique(timep))) {
       tmp = c(tmp,mean(2^x[which(data_annotation[colnames(data_table),
            "Time"] == sort(unique(timep))[i])]))
       }
       return(tmp)
       }))
       colnames(means) <- sort(unique(timep))
       Ratios_TPs <- matrix(rep(0,nrow(data_table)*(length(unique(timep)) - 1)),
            nrow(data_table),(length(unique(timep)) - 1))
       for (tt in 1:(length(unique(timep)) - 1)) {
          Ratios_TPs[,tt] <- means[,tt + 1]/means[,tt]
       }

       FC_TP_DEindex <- apply(Ratios_TPs,1,function(x){
          if (length(which(x > 2 || x < 0.5)) >= 2) { "DE" } else {"not_DE"}
       })
  }


  if (control_timecourse == TRUE) {

      # FCs case vs control
      means_case <- t(apply(fmat_case, 1,function(x){
      tmp = NULL
      for (i in 1:length(unique(timep_case))) {
      tmp = c(tmp, mean(2^x[which(data_annotation[colnames(fmat_case),
        "Time"] == sort(unique(timep_case))[i])]))
      }
      return(tmp)
      }))
      colnames(means_case) <- sort(unique(timep_case))

      means_control <- t(apply(fmat_ctrl, 1,function(x){
      tmp = NULL
      for (i in 1:length(unique(timep_ctrl))) {
      tmp = c(tmp, mean(2^x[which(data_annotation[colnames(fmat_ctrl),
            "Time"] == sort(unique(timep_ctrl))[i])]))
      }
      return(tmp)
      }))
      colnames(means_control) <- sort(unique(timep_ctrl))

      common_timepoints = sort(unique(timep_case))[sort(unique(timep_case)) %in%
            unique(timep_ctrl)]
      if (length(common_timepoints) == 0) {
         FC_DEindex = rep("not_detectable",nrow(means_case))
      } else {
       meanRatios <- t(apply(cbind(means_case[,as.character(common_timepoints)],
          means_control[,as.character(common_timepoints)]),1,function(x){
          tmp <- x[1:length(unique(common_timepoints))]/
              (x[(length(unique(common_timepoints)) + 1):(2*
              length(unique(common_timepoints)))])
          return(tmp)
        }))
        FCs <- t(apply(meanRatios,1,function(x){
          y = x
          y[x < 1] = -1/x[x < 1]
          return(y)
        }))
        FC_DEindex <- apply(FCs,1,function(x){
          if (length(which(abs(x) > 2)) >= 2) { "DE" } else {"not_DE"}
        })
      }

      # ANOVA
      anovas <- t(apply(data_table,1,function(x){summary(aov(as.numeric(x) ~
            data_annotation$Time + data_annotation$Condition +
            data_annotation$Time * data_annotation$Condition))[[1]][2:3,
            "Pr(>F)"]}))
      anovas_FDR =  cbind(p.adjust(anovas[,1], method = "BH"),
            p.adjust(anovas[,2], method = "BH"))
      anovas_pre = anovas_FDR[,1] < Q | anovas_FDR[,2] < Q
      anovas_index <- unlist(lapply(anovas_pre, function(x){if (x == TRUE) {"DE"
          } else {"not_DE"}}))
  }

  # SS1 Flag
  SS1_flag = SS_1_s < 20
  error_index = unlist(lapply(SS1_flag,function(x){if (x == FALSE) {"fit might be
      unstable due to large variation"} else {""}}))

  ### exit the function without error if no DE genes are detected
  if (!(TRUE %in% (p_scaled <= Q))) {
    warning("No DE genes were detected. Maybe amount of background
            randomizations is too low.")
    return(NULL)

  ### if DE genes are detected, finish FDR correction by using the cutoff
  } else {

    ### if control data is present but not as a timecourse, add t-test
    ### p-values to the results
    result =   as.data.frame(cbind("Gene" = row.names(data_table),
            "adj.p" = as.numeric(p_scaled_orig), stringsAsFactors = FALSE))
    result$adj.p <- as.numeric(as.character(result$adj.p))
    result = result[order(result$adj.p),]
    print(paste("Found ",nrow(result[result$adj.p <= Q,])," DE genes",sep = ""))

    pvals_and_flags <- NULL
    if (control_timecourse == TRUE) {
       pvals_and_flags <- as.data.frame(cbind("Gene" = row.names(data_table),
            "adj.p" = p_scaled_orig, "at least 2 TPs for case vs. control" =
            FC_DEindex, "ANOVA condition or time*condition" = anovas_index,
            "prediction error" = error_index))
    } else {
       if (e_type == "Array") {
           pvals_and_flags <- as.data.frame(cbind("Gene" = row.names(data_table),
            "adj.p" = p_scaled_orig, "at least 2 consecutive TP Ratios" =
            FC_TP_DEindex, "prediction error" = error_index))
       } else {
           pvals_and_flags <- as.data.frame(cbind("Gene" = row.names(data_table),
            "adj.p" = p_scaled_orig,"prediction error" = error_index))
       }
    }
    return(list("DE_genes" = result[as.numeric(result$adj.p) <= Q,],
            "pvals_and_flags" = pvals_and_flags))
    }
}


################################################################################
################################################################################
################################################################################


#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#                     Final function calling all others
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

#' Differential expression analysis using impulse models
#'
#' Fits an impulse model to time course data and uses this model as a basis
#' to detect differentially expressed (DE) genes. If a single time course
#' data set is given, DE genes are detected over time, whereas if an
#' additional control time course data set is present, DE genes are
#' detected between both datasets.
#' @aliases impulse_DE impulseDE
#' @param expression_table numeric matrix of expression values; genes should
#' be in rows, samples in columns. Data should be properly normalized and
#' log2-transformed as well as filtered for present or variable genes.
#' @param annotation_table table providing co-variables for the samples
#' including condition and time points. Time points must be numeric numbers.
#' @param colname_time character string specifying the column name of the
#' co-variable "\code{Time}" in \code{annotation_table}
#' @param colname_condition character string specifying the column name of
#' the co-variable "\code{Condition}" in \code{annotation_table}
#' @param control_timecourse logical indicating whether a control time
#' timecourse is part of the data set (\code{TRUE}) or not (\code{FALSE}).
#' Default is \code{FALSE}.
#' @param control_name character string specifying the name of the control
#' condition in \code{annotation_table}.
#' @param case_name character string specifying the name of the case
#' condition in \code{annotation_table}. Should be set if more than two
#' conditions are present in \code{annotation_table}.
#' @param expr_type character string with allowed values "\code{Array}" or
#' "\code{Seq}". Default is "\code{Array}".
#' @param plot_clusters logical indicating whether to plot the clusters
#' (\code{TRUE}) or not (\code{FALSE}). Default is \code{TRUE}.
#' @param n_iter numeric value specifying the number of iterations, which are
#' performed to fit the impulse model to the clusters. Default is \code{100}.
#' @param n_randoms numeric value specifying the number of generated randomized
#' background iterations, which are used for differential expression analysis.
#' Default is \code{50000} and this value should not be decreased.
#' @param n_process numeric value indicating the number of processes, which can
#' be used on the machine to run calculations in parallel. Default
#' is \code{4}. The specified value is internally changed to
#' \code{min(detectCores() - 1, n_process)} using the \code{detectCores}
#' function from the package \code{parallel} to avoid overload.
#' @param Q_value numeric value specifying the cutoff to call genes
#' significantly differentially expressed after FDR correction (adjusted
#' p-value). Default is \code{0.01}.
#' @param new_device logical indicating whether each plot should be plotted
#' into a new device (\code{TRUE}) or not (\code{FALSE}). Default is \code{TRUE}.
#' @return List containing the following elements:
#' \itemize{
#' \item \code{impulse_fit_results} List containing fitted values and model
#' parameters:
#' \itemize{
#' \item \code{impulse_parameters_case} Matrix of fitted impulse model
#' parameters and sum of squared fitting errors for the case dataset. If a
#' control time course is present, corresponding list entries will exist for the
#' control and the combined dataset as well (named
#' \code{impulse_parameters_control} and \code{impulse_parameters_combined},
#' respectively).
#' \item \code{impulse_fits_case} Matrix of impulse values calculated based
#' on the analyzed time points and the fitted model parameters for the combined
#' dataset. If a control time course is present, corresponding list entries will
#' exist for the control and the combined dataset as well (named
#' \code{impulse_fits_control} and \code{impulse_fits_combined},
#' respectively).
#' }
#' \item \code{DE_results} List containg the results from the differential
#' expression analysis:
#' \itemize{
#' \item \code{DE_genes} data.frame containing the names of genes being called
#' as differentially expressed according to the specified cutoff \code{Q_value}
#' together with the adjusted p-values.
#' \item \code{pvals_and_flags} data.frame containing all gene names
#' together with the adjusted p-values and flags for differential expression
#' according to additional tests.
#' }
#' \item \code{clustering_results} List containing the clustering results:
#' \itemize{
#' \item \code{kmeans_clus_case} Numeric vector of clusters IDs, to which the
#' genes were finally assigned.
#' \item \code{cluster_means_case} Matrix containing the mean expression values
#' for each cluster (taken over all genes assigned to a cluster).
#' \item \code{pre_clus_case} Numeric number of clusters determined after the
#' first (preliminary) clustering step.
#' \item \code{fine_clus_case} Numeric number of final clusters determined after
#'  the second clustering step.
#' }
#' If a control time course is present, those four list entries will
#' exist correspondingly for the control and the combined dataset as well
#' (ending with \code{_control} and \code{_combined} instead of \code{_case},
#' respectively).
#' }
#' @details \code{ImpulseDE} is based on the impulse model proposed by
#' Chechik and Koller, which reflects a two-step behavior of genes within a cell
#' responding to environmental changes (Chechik and Koller, 2009). To detect
#' differentially expressed genes, a five-step workflow is followed:
#' \enumerate{
#' \item The genes are clustered into a limited number of groups using k-means
#' clustering. If \code{plot_clusters} = \code{TRUE}, the clusters are plotted.
#' \item The impulse model is fitted to the mean expression profiles of the
#' clusters. The best parameter sets are then used for the next step.
#' \item The impulse model is fitted to each gene separately using the parameter
#' sets from step 2 as optimal start point guesses.
#' \item The impulse model is fitted to a randomized dataset (bootstrap), which
#' is essential to detect significantly differentially expressed genes
#' (Storey et al., 2005).
#' \item Detection of differentially expressed genes utilizing the fits to the
#' real and randomized data sets. FDR-correction is performed to obtain adjusted
#' p-values (Benjamini and Hochberg, 1995).
#' }
#' @examples
#' #' Install package longitudinal and load it
#' library(longitudinal)
#' #' Attach datasets
#' data(tcell)
#' #' check dimension of data matrix of interest
#' dim(tcell.10)
#' #' generate a proper annotation table
#' annot <- as.data.frame(cbind("Time" =
#'    sort(rep(get.time.repeats(tcell.10)$time,10)),
#'    "Condition" = "activated"), stringsAsFactors = FALSE)
#' #' Time columns must be numeric
#' annot$Time <- as.numeric(annot$Time)
#' #' rownames of annotation table must appear in data table
#' rownames(annot) = rownames(tcell.10)
#' #' apply ImpulseDE in single time course mode
#' #' since genes must be in rows, transpose data matrix using t()
#' #' For the example, reduce iterations to 10, randomizations to 50, number of
#' #' genes to 20 and number of used processors to 1:
#' impulse_results <- impulse_DE(t(tcell.10)[1:20,], annot, "Time", "Condition",
#'    n_iter = 10, n_randoms = 50, n_process = 1)
#' @seealso \code{\link{plot_impulse}}, \code{\link{calc_impulse}}.
#' @author Jil Sander
#' @references Benjamini, Y. and Hochberg, Y. (1995) Controlling the false
#' discovery rate: a practical and powerful approach to multiple testing.
#' J. R. Stat. Soc. Series B Stat. Methodol., 57, 289-300.
#' @references Storey, J.D. et al. (2005) Significance analysis of time course
#' microarray experiments. Proc. Natl. Acad. Sci. USA, 102, 12837-12841.
#' @references Rangel, C., Angus, J., Ghahramani, Z., Lioumi, M., Sotheran, E.,
#' Gaiba, A., Wild, D.L., Falciani, F. (2004) Modeling T-cell activation using
#' gene expression profiling and state-space models. Bioinformatics, 20(9),
#' 1361-72.
#' @references Chechik, G. and Koller, D. (2009) Timing of Gene Expression
#' Responses to Envi-ronmental Changes. J. Comput. Biol., 16, 279-290.
#' @references Yosef, N. et al. (2013) Dynamic regulatory network controlling
#' TH17 cell differentiation. Nature, 496, 461-468.
#' @export
impulse_DE <- function(expression_table = NULL, annotation_table = NULL,
    colname_time = NULL, colname_condition = NULL, control_timecourse = FALSE,
    control_name = NULL, case_name = NULL, expr_type = "Array",
    plot_clusters = TRUE, n_iter = 100, n_randoms = 50000, n_process = 4,
    Q_value = 0.01, new_device = TRUE){

  tm_tot <- system.time({

    #' @import compiler
    #' @importFrom grDevices dev.new
    #' @importFrom graphics abline axis legend par plot points frame
    #' @importFrom stats aov cor dist kmeans mad median nlminb optim p.adjust
    #' @importFrom stats runif sd
    #' @importFrom utils head write.table

    ## prepare annotation table for internal usage
    print("START: Prepare annotation table for internal usage")
    print("-------------------------------------------------------------------")

    prepared_annotation <- annotation_preparation(annotation_table,
          expression_table,
          colname_time ,colname_condition, control_timecourse, control_name,
          case_name)
    prepared_annotation <- prepared_annotation[order(
            prepared_annotation$Condition),]
    prepared_annotation <- prepared_annotation[order(prepared_annotation$Time),]
    print("DONE")
    print("###################################################################")

    expression_table <- as.matrix(expression_table)    # to have numeric values
    expression_table <- expression_table[,rownames(prepared_annotation)]

    # exclude genes with missing values(NAs)
    indx <- apply(expression_table,1,function(x){TRUE %in% is.na(x)})
    expression_table <- expression_table[!(indx),]

    # if rownames are just 1,2,3 or if there are no rownames
    if (is.null(rownames(expression_table))) {
       rownames(expression_table) <- paste("G", 1:nrow(expression_table),
            sep = "_")
    } else if (length(grep("[a-zA-Z]",rownames(expression_table))) == 0) {
       rownames(expression_table) <- paste(rownames(expression_table),"G",
            sep = "_")
    }

    ### cluster genes to reduce efforts for fitting the impulse model
    print("START: Clustering genes for Impulse model fit")
    print("-------------------------------------------------------------------")
    tm_clust <- system.time({
      clustering_results <- cluster_genes_for_impulse(expression_table,
          prepared_annotation, control_timecourse, control_name, plot_clusters,
          n_device = new_device)
    })
    print("DONE")
    print(paste("Consumed time: ",round(tm_clust["elapsed"]/60,2)," min",sep = ""))
    print("##################################################################")

    # fit Impulse model to the clusters
    print("START: Fitting Impulse model to the clusters")
    print("-------------------------------------------------------------------")
    tm_imp_fit_clus <- system.time({
      impulse_fit_clusters <- impulse_fit(clustering_results,prepared_annotation,
          n_iter, control_timecourse, control_name, n_proc = n_process)
    })
    print("DONE")
    print(paste("Consumed time: ",round(tm_imp_fit_clus["elapsed"]/60,2)," min",
        sep = ""))
    print("###################################################################")

    ###  fit Impule model to each gene by using the cluster fits as start values
    print("START: Fitting Impulse model to the genes")
    print("-------------------------------------------------------------------")
    tm_imp_fit_gen <- system.time({
      impulse_fit_genes <- impulse_fit(expression_table, prepared_annotation,
        n_iter, control_timecourse, control_name, clustering_results,
        impulse_fit_clusters, n_proc = n_process)
    })
    print("DONE")
    print(paste("Consumed time: ",round(tm_imp_fit_gen["elapsed"]/60,2),
        " min",sep = ""))
    print("###################################################################")

    ### generate background for the DE analysis
    print("START: Generate background")
    print("-------------------------------------------------------------------")
    tm_bg <- system.time({
      background_results <-  generate_background(expression_table,
        prepared_annotation, n_iter, impulse_fit_genes, control_timecourse,
        control_name, clustering_results, n_randoms, n_process)
    })
    print("DONE")
    print(paste("Consumed time: ",round(tm_bg["elapsed"]/60,2)," min",sep = ""))
    print("###################################################################")

    ## detect differentially expressed genes
    print("START: DE analysis")
    print("-------------------------------------------------------------------")
    tm_DE <- system.time({
      impulse_DE_genes <- DE_analysis(expression_table,prepared_annotation,
            impulse_fit_genes, background_results, control_timecourse,
            control_name, expr_type, Q_value)
    })
    print("DONE")
    print(paste("Consumed time: ",round(tm_DE["elapsed"]/60,2)," min",sep = ""))
    print("##################################################################")
  })
  print(paste("TOTAL consumed time: ",round(tm_tot["elapsed"]/60,2),
        " min",sep = ""))
  return(list("impulse_fit_results" = impulse_fit_genes,
        "DE_results" = impulse_DE_genes, "clustering_results" = clustering_results))
}
Any scripts or data that you put into this service are public.
ImpulseDE documentation built on April 28, 2020, 9:05 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
ImpulseDE
Detection of DE genes in time series data using impulse models

R/Impulse_DE_fin.R
In ImpulseDE: Detection of DE genes in time series data using impulse models

Defines functions impulse_DE DE_analysis generate_background plot_impulse impulse_fit cluster_genes_for_impulse two_impulses calc_impulse annotation_preparation

Documented in calc_impulse impulse_DE plot_impulse

Try the ImpulseDE package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

ImpulseDE Detection of DE genes in time series data using impulse models

R/Impulse_DE_fin.R In ImpulseDE: Detection of DE genes in time series data using impulse models

Defines functions impulse_DE DE_analysis generate_background plot_impulse impulse_fit cluster_genes_for_impulse two_impulses calc_impulse annotation_preparation

Documented in calc_impulse impulse_DE plot_impulse

Try the ImpulseDE package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

ImpulseDE
Detection of DE genes in time series data using impulse models

R/Impulse_DE_fin.R
In ImpulseDE: Detection of DE genes in time series data using impulse models