R/missingValuesFilter.R

Defines functions mvFilterGetIndices deleteLinesFromIndices mvFilterFromIndices mvFilter getIndicesOfLinesToRemove StringBasedFiltering2 StringBasedFiltering removeLines proportionConRev_HC NumericalgetIndicesOfLinesToRemove NumericalFiltering getNumberOf getPourcentageOfMV

Documented in deleteLinesFromIndices getIndicesOfLinesToRemove getNumberOf getPourcentageOfMV mvFilter mvFilterFromIndices mvFilterGetIndices NumericalFiltering NumericalgetIndicesOfLinesToRemove proportionConRev_HC removeLines StringBasedFiltering StringBasedFiltering2

#' Returns the percentage of missing values in the quantitative
#' data (\code{exprs()} table of the dataset).
#' 
#' @title Percentage of missing values
#' 
#' @param obj An object of class \code{MSnSet}.
#' 
#' @return A floating number
#' 
#' @author Florence Combes, Samuel Wieczorek
#' 
#' @examples
#' utils::data(Exp1_R25_pept, package='DAPARdata')
#' getPourcentageOfMV(Exp1_R25_pept)
#' 
#' @export
#' 
#' @importFrom Biobase exprs fData pData
#' 
getPourcentageOfMV <- function(obj){

  df <- data.frame(Biobase::exprs(obj))

NA.count<-apply(df, 2,
                function(x) length(which(is.na(data.frame(x))==TRUE)) )


pourcentage <- 100 * round(sum(NA.count) /(nrow(df)* ncol(df)), digits=4)

return(pourcentage)
}


#' Returns the number of lines, in a given column, where content matches 
#' the prefix.
#' 
#' @title Number of lines with prefix
#' 
#' @param obj An object of class \code{MSnSet}.
#' 
#' @param name The name of a column.
#' 
#' @param prefix A string
#' 
#' @return An integer
#' 
#' @author Samuel Wieczorek
#' 
#' @examples
#' utils::data(Exp1_R25_pept, package='DAPARdata')
#' getNumberOf(Exp1_R25_pept, "Potential_contaminant", "+")
#' 
#' @export
#' 
#' @importFrom Biobase exprs fData pData
#' 
getNumberOf <- function(obj, name=NULL, prefix=NULL){
if (is.null(name) || is.null(prefix) || (name=="") || (prefix=="")){
    return(0)}
if (!(is.null(name) || !is.null(name==""))
    && (is.null(prefix) || (prefix==""))){return(0)}

if(nchar(prefix) > 0){
    count <- length(which(substr(Biobase::fData(obj)[,name], 0, 1) == prefix))
} else { count <- 0}

return(count)
}



#' This function removes lines in the dataset based on numerical conditions.
#' 
#' @title Removes lines in the dataset based on numerical conditions.
#' 
#' @param obj An object of class \code{MSnSet}.
#' 
#' @param name The name of the column that correspond to the line to filter
#' 
#' @param value A number 
#' 
#' @param operator A string
#' 
#' @return An list of 2 items :
#' obj : an object of class \code{MSnSet} in which the lines have been deleted
#' deleted : an object of class \code{MSnSet} which contains the deleted lines 
#' 
#' @author Samuel Wieczorek
#' 
#' @examples
#' utils::data(Exp1_R25_pept, package='DAPARdata')
#' NumericalFiltering(Exp1_R25_pept, 'A_Count', '6', '==')
#' 
#' @export
#' 
NumericalFiltering <- function(obj, name=NULL, value=NULL, operator=NULL){
  if ((is.null(name) || (name == ""))) {return(NULL)}

  deleted <- NULL
  ind <- NULL
  ind <- NumericalgetIndicesOfLinesToRemove(obj,name, value, operator)

  if (!is.null(ind) && (length(ind) > 0)){
    deleted <- obj[ind]

    obj <- deleteLinesFromIndices(obj, ind,
                                  paste("\"",
                                        length(ind),
                                        " lines were removed from dataset.\"",
                                        sep="")
    )

  }

  return(list(obj=obj, deleted=deleted))
}




#' This function returns the indice of the lines to delete, based on a 
#' prefix string
#' 
#' @title Get the indices of the lines to delete, based on a prefix string
#' 
#' @param obj An object of class \code{MSnSet}.
#' 
#' @param name The name of the column that correspond to the data to filter
#' 
#' @param value xxxx
#' 
#' @param operator A xxxx
#' 
#' @return A vector of integers.
#' 
#' @author Samuel Wieczorek
#' 
#' @examples
#' utils::data(Exp1_R25_pept, package='DAPARdata')
#' NumericalgetIndicesOfLinesToRemove(Exp1_R25_pept, "A_Count", value="6", operator='==')
#' 
#' @export
#' 
#' @importFrom Biobase exprs fData pData
#' 
NumericalgetIndicesOfLinesToRemove <- function(obj, name=NULL, value=NULL, operator=NULL)
{
  if ((value == "") || is.null(value)|| (operator=="") || is.null(operator)) {
    # warning ("No change was made")
    return (NULL)}

  data <- Biobase::fData(obj)[,name]
  ind <- which(eval(parse(text=paste0("data", operator, value))))

  return(ind)
}



#' Plots a barplot of proportion of contaminants and reverse. Same as the function
#' \code{proportionConRev} but uses the package \code{highcharter}
#' 
#' @title Barplot of proportion of contaminants and reverse
#' 
#' @param nBoth The number of both contaminants and reverse identified in the dataset.
#' 
#' @param nCont The number of contaminants identified in the dataset.
#' 
#' @param nRev The number of reverse entities identified in the dataset.
#' 
#' @param lDataset The total length (number of rows) of the dataset
#' 
#' @return A barplot
#' 
#' @author Samuel Wieczorek
#' 
#' @examples
#' proportionConRev_HC(10, 20, 100)
#' 
#' @export
#' 
proportionConRev_HC <- function(nBoth = 0, nCont=0, nRev=0, lDataset=0){
    if (is.null(nCont) && is.null(nBoth) && is.null(nRev) && is.null(lDataset)){return(NULL)}

    total <- nBoth + nCont + nRev + lDataset
    pctGood <- 100 * round(lDataset/total,  digits=4)
    pctBoth <- 100 * round(nBoth/total,  digits=4)
    pctContaminants <- 100 * round(nCont/total,  digits=4)
    pctReverse <- 100 * round(nRev/total,  digits=4)

    counts <- c(lDataset, nCont, nRev, nBoth)
    slices <- c(pctGood, pctContaminants, pctReverse ,pctBoth)
    lbls <- c("Quantitative data", "Contaminants", "Reverse", "Both contaminants & Reverse")
    #pct <- c(pctGood, pctContaminants, pctReverse  ,pctBoth)
    lbls <- paste(lbls, " (", counts, " lines)", sep="")

    mydata <- data.frame(test=c(pctGood, pctContaminants, pctReverse ,pctBoth))

    highchart() %>%
        my_hc_chart(chartType = "bar") %>%
        hc_yAxis(title = list(text = "Pourcentage")) %>%
        hc_xAxis(categories=lbls) %>%
        hc_legend(enabled = FALSE) %>%
        hc_plotOptions(column = list(
            dataLabels = list(enabled = TRUE),
            stacking = "normal",
            enableMouseTracking = FALSE)
        ) %>%
        hc_add_series(data  = mydata$test,
                      dataLabels = list(enabled = TRUE, format='{point.y}%'),
                  colorByPoint = TRUE) %>%
      my_hc_ExportMenu(filename = "contaminants")


}




#' This function removes lines in the dataset based on a prefix string.
#' 
#' @title Removes lines in the dataset based on a prefix string.
#' 
#' @param obj An object of class \code{MSnSet}.
#' 
#' @param idLine2Delete The name of the column that correspond to the 
#' data to filter
#' 
#' @param prefix A character string that is the prefix to find in the data
#' @return An object of class \code{MSnSet}.
#' 
#' @author Samuel Wieczorek
#' 
#' @examples
#' utils::data(Exp1_R25_pept, package='DAPARdata')
#' removeLines(Exp1_R25_pept, "Potential_contaminant")
#' removeLines(Exp1_R25_pept, "Reverse")
#' 
#' @export
#' 
removeLines <- function(obj, idLine2Delete=NULL, prefix=NULL){
if ((prefix == "") || is.null(prefix)) {
    #warning ("No change was made")
    return (obj)}
    t <- (prefix == substring(Biobase::fData(obj)[,idLine2Delete],1,nchar(prefix)))
    ind <- which( t== TRUE)
    obj <- obj[-ind ]

return(obj)
}



#' This function removes lines in the dataset based on prefix strings (contaminants, reverse or both).
#' 
#' @title Removes lines in the dataset based on a prefix strings (contaminants, reverse or both).
#' 
#' @param obj An object of class \code{MSnSet}.
#' 
#' @param idCont2Delete The name of the column that correspond to the 
#' contaminants to filter
#' 
#' @param prefix_Cont A character string that is the prefix for the contaminants to find in the data
#' 
#' @param idRev2Delete The name of the column that correspond to the 
#' reverse data to filter
#' 
#' @param prefix_Rev A character string that is the prefix for the reverse to find in the data
#' 
#' @return An list of 4 items :
#' obj : an object of class \code{MSnSet} in which the lines have been deleted
#' deleted.both : an object of class \code{MSnSet} which contains the deleted lines 
#' corresponding to both contaminants and reverse, 
#' deleted.contaminants : n object of class \code{MSnSet} which contains the deleted lines 
#' corresponding to contaminants, 
#' deleted.reverse : an object of class \code{MSnSet} which contains the deleted lines 
#' corresponding to reverse,
#' 
#' @author Samuel Wieczorek
#' 
#' @examples
#' utils::data(Exp1_R25_pept, package='DAPARdata')
#' StringBasedFiltering(Exp1_R25_pept, 'Potential_contaminant', '+', 'Reverse', '+')
#' 
#' @export
#' 
StringBasedFiltering <- function(obj, 
                                 idCont2Delete=NULL, prefix_Cont=NULL, 
                                 idRev2Delete=NULL, prefix_Rev=NULL){

    deleted.both <- deleted.contaminants <- deleted.reverse <- NULL

    ##
    ##Search for both
    ##
    if ((!is.null(idCont2Delete) || (idCont2Delete != "")) &&
        (!is.null(idRev2Delete) || (idRev2Delete != ""))) {
        indContaminants <- indReverse <- indBoth <- NULL
        indContaminants <- getIndicesOfLinesToRemove(obj,idCont2Delete,  prefix_Cont)
        indReverse <- getIndicesOfLinesToRemove(obj, idRev2Delete, prefix_Rev)
        indBoth <- intersect(indContaminants, indReverse)

        if (!is.null(indBoth) && (length(indBoth) > 0)){
                deleted.both <- obj[indBoth]
                obj <- deleteLinesFromIndices(obj, indBoth,
                                               paste("\"",
                                                     length(indBoth),
                                                     " both contaminants and reverse were removed from dataset.\"",
                                                     sep="")
                )
            }
    }

    ##
    ##Search for contaminants
    ##
    if ((!is.null(idCont2Delete) || (idCont2Delete != ""))) {
        indContaminants <- NULL
        indContaminants <- getIndicesOfLinesToRemove(obj,idCont2Delete,  prefix_Cont)

        if (!is.null(indContaminants) && (length(indContaminants) > 0)){
                deleted.contaminants <- obj[indContaminants]

                obj <- deleteLinesFromIndices(obj, indContaminants,
                                               paste("\"",
                                                     length(indContaminants),
                                                     " contaminants were removed from dataset.\"",
                                                     sep="")
                )

        }
    }


    ##
    ## Search for reverse
    ##
    if ((!is.null(idRev2Delete) || (idRev2Delete != ""))) {
        indReverse <- getIndicesOfLinesToRemove(obj, idRev2Delete, prefix_Rev)

        if (!is.null(indReverse)){
            if (length(indReverse) > 0)  {
                deleted.reverse <- obj[indReverse]

                obj <- deleteLinesFromIndices(obj, indReverse,
                                               paste("\"",
                                                     length(indReverse),
                                                     " reverse were removed from dataset.\"",
                                                     sep="")
                )

            }
        }
    }


    return(list(obj=obj,
                deleted.both=deleted.both,
                deleted.contaminants=deleted.contaminants,
                deleted.reverse=deleted.reverse))
}






#' This function removes lines in the dataset based on prefix strings.
#' 
#' @title Removes lines in the dataset based on a prefix strings.
#' 
#' @param obj An object of class \code{MSnSet}.
#' 
#' @param cname The name of the column that correspond to the line to filter
#' 
#' @param tag A character string that is the prefix for the contaminants to find in the data
#' 
#' @return An list of 4 items :
#' obj : an object of class \code{MSnSet} in which the lines have been deleted
#' deleted : an object of class \code{MSnSet} which contains the deleted lines 
#' 
#' @author Samuel Wieczorek
#' 
#' @examples
#' utils::data(Exp1_R25_pept, package='DAPARdata')
#' StringBasedFiltering2(Exp1_R25_pept, 'Potential_contaminant', '+')
#' 
#' @export
#' 
StringBasedFiltering2 <- function(obj, cname=NULL, tag=NULL){

  deleted <- NULL

  ##
  ##Search for contaminants
  ##
  if ((!is.null(cname) || (cname != ""))) {
    ind <- NULL
    ind <- getIndicesOfLinesToRemove(obj,cname,  tag)

    if (!is.null(ind) && (length(ind) > 0)){
      deleted <- obj[ind]

      obj <- deleteLinesFromIndices(obj, ind,
                                    paste("\"",
                                          length(ind),
                                          " contaminants were removed from dataset.\"",
                                          sep="")
      )

    }
  }

  return(list(obj=obj, deleted=deleted))
}




#' This function returns the indice of the lines to delete, based on a 
#' prefix string
#' 
#' @title Get the indices of the lines to delete, based on a prefix string
#' 
#' @param obj An object of class \code{MSnSet}.
#' 
#' @param idLine2Delete The name of the column that correspond to the data 
#' to filter
#' 
#' @param prefix A character string that is the prefix to find in the data
#' 
#' @return A vector of integers.
#' 
#' @author Samuel Wieczorek
#' 
#' @examples
#' utils::data(Exp1_R25_pept, package='DAPARdata')
#' getIndicesOfLinesToRemove(Exp1_R25_pept, "Potential_contaminant", prefix="+")
#' 
#' @export
#' 
#' @importFrom Biobase exprs fData pData
#' 
getIndicesOfLinesToRemove <- function(obj, idLine2Delete=NULL, prefix=NULL)
{
if ((prefix == "") || is.null(prefix)) {
   # warning ("No change was made")
    return (NULL)}
t <- (prefix == substring(Biobase::fData(obj)[,idLine2Delete],1,nchar(prefix)))
ind <- which( t== TRUE)
return(ind)
}


#' Filters the lines of \code{exprs()} table with conditions on the number
#' of missing values.
#' The user chooses the minimum amount of intensities that is acceptable and
#' the filter delete lines that do not respect this condition.
#' The condition may be on the whole line or condition by condition.
#' 
#' The different methods are :
#' "WholeMatrix": given a threshold \code{th}, only the lines that contain
#' at least \code{th} values are kept.
#' "AllCond": given a threshold \code{th}, only the lines which contain
#' at least \code{th} values for each of the conditions are kept.
#' "AtLeastOneCond": given a threshold \code{th}, only the lines that contain
#' at least \code{th} values, and for at least one condition, are kept.
#' 
#' @title Filter lines in the matrix of intensities w.r.t. some criteria
#' 
#' @param obj An object of class \code{MSnSet} containing
#' quantitative data.
#' 
#' @param type Method used to choose the lines to delete.
#' Values are : "None", "WholeMatrix", "AllCond", "AtLeastOneCond"
#' 
#' @param th An integer value of the threshold
#' 
#' @param processText A string to be included in the \code{MSnSet}
#' object for log. 
#' 
#' @return An instance of class \code{MSnSet} that have been filtered.
#' 
#' @author Florence Combes, Samuel Wieczorek
#' 
#' @examples
#' utils::data(Exp1_R25_pept, package='DAPARdata')
#' mvFilter(Exp1_R25_pept, "WholeMatrix", 2)
#' 
#' @export
#' 
mvFilter <- function(obj,
                     type, 
                     th, 
                     processText=NULL )
{
    #Check parameters
    paramtype<-c("None", "WholeMatrix", "AllCond", "AtLeastOneCond")
    if (sum(is.na(match(type, paramtype)==TRUE))>0){
        warning("Param type is not correct.")
        return (NULL)
    }

    paramth<-c(seq(0, nrow(Biobase::pData(obj)), 1))
    if (sum(is.na(match(th, paramth)==TRUE))>0){
        warning("Param th is not correct.")
        return (NULL)
    }

    if(!is.integer(th)){th <- as.integer(th)}

    keepThat <- mvFilterGetIndices(obj,
                                   condition = type, 
                                   threshold = th)

obj <- obj[keepThat]

    obj@processingData@processing <-
        c(obj@processingData@processing, processText)
    return(obj)
}



#' Filters the lines of \code{exprs()} table with conditions on the number
#' of missing values.
#' The user chooses the minimum amount of intensities that is acceptable and
#' the filter delete lines that do not respect this condition.
#' The condition may be on the whole line or condition by condition.
#' 
#' The different methods are :
#' "WholeMatrix": given a threshold \code{th}, only the lines that contain
#' at least \code{th} values are kept.
#' "AllCond": given a threshold \code{th}, only the lines which contain
#' at least \code{th} values for each of the conditions are kept.
#' "AtLeastOneCond": given a threshold \code{th}, only the lines that contain
#' at least \code{th} values, and for at least one condition, are kept.
#' 
#' @title Filter lines in the matrix of intensities w.r.t. some criteria
#' 
#' @param obj An object of class \code{MSnSet} containing
#' quantitative data.
#' 
#' @param keepThat A vector of integers which are the indices of lines to 
#' keep.
#' 
#' @param processText A string to be included in the \code{MSnSet}
#' object for log. 
#' 
#' @return An instance of class \code{MSnSet} that have been filtered.
#' 
#' @author Florence Combes, Samuel Wieczorek
#' 
#' @examples
#' utils::data(Exp1_R25_pept, package='DAPARdata')
#' mvFilterFromIndices(Exp1_R25_pept, c(1:10))
#' 
#' @export
#' 
mvFilterFromIndices <- function(obj,keepThat=NULL, processText="" )
{

if (is.null(keepThat)) {return(obj)}
obj <- obj[keepThat]

# if (!is.null(obj@experimentData@other$OriginOfValues)){
#     obj@experimentData@other$OriginOfValues <- obj@experimentData@other$OriginOfValues[keepThat,]
# }
obj@processingData@processing <-
    c(obj@processingData@processing, processText)

return(obj)
}


#' Delete the lines of \code{exprs()} table identified by their indice.
#' 
#' @title Delete the lines in the matrix of intensities and the metadata table
#' given their indice.
#' 
#' @param obj An object of class \code{MSnSet} containing
#' quantitative data.
#' 
#' @param deleteThat A vector of integers which are the indices of lines to 
#' delete.
#' 
#' @param processText A string to be included in the \code{MSnSet}
#' object for log. 
#' 
#' @return An instance of class \code{MSnSet} that have been filtered.
#' 
#' @author Florence Combes, Samuel Wieczorek
#' 
#' @examples
#' utils::data(Exp1_R25_pept, package='DAPARdata')
#' deleteLinesFromIndices(Exp1_R25_pept, c(1:10))
#' 
#' @export
#' 
deleteLinesFromIndices <- function(obj,deleteThat=NULL, processText="" )
{

    if (is.null(deleteThat)) {return(obj)}
    obj <- obj[-deleteThat]

    obj@processingData@processing <-  c(obj@processingData@processing, processText)
    if (grepl("contaminants", processText)){obj@experimentData@other$contaminantsRemoved <- TRUE}
    if (grepl("reverse", processText)){obj@experimentData@other$reverseRemoved <- TRUE }
    return(obj)
}


#' Returns the indices of the lines of \code{exprs()} table to delete w.r.t. 
#' the conditions on the number of missing values.
#' The user chooses the minimum amount of intensities that is acceptable and
#' the filter delete lines that do not respect this condition.
#' The condition may be on the whole line or condition by condition.
#' 
#' The different methods are :
#' "WholeMatrix": given a threshold \code{th}, only the lines that contain
#' at least \code{th} values are kept.
#' "AllCond": given a threshold \code{th}, only the lines which contain
#' at least \code{th} values for each of the conditions are kept.
#' "AtLeastOneCond": given a threshold \code{th}, only the lines that contain
#' at least \code{th} values, and for at least one condition, are kept.
#' 
#' @title Filter lines in the matrix of intensities w.r.t. some criteria
#' 
#' @param obj An object of class \code{MSnSet} containing
#' quantitative data.
#' 
#' @param percent TRUE or FALSE. Default is FALSE..
#' 
#' @param condition Method used to choose the lines to delete.
#' Values are : "None", "EmptyLines", "WholeMatrix", "AllCond", "AtLeastOneCond"
#' 
#' @param threshold An integer value of the threshold if percent is FALSE. Otherwise, a floating
#' number between 0 and 1.
#' 
#' @return An vector of indices that correspond to the lines to keep.
#' 
#' @author Enora Fremy, Samuel Wieczorek
#' 
#' @examples
#' utils::data(Exp1_R25_pept, package='DAPARdata')
#' keepThat <- mvFilterGetIndices(Exp1_R25_pept, condition = "WholeMatrix", threshold=2)
#' keepThat <- mvFilterGetIndices(Exp1_R25_pept, condition = "EmptyLines")
#' keepThat <- mvFilterGetIndices(Exp1_R25_pept, condition = "WholeMatrix", percent=TRUE, threshold=0.5)
#' 
#' @export
#' 
mvFilterGetIndices <- function(obj,
                               percent = FALSE,
                               condition = 'WholeMatrix', 
                               threshold = NULL){
  #Check parameters
  paramtype<-c("None", "EmptyLines", "WholeMatrix", "AllCond", "AtLeastOneCond")
  if (!(condition %in% paramtype)){
    warning("Param `type` is not correct.")
    return (NULL)
  }
  
  if (condition != 'EmptyLines')
    if (!(percent %in% c(T, F))){
      warning("Param `type` is not correct.")
      return (NULL)
    } else {
      if (!isTRUE(percent)){
        paramth <- c(seq(0, nrow(Biobase::pData(obj)), 1))
        if (!(threshold %in% paramth)){
          warning(paste0("Param `threshold` is not correct. It must an integer greater than or equal to 0 and less or equal than ",
                         nrow(Biobase::pData(obj))))
          return (NULL)
        }
      } else {
        if (threshold < 0 || threshold > 1){
          warning("Param `threshold` is not correct. It must be greater than 0 and less than 1.")
          return (NULL)
        }
      }
    }
  
  keepThat <- NULL
  if (is.null(obj@experimentData@other$OriginOfValues)){
    data <- Biobase::exprs(obj)
    warning('The dataset contains no slot OriginOfValues in which to search for indices. The search will
            be proceeded in the intensities tab based on NA values')
  } else {
    data <- dplyr::select(Biobase::fData(obj),
                          obj@experimentData@other$OriginOfValues)
  }
  
  if (condition == "None") {
    keepThat <- seq(1:nrow(data))
  } else if (condition == "EmptyLines") {
    keepThat <- which(apply(!DAPAR::is.MV(data), 1, sum) >= 1)
  } else if (condition == "WholeMatrix") {
    if (isTRUE(percent)) {
      keepThat <- which(rowSums(!DAPAR::is.MV(data))/ncol(data) >= threshold) 
    } else {
      keepThat <- which(apply(!DAPAR::is.MV(data), 1, sum) >= threshold)
    }
  } else if (condition == "AtLeastOneCond" || condition == "AllCond") {
    
    conditions <- unique(Biobase::pData(obj)$Condition)
    nbCond <- length(conditions)
    keepThat <- NULL
    s <- matrix(rep(0, nrow(data)*nbCond),
                nrow=nrow(data),
                ncol=nbCond)
    
    if (isTRUE(percent)) {
      for (c in 1:nbCond) {
        ind <- which(Biobase::pData(obj)$Condition == conditions[c])
        s[,c] <- (rowSums(!DAPAR::is.MV(data[,ind]))/length(ind)) >= threshold
      }
    } else {
      for (c in 1:nbCond) {
        ind <- which(Biobase::pData(obj)$Condition == conditions[c])
        if (length(ind) == 1){
          s[,c] <- (!DAPAR::is.MV(data[,ind]) >= threshold) 
        }
        else {
          s[,c] <- (apply(!DAPAR::is.MV(data[,ind]), 1, sum)) >= threshold
        }
      }
    }
    
    switch(condition,
           AllCond = keepThat <- which(rowSums(s) == nbCond),
           AtLeastOneCond = keepThat <- which(rowSums(s) >= 1)
    )
  }
  
  return(keepThat)
}


#' 
#' 
#' #' Filter missing values by proportion
#' #'
#' #' @description Remove lines in the data according to the proportion of missing
#' #' values. This proportion is calculated differently depending on whether we
#' #' want a certain proportion of missing values (NA) to remain on:
#' #' * the entire matrix, regardless of the conditions: the rows containing a
#' #' proportion of NA equal or below the threshold will be kept.
#' #' * all the conditions: the lines for which all the conditions have a NA
#' #' proportion equal to or less than the fixed proportion will be kept.
#' #' * at least one condition: the lines for which at least one condition is
#' #' equal to or less than the fixed proportion of NA will be kept.
#' #'
#' #' @param obj  An object of class \code{MSnSet} containing quantitative data
#' #' and phenotype data.
#' #' 
#' #' @param intensities_proportion float between 0 and 1 corresponding to the proportion
#' #' of intensities to keep in the lines.
#' #' 
#' #' @param mode character string. Four possibilities corresponding to the
#' #' description above: "None", WholeMatrix", "AllCond" and "AtLeastOneCond".
#' #' 
#' #' @return the object given as input but with the lines not respecting the
#' #' proportion of NA requested in less.
#' #' 
#' #' @author Hélène Borges
#' #' 
#' #' @examples
#' #' utils::data(Exp1_R25_prot, package='DAPARdata')
#' #' filtered <- filterByProportion(obj = Exp1_R25_prot, intensities_proportion = 0.8, mode = "AtLeastOneCond")
#' #' 
#' #' @export
#' #' 
#' #' @importFrom stringr str_glue
#' #' @importFrom Biobase exprs pData fData
#' #' @import dplyr
#' #' @importFrom tidyr pivot_longer
#' #' @importFrom methods is
#' #' 
#' filterByProportion <- function(obj, intensities_proportion, mode = "None"){
#'   # check if mode is valid
#'   if(!(mode %in% c("None","WholeMatrix", "AllCond", "AtLeastOneCond"))){
#'     stop(stringr::str_glue("Wrong mode: {mode} is not a valid string.
#'                      Please choose between 'None', WholeMatrix', 'AllCond' or 'AtLeastOneCond'.",
#'                            call. =FALSE))
#'   }
#'   # check if intensities_proportion is valid
#'   if(!methods::is(intensities_proportion, "numeric" )){
#'     stop(stringr::str_glue("Wrong parameter: intensities_proportion needs to be numeric"))
#'   }else if(!dplyr::between(intensities_proportion,0,1)){
#'     stop(stringr::str_glue("Wrong parameter: intensities_proportion must be between 0 and 1"))
#'   }
#'   
#'   print(stringr::str_glue("chosen proportion of intensities to be present: {intensities_proportion}"))
#'   print(stringr::str_glue("chosen mode: {mode}"))
#'   intensities <- Biobase::exprs(obj)
#'   sTab <- Biobase::pData(obj)
#'   sTab$Condition <- as.factor(sTab$Condition)
#'   
#'   intensities_t <- as.data.frame(t(intensities))
#'   intensities_t <- dplyr::bind_cols(intensities_t,
#'                                     condition = sTab$Condition,
#'                                     sample = rownames(intensities_t))
#'   tbl_intensities <- dplyr::as_tibble(intensities_t, rownames = NA)
#'   longer_intensities <- tbl_intensities %>%
#'     tidyr::pivot_longer(-c(condition,sample), names_to = "feature", values_to = "intensity")
#'   # group_by does not keep the initial order when it is not a factor so to keep
#'   # the protein order, we cheat by transforming feature into a factor.
#'   longer_intensities$feature <- factor(longer_intensities$feature,
#'                                        levels = unique(longer_intensities$feature))
#'   if(mode == "None"){
#'     to_keep <- obj
#'   }else if(mode == "WholeMatrix"){
#'     nb_samples <- ncol(intensities)
#'     threshold <- ceiling(nb_samples*intensities_proportion)
#'     print(stringr::str_glue("missing value threshold {threshold}"))
#'     # for each feature (protein / peptide) we count the number of intensities present
#'     feat_grp <- longer_intensities %>%
#'       dplyr::group_by(feature) %>%
#'       dplyr::summarise(non_na = sum(!is.na(intensity)))
#'     to_keep <- obj[which(feat_grp$non_na >= threshold),]
#'     
#'   }else if(mode == "AllCond" || mode == "AtLeastOneCond"){
#'     workforces <- longer_intensities %>%
#'       dplyr::group_by(feature, condition) %>%
#'       dplyr::count(condition)
#'     # the number of samples per condition
#'     workforces <- workforces$n[seq_len(length(levels(sTab$Condition)))]
#'     
#'     # for each condition of each feature, we count the number of intensities present
#'     feat_grp <- longer_intensities %>%
#'       dplyr::group_by(feature, condition) %>%
#'       dplyr::summarise(non_na = sum(!is.na(intensity)))
#'     # the threshold for each condition
#'     thresholds <- ceiling(workforces*intensities_proportion)
#'     print(stringr::str_glue("for condition {unique(levels(longer_intensities$condition))} number of samples is {workforces}, so missing value threshold is {thresholds} "))
#'     # For each feature, each condition is compared with its respective
#'     # threshold, we put 0 if the protein has a number of intensities lower than
#'     # the threshold of the corresponding condition, and 1 otherwise
#'     check_th <- feat_grp %>%
#'       dplyr::group_by(feature) %>%
#'       dplyr::mutate(non_na = dplyr::case_when(
#'         non_na < thresholds ~ 0,
#'         TRUE ~ 1
#'       )) %>%
#'       dplyr::ungroup()
#'     # if it is AllCond then we must find the features for which all the conditions
#'     # respect the threshold
#'     if(mode == "AllCond"){
#'       all_cond_ok <- check_th %>%
#'         dplyr::group_by(feature) %>%
#'         dplyr::filter(all(non_na ==1)) %>%
#'         dplyr::ungroup() %>%
#'         as.data.frame()
#'       all_cond_ok$feature <- as.character(all_cond_ok$feature)
#'       to_keep <- obj[which(rownames(obj) %in% all_cond_ok$feature),]
#'     }else if(mode == "AtLeastOneCond"){
#'       # if it is AtLeastOneCond then we must find the features for which at
#'       # least one condition that respects the threshold
#'       any_cond_ok <- check_th %>%
#'         dplyr::group_by(feature) %>%
#'         dplyr::filter(any(non_na ==1)) %>%
#'         dplyr::ungroup() %>%
#'         as.data.frame()
#'       any_cond_ok$feature <- as.character(any_cond_ok$feature)
#'       to_keep <- obj[which(rownames(obj) %in% any_cond_ok$feature),]
#'     }
#'   }
#'   print(stringr::str_glue("There were initially {nrow(intensities)} features.
#'                  After filtering out the missing values, {nrow(exprs(to_keep))} remain."))
#'   return(to_keep)
#' }

Try the DAPAR package in your browser

Any scripts or data that you put into this service are public.

DAPAR documentation built on April 11, 2021, 6 p.m.