R/3.2_read_proteingroups.R

Defines functions read_maxquant_phosphosites read_proteingroups read_maxquant_proteingroups is_file label2index dequantify mqdt_to_mat drop_differing_uniprots .read_maxquant_phosphosites .read_maxquant_proteingroups un_int64 guess_maxquant_quantity

Documented in dequantify guess_maxquant_quantity is_file label2index .read_maxquant_phosphosites read_maxquant_phosphosites .read_maxquant_proteingroups read_maxquant_proteingroups read_proteingroups

#---------------------------------------------------------------------------
# 
#                       guess_maxquant_quantity
#
#---------------------------------------------------------------------------


#' maxquant quantity patterns
#' @examples
#' MAXQUANT_PATTERNS
#' @export
MAXQUANT_PATTERNS <- c(
    `normalizedratio`            = '^Ratio ([HM]/[ML]) normalized (.+)$',
    `ratio`                      = '^Ratio ([HM]/[ML]) (?!count|type|variability|iso-count|normalized)(.+)',
    `correctedreporterintensity` = '^Reporter intensity corrected ([0-9]+) (.+)$',
    `reporterintensity`          = '^Reporter intensity ([0-9]+) (.+)$',
    `maxlfq`                     = '^LFQ intensity ([HML] )? ?(.+)$',
    `labeledintensity`           = '^Intensity ([HML]) (.+)$',
    `intensity`                  = '^Intensity (.+)$'
)


#' Guess maxquant quantity from snames
#'
#' @param x character vector
#' @return  string: value from names(MAXQUANT_PATTERNS)
#' @examples
#' # file
#'     file <- system.file('extdata/fukuda20.proteingroups.txt', package = 'autonomics')
#'     guess_maxquant_quantity(file)
#'
#' # character vector
#'     x <- "Ratio M/L normalized STD(L)_E00(M)_E01(H)_R1"
#'     guess_maxquant_quantity(x)
#'
#'     x <- "Ratio M/L STD(L)_E00(M)_E01(H)_R1"
#'     guess_maxquant_quantity(x)
#'
#'     x <- "LFQ intensity E00.R1"
#'     guess_maxquant_quantity(x)
#'
#'     x <- "Reporter intensity corrected 0 STD(0)E00(1)E01(2)_R1"
#'     guess_maxquant_quantity(x)
#'
#'     x <- "Reporter intensity 0 STD(0)E00(1)E01(2)_R1"
#'     guess_maxquant_quantity(x)
#'
#'     x <- "Intensity H STD(L)_E00(M)_E01(H)_R1"
#'     guess_maxquant_quantity(x)
#'
#' @export
guess_maxquant_quantity <- function(x){
# Assert
    assert_is_character(x)
# read if x is filename
    if (is_scalar(x)){
        if (is_existing_file(x)){  
            x <- names(fread(x, header = TRUE, nrows = 1))
        }
    }
# guess from character vector
    for (quantity in names(MAXQUANT_PATTERNS)){
        pattern <- MAXQUANT_PATTERNS[[quantity]]
        if (any(stri_detect_regex(x, pattern)))   return(quantity)
    }
    stop('quantity could not be infered')
}


#---------------------------------------------------------------------------
# 
#                       .read_maxquant_proteingroups
#                       .read_maxquant_phosphosites
#
#---------------------------------------------------------------------------

utils::globalVariables('where')
un_int64 <- function(x) {
    dplyr::mutate(x, dplyr::across(where(bit64::is.integer64), as.numeric))
}

#' Read proteingroups/phosphosites as-is
#' @param file         proteingroups / phosphosites file
#' @param profile  proteingroups file
#' @param quantity     string
#' @param verbose      TRUE / FALSE
#' @return data.table
#' @examples 
#' profile <- system.file('extdata/billing19.proteingroups.txt', package = 'autonomics')
#' fosfile <- system.file('extdata/billing19.phosphosites.txt',  package = 'autonomics')
#' prodt <- .read_maxquant_proteingroups(file = profile)
#' fosdt <- .read_maxquant_phosphosites( file = fosfile, profile = profile)
#' @export
.read_maxquant_proteingroups <- function(file, quantity = guess_maxquant_quantity(file), verbose = TRUE){
# Assert
    assert_maxquant_proteingroups(file)
    assert_is_subset(quantity, names(MAXQUANT_PATTERNS))
# Read
    if (verbose)  cmessage('%sRead%sproteingroups %s%s%s', spaces(8), spaces(2), quantity, spaces(35-nchar(quantity)-nchar('proteingroups ')), file)
    prodt <- fread(file, colClasses = c(id = 'character'), integer64 = 'numeric')
    prodt %<>% un_int64()
  # prodt[Reverse == '+', `Majority protein IDs` := split_extract_fixed(`Majority protein IDs`, ';', 1)]
            # In FOS `Proteins` column is empty for Reverse, so `Protein` (singular!) is copied over
            # In PRO therefore take leading protein only for naming identity with FOS.
            # Upon further insight naming parity is probably not so important
            # So dont unnecessarily modify
# Extract relevant columns
    pattern <- MAXQUANT_PATTERNS[[quantity]]   # the proteins column is empty in (only) reverse
    anncols <- c('id', 'Majority protein IDs', 'Reverse', 
                 'Potential contaminant', 'Contaminant', 'Fasta headers')#, 'Phospho (STY) site IDs')
    anncols %<>% intersect(names(prodt))
    valcols <- grep(pattern, names(prodt), value = TRUE)
    pepcols <- grep('Razor + unique peptides ', names(prodt), fixed = TRUE, value = TRUE)
    prodt %<>% extract(, c(anncols, pepcols, valcols), with = FALSE)
    digits <- ceiling(log10(nrow(prodt)))
  # if (verbose)  message('\t\t', nrow(prodt), ' proteins, contaminants, reverse')
# Return
    names(prodt) %<>% stri_replace_first_fixed(          'Reverse',     'reverse')
    names(prodt) %<>% stri_replace_first_fixed(          'Contaminant', 'contaminant') # older MaxQuant
    names(prodt) %<>% stri_replace_first_fixed('Potential contaminant', 'contaminant') # newer MaxQuant
    setnames(prodt, 'id',                   'proId')
    setnames(prodt, 'Majority protein IDs', 'uniprot')
    prodt[]
}


#' @rdname dot-read_maxquant_proteingroups
#' @export
.read_maxquant_phosphosites <- function(
    file, profile, quantity = guess_maxquant_quantity(file), verbose = TRUE
){
# Assert
    assert_maxquant_proteingroups(profile)
    assert_maxquant_phosphosites(file)
    `Protein group IDs` <- Reverse <- Proteins <- Protein <- NULL
# Read    
    if (verbose)  cmessage('%sphosphosites  %s%s%s', spaces(14), quantity, spaces(35-nchar('phosphosites  ')-nchar(quantity)), file)
    colclasses <- c(id = 'character', `Protein group IDs` = 'character')
    fosdt <- fread(file, colClasses = colclasses, integer64 = 'numeric')
    fosdt %<>% un_int64()
# Extract relevant columns
    fosdt[Reverse == '+', Proteins := Protein]  # the proteins column is empty in (only) reverse
    pattern <- MAXQUANT_PATTERNS[[quantity]]    # this fails the feature_id naming
    anncols <- c('id', 'Protein group IDs', 'Proteins', 
                 'Positions within proteins', 'Amino acid', 
                 'Reverse', 'Contaminant', 'Potential contaminant', 'Fasta headers')
    anncols %<>% intersect(names(fosdt))
    valcols <- grep(pattern, names(fosdt), value = TRUE)
    valcols %<>% extract(stri_detect_fixed(., '___1'))
    fosdt %<>% extract(, c(anncols, valcols), with = FALSE)
    digits <- ceiling(log10(nrow(fosdt)))
    if (verbose)  message(spaces(55), 'Read   ', formatC(nrow(fosdt), digits = digits), 
                          ' phosphosites in proteins, contaminants, reverse')
    names(fosdt) %<>% stri_replace_first_fixed(          'Reverse',     'reverse')
    names(fosdt) %<>% stri_replace_first_fixed(          'Contaminant', 'contaminant') # older MaxQuant
    names(fosdt) %<>% stri_replace_first_fixed('Potential contaminant', 'contaminant') # newer MaxQuant
# Filter
    fosdt <- fosdt[stri_count_fixed(`Protein group IDs`, ';') == 0]
    if (verbose)  message(spaces(55), 'Retain ', formatC(nrow(fosdt), digits = digits), 
                          ' phosphosites in single proteingroup')
    idx <- rowSums(fosdt[, valcols, with = FALSE], na.rm = TRUE) > 0
    fosdt <- fosdt[which(idx)]
    if (verbose)  message(spaces(55), 'Retain ', formatC(nrow(fosdt), digits = digits), 
                          ' phosphosites with signal in some sample')
# Rename 
    names(fosdt) %<>% stri_replace_first_fixed('___1', '')
    setnames(fosdt, 'id',                'fosId')
    setnames(fosdt, 'Protein group IDs', 'proId')
    setnames(fosdt, 'Proteins',          'uniprot')
    fosdt[]
}


#---------------------------------------------------------------------------
# 
#                      drop_differing_uniprots
#
#---------------------------------------------------------------------------


drop_differing_uniprots <- function(fosdt, prodt, verbose){
# Assert
    contaminant <- fosId <- `Positions within proteins` <- proId <- NULL
    reverse <- uniprot <- NULL
# Avoid changing the global env
    prodt %<>% copy()
    fosdt %<>% copy()
# Extract annotation cols
    if (verbose)  message(spaces(55), 'Keep proteingroup uniprots')
    proanncols <- c('proId', 'uniprot', 'reverse', 'contaminant')
    fosanncols <- c('fosId', 'proId', 'uniprot', 'Positions within proteins', 'reverse', 'contaminant')
    proanndt <- prodt[, proanncols, with = FALSE]
    fosanndt <- fosdt[, fosanncols, with = FALSE]
# Separate contaminants-reverse from other proteins.
    conrevdt <- fosanndt[reverse == '+' | contaminant == '+']
    fosanndt <- fosanndt[reverse == ''  & contaminant == '']
    proanndt <- proanndt[reverse == ''  & contaminant == '']
    fosanndt[, c('reverse', 'contaminant') := NULL]
    proanndt[, c('reverse', 'contaminant') := NULL]
    conrevdt[, c('reverse', 'contaminant') := NULL]
# Merge
    fosanndt %<>% uncollapse(uniprot, `Positions within proteins`, sep = ';')
    proanndt %<>% uncollapse(uniprot,                              sep = ';')
    fosanndt %<>% merge(proanndt, by = c('proId', 'uniprot'))
    fosanndt %<>% extract(order(as.integer(fosId)))
    fosanndt %<>% extract(, lapply(.SD, paste0, collapse = ';'), by = 'fosId')
# Add back contaminants/reverse
    fosanndt %<>% rbind(conrevdt)
    fosanndt[, proId := NULL]
    fosdt[, c('uniprot', 'Positions within proteins') := NULL]
    fosdt %<>% merge(fosanndt, by = 'fosId', all.x = TRUE, sort = FALSE)
    fosdt %<>% pull_columns(names(fosanndt))
    fosdt    
}


#---------------------------------------------------------------------------
# 
#                 mqdt_to_mat
#                 dequantify
#                     label2index
#
#---------------------------------------------------------------------------


#' Convert maxquant data.table to matrix
#' @param dt       data.table
#' @param pattern  string
#' @param logbase  base for value logaritmization
#' @param verbose  TRUE / FALSE
#' @return matrix
#' @examples 
#' profile <- system.file('extdata/billing19.proteingroups.txt',  package = 'autonomics')
#' fastafile <- system.file('extdata/uniprot_hsa_20140515.fasta', package = 'autonomics')
#' prodt <- .read_maxquant_proteingroups(file = profile)
#'     uniprothdrs <- read_uniprotdt(fastafile)
#' contaminanthdrs <- read_contaminantdt()
#'    maxquanthdrs <- parse_maxquant_hdrs(prodt$`Fasta headers`)
#' prodt %<>% annotate_maxquant(uniprothdrs, contaminanthdrs, maxquanthdrs)
#' quantity <- guess_maxquant_quantity(profile)
#' pattern <- MAXQUANT_PATTERNS[[quantity]]
#' mqdt_to_mat(prodt, pattern = pattern)[1:2, 1:2]
#' @noRd
mqdt_to_mat <- function(dt, pattern, logbase = 2, verbose = TRUE){
    mat <- dt[, .SD, .SDcols = patterns(pattern)]
    mat %<>% data.matrix()
    rownames(mat) <- dt$feature_id
    mat %<>% zero_to_na(verbose = verbose) 
    mat %<>% nan_to_na( verbose = verbose)
    if (!is.null(logbase)) mat <- log(mat, base = logbase)
    mat
}


#' Dequantify maxquant snames
#' 
#' Drop quantity ('Reporter intensity'). \cr
#' Encode `{channel}` as suffix.
#' 
#' `               Ratio H/L WT(L).KD(H).R1  ->  WT(L).KD(H).R1{H/L}`
#' `                    LFQ intensity WT.R1  ->  WT.R1`
#' `Reporter intensity 0 WT(126).KD(127).R1  ->  WT(1).KD(2).R1{1}`
#' @param x        `character`
#' @param quantity `'ratio',              'normalizedratio'`,  \cr
#'                 `'LFQ intensity'`, \cr
#'                 `'intensity',          'labeledintensity'`
#'                 `'reporterintensity', 'correctedreporterintensity'`
#' @param verbose  `TRUE` or `FALSE`
#' @return `character`
#' @examples
#' dequantify(c('Ratio H/L WT(L).KD(M).OE(H).R1',             # Ratios
#'              'Ratio M/L WT(L).KD(M).OE(H).R1'))
#' dequantify(c('Ratio H/L normalized WT(L).KD(M).OE(H).R1',  # Norm. Ratios
#'              'Ratio M/L normalized WT(L).KD(M).OE(H).R1'))
#' dequantify(c('LFQ intensity WT.R1',                        # LFQ intensity
#'              'LFQ intensity KD.R1'))
#' dequantify(c('Reporter intensity 1 WT(126).KD(127).R1',    # Rep.intensities
#'              'Reporter intensity 2 WT(126).KD(127).R1'))
#' @md
#' @export
dequantify <- function(
    x, quantity = guess_maxquant_quantity(x), verbose  = FALSE
){
# x = multiplex + channel. Return multiplex if single channel.
# Decompose multiplex and channel
    pattern <- MAXQUANT_PATTERNS[[quantity]]
    if (quantity == 'intensity'){
        multiplex <- stri_replace_first_regex(x, pattern, '$1')
        channel   <- rep('', length(multiplex))
    } else {
        multiplex <- stri_replace_first_regex(x, pattern, '$2')
        channel   <- stri_replace_first_regex(x, pattern, '$1')
        channel %<>% stri_replace_first_fixed(' ', '')
    }
# Reporter intensity
    if (quantity %in% c('reporterintensity', 'correctedreporterintensity')){
        multiplex %<>% lapply(label2index)
        channel %<>% as.numeric()
        if (0 %in% channel){                           # pre-2018 mq is 0-based
            channel %<>% as.numeric() %>% add(1) %>% as.character()
        }
    }
# Standardize
    if (all(channel=='')){
        cleanx <- multiplex
    } else {
        cleanx <- sprintf('%s{%s}', multiplex, channel)
    }
    if (verbose)  cmessage('%sStandardize snames: %s  ->  %s', spaces(14), x[1], cleanx[1])
    return(cleanx)
}


#' Convert labels into indices
#' @param x  `character`
#' @examples
#' label2index(x = 'Reporter intensity 0 WT(0).KD(1).OE(2).R1')
#' label2index(x = 'Reporter intensity 1 WT(1).KD(2).OE(3).R1')
#' label2index(x = 'Reporter intensity 0 WT(126).KD(127).OE(128).R1')
#' label2index(x = 'Reporter intensity 1 WT(126).KD(127).OE(128).R1')
#' label2index(x = 'Reporter intensity 1 Mix1')
#' @export
label2index <- function(x){
    labels <- unlist(stri_extract_all_regex(x, '\\(.+?\\)'))
    if (any(is.na(labels)))  return(x)
    for (i in rev(seq_along(labels))){ 
        # important to do this in rev order!!! otherwise ...
        #                 Reporter intensity 0 WT(0).KD(1).OE(2).R1
        # i=1: (0) -> 1 : Reporter intensity 0 WT(1).KD(1).OE(2).R1
        # i=2: (1) -> 2 : Reporter intensity 0 WT(2).KD(1).OE(2).R1
        # not what we want!!!
        x %<>% stri_replace_first_fixed(labels[[i]], paste0('(',i,')'))
    }
    x
}


#---------------------------------------------------------------------------
# 
#                   read_maxquant_proteingroups
#                   read_maxquant_phosphosites
#
#---------------------------------------------------------------------------



#' Is a file?
#'
#' Is a file (and not a dir)
#'
#' This function distinguishies between dir and file.
#' Others dont: is.file, fs::file_exists, assertive::is_existing_file
#' @param file filepath
#' @examples
#' dir  <- tempdir();  dir.create(dir, showWarnings = FALSE)
#' file <- tempfile(); invisible(file.create(file))
#' is_file(dir)
#' is_file(file)
#' @export
is_file <- function(file){
    file.exists(file) & !dir.exists(file)
}


#' Read maxquant proteingroups
#' @param dir           proteingroups directory
#' @param file          proteingroups file
#' @param fastafile     uniprot fastafile
#' @param restapi       TRUE or FALSE : use uniprot restapi to annotate uniprots not in fastadt ?
#' @param quantity     'normalizedratio', 'ratio', 'correctedreporterintensity', 
#'                     'reporterintensity', 'maxlfq', 'labeledintensity', 
#'                     'intensity' or NULL
#' @param subgroups     NULL or string vector : subgroups to retain
#' @param contaminants  TRUE or FALSE : retain contaminants ?
#' @param reverse       TRUE or FALSE : include reverse hits ?
#' @param rm_missing_in_all_samples TRUE or FALSE
#' @param invert        string vector : subgroups which require inversion
#' @param impute        TRUE or FALSE: impute group-specific NA values?
#' @param plot          TRUE or FALSE: plot ?
#' @param label         fvar
#' @param pca           TRUE or FALSE: run pca ?
#' @param pls           TRUE or FALSE: run pls ?
#' @param fit           model engine: 'limma', 'lm', 'lme(r)', 'wilcoxon' or NULL
#' @param formula       model formula
#' @param block         model blockvar: string or NULL
#' @param coefs         model coefficients    of interest: character vector or NULL
#' @param contrasts     coefficient contrasts of interest: character vector or NULL
#' @param palette       color palette : named character vector
#' @param verbose       TRUE or FALSE : message ?
#' @param ...           maintain deprecated functions
#' @return SummarizedExperiment
#' @examples
#' # fukuda20 - LFQ
#'     file <- system.file('extdata/fukuda20.proteingroups.txt', package = 'autonomics')
#'     pro <- read_maxquant_proteingroups(file = file)
#'     
#' # billing19 - Normalized Ratios
#'          file <- system.file('extdata/billing19.proteingroups.txt', package = 'autonomics')
#'     fastafile <- system.file('extdata/uniprot_hsa_20140515.fasta',  package = 'autonomics')
#'     subgroups <- sprintf('%s_STD', c('E00', 'E01', 'E02', 'E05', 'E15', 'E30', 'M00'))
#'     pro <- read_maxquant_proteingroups(file = file, subgroups = subgroups)
#'     pro <- read_maxquant_proteingroups(file = file, fastafile = fastafile, subgroups = subgroups)
#' @export
read_maxquant_proteingroups <- function(
                          dir = getwd(), 
                         file = if (is_file(dir)) dir else file.path(dir, 'proteinGroups.txt'), 
                    fastafile = NULL,
                      restapi = FALSE,
                     quantity = NULL,
                    subgroups = NULL,
                       invert = character(0),
                 contaminants = FALSE,
                      reverse = FALSE,
    rm_missing_in_all_samples = TRUE, 
                       impute = FALSE,
                         plot = FALSE,
                        label = 'feature_id',
                          pca = plot,
                          pls = plot, 
                          fit = if (plot) 'limma' else NULL,
                     groupvar = 'subgroup',
                      formula = as.formula(sprintf('~ %s', groupvar)),
                        block = NULL, 
                        coefs = NULL, 
                    contrasts = NULL,
                      palette = NULL, 
                      verbose = TRUE
){
# Assert
    assert_maxquant_proteingroups(file)
    if (is.null(quantity))  quantity <- guess_maxquant_quantity(file)
    assert_is_a_string(quantity)
    assert_is_subset(quantity, names(MAXQUANT_PATTERNS))
    assert_is_a_bool(verbose)
    `Fasta headers` <- NULL
# Read/Annotate
    prodt <- .read_maxquant_proteingroups(file = file, quantity = quantity, verbose = verbose)
        uniprothdrs <- read_uniprotdt(fastafile)
    contaminanthdrs <- read_contaminantdt()
       maxquanthdrs <- parse_maxquant_hdrs(prodt$`Fasta headers`); prodt[, `Fasta headers` := NULL ]
    prodt %<>% annotate_maxquant( uniprothdrs = uniprothdrs, 
                              contaminanthdrs = contaminanthdrs, 
                                 maxquanthdrs = maxquanthdrs, 
                                      restapi = restapi,
                                      verbose = verbose )
# SumExp
    if (verbose)  cmessage('%sSumExp', spaces(4))
    pattern <- MAXQUANT_PATTERNS[[quantity]]
    promat <- mqdt_to_mat(prodt, pattern, verbose = verbose)
    pepcols <- names(prodt) %>% extract(stri_detect_fixed(., 'eptides'))
    pepdt <- prodt[, pepcols, with = FALSE]
    prodt %<>% extract(, names(prodt) %>% setdiff(colnames(promat)) %>% setdiff(names(pepdt)), with = FALSE)
    object <- list(promat)
    names(object) <- paste0('log2', quantity)
    object %<>% SummarizedExperiment(rowData = prodt)
# Dequantify. Add pepcounts
    object$mqcol <- colnames(object)
    colnames(object) %<>% dequantify(quantity = quantity, verbose = verbose)
    pepcols <- paste0('Razor + unique peptides ', gsub('\\{.+\\}', '', colnames(object)))
    pepmat <- pepdt[, pepcols, with = FALSE ]
    pepmat %<>% data.matrix()
    dimnames(pepmat) <- dimnames(object)
    assays(object)$pepcounts <- pepmat
    object %<>% process_maxquant( subgroups = subgroups,      
                                     invert = invert,
                                    reverse = reverse,  
                               contaminants = contaminants,   
                  rm_missing_in_all_samples = rm_missing_in_all_samples, 
                                     impute = impute,
                                    verbose = verbose )
    assays <- c(assayNames(object)[1], 'pepcounts')
    for (assay in assays)  object %<>% add_assay_means(assay)
    object %<>% analyze(
        pca          = pca,           pls       = pls,
        fit          = fit,           formula   = formula,
        block        = block,         coefs     = coefs,
        contrasts    = contrasts,     plot      = plot,
        label        = label,
        palette      = palette,       verbose   = verbose )
    object
}


#' @rdname read_maxquant_proteingroups
#' @export
read_proteingroups <- function(...){
    .Deprecated('read_maxquant_proteingroups')
    read_maxquant_proteingroups(...)
}


#' Read maxquant phosphosites
#' @param dir           proteingroups directory
#' @param fosfile       phosphosites  file
#' @param profile       proteingroups file
#' @param fastafile     uniprot fastafile
#' @param restapi       TRUE or FALSE : annotate non-fastadt uniprots using uniprot restapi
#' @param quantity     'normalizedratio', 'ratio', 'correctedreporterintensity', 
#'                     'reporterintensity', 'maxlfq', 'labeledintensity', 
#'                     'intensity' or NULL
#' @param subgroups     NULL or string vector : subgroups to retain
#' @param contaminants  TRUE or FALSE: retain contaminants ?
#' @param reverse       TRUE or FALSE: include reverse hits 
#' @param rm_missing_in_all_samples TRUE or FALSE
#' @param localization  number: min localization probability (for phosphosites)
#' @param invert        string vector: subgroups which require inversion
#' @param impute        TRUE or FALSE: impute group-specific NA values?
#' @param plot          TRUE or FALSE
#' @param label         fvar
#' @param pca           TRUE or FALSE: run pca ?
#' @param pls           TRUE or FALSE: run pls ?
#' @param fit           model engine: 'limma', 'lm', 'lme(r)', 'wilcoxon' or NULL
#' @param formula       model formula
#' @param block         model blockvar: string or NULL
#' @param coefs         model coefficients          of interest: string vector or NULL
#' @param contrasts     model coefficient contrasts of interest: string vector or NULL
#' @param palette       color palette: named string vector
#' @param verbose       TRUE or FALSE: message ?
#' @param ...           maintain deprecated functions
#' @return SummarizedExperiment
#' @examples
#'   profile <- system.file('extdata/billing19.proteingroups.txt', package = 'autonomics')
#'   fosfile <- system.file('extdata/billing19.phosphosites.txt',  package = 'autonomics')
#' fastafile <- system.file('extdata/uniprot_hsa_20140515.fasta',  package = 'autonomics')
#' subgroups <- sprintf('%s_STD', c('E00', 'E01', 'E02', 'E05', 'E15', 'E30', 'M00'))
#' pro <- read_maxquant_proteingroups(file = profile, subgroups = subgroups)
#' fos <- read_maxquant_phosphosites(fosfile = fosfile, profile = profile, subgroups = subgroups)
#' fos <- read_maxquant_phosphosites(fosfile = fosfile, profile = profile, fastafile = fastafile, subgroups = subgroups)
#' @export
read_maxquant_phosphosites <- function(
                      dir = getwd(), 
                  fosfile = if (is_file(dir)) dir else file.path(dir, 'phospho (STY)Sites.txt'), 
                  profile = file.path(dirname(fosfile), 'proteinGroups.txt'), 
                fastafile = NULL,
                  restapi = FALSE,
                 quantity = NULL,   
                subgroups = NULL,
                   invert = character(0),
             contaminants = FALSE,
                  reverse = FALSE,
rm_missing_in_all_samples = TRUE,
             localization = 0.75,
                   impute = FALSE,
                     plot = FALSE,
                    label = 'feature_id',
                      pca = plot, 
                      pls = plot,
                      fit = if (plot) 'limma' else NULL,
                 groupvar = 'subgroup',
                  formula = as.formula(sprintf('~ %s', groupvar)),
                    block = NULL,
                    coefs = NULL,
                contrasts = NULL, 
                  palette = NULL,
                  verbose = TRUE
){
# Assert
    assert_all_are_existing_files(c(fosfile, profile))
    if (is.null(quantity))  quantity <- guess_maxquant_quantity(fosfile)
    assert_is_a_string(quantity)
    assert_is_subset(quantity, names(MAXQUANT_PATTERNS))
    assert_is_a_bool(verbose)
    `Fasta headers` <- NULL
# Read
    prodt <- .read_maxquant_proteingroups(file = profile, quantity = quantity, verbose = verbose)
    fosdt <- .read_maxquant_phosphosites( file = fosfile, quantity = quantity, profile = profile, verbose = verbose)
    fosdt %<>% drop_differing_uniprots(prodt, verbose = verbose)
        uniprothdrs <- read_uniprotdt(fastafile)
    contaminanthdrs <- read_contaminantdt()
       maxquanthdrs <- parse_maxquant_hdrs(prodt$`Fasta headers`); fosdt[, `Fasta headers` := NULL ]
    fosdt %<>% annotate_maxquant( uniprothdrs = uniprothdrs, 
                              contaminanthdrs = contaminanthdrs, 
                                 maxquanthdrs = maxquanthdrs, 
                                      restapi = restapi,
                                      verbose = verbose )
    prodt %<>% extract(fosdt$proId, on = 'proId')
# SumExp
    if (verbose)  cmessage('%sSumExp', spaces(4))
    pattern <- MAXQUANT_PATTERNS[[quantity]]
    promat <- mqdt_to_mat(prodt, pattern, verbose = verbose)
    pepcols <- names(prodt) %>% extract(stri_detect_fixed(., 'eptides'))
    pepdt <- prodt[, pepcols, with = FALSE]
    prodt %<>% extract(, names(prodt) %>% setdiff(colnames(promat)) %>% setdiff(names(pepdt)), with = FALSE)
    fosmat <- mqdt_to_mat(fosdt, pattern, verbose = verbose)
    fosdt <- fosdt %>% extract(, setdiff(names(.), colnames(fosmat)), with = FALSE)
    object <- SummarizedExperiment::SummarizedExperiment(
        assays  = list(log2sites    = fosmat,
                       log2proteins = promat,
                       log2diffs = fosmat - promat),
        rowData = fosdt)
# Dequantify. Add pepcounts
    object$mqcol <- colnames(object)
    colnames(object) %<>% dequantify()
    pepcols <- paste0('Razor + unique peptides ', gsub('\\{.+\\}', '', colnames(object)))
    pepmat <- pepdt[, pepcols, with = FALSE ]
    pepmat %<>% data.matrix()
    dimnames(pepmat) <- dimnames(object)
    assays(object)$pepcounts <- pepmat
# Process / Analyze
    object %<>% process_maxquant( subgroups = subgroups,     
                                     invert = invert,      
                                    reverse = reverse, 
                               contaminants = contaminants,  
                  rm_missing_in_all_samples = rm_missing_in_all_samples,
                               localization = localization, 
                                     impute = impute,
                                    verbose = verbose )
    assays <- c('log2sites', 'pepcounts')
    for (assay in assays)  object %<>% add_assay_means(assay)
    object %<>% analyze(
        pca       = pca,           pls       = pls,
        fit       = fit,           formula   = formula,
        block     = block,         coefs     = coefs,
        contrasts = contrasts,     plot      = plot,
        label     = label,
        palette   = palette,       verbose   = verbose )
    object
}


#' @rdname read_maxquant_phosphosites
#' @export
read_phosphosites <- function(...){
    .Deprecated('read_maxquant_phosphosites')
   read_maxquant_phosphosites(...) 
}


#-----------------------------------------------------------------------------
#
#                            invert_subgroups
#
#-----------------------------------------------------------------------------

# x <- c('Ctrl_A', 'Ctrl_B')
# .invert_subgroups(x)
.invert_subgroups <- function(x, sep = guess_sep(x)){
    stri_split_fixed(x, sep)                      %>%
    lapply(rev)                                   %>%
    vapply(paste, character(1), collapse = sep)
}


#' Invert subgroups
#'
#' Invert expressions , subgroups, and sample ids
#'
#' @param  object     SummarizedExperiment
#' @param  subgroups  character vector: subgroup levels to be inversed
#' @param  sep        string: collapsed string separator
#' @return character vector or SummarizedExperiment
#' @examples
#' file <- system.file('extdata/fukuda20.proteingroups.txt', package = 'autonomics')
#' object <- read_maxquant_proteingroups(file)
#' invert_subgroups(object)
#' @export
invert_subgroups <- function(
    object, 
    subgroups = slevels(object, 'subgroup'), 
    sep = guess_sep(object, 'subgroup')
){
# Assert / Msg
    if (length(subgroups)==0) return(object)
    assert_is_all_of(object, 'SummarizedExperiment')
    assert_is_subset('subgroup', svars(object))
    assert_is_subset(subgroups, subgroup_levels(object))
    idx <- which(object$subgroup %in% subgroups)
    first <- values(object)[, idx[1]] %>% (function(y) which(!is.na(y))[[1]])
    oldvalue <- values(object)[first, idx[1]] %>% round(2) %>% as.character()
    cmessage('%sInvert subgroups %s', spaces(14), paste0(subgroups, collapse = ', '))
# Invert (log) ratios
    if (all(values(object)>0, na.rm=TRUE)){ values(object)[, idx] %<>% (function(object){1/object})
    } else {                                values(object)[, idx] %<>% (function(object){ -object})}
    newvalue <- as.character(round(values(object)[first, idx[1]], 2))
    cmessage('%sexprs    : %s -> %s', spaces(21), as.character(oldvalue), 
            as.character(newvalue))
# Invert subgroup and sampleid values
    oldsubgroups <- sdata(object)$subgroup[idx]
    newsubgroups <- vapply(oldsubgroups, .invert_subgroups, character(1),sep=sep)
    oldsampleids <- sdata(object)$sample_id[idx]
    for (i in seq_along(idx)){
        sdata(object)$subgroup[ idx[i]] <- newsubgroups[i]
        sdata(object)$sample_id[idx[i]] %<>% stri_replace_first_fixed(oldsubgroups[i], newsubgroups[i])
        snames(object)[         idx[i]] %<>% stri_replace_first_fixed(oldsubgroups[i], newsubgroups[i])
    }
    newsampleids <- sdata(object)$sample_id[idx]
    cmessage('%ssubgroups: %s -> %s', spaces(21), oldsubgroups[1], newsubgroups[1])
    cmessage('%ssampleids: %s -> %s', spaces(21), oldsampleids[1], newsampleids[1])
# Order on subgroup
    object %<>% arrange_samples_('subgroup')
    cmessage('%sOrder on subgroup', spaces(14))
    object$subgroup %<>% droplevels()

# Return
    return(object)
}

arrange_samples_ <- function(object, svars){
    idx <- do.call(order, sdata(object)[, svars, drop = FALSE])
    object %<>% extract(, idx)
    return(object)
}


#---------------------------------------------------------------------------
#
#                     demultiplex
#                         .is_multiplexed
#                         .demultiplex
#
#---------------------------------------------------------------------------


#' Demultiplex snames
#' 
#' Demultiplex maxquant samplenames
#' 
#' `WT(L).KD(H).R1{H/L}  -> KD_WT.R1`
#' `WT(1).KD(2).R1{1}    -> WT.R1`
#' `         WT.R1       -> WT.R1`
#' @param x       character vector
#' @param verbose TRUE or FALSE
#' @return character
#' @examples
#' # uniplexed / intensity / ratio
#'    demultiplex(c('KD.R1','OE.R1'))
#'    demultiplex(c('WT(L).KD(M).OE(H).R1{M}',  'WT(L).KD(M).OE(H).R1{H}'))
#'    demultiplex(c('WT(L).KD(M).OE(H).R1{M/L}','WT(L).KD(M).OE(H).R1{H/L}'))
#' # run / replicate
#'    demultiplex(c('WT(L).OE(H).R1{L}',    'WT(L).OE(H).R1{H}'))     # run
#'    demultiplex(c('WT.R1(L).OE.R1(H){L}', 'WT.R1(L).OE.R1(H){H}'))  # repl
#' # label / index
#'    demultiplex(c('WT(L).OE(H).R1{L}',    'WT(L).OE(H).R1{H}'))     # label
#'    demultiplex(c('WT(1).OE(2).R1{1}',    'WT(1).OE(2).R1{2}'))     # index
#' # with unused channels
#'    demultiplex('WT(1).KD(2).OE(3).R1{6}')
#' @md
#' @export
demultiplex <- function(x, verbose = FALSE){
    assert_is_character(x)
    assert_is_a_bool(verbose)
    . <- NULL
    if (!.is_multiplexed(x)) return(x)
    y <- unname(vapply(x, .demultiplex, character(1)))
    make.unique(y, sep = '-')
}


.is_multiplexed <- function(x){
# Components
    pattern <- '(.+)\\{(.+)\\}'
    n_open   <- stri_count_fixed(x, '(')
    n_closed <- stri_count_fixed(x, ')')
    channel <- gsub(pattern, '\\2', x)
# Multiplexed consistently ?
    y <- all(stri_detect_regex(x, pattern) &
                 n_open > 0                    &  
                 n_open == n_closed)
# All channels defined (TMT) ? 
    if (all(is_numeric_string(channel))){
        channel %<>% as.numeric()
        y %<>% and(all(as.numeric(channel) <= n_open))
    }
    y
}

.demultiplex <- function(y){
    y0 <- y
    channel <- split_extract_fixed(y, '{', 2) %>% substr(1, nchar(.)-1)
    y %<>% stri_replace_last_fixed(paste0('{', channel, '}'), '')
    
    labels     <- y %>% stri_extract_all_regex('\\([^()]+\\)') %>% unlist() %>% substr(2, nchar(.)-1)
    run        <- y %>% stri_split_regex('\\([^[()]]+\\)')     %>% unlist() %>% extract(length(.))
    biosamples <- y %>% stri_split_regex('\\([^[()]]+\\)')     %>% unlist() %>% extract(-length(.))
    biosamples %<>% stri_replace_first_regex('^[._ ]', '')
    assert_are_same_length(biosamples, labels)
    names(biosamples) <- labels
    channel %<>% stri_split_fixed('/') %>% unlist()
    
    if (!all(channel %in% labels)) return(y0)
    
    result <- paste0(biosamples[channel], collapse = '_')
    result %<>% paste0(run)
    result
}


#----------------------------------------------------------------------------
#
#                 process_maxquant
#
#----------------------------------------------------------------------------

process_maxquant <- function(
    object, subgroups, invert, contaminants, reverse, rm_missing_in_all_samples = TRUE, 
    localization = 0.75, impute, verbose
){
# Demultiplex. Infer Subgroup
    contaminant <- `Localization prob` <- NULL
    colnames(object) %<>% demultiplex(verbose = verbose)
    object$sample_id <- colnames(object)
    object %<>% add_subgroup(verbose = verbose)
    cols <- c('sample_id', 'subgroup', 'replicate', 'mqcol')
    cols %<>% intersect(svars(object))
    sdt(object) %<>% pull_columns(cols)
# Samples
    object %<>% filter_samples_available_for_some_feature(verbose = verbose)
    if (!is.null(subgroups)){
        assert_is_subset(subgroups, as.character(object$subgroup))
        object %<>% filter_samples(subgroup %in% subgroups, verbose = verbose)
    }
    object %<>% invert_subgroups(invert)
# Features
    analysis(object)$nfeatures <- c(nrow(object))
    if (!reverse)             object %<>% filter_features(   reverse == '', verbose = verbose)
    if (!contaminants)        object %<>% filter_features(contaminant== '', verbose = verbose)
    if ({rm_missing_in_all_samples})  object %<>% rm_missing_in_all_samples(verbose = verbose)
    #object %<>% filter_exprs_replicated_in_some_subgroup(verbose = verbose) # doesnt work for single-instance subgroups
    if ('Localization prob' %in% fvars(object)){                             # subgroup could be increasing concentrations or so
        object %<>% filter_features(
            `Localization prob` >= localization, verbose = verbose)  }
# Impute
    if ({{impute}})  object %<>% impute(plot = FALSE)
    object
}


#---------------------------------------------------------------------------
#
#                       log2proteins
#
#---------------------------------------------------------------------------


#' Get/Set log2proteins
#' @param object SummarizedExperiment
#' @param value occupancy matrix (features x samples)
#' @return occpuancy matrix (get) or updated object (set)
#' @examples
#' file <- system.file('extdata/fukuda20.proteingroups.txt', package = 'autonomics')
#' object <- read_maxquant_proteingroups(file)
#' log2proteins(object)[1:3, 1:3]
#' @rdname log2proteins
#' @export
setGeneric('log2proteins', function(object)   standardGeneric("log2proteins"))

#' @rdname log2proteins
setMethod("log2proteins", signature("SummarizedExperiment"),
function(object)   assays(object)$log2proteins)


#' @rdname log2proteins
#' @export
setGeneric('log2proteins<-',
function(object, value) standardGeneric("log2proteins<-"))

#' @rdname log2proteins
setReplaceMethod("log2proteins", signature("SummarizedExperiment", "matrix"),
function(object, value){
    assays(object)$log2proteins <- value
    object })

#' @rdname log2proteins
setReplaceMethod("log2proteins", signature("SummarizedExperiment", "numeric"),
function(object, value){
    assays(object)$log2proteins[] <- value
    object })


#---------------------------------------------------------------------------
#
#                       log2sites
#
#---------------------------------------------------------------------------


#' Get/Set log2sites
#' @param object SummarizedExperiment
#' @param value occupancy matrix (features x samples)
#' @return occpuancy matrix (get) or updated object (set)
#' @examples
#' file <- system.file('extdata/fukuda20.proteingroups.txt', package = 'autonomics')
#' object <- read_maxquant_proteingroups(file)
#' log2sites(object)[1:3, 1:3]
#' @rdname log2sites
#' @export
setGeneric('log2sites', function(object)   standardGeneric("log2sites"))

#' @rdname log2sites
setMethod("log2sites", signature("SummarizedExperiment"),
function(object)   assays(object)$log2sites)


#' @rdname log2sites
#' @export
setGeneric('log2sites<-',
function(object, value) standardGeneric("log2sites<-"))

#' @rdname log2sites
setReplaceMethod("log2sites", signature("SummarizedExperiment", "matrix"),
function(object, value){
    assays(object)$log2sites <- value
    object })

#' @rdname log2sites
setReplaceMethod("log2sites", signature("SummarizedExperiment", "numeric"),
function(object, value){
    assays(object)$log2sites[] <- value
    object })


#---------------------------------------------------------------------------
#
#                       log2diffs
#
#---------------------------------------------------------------------------


#' Get/Set log2diffs
#' @param object SummarizedExperiment
#' @param value occupancy matrix (features x samples)
#' @return occpuancy matrix (get) or updated object (set)
#' @examples
#' file <- system.file('extdata/fukuda20.proteingroups.txt', package = 'autonomics')
#' object <- read_maxquant_proteingroups(file)
#' log2diffs(object)[1:3, 1:3]
#' @rdname log2diffs
#' @export
setGeneric('log2diffs', function(object)   standardGeneric("log2diffs"))

#' @rdname log2diffs
setMethod("log2diffs", signature("SummarizedExperiment"),
function(object)   assays(object)$log2diffs)


#' @rdname log2diffs
#' @export
setGeneric('log2diffs<-',
function(object, value) standardGeneric("log2diffs<-"))

#' @rdname log2diffs
setReplaceMethod("log2diffs", signature("SummarizedExperiment", "matrix"),
function(object, value){
    assays(object)$log2diffs <- value
    object })

#' @rdname log2diffs
setReplaceMethod("log2diffs", signature("SummarizedExperiment", "numeric"),
function(object, value){
    assays(object)$log2diffs[] <- value
    object })
bhagwataditya/importomics documentation built on Oct. 29, 2024, 3:19 p.m.