R/getResult.R

Defines functions .get_bulk_files .mgnify_get_single_analysis_results .mgnify_get_analyses_results .mgnify_get_single_analysis_treese .mgnify_get_analyses_treese .create_TreeSE_from_func_data .convert_results_to_object

#' Get microbial and/or functional profiling data for a list of accessions
#'
#' @details
#' Given a set of analysis accessions and collection of annotation types,
#' the function queries the MGNify API and returns the results. This function
#' is convenient for retrieving highly structured (analysis vs counts) data on
#' certain instances. For example, BIOM files are downloaded automatically.
#' If you want just to retrieve raw data from the database, see \code{getData}.
#'
#' @param x A \code{MgnifyClient} object.
#'
#' @param accession A single character value or a vector of character values
#' specifying accession IDs to return results for.
#'
#' @param output A single character value specifying the format of an output.
#' Must be one of the following options: \code{"TreeSE"}, \code{"list"}, or 
#' \code{"phyloseq"}. (By default: \code{output = "TreeSE"})
#'
#' @param get.taxa A boolean value specifying whether to retrieve taxonomy
#' data (OTU table). See \code{taxa.su} for specifying taxonomy type. The
#' data is retrieved as BIOM files which are subsequently parsed.
#' (By default: \code{get.taxa = TRUE})
#'
#' @param get.func A boolean value or a single character value or a vector
#' character values specifying functional analysis types to retrieve. If
#' \code{get.func = TRUE}, all available functional datatypes are retrieved,
#' and if \code{FALSE}, functional data is not retrieved. The current list of
#' available types is \code{"antismash-gene-clusters"}, \code{"go-slim"},
#' \code{"go-terms"}, \code{"interpro-identifiers"}, \code{"taxonomy"},
#' \code{"taxonomy-itsonedb"}, \code{"taxonomy-itsunite"}, \code{"taxonomy-lsu"},
#' and \code{"taxonomy-ssu"}. Note that depending on the particular analysis
#' type, pipeline version etc., not all functional results will be available.
#' Furthermore, taxonomy is also available via \code{get.func}, and loading
#' the data might be considerable faster if \code{bulk.dl = TRUE}. However,
#' phylogeny is available only via \code{get.taxa}.
#' (By default: \code{get.func = TRUE})
#'
#' @param ... optional arguments:
#' \itemize{
#'   
#'   \item \strong{taxa.su} A single character value specifying which taxa
#'   subunit results should be selected. Currently, taxonomy assignments in the
#'   MGnify pipelines rely on rRNA matches to existing databases
#'   (GreenGenes and SILVA), with later pipelines checking both the SSU and
#'   LSU portions of the rRNA sequence. \code{taxa.su} allows then selection
#'   of either the Small subunit (\code{"SSU"}) or Large subunit (\code{"LSU"})
#'   results in the final \code{TreeSummarizedExperiment} object. Older pipeline
#'   versions do not report results for both subunits, and thus for some
#'   accessions this value will have no effect.
#'
#'   \item \strong{get.tree} A single boolean value specifying whether to
#'   include available phylogenetic trees in the \code{TreeSummarizedExperiment}
#'   object. Available when \code{get.taxa = TRUE}.
#'   (By default: \code{get.tree = TRUE})
#'
#'   \item \strong{as.df} A single boolean value enabled when
#'   \code{output = "list"}. The argument specifies whether return functional
#'   data as a named list (one entry per element in the output list) of
#'   data.frames, with each data.frame containing results for all requested
#'   accessions. If \code{FALSE}, the function returns a list of lists, each
#'   element consisting of results for a single accession. (By default:
#'   \code{as.df = TRUE})
#'
#'   \item \strong{bulk.dl} A single boolean value specifying should
#'   MGnifyR attempt to speed things up by downloading
#'   relevant studies TSV results and only extracting the required columns,
#'   rather than using the JSONAPI interface. When getting results where
#'   multiple accessions share the same study, this option may result in
#'   significantly faster processing. However, there appear to be (quite a few)
#'   cases in the database where the TSV result columns do NOT match the
#'   expected accession names. This will hopefully be fixed in the future,
#'   but for now \code{bulk.dl} defaults to TRUE. When it does work, it can
#'   be orders of magnitude more efficient.
#'   (By default: \code{buld_dl = TRUE})
#'
#' }
#'
#' @return
#' If only taxonomy data is retrieved, the result is returned in
#' \code{TreeSummarizedExperiment} object by default. The result can also be
#' returned as a \code{phyloseq} object or as a list of \code{data.frames}.
#' Note that \code{phyloseq} object can include only one phylogenetic tree
#' meaning that some taxa might be lost when data is subsetted based on tree.
#'
#' When functional data is retrieved in addition to taxonomy data, the result
#' is returned as a \code{MultiAssayExperiment} object. Other options are a list
#' containing \code{phyloseq} object and \code{data.frames} or just
#' \code{data.frames}.
#'
#' Functional data can be returned as a \code{MultiAssayExperiment} object or
#' as a list of \code{data.frames}.
#'
#' @examples
#' # Create a client object
#' mg <- MgnifyClient(useCache = FALSE)
#'
#' # Get OTU tables as TreeSE
#' accession_list <- c("MGYA00377505")
#' tse <- getResult(mg, accession_list, get.func=FALSE, get.taxa=TRUE)
#'
#' \dontrun{
#' # Get functional data along with OTU tables as MAE
#' mae <- getResult(mg, accession_list, get.func=TRUE, get.taxa=TRUE)
#'
#' # Get same data as list
#' list <- getResult(
#'     mg, accession_list, get.func=TRUE, get.taxa=TRUE, output = "list",
#'     as.df = TRUE, use.cache = TRUE)
#' }
#'
#' @seealso
#' \code{\link[MGnifyR:getData]{getData}}
#'
#' @name getResult
NULL

#' @rdname getResult
#' @importFrom plyr llply
#' @importFrom dplyr bind_rows
#' @importFrom reshape2 dcast
#' @include AllClasses.R AllGenerics.R MgnifyClient.R utils.R
#' @export
setMethod("getResult", signature = c(x = "MgnifyClient"), function(
        x, accession, get.taxa = TRUE, get.func = TRUE, output = "TreeSE", ...){
    ############################### INPUT CHECK ################################
    if( !(.is_non_empty_character(accession)) ){
        stop(
            "'accession' must be a single character value or vector of ",
            "character values specifying the MGnify accession identifier.",
            call. = FALSE)
    }
    if( !.is_a_bool(get.taxa) ){
        stop(
            "'get.taxa' must be TRUE or FALSE.",
            call. = FALSE)
    }
    if( !(.is_a_bool(get.func) ||
        (is.character(get.func) &&
        all(get.func %in% names(.analyses_results_type_parsers)))) ){
        stop(
            "'get.func' must be TRUE or FALSE or a single character value ",
            "or a list of character values specifying functional analysis ",
            "types.", call. = FALSE)
    }
    # Get all values, if TRUE
    if(!is.character(get.func) && get.func){
        get.func <- names(.analyses_results_type_parsers)
    }
    if( !(length(output) == 1 && output %in% c("list", "phyloseq", "TreeSE")) ){
        stop(
            "'output' must be a 'TreeSE', 'list' or 'phyloseq'.", call. = FALSE)
    }
    ############################# INPUT CHECK END ##############################
    # Get functional data if user specified
    if( is.character(get.func) ){
        # If single value, create a vector from it
        if( length(get.func) == 1 ){
            get.func <- c(get.func)
        }
        func_res <- .mgnify_get_analyses_results(
            client = x, accession = accession, retrievelist = get.func,
            output = output, ...)
    } else{
        func_res <- NULL
    }
    # Get microbial profiling data
    if( get.taxa ){
        # The fetched BIOM files are parsed with mia::importBIOM, however,
        # mia does not import biomformat, it is only in its "suggests". This is
        # why we have to check that biomformat is available
        .require_package("biomformat")
        #
        taxa_res <- .mgnify_get_analyses_treese(
            client = x, accession = accession, ...)
    } else{
        taxa_res <- NULL
    }
    # Convert results to specified output type
    if( output != "list" ){
        result <- .convert_results_to_object(
            taxa_res, func_res, output)
    } else{
        # Create a final result list, if output is specified to be a list
        result <- append(taxa_res, func_res)
    }
    return(result)
})

################################ HELP FUNCTIONS ################################

# Convert results to TreeSE, MultiAssayExperiment or phyloseq.
#' @importFrom methods is
#' @importFrom dplyr bind_rows
.convert_results_to_object <- function(taxa_res, func_res, output){
    result <- NULL
    # If there are microbial profiling data, convert it to TreeSE or phyloseq
    if( !is.null(taxa_res) ){
        # Get TreeSE objects
        tse_list <- taxa_res$tse_objects
        # Get sample metadata
        col_data <- taxa_res$sample_metadata
        # If some results were not found, remove them
        ind <- !unlist(lapply(tse_list, is.null))
        # If there are samples left after subsetting
        if( any(ind) ){
            tse_list <- tse_list[ind]
            col_data <- col_data[ind]
            # Bind sample metadata to one table
            col_data <- do.call(bind_rows, col_data)
            col_data <- DataFrame(col_data)
            # Merge individual TreeSEs into one
            result <- mergeSEs(
                tse_list, assay.type = "counts", missing_values = 0)
            # Order the sample metadata
            col_data <- col_data[ colnames(result), , drop = FALSE]
            # Add sample metadata to the object
            colData(result) <- col_data
            # If user wants phyloseq, convert TreeSE
            if( output == "phyloseq" ){
                result <- makePhyloseqFromTreeSE(result)
            }
        } else{
            warning(
                "\nNo taxonomy data was found for the dataset.",
                call. = FALSE)
        }
    }
    # If there are functional data
    if( !is.null(func_res) ){
        # Remove data that is NULL
        ind <- !unlist(lapply(func_res, is.null))
        # If there are experiments left after subsetting
        if( any(ind) ){
            # Take only those experiments that are not NULL
            func_res <- func_res[ind]
            # If user wants TreeSE
            if( output == "TreeSE" ){
                # Create TreeSE from functional data
                func_res <- lapply(
                    func_res,
                    .create_TreeSE_from_func_data, tse = result)
                # Get colData from microbial profiling data TreeSE,
                # if it is included
                args <- list()
                # If taxa data is included get all the data. Otherwise, get only
                # functional annotations.
                if( !is.null(result) ){
                    col_data <- colData(result)
                    result <- list(microbiota = result)
                    exp_list <- c(result, func_res)
                } else {
                    exp_list <- func_res
                    col_data <- NULL
                }

                # If there are more than 1 experiments, create MAE
                if( length(exp_list) > 1 ){
                    exp_list <- ExperimentList(exp_list)
                    result <- MultiAssayExperiment(exp_list)
                    # If sample metadata is not empty
                    if( !is.null(col_data) ){
                        # Ensure that colData has all samples present in dataset
                        all_samples <- unique(unlist(colnames(result)))
                        col_data <- col_data[match(
                            all_samples, rownames(col_data)), ]
                        rownames(col_data) <- all_samples
                        # And then add to mae
                        colData(result) <- col_data
                    }
                } else{
                    # If there are only 1 experiment, give it as it is
                    result <- exp_list[[1]]
                }
            } else{
                # If user wants output as a phyloseq, give a list of one
                # phyloseq object and functional data
                result <- list(microbiota = result)
                result <- c(result, func_res)
                # If there are only one experiment, take it out from the list
                if( length(result) == 1 ){
                    result <- result[[1]]
                }
            }
        } else{
            warning("\nNo functional data found for the dataset.", call. = FALSE)
        }
    }
    return(result)
}

# Create a TreeSE from single functional data data.frame
#' @importFrom S4Vectors SimpleList
#' @importFrom dplyr %>% mutate_all na_if
.create_TreeSE_from_func_data <- function(x, tse_microbiota){
    # If data was provided
    if( !is.null(x) ){
        # Get assay
        assay <- dcast(x, index_id ~ analysis, value.var = "count")
        rownames(assay) <- assay[["index_id"]]
        assay[["index_id"]] <- NULL
        # Get row_data
        row_data <- x[ , !colnames(x) %in% c("analysis", "count"), drop = FALSE]
        row_data <- row_data[!duplicated(row_data), , drop = FALSE]
        rownames(row_data) <- row_data[["index_id"]]
        row_data <- row_data[rownames(assay), , drop=FALSE]
        # Get taxonomy from the information (e.g. when the data is taxonomy)
        tax_tab <- mia:::.parse_taxonomy(row_data, column_name = "index_id")
        # If taxonomy information was found
        if( ncol(tax_tab) > 0 ){
            # Remove prefixes
            tax_tab <- mia:::.remove_prefixes_from_taxa(tax_tab)
            # Replace empty cells with NA
            tax_tab <- tax_tab %>% as.data.frame() %>% mutate_all(na_if, "")
            # Add taxonomy info to original rowData
            row_data <- cbind(row_data, tax_tab)
        }
        # If microbiota data exists, order the functional data based on that
        # Do not drop samples that are not found from microbiota data.
        # Order only when samples are shared.
        if( !is.null(tse_microbiota) &&
            all(colnames(assay) %in% colnames(tse_microbiota)) &&
            all(colnames(tse_microbiota) %in% colnames(assay)) ){
            assay <-  assay[ , match(
                colnames(tse_microbiota), colnames(assay)), drop = FALSE]
        }
        # Create a TreeSE
        assay <- as.matrix(assay)
        assays <- SimpleList(counts = assay)
        row_data <- DataFrame(row_data)
        # Get arguments for TreeSE
        args <- list(assays = assays, rowData = row_data)
        # Get sample metadata
        if( !is.null(tse_microbiota) ){
            col_data <- colData(tse_microbiota)
            # Order coldata.
            col_data <- col_data[match(colnames(assay), rownames(col_data)), ]
            # Add colnames to ensure that all rows have name (if some samples
            # were missing from the col_data)
            rownames(col_data) <- colnames(assay)
            args$colData <- col_data
        }
        # Create TreeSE
        tse <- do.call(TreeSummarizedExperiment, args)
    } else{
        tse <- NULL
    }
    return(tse)
}

# Helper function for importing microbial profiling data.
.mgnify_get_analyses_treese <- function(
        client, accession, use.cache = useCache(client),
        show.messages = verbose(client), taxa.su = "SSU", ...){
    ############################### INPUT CHECK ################################
    if( !(.is_non_empty_string(taxa.su)) ){
        stop(
            "'taxa.su' must be a single character value specifying taxa ",
            "subunit.",
            call. = FALSE)
    }
    if( !.is_a_bool(use.cache) ){
        stop(
            "'use.cache' must be a single boolean value.", call. = FALSE)
    }
    if( !.is_a_bool(show.messages) ){
        stop(
            "'show.messages' must be a single boolean value.", call. = FALSE)
    }
    show.messages <- ifelse(show.messages, "text", "none")
    ############################# INPUT CHECK END ##############################
    # Give message about progress
    if( show.messages =="text" ){
        message("Fetching taxonomy data...")
    }
    # Get TreeSE objects
    tse_list <- llply(accession, function(x) {
            .mgnify_get_single_analysis_treese(
                client, x, use.cache = use.cache, taxa.su = taxa.su, ...)
    }, .progress = show.messages)
    # The sample_data has been corrupted by doing the merge (names get messed
    # up and duplicated), so just regrab it with another lapply/rbind
    col_data <- lapply(accession, function(x){
        .mgnify_get_single_analysis_metadata(client, x, use.cache = use.cache)})
    # If user wants result as list
    result <- list(tse_objects=tse_list, sample_metadata = col_data)
    return(result)
}

################################ HELP FUNCTIONS ################################

# Get a single biom file and convert it to TreeSummarizedExperiment format
#
#' @importFrom mia importBIOM
#' @importFrom mia checkTaxonomy
#' @importFrom urltools parameters parameters<-
#' @importFrom httr GET
#' @importFrom httr write_disk
#' @importFrom ape read.tree
#' @importFrom TreeSummarizedExperiment rowTree
#' @importFrom SummarizedExperiment rowData rowData<-
.mgnify_get_single_analysis_treese <- function(
        client = NULL, accession, use.cache = useCache(client),
        downloadDIR = NULL, taxa.su = "SSU", get.tree = TRUE,
        show.warnings = showWarnings(client), cache.dir = cacheDir(client),
        clear.cache = clearCache(client), ...){
    ############################### INPUT CHECK ################################
    if( !.is_a_bool(get.tree) ){
        stop("'get.tree' must be TRUE or FALSE.",
            call. = FALSE)
    }
    if( !.is_a_bool(use.cache) ){
        stop(
            "'use.cache' must be a single boolean value.", call. = FALSE)
    }
    if( !.is_a_bool(show.warnings) ){
        stop(
            "'show.warnings' must be a single boolean value.", call. = FALSE)
    }
    if( !.is_non_empty_string(cache.dir) ){
        stop(
            "'cache.dir' must be a string.", call. = FALSE)
    }
    if( !.is_a_bool(clear.cache) ){
        stop(
            "'clear.cache' must be a single boolean value.", call. = FALSE)
    }
    ############################# INPUT CHECK END ##############################
    # Get the metadata related to analysis
    metadata_df <- .mgnify_get_single_analysis_metadata(
        client, accession, use.cache = use.cache, ...)
    analysis_data <- .mgnify_retrieve_json(
        client, paste("analyses", accession, sep = "/"), use.cache = use.cache,
        ...)
    # Get the metadata related to available files etc
    download_url <- analysis_data[[1]]$relationships$downloads$links$related
    analysis_downloads <- .mgnify_retrieve_json(
        client, complete_url = download_url, use.cache = use.cache, ...)

    # Depending on the pipeline version, there may be more than one OTU table
    # available (LSU/SSU), so try and get the one specified in taxa.su -
    # otherwise spit out a warning and grab the generic (older pipelines)
    formats <- lapply(
        analysis_downloads, function(x){x$attributes$`file-format`$name})
    formats <- unlist(formats)
    available_biom_files <- analysis_downloads[grepl('JSON Biom', formats)]
    # Check if any biom files was found
    if( is.null(available_biom_files) || length(available_biom_files) == 0 ){
        warning(
            "\nNo BIOM data found for accession '", accession, "'.",
            call. = FALSE)
        return(NULL)
    }
    group_type <- lapply(
        available_biom_files, function(x){ x$attributes$`group-type` } )
    group_type <- unlist(group_type)
    biom_position <- grepl(taxa.su, group_type)
    if( sum(biom_position) == 0 ){
        if( show.warnings ){
            warning(
                "\nUnable to locate requested taxonomy type ", taxa.su, ". ",
                "This is likely due to the current analysis having been ",
                "performed on an older version of the MGnify pipeline. ",
                "The available BIOM file will be used instead.",
                call. = FALSE)
        }
        biom_url <- available_biom_files[[1]]$links$self
    } else {
        biom_url <- available_biom_files[biom_position][[1]]$links$self
    }

    # Can specify a separate dir for saving biom files, otherwise they end up
    # in the cacheDir(client) folder, under "bioms"
    if( is.null(downloadDIR) ){
        downloadDIR <- file.path(cache.dir, "biom_files")
        dir.create(
            downloadDIR, recursive = TRUE, showWarnings = show.warnings)
    }
    # Clear out any ?params after the main location - don't need them for this
    parameters(biom_url) <- NULL
    # Get the file name and path to it in local machine
    fname <- utils::tail(strsplit(biom_url, "/")[[1]], n = 1)
    biom_path <- file.path(downloadDIR, fname)

    ## Quick check to see if we should clear the disk cache  for this specific
    # call  - used for debugging and when MGnify breaks
    if( use.cache && clear.cache && file.exists(biom_path) ){
        message("clearCache is TRUE: deleting ", biom_path)
        unlink(biom_path)
    }
    # Download the file from the database to specific file path
    fetched_from_url <- FALSE
    if( !file.exists(biom_path) ){
        res <- GET(biom_url, write_disk(biom_path, overwrite = TRUE))
        fetched_from_url <- TRUE
        # If the file was not successfully downloaded
        if( res$status_code != 200 ){
            warning(
                "\n", biom_url, ": ", content(res, ...)$errors[[1]]$detail,
                " A biom listed in 'accession' is missing from the ",
                "output.", call. = FALSE)
            # Remove the downloaded file, it includes only info on errors
            unlink(biom_path)
            return(NULL)
        }
    }

    # Load in the TreeSummarizedExperiment object
    tse <- importBIOM(
        biom_path, rank.from.prefix = TRUE, set.ranks = TRUE, verbose = FALSE,
        prefix.rm = TRUE, artifact.rm = TRUE)
    # If the file was not in store already but fetched from database, and cache
    # storing is disabled
    if( fetched_from_url && !use.cache ){
        unlink(biom_path)
    }
    # TreeSE has sample ID as its colnames. Rename so that it is the accession
    # ID.
    colData(tse)[["biom_sample_id"]] <- colnames(tse)
    colnames(tse) <- accession

    # If user wants also phylogenetic tree
    if(get.tree){
        # Is there a tree?
        data_types <- lapply(
            analysis_downloads, function(x){x$attributes$`description`$label})
        data_types <- unlist(data_types)
        tvec <- grepl('Phylogenetic tree', data_types)
        if( any(tvec) ){
            # Get the url address of tree
            tree_url <- analysis_downloads[tvec][[1]]$links$self
            # Clear out any ?params after the main location - don't need them
            # for this
            parameters(tree_url) <- NULL
            # Get the file name for the tree
            fname <- utils::tail(strsplit(tree_url, '/')[[1]], n=1)
            # Get the path for the tree
            tree_path <- file.path(downloadDIR, fname)

            # Quick check to see if we should clear the disk cache
            #  for this specific call  - used for debugging and when MGnify
            # breaks
            if(use.cache && clear.cache && file.exists(tree_path) ){
                message("clearCache is TRUE: deleting ", tree_path)
                unlink(tree_path)
            }
            # Download the file from the database to specific file path
            fetched_from_url <- FALSE
            if( !file.exists(tree_path) ){
                res <- GET(tree_url, write_disk(tree_path, overwrite = TRUE))
                fetched_from_url <- TRUE
                # If the file was not successfully downloaded
                if( res$status_code != 200 ){
                    warning(
                        "\n", tree_url, ": ",
                        content(res, ...)$errors[[1]]$detail,
                        " A phylogenetic tree listed in 'accession' is ",
                        "missing from the output.", call. = FALSE)
                }
            }
            # Add the tree to TreeSE object
            if( file.exists(tree_path) ){
                row_tree <- read.tree(tree_path)
                rowTree(tse) <- row_tree
                # If the file was not in store already but fetched from
                # database and cache storing is disabled
                if( fetched_from_url && !use.cache ){
                    unlink(biom_path)
                }
            }
        }
    }
    return(tse)
}

# Helper function for importing functional data
.mgnify_get_analyses_results <- function(
        client, accession, retrievelist, output, use.cache = useCache(client),
        show.messages = verbose(client), as.df = TRUE, bulk.dl = TRUE, ...){
    ############################### INPUT CHECK ################################
    # If output is TreeSE or object, get results as data.frames
    as.df <- ifelse(output != "list", TRUE, as.df)
    if( !.is_a_bool(as.df) ){
        stop("'as.df' must be TRUE or FALSE.", call. = FALSE)
    }
    if( !.is_a_bool(bulk.dl) ){
        stop("'bulk.dl' must be TRUE or FALSE.", call. = FALSE)
    }
    if( !.is_a_bool(use.cache) ){
        stop(
            "'use.cache' must be a single boolean value.", call. = FALSE)
    }
    if( !.is_a_bool(show.messages) ){
        stop(
            "'show.messages' must be a single boolean value.", call. = FALSE)
    }
    show.messages <- ifelse(show.messages, "text", "none")
    ############################# INPUT CHECK END ##############################
    # Give message about progress
    if( show.messages == "text" ){
        message("Fetching functional data...")
    }
    # Get functional results
    all_results <- llply(accession, function(x){
        .mgnify_get_single_analysis_results(
            client, x, use.cache = use.cache, retrievelist = retrievelist,
            bulk.files = bulk.dl, ...)
    }, .progress = show.messages)
    # Add names based on accessions codes
    names(all_results) <- accession

    # Compact the result type dataframes into a single instance. Per
    # accession counts in each column.
    if(as.df){
        # Check which data types are available
        data_types <- names(all_results[[1]])
        # Combine results data type -wise
        all_results <- lapply(data_types, function(data_type){
            # Get only certain type of data of all samples
            temp <- lapply(all_results, function(acc_res) acc_res[[data_type]])
            # Combine data to df
            temp <- bind_rows(temp, .id = "analysis")
            # If there was no data, the df is empty
            if( nrow(temp) > 0 && ncol(temp) > 0 ){
                # Counts are numeric values, convert...
                temp[["count"]] <- as.numeric(temp[["count"]])
                # There are three columns that we want; "analysis"
                # (sample ID), "count" (counts), and "index_id" (feature)
                # columns.

                # If the "index_id" column equals with the "accession" column,
                # drop the latter. They should match in taxonomy data, but
                # functional might have additional columns.
                if( all(temp[["index_id"]] == temp[["accession"]]) ){
                    temp <- temp[ , colnames(temp) != "accession", drop = FALSE]
                }
            } else {
                temp <- NULL
            }
            return(temp)
        })
        # Give names based on data types
        names(all_results) <- data_types
    }
    return(all_results)
}

# types can be found using names(MGnifyR:::.analyses_results_type_parsers).
# Note that not depending on the particular analysis type, pipeline
# Retrieves combined study/sample/analysis metadata - not exported
#' @importFrom urltools parameters parameters<-
#' @importFrom httr GET
#' @importFrom httr write_disk
#' @importFrom dplyr bind_rows
#' @importFrom utils read.csv2
.mgnify_get_single_analysis_results <- function(
        client, accession, retrievelist=c(),
        use.cache = useCache(client), clear.cache = clearCache(client),
        max.hits = NULL, bulk.files = FALSE,
        show.warnings = showWarnings(client), ...){
    # Input check
    if( !.is_a_bool(clear.cache) ){
        stop(
            "'clear.cache' must be a single boolean value.", call. = FALSE)
    }
    if( !.is_a_bool(show.warnings) ){
        stop(
            "'show.warnings' must be a single boolean value.", call. = FALSE)
    }
    #
    # Get the metadata describing the samples
    metadata_df <- .mgnify_get_single_analysis_metadata(
        client, accession, use.cache = use.cache, max.hits = max.hits)

    # Should we try and grab the study's full TSV download rather than parse
    # through the JSON API? Doing so has the potential to use a LOT more disk
    # space, along with potentially increased data download. It should
    # be faster though, except in pathological cases (e.g. only 1 sample per
    # 1000 sample study required). As with everything else, we make use of
    # local caching to speed things along.
    if( bulk.files ){
        # Create a directory for donwloading the file
        downloadDIR <- file.path(cacheDir(client), "tsv")
        if(!dir.exists(downloadDIR)){
            dir.create(
                downloadDIR, recursive = TRUE, showWarnings = show.warnings)
        }
        # Get what files are available in database
        available_downloads_json <- .mgnify_retrieve_json(
            client,
            path = paste(
                "studies", metadata_df$study_accession, "downloads", sep = "/"),
            use.cache = use.cache, ...)
        # Load the files
        parsed_results <- lapply(available_downloads_json, function(r) {
            # Figure out the label mapping
            cur_lab <- r$attributes$description$label
            # Pipe version
            cur_pipeversion <- r$relationships$pipeline$data$id

            # There MUST be a better way to do the line below...
            # Get the type of the data
            cur_type <- names(.analyses_results_bulk_file_names)[
                cur_lab == .analyses_results_bulk_file_names]

            # Check the pipeline versions match and load the file if the
            # datatype is specified to be loaded.
            if(length(cur_type) > 0 &&
                cur_pipeversion == metadata_df$`analysis_pipeline-version` &&
                any(cur_type %in% retrievelist)){
                # If there are "taxonomic assignments ssu", it can match with
                # 2 --> take the first one
                cur_type <- cur_type[[1]]
                # Get the data
                temp <- .get_bulk_files(
                    cur_type, client = client, r = r,
                    metadata_df = metadata_df, downloadDIR = downloadDIR,
                    use.cache = use.cache, ...)
            } else{
                temp <- NULL
            }
            return(list(type=cur_type, data=temp))
        })
        # Add data types to data as names (taxonomy might have 2 data types if
        # NULL because these both 2 data types are tried to fetch)
        cur_type <- unlist(lapply(parsed_results, function(x)
            ifelse( length(x$type) > 1, x$type[[1]], x$type)))
        parsed_results <- lapply(parsed_results, function(x) x$data)
        names(parsed_results) <- cur_type
    }else{
        # If user do not want to fetch bulk files
        # Now (re)load the analysis data:
        analysis_data <- .mgnify_retrieve_json(
            client, paste("analyses", accession, sep = "/"),
            use.cache = use.cache, max.hits = max.hits, ...)
        # For now try and grab all data types that user has specified -
        # just return the list - don't do any processing...
        all_results <- lapply(
            names(.analyses_results_type_parsers), function(r) {
                tmp <- NULL
                if(r %in% retrievelist){
                    tmp <- .mgnify_retrieve_json(
                        client,
                        complete_url = analysis_data[[1]]$relationships[[
                            r]]$links$related,
                        use.cache = use.cache,
                        max.hits = max.hits, ...)
                }
                return(tmp)
        })
        # Add data types as names
        names(all_results) <- names(.analyses_results_type_parsers)
        # Get the data in correct format
        parsed_results <- lapply(names(all_results), function(x){
            # Get the specific type of data
            all_json <- all_results[[x]]
            # If specific type of data can be found
            res_df <- NULL
            if( !is.null(all_json) && length(all_json) > 0 ){
                # Get the specific type of data from all accessions
                all_json <- lapply(
                    all_json, .analyses_results_type_parsers[[x]])
                # Combine
                res_df <- do.call(bind_rows, all_json)
                rownames(res_df) <- res_df$index_id
            }
            return(res_df)
        })
        names(parsed_results) <- names(all_results)
    }
    # Return a list of data types, e.g., taxonomy, GO...
    return(parsed_results)
}

# Helper function for getting bulk file
.get_bulk_files <- function(
        cur_type, client, r, metadata_df, downloadDIR,
        use.cache, clear.cache = clearCache(client),
        use.mem.cache = FALSE, mem.cache.name = "mgnify_memory_cache", ...){
    # Input check
    if( !.is_a_bool(clear.cache) ){
        stop(
            "'clear.cache' must be a single boolean value.", call. = FALSE)
    }
    if( !.is_a_bool(use.mem.cache) ){
        stop(
            "'use.mem.cache' must be a single boolean value.", call. = FALSE)
    }
    # Check mem.cache.name
    if( !( length(mem.cache.name) == 1 && is.character(mem.cache.name) ) ){
      stop("'mem.cache.name' must be a single character value.", call. = FALSE)
    }
    if( exists(mem.cache.name) && use.mem.cache ){
        mgnify_memory_cache <- get(mem.cache.name)
        if( !is.list(mgnify_memory_cache) ){
            stop("'mem.cache.name' must specify a list object.", call. = FALSE)
        }
    } else if(use.mem.cache){
        mgnify_memory_cache <- list()
    }
    #
    # Get the url
    data_url <- r$links$self
    # Clear off extraneous gubbins
    parameters(data_url) <- NULL
    #build the cache filename
    fname <- utils::tail(strsplit(data_url, '/')[[1]], n=1)

    # At this point we might have already got the data we want
    # loaded. Check the memory cache object
    if( (use.mem.cache &&
        cur_type %in% names(mgnify_memory_cache) &&
        mgnify_memory_cache[cur_type]["fname"] == fname) ){
        tmp_df <- mgnify_memory_cache[cur_type][["data"]]
    }else{
        # Nope - gonna have to load it up from disk or grab
        # it from t'interweb
        data_path <- file.path(downloadDIR, fname)
        # Clear cache if specified
        if( use.cache && clear.cache && file.exists(data_path) ){
            message("clearCache is TRUE: deleting ", data_path)
            unlink(data_path)
        }
        # Download the file from the database to specific file path
        if( !file.exists(data_path) ){
            res <- GET(data_url, write_disk(data_path, overwrite = TRUE))
            # If the file was not successfully downloaded
            if( res$status_code != 200 ){
                warning(
                    "\n", data_url, ": ", content(res, ...)$errors[[1]]$detail,
                    " Could not load the file from database. The data ",
                    "from the file is not included in the output.",
                    call. = FALSE)
                # Remove the downloaded file
                unlink(data_path)
                return(NULL)
            }
        }
        # Load the file (might be big so save it in the
        # 1 deep cache)
        tmp_df <- read.csv2(
            data_path, sep="\t", header = TRUE,
            stringsAsFactors = FALSE)
    }
    # Save it in memory using "super assignment" - which
    # I'm not really sure about but it seems to work...
    # thing'd be much easier if R passed objects
    # by reference.
    if( use.mem.cache ){
        mgnify_memory_cache[[cur_type]] <- list(
            data=tmp_df, fname=fname)
        # Assign to global variable so it is visible for user
        assign(mem.cache.name, mgnify_memory_cache, envir = .GlobalEnv)
    }

    # Because there seem to be "mismatches" between the
    # JSON and downloadable files, and maybe some issues
    # with missing downloads, we have to check if we
    # actually got a valid file:
    if( ncol(tmp_df) < 3 ){
        warning("\nInvalid download for ", accession, call. = FALSE)
        return(NULL)
    }

    # Need to figure out how many columns to keep -
    # Get those columns that do not include numeric values but some info about
    # samples etc... First column includes always sample ID.
    info_cols <- lapply(tmp_df, function(x){
        all(is.na(suppressWarnings(as.numeric(x))))})
    info_cols <- unlist(info_cols)
    info_cols <- c(sample_ID = TRUE, info_cols[2:length(info_cols)])
    info_cols <- which(info_cols)

    # Also need the column name for this particular
    # analysis...
    # As far as I can see they could be either assembly IDs
    # or run ids. FFS. Assuming that both assembly and run
    # won't be present...:
    if( "assembly_accession" %in% colnames(metadata_df) ){
        accession <- metadata_df$assembly_accession[[1]]
    }else if( "run_accession" %in% colnames(metadata_df) ){
        accession <- metadata_df$run_accession[[1]]
    } else{
        warning("\nFailed to data on ", accession, call. = FALSE)
        return(NULL)
    }

    # Get the correct sample and subset the data so that it includes only
    # the sample and info columns
    column_position <- match(accession, colnames(tmp_df))
    if( (is.na(column_position) || length(column_position) != 1) ){
        warning("\nFailed to find column ", accession, call. = FALSE)
        return(NULL)
    }
    # Ensure that the counts are numeric
    tmp_df[[column_position]] <- as.numeric(tmp_df[[column_position]])
    # Get columns to keep
    keeper_columns <- c(info_cols, column_position)
    tmp_df <- tmp_df[, keeper_columns, drop = FALSE]
    # Adjust colnames rownames and add column
    colnames(tmp_df)[1] <- "accession"
    colnames(tmp_df)[ ncol(tmp_df) ] <- "count"
    tmp_df$index_id <- tmp_df$accession
    rownames(tmp_df) <- tmp_df$accession
    return(tmp_df)
}

# Which parser do you use for which type of output?
# a list of parsers for each output type. If new output types come, add them to
# here (and below) so that they are fetched from the database.
.analyses_results_type_parsers <- list(
    `taxonomy` = .mgnify_parse_tax,
    `taxonomy-itsonedb` = .mgnify_parse_tax,
    `go-slim`=.mgnify_parse_func,
    `taxonomy-itsunite` = .mgnify_parse_tax,
    `taxonomy-ssu` = .mgnify_parse_tax,
    `taxonomy-lsu` = .mgnify_parse_tax,
    `antismash-gene-clusters` = .mgnify_parse_func,
    `go-terms` = .mgnify_parse_func,
    `interpro-identifiers` = .mgnify_parse_func)

# This maps the json attribute name for retrievelist to the "description ->
# label" attribute in the study downloads section
.analyses_results_bulk_file_names <- list(
    `taxonomy` = "Taxonomic assignments SSU",
    `taxonomy-itsonedb` = "Taxonomic assignments ITS",
    `go-slim` = "GO slim annotation",
    `taxonomy-itsunite` = "Taxonomic assignments Unite",
    `taxonomy-ssu` = "Taxonomic assignments SSU",
    `taxonomy-lsu` = "Taxonomic assignments LSU",
    `antismash-gene-clusters` = .mgnify_parse_func,
    `go-terms` = "Complete GO annotation",
    `interpro-identifiers` = "InterPro matches",
    `phylo-tax-ssu` = "Phylum level taxonomies SSU",
    `phylo-tax-lsu` = "Phylum level taxonomies LSU")
beadyallen/MGnifyR documentation built on Dec. 3, 2024, 8:21 a.m.