recount3: Explore and download data from the recount3 project

Documented in create_rse_manual

#' Internal function for creating a recount3 RangedSummarizedExperiment object
#'
#' This function is used internally by `create_rse()` to construct a `recount3`
#' [RangedSummarizedExperiment-class][SummarizedExperiment::RangedSummarizedExperiment-class]
#' object that contains the base-pair coverage counts at the `gene` or `exon`
#' feature level for a given annotation.
#'
#' @param type  A `character(1)` specifying whether you want to access gene,
#' exon, or exon-exon junction counts.
#' @inheritParams locate_url
#' @inheritParams file_retrieve
#'
#' @return A
#'  [RangedSummarizedExperiment-class][SummarizedExperiment::RangedSummarizedExperiment-class]
#'  object.
#' @export
#' @importFrom SummarizedExperiment SummarizedExperiment "assayNames<-"
#' "metadata<-"
#' @importFrom S4Vectors DataFrame
#' @importFrom rtracklayer import.gff
#' @importFrom Matrix readMM
#' @importFrom GenomicRanges GRanges mcols mcols<-
#' @importFrom sessioninfo package_info
#' @references
#'
#' <https://doi.org/10.12688/f1000research.12223.1> for details on the
#' base-pair coverage counts used in recount2 and recount3.
#'
#' @family internal functions for accessing the recount3 data
#' @examples
#'
#' ## Unlike create_rse(), here we create an RSE object by
#' ## fully specifying all the arguments for locating this study
#' rse_gene_SRP009615_manual <- create_rse_manual(
#'     "SRP009615",
#'     "data_sources/sra"
#' )
#' rse_gene_SRP009615_manual
#'
#' ## Check how much memory this RSE object uses
#' pryr::object_size(rse_gene_SRP009615_manual)
#'
#' ## Test with a collection that has a single sample
#' ## NOTE: this requires loading the full data for this study when
#' ## creating the RSE object
#' rse_gene_ERP110066_collection_manual <- create_rse_manual(
#'     "ERP110066",
#'     "collections/geuvadis_smartseq",
#'     recount3_url = "http://snaptron.cs.jhu.edu/data/temp/recount3"
#' )
#' rse_gene_ERP110066_collection_manual
#'
#' ## Check how much memory this RSE object uses
#' pryr::object_size(rse_gene_ERP110066_collection_manual)
#'
#' ## Mouse example
#' rse_gene_DRP002367_manual <- create_rse_manual(
#'     "DRP002367",
#'     "data_sources/sra",
#'     organism = "mouse"
#' )
#' rse_gene_DRP002367_manual
#'
#' ## Information about how this RSE was made
#' metadata(rse_gene_DRP002367_manual)
#'
#' ## Test with a collection that has one sample, at the exon level
#' ## NOTE: this requires loading the full data for this study (nearly 6GB!)
#' \dontrun{
#' rse_exon_ERP110066_collection_manual <- create_rse_manual(
#'     "ERP110066",
#'     "collections/geuvadis_smartseq",
#'     type = "exon",
#'     recount3_url = "http://snaptron.cs.jhu.edu/data/temp/recount3"
#' )
#' rse_exon_ERP110066_collection_manual
#'
#'
#' ## Check how much memory this RSE object uses
#' pryr::object_size(rse_exon_ERP110066_collection_manual)
#' # 409 MB
#'
#' ## Test with a collection that has one sample, at the junction level
#' ## NOTE: this requires loading the full data for this study
#' system.time(rse_jxn_ERP110066_collection_manual <- create_rse_manual(
#'     "ERP110066",
#'     "collections/geuvadis_smartseq",
#'     type = "jxn",
#'     recount3_url = "http://snaptron.cs.jhu.edu/data/temp/recount3"
#' ))
#' rse_jxn_ERP110066_collection_manual
#'
#' ## Check how much memory this RSE object uses
#' ## NOTE: this doesn't run since 2 files are missing on the test site!
#' pryr::object_size(rse_jxn_ERP110066_collection_manual)
#' }
#'
#' \dontrun{
#' ## For testing and debugging
#' project <- "ERP110066"
#' project_home <- "collections/geuvadis_smartseq"
#'
#' project <- "SRP009615"
#' project_home <- "data_sources/sra"
#' type <- "gene"
#' organism <- "human"
#' annotation <- "gencode_v26"
#' jxn_format <- "ALL"
#' bfc <- recount3_cache()
#' recount3_url <- "http://idies.jhu.edu/recount3/data"
#' verbose <- TRUE
#' }
create_rse_manual <- function(project,
    project_home = project_homes(
        organism = organism,
        recount3_url = recount3_url
    ),
    type = c("gene", "exon", "jxn"),
    organism = c("human", "mouse"),
    annotation = annotation_options(organism),
    bfc = recount3_cache(),
    jxn_format = c("ALL", "UNIQUE"),
    recount3_url = getOption("recount3_url", "http://duffel.rail.bio/recount3"),
    verbose = getOption("recount3_verbose", TRUE)) {
    project_home <- match.arg(project_home)
    type <- match.arg(type)
    organism <- match.arg(organism)
    annotation <- match.arg(annotation)
    jxn_format <- match.arg(jxn_format)

    ## First the metadata which is the smallest
    if (verbose) {
        message(
            Sys.time(),
            " downloading and reading the metadata."
        )
    }

    metadata <- read_metadata(file_retrieve(
        url = locate_url(
            project = project,
            project_home = project_home,
            type = "metadata",
            organism = organism,
            annotation = annotation,
            recount3_url = recount3_url
        ),
        bfc = bfc,
        verbose = verbose
    ))

    ## Update the project_home based on the metadata
    project_home_original <- project_home
    project_home <- metadata$recount_project.file_source[1]

    ## Add the URLs to the BigWig files
    metadata$BigWigURL <- locate_url(
        project = project,
        project_home = project_home,
        type = "bw",
        organism = organism,
        annotation = annotation,
        recount3_url = recount3_url,
        sample = metadata$external_id
    )

    if (type == "jxn") {
        jxn_files <- locate_url(
            project = project,
            project_home = project_home,
            type = "jxn",
            organism = organism,
            annotation = annotation,
            jxn_format = jxn_format,
            recount3_url = recount3_url
        )
    }

    if (verbose) {
        message(
            Sys.time(),
            " downloading and reading the feature information."
        )
    }
    ## Read the feature information
    if (type %in% c("gene", "exon")) {
        feature_info <-
            rtracklayer::import.gff(file_retrieve(
                url = locate_url_ann(
                    type = type,
                    organism = organism,
                    annotation = annotation,
                    recount3_url = recount3_url
                ),
                bfc = bfc,
                verbose = verbose
            ))
    } else if (type == "jxn") {
        feature_info <- utils::read.delim(file_retrieve(
            url = jxn_files[grep("\\.RR\\.gz$", jxn_files)],
            bfc = bfc,
            verbose = verbose
        ))
        ## Testing with ERP001942 revealed an issue here
        # > table(x$strand)
        #       -       ?       +
        # 1409791    7842 1432569
        feature_info$strand[feature_info$strand == "?"] <- "*"

        feature_info <- GenomicRanges::GRanges(feature_info)
    }

    if (verbose) {
        message(
            Sys.time(),
            " downloading and reading the counts: ",
            nrow(metadata),
            ifelse(nrow(metadata) > 1, " samples", " sample"),
            " across ",
            length(feature_info),
            " features."
        )
    }
    if (type %in% c("gene", "exon")) {
        counts <- read_counts(
            file_retrieve(
                url = locate_url(
                    project = project,
                    project_home = project_home,
                    type = type,
                    organism = organism,
                    annotation = annotation,
                    recount3_url = recount3_url
                ),
                bfc = bfc,
                verbose = verbose
            ),
            samples = metadata$external_id
        )
    } else if (type == "jxn") {
        counts <- Matrix::readMM(file_retrieve(
            url = jxn_files[grep("\\.MM\\.gz$", jxn_files)],
            bfc = bfc,
            verbose = verbose
        ))

        if (verbose) {
            message(
                Sys.time(),
                " matching exon-exon junction counts with the metadata."
            )
        }
        ## The samples in the MM jxn table are not in the same order as the
        ## metadata!
        jxn_rail <- read.delim(file_retrieve(
            url = jxn_files[grep("\\.ID\\.gz$", jxn_files)],
            bfc = bfc,
            verbose = verbose
        ))
        m <- match(metadata$rail_id, jxn_rail$rail_id)
        stopifnot(
            "Metadata rail_id and exon-exon junctions rail_id are not matching." =
                !all(is.na(m))
        )
        counts <- counts[, m, drop = FALSE]
        colnames(counts) <- metadata$external_id
    }

    ## Build the RSE object
    if (verbose) {
        message(
            Sys.time(),
            " constructing the RangedSummarizedExperiment (rse) object."
        )
    }

    stopifnot(
        "Metadata external_id and counts colnames are not matching." =
            identical(metadata$external_id, colnames(counts))
    )

    if (type == "gene") {
        stopifnot(
            "Gene names and count rownames are not matching." =
                identical(feature_info$gene_id, rownames(counts))
        )
    } else if (type == "exon") {
        stopifnot(
            "Exon names and count rownames are not matching." =
                identical(feature_info$recount_exon_id, rownames(counts))
        )
    } else if (type == "jxn") {
        rownames(counts) <- as.character(feature_info)
    }

    ## Make names consistent
    names(feature_info) <- rownames(counts)
    rownames(metadata) <- colnames(counts)
    recount3_pkg <- sessioninfo::package_info(
        pkgs = "recount3",
        include_base = FALSE,
        dependencies = FALSE
    )

    ## Change "score" for "bp_length"
    ## Related to https://github.com/LieberInstitute/recount3/issues/4
    colnames(mcols(feature_info))[colnames(mcols(feature_info)) == "score"] <- "bp_length"

    rse <- SummarizedExperiment::SummarizedExperiment(
        assays = list(counts = counts),
        colData = S4Vectors::DataFrame(metadata, check.names = FALSE),
        rowRanges = feature_info,
        metadata = list(
            time_created = Sys.time(),
            recount3_version = as.data.frame(recount3_pkg),
            project = project,
            project_home = project_home_original,
            type = type,
            organism = organism,
            annotation = annotation,
            jxn_format = jxn_format,
            recount3_url = recount3_url
        )
    )
    if (type %in% c("gene", "exon")) {
        ## Change the name for gene and exons, just to highlight that these
        ## are not read counts
        assayNames(rse) <- "raw_counts"

        ## Remove jxn_format since it has nothing to do with genes/exons
        S4Vectors::metadata(rse)$jxn_format <- NULL
    }
    return(rse)
}