R/methods-GScores.R

Defines functions .which_scores_nonsnrs .rleWhichValues .which_scores_snrs .rgscoresHDF5 .rgscoresRle .pprintnsites .pprintseqs citation.GScores citation.character citation .scores_nonsnrs .fetch_scores_nonsnrs .scores_snrs .filter_ambig_multinuc_ref_alt_args .check_ref_alt_args .fetch_scores_snrs .rleGetMetaValues .rleGetValues .fromBase .toBase GScores

Documented in citation GScores

## GScores constructor
GScores <- function(provider, provider_version, download_url,
                    download_date, reference_genome,
                    data_pkgname, data_dirpath,
                    data_serialized_objnames=character(0),
                    default_pop="default",
                    data_group=sub("\\..*$", "", data_pkgname),
                    data_tag=sub("\\..*$", "", data_pkgname),
                    data_nsites=NA_real_,
                    data_hdf5=FALSE,
                    license="",
                    license_url="",
                    license_reqconsent=FALSE) {
  data_cache <- new.env(hash=TRUE, parent=emptyenv())
  data_pops <- list.files(path=data_dirpath, pattern=data_pkgname)
  data_nonsnrs <- length(grep("nonsnv", data_pops)) > 0
  data_pops <- data_pops[grep("nonsnv", data_pops, invert=TRUE)]
  data_pops <- gsub(paste0(data_pkgname, "."), "", data_pops)
  data_pops <- gsub("\\..+$", "", data_pops)
  data_pops <- sort(unique(data_pops))
  if (all(data_pops %in% seqlevels(reference_genome))) ## no additional score populations
    data_pops <- default_pop
  else {
    if (any(data_pops %in% seqlevels(reference_genome)))
      data_pops <- c(default_pop, setdiff(data_pops, seqlevels(reference_genome)))
  }

  assign(data_pkgname, list(), envir=data_cache)
  assign(sprintf("%s.nonsnvs", data_pkgname), GRangesList(), envir=data_cache)

  if (file.exists(file.path(data_dirpath, "nsites.rds")))
    data_nsites <- as.numeric(readRDS(file.path(data_dirpath, "nsites.rds")))

  new("GScores", provider=provider,
                 provider_version=provider_version,
                 download_url=download_url,
                 download_date=download_date,
                 reference_genome=reference_genome,
                 data_pkgname=data_pkgname,
                 data_dirpath=data_dirpath,
                 data_serialized_objnames=data_serialized_objnames,
                 data_group=data_group,
                 data_tag=data_tag,
                 data_pops=data_pops,
                 default_pop=default_pop,
                 data_nonsnrs=data_nonsnrs,
                 data_nsites=data_nsites,
                 data_hdf5=data_hdf5,
                 license=license,
                 license_url=license_url,
                 license_reqconsent=license_reqconsent,
                 .data_cache=data_cache)
}

## accessors
setMethod("name", "GScores", function(x) x@data_pkgname)

setMethod("type", "GScores", function(x) sub("\\..*$", "", name(x)))

setMethod("provider", "GScores", function(x) x@provider)

setMethod("providerVersion", "GScores", function(x) x@provider_version)

setMethod("genomeDescription", "GScores", function(x) x@reference_genome)

setMethod("organism", "GScores",
          function(object) organism(genomeDescription(object)))

setMethod("seqinfo", "GScores", function(x) seqinfo(genomeDescription(x)))

setMethod("seqnames", "GScores", function(x) seqnames(genomeDescription(x)))

setMethod("seqlengths", "GScores", function(x) seqlengths(genomeDescription(x)))

setMethod("seqlevelsStyle", "GScores",
          function(x) seqlevelsStyle(genomeDescription(x)))

setMethod("gscoresNonSNRs", "GScores",
          function(x) x@data_nonsnrs)

setMethod("populations", "GScores", function(x) x@data_pops)

setMethod("defaultPopulation", "GScores", function(x) x@default_pop)

setReplaceMethod("defaultPopulation", c("GScores", "character"),
                 function(x, value) {
                   if (length(value) > 1)
                     message("more than one default scores population name supplied, using the 1st one only.")
                   value <- value[1]
                   if (any(!value %in% populations(x)))
                     stop(sprintf("scores population %s is not part of the available scores populations. Use 'populations()' to figure out which ones are available.", value))
                   x@default_pop <- value
                   x
                 })

setMethod("gscoresTag", "GScores", function(x) x@data_tag)

setReplaceMethod("gscoresTag", c("GScores", "character"),
                 function(x, value) {
                   if (length(value) > 1)
                     message("more than one genomic scores tag name supplied, using the 1st one only.")
                   x@data_tag <- value[1]
                   x
                 })

setMethod("gscoresCategory", "GScores", function(x) x@data_group)

setReplaceMethod("gscoresCategory", c("GScores", "character"),
                 function(x, value) {
                   if (length(value) > 1)
                     message("more than one genomic scores category name supplied, using the 1st one only.")
                   x@data_group <- value[1]
                   x
                 })

setMethod("nsites", "GScores", function(x) x@data_nsites)

setMethod("hdf5Backend", "GScores", function(x) x@data_hdf5)

## convert digits in vector 'd', grouped by 'g', into numbers in base 'b'
.toBase <- function(d, g=rep(1, length(d)), b) {
  n <- tapply(d, g, function(d, b) sum(d * b^(0:(length(d)-1))), b)
  as.integer(n)
}

## convert each number in 'n' into 'd' digits in base 'b'
.fromBase <- function(n, d, b) {
  totaldigits <- length(n) * d
  digits <- rep(NA_integer_, times=totaldigits)
  i <- 0L
  while (i < d) {
    digits[seq(1L+i, totaldigits, by=d)] <- n %% b
    n <- floor(n / b)
    i <- i + 1L
  }
  digits
}

## this has been improved using RleViews as discussed in
## https://stat.ethz.ch/pipermail/bioconductor/2013-December/056409.html
.rleGetValues <- function(rlelst, gr, mdata, summaryFun,
                          quantized=FALSE, isHDF5=FALSE) {
  summaryFun <- match.fun(summaryFun)
  numericmean <- TRUE
  if (!identical(summaryFun, mean))
    numericmean <- FALSE

  whregions <- which(width(gr) > 1)
  if (quantized && length(whregions) > 0)
    stop("When 'quantized=TRUE' input 'ranges' must have width one because summarization can only be calculated with dequantized values.")

  .dequantizer <- mdata$dqfun
  dqargs <- mdata$dqfun_args
  seqlevels(gr) <- names(rlelst)
  ord <- order(seqnames(gr)) ## store ordering below in 'split()'
  startbyseq <- split(start(gr), seqnames(gr), drop=TRUE)
  x <- ans <- numeric(0)
  if (quantized)
    x <- decode(unlist(rlelst[startbyseq], use.names=FALSE))
  else {
    lappargs <- c(list(X=rlelst[startbyseq], FUN=.dequantizer), dqargs)
    x <- unlist(do.call("lapply", lappargs), use.names=FALSE)
  }
  valxpos <- mdata$valxpos
  if (is.null(valxpos))
    valxpos <- 1
  stopifnot(length(valxpos) == 1 && valxpos[1] > 0) ## QC

  if (valxpos == 1) {
    ans <- vector(mode=class(x), length=length(gr))
    ans[ord] <- x
    if (quantized)
      ans <- Rle(ans)
  } else {
    if (length(whregions) > 0)
      stop("This GScores object returns more than one value per position, and therefore, input ranges must have width one.")
    ans <- matrix(NA_real_, nrow=length(gr), ncol=valxpos)
    ans[ord, ] <- matrix(x, nrow=length(gr), ncol=valxpos, byrow=TRUE)
  }

  if (length(whregions) > 0) { ## regions comprising more than
    tmpans <- NA_real_         ## one position are summarized
    ord2 <- order(seqnames(gr)[whregions]) ## store ordering below in 'split()'
    if (numericmean && !isHDF5) {
      rngbyseq <- split(gr[whregions], seqnames(gr)[whregions], drop=TRUE)
      tmpans <- lapply(names(rngbyseq),
                       function(sname) {
                         coercedrle <- rlelst[[sname]]
                         dqargs2 <- c(list(q=runValue(coercedrle)), dqargs)
                         runValue(coercedrle) <- do.call(".dequantizer", dqargs2)
                         viewMeans(Views(coercedrle,
                                         start=start(rngbyseq[[sname]]),
                                         end=end(rngbyseq[[sname]])))
                       })
      tmpans <- unsplit(tmpans, seqnames(gr)[whregions])
    } else { ## this allows for other summary functions
             ## but it runs about 10-fold slower
      startbyseq <- split(start(gr)[whregions],
                          seqnames(gr)[whregions], drop=TRUE)
      widthbyseq <- split(width(gr)[whregions],
                          seqnames(gr)[whregions], drop=TRUE)
      f <- function(r, p, w)
             sapply(seq_along(p),
                    function(i, r, p, w) {
                      dqargs2 <- c(list(q=r[p[i]:(p[i]+w[i]-1)]), dqargs)
                      summaryFun(do.call(".dequantizer", dqargs2))
                    },
                    r, p, w)

      tmpans <- unlist(mapply(f,
                              rlelst[names(startbyseq)], startbyseq, widthbyseq,
                              SIMPLIFY=FALSE),
                       use.names=FALSE)
    }

    ans[ord[whregions]] <- tmpans[ord2]
  }
  ans
}

.rleGetMetaValues <- function(rlelst, gr, metadataID, isHDF5=FALSE) {
  stopifnot(all(width(gr) == 1)) ## QC
  if (isHDF5)
    rlelst <- endoapply(rlelst,
                        function(x)
                          HDF5Array(path(seed(x)),
                                    name=sprintf("%s/%s",
                                                 sub("/scores", "", seed(x)@name), metadataID)))
  else
    rlelst <- endoapply(rlelst, function(x) metadata(x)[[metadataID]])

  if (all(sapply(rlelst, length) == 0L))
    return(Rle(rep(FALSE, length(gr))))

  seqlevels(gr) <- names(rlelst)
  ord <- order(seqnames(gr)) ## store ordering below in 'split()'
  startbyseq <- split(start(gr), seqnames(gr), drop=TRUE)
  x <- as.logical(decode(unlist(rlelst[startbyseq], use.names=FALSE)))
  ans <- vector(mode=class(x), length=length(gr))
  ans[ord] <- x
  ans <- Rle(ans)

  ans
}

setMethod("score", "GScores",
          function(x, ..., simplify=TRUE) {
            gsco <- gscores(x, ..., scores.only=TRUE)
            if (ncol(gsco) == 1 && simplify)
              gsco <- gsco[[1]]
            gsco
          })

setMethod("gscores", c("GScores", "GenomicRanges"),
          function(x, ranges, ...) {
            ## default non-generic arguments
            paramNames <- c("scores.only", "pop", "type",
                            "summaryFun", "quantized",
                            "ref", "alt", "minoverlap", "caching")
            pop <- defaultPopulation(x)
            type <- "snrs"
            scores.only <- FALSE
            summaryFun <- mean
            quantized <- FALSE
            ref <- character(0)
            alt <- character(0)
            minoverlap <- 1L ## only relevant for genomic scores associated with nonSNRs
            caching <- TRUE

            ## get arguments
            arglist <- list(...)
            if (is.null(names(arglist)))
              names(arglist) <- rep("", length(arglist))

            mask <- nchar(names(arglist)) == 0
            if (any(mask))
              stop(sprintf("optional argument(s) in position(s) %s should be named",
                           paste(2+which(mask), collapse=", ")))

            mask <- names(arglist) %in% paramNames
            if (any(!mask))
                stop(sprintf("unused argument (%s)", names(arglist)[!mask]))
            list2env(arglist, envir=sys.frame(sys.nframe()))

            if (!type %in% c("snrs", "nonsnrs"))
              stop("argument 'type' must be either 'snrs' (default) or 'nonsnrs'.")

            mask <- pop %in% populations(x)
            if (any(!mask))
              stop(sprintf("scores population %s is not present in %s. Please use 'populations()' to find out the available ones.",
                           pop[!mask], name(x)))

            ## adapt to sequence style and genome version from the input
            ## GScores object, thus assuming positions are based on the same
            ## genome even though might be differently specified
            ## (i.e., hg19 vs hs37d5 or hg38 vs GRCh38)
            if (length(intersect(seqlevelsStyle(ranges), seqlevelsStyle(x))) == 0)
              seqlevelsStyle(ranges) <- seqlevelsStyle(x)[1]
            commonSeqs <- intersect(seqlevels(ranges), seqlevels(x))
            if (any(is.na(genome(ranges)))) {
              ## message(sprintf("assuming query ranges genome build is the one of the GScores object (%s).",
              ##                 unique(genome(x)[commonSeqs])))
              genome(ranges) <- genome(x)
            } else if (any(genome(ranges)[commonSeqs] != genome(x)[commonSeqs])) {
              message(sprintf("assuming %s represent the same genome build between query ranges and the GScores object, respectively.",
                              paste(c(unique(genome(ranges)[commonSeqs]),
                                      unique(genome(x)[commonSeqs])),
                                    collapse=" and ")))
              genome(ranges) <- genome(x)
            }

            ord <- 1:length(ranges)
            if (is.unsorted(ranges)) {
              ord <- order(ranges)
              ranges <- ranges[ord]
              if (length(ref) > 0 && length(alt) > 0) {
                ref <- ref[ord]
                alt <- alt[ord]
              }
            }

            ans <- NULL
            if (type == "snrs")
              ans <- .scores_snrs(x, ranges, pop, summaryFun, quantized,
                                  scores.only, ref, alt, caching)
            else
              ans <- .scores_nonsnrs(x, ranges, pop, quantized, scores.only,
                                     ref, alt, minoverlap, caching)
            if (is(ans, "DFrame"))
              ans[ord, ] <- ans
            else ## GRanges
              ans[ord] <- ans

            ans
          })

setMethod("gscores", c("GScores", "character"),
          function(x, ranges, ...) {
            ## default non-generic arguments
            paramNames <- c("scores.only", "pop",
                            "summaryFun", "quantized",
                            "ref", "alt", "minoverlap", "caching")
            pop <- defaultPopulation(x)
            scores.only <- FALSE
            summaryFun <- mean
            quantized <- FALSE
            ref <- character(0)
            alt <- character(0)
            minoverlap <- 1L ## only relevant for genomic scores associated with nonSNRs
            caching <- TRUE
            ids <- ranges

            ## get arguments
            arglist <- list(...)
            mask <- nchar(names(arglist)) == 0
            if (any(mask))
              names(arglist)[mask] <- paste0("X", 1:sum(mask))

            mask <- names(arglist) %in% paramNames
            if (any(!mask))
                stop(sprintf("unused argument (%s)", names(arglist)[!mask]))
            list2env(arglist, envir=sys.frame(sys.nframe()))

            mask <- pop %in% populations(x)
            if (any(!mask))
              stop(sprintf("scores population %s is not present in %s. Please use 'populations()' to find out the available ones.",
                           pop[!mask], name(x)))

            ans <- DataFrame(as.data.frame(matrix(NA_real_, nrow=length(ids),
                                                  ncol=length(pop),
                                                  dimnames=list(NULL, pop))),
                             row.names=ids)
            if (!exists("rsIDs", envir=x@.data_cache)) {
              if (file.exists(file.path(x@data_dirpath, "rsIDs.rds"))) {
                message("Loading first time annotations of identifiers to genomic positions, produced by data provider.")
                rsIDs <- readRDS(file.path(x@data_dirpath, "rsIDs.rds"))
                assign("rsIDs", rsIDs, envir=x@.data_cache)
              } else {
                message("The data provider did not produce annotations of identifiers to genomic positions.")
                return(ans)
              }
            }

            rsIDs <- get("rsIDs", envir=x@.data_cache)
            stopifnot(is.integer(rsIDs)) ## QC
            idsint <- rep(NA_integer_, length(ids))
            rsMask <- regexpr("^rs", ids) == 1
            idsint[rsMask] <- as.integer(sub(pattern="^rs", replacement="", x=ids[rsMask]))
            mt <- rep(NA_integer_, length(idsint))
            if (any(!is.na(idsint))) {
              idsintnoNAs <- idsint[!is.na(idsint)]
              ord <- order(idsintnoNAs)                     ## order ids to speed up
              mtfi <- findInterval(idsintnoNAs[ord], rsIDs) ## call to findInterval()
              mtfi[ord] <- mtfi                             ## put matches into original order
              mt[!is.na(idsint)] <- mtfi                    ## integrate matches into result
            }
            mt[mt == 0] <- 1
            if (any(!is.na(mt))) {
              maskNAs <- idsint != rsIDs[mt]
              mt[maskNAs] <- NA
            }
            if (any(!is.na(mt))) {
              if (!exists("rsIDidx", envir=x@.data_cache)) {
                rsIDidx <- readRDS(file.path(x@data_dirpath, "rsIDidx.rds"))
                assign("rsIDidx", rsIDidx, envir=x@.data_cache)
              }
              rsIDidx <- get("rsIDidx", envir=x@.data_cache)
              mt <- rsIDidx[mt]
            }

            ranges <- GRanges()
            mcols(ranges) <- DataFrame(as.data.frame(matrix(NA_real_, nrow=0,
                                                            ncol=length(pop),
                                                            dimnames=list(NULL, pop))))
            if (any(!is.na(mt))) {
              if (!exists("rsIDgp", envir=x@.data_cache)) {
                rsIDgp <- readRDS(file.path(x@data_dirpath, "rsIDgp.rds"))
                assign("rsIDgp", rsIDgp, envir=x@.data_cache)
              }
              rsIDgp <- get("rsIDgp", envir=x@.data_cache)
              rng <- rsIDgp[mt[!is.na(mt)]]
              mask <- logical(length(mt))
              mask[!is.na(mt)] <- rng$isSNV
              if (any(mask))
                ans[mask, pop] <- gscores(x, rng[rng$isSNV], pop=pop, type="snrs",
                                          summaryFun=summaryFun, quantized=quantized,
                                          scores.only=TRUE, ref=ref, alt=alt,
                                          minoverlap=minoverlap, caching=caching)
              mask <- logical(length(mt))
              mask[!is.na(mt)] <- !rng$isSNV
              if (any(mask))
                ans[mask, pop] <- gscores(x, rng[!rng$isSNV], pop=pop, type="nonsnrs",
                                          summaryFun=summaryFun, quantized=quantized,
                                          scores.only=TRUE, ref=ref, alt=alt,
                                          minoverlap=minoverlap, caching=caching)
              ranges <- as(rng, "GRanges")
              names(ranges) <- rownames(ans)[!is.na(mt)]
              mcols(ranges) <- ans[!is.na(mt), , drop=FALSE]
            }

            if (scores.only)
              return(ans[!is.na(mt), ])

            ranges
          })

## fetch genomic scores stored on disk for specific sequences given in 'snames'
## and add them to the 'gsco1pop'data structure, which is then returned back
.fetch_scores_snrs <- function(object, objectname, gsco1pop, pop, snames) {
  slengths <- seqlengths(object)
  avh5snames <- h5fpath <- character(0)

  if (hdf5Backend(object)) {
    fname <- sprintf("%s.%s.h5", object@data_pkgname, pop)
    h5fpath <- file.path(object@data_dirpath, fname)
    h5str <- h5ls(h5fpath)
    avh5snames <- h5str$name[which(h5str$group == "/")]
  }

  for (sname in snames) {

    snameExists <- FALSE

    if (hdf5Backend(object)) { ## HDF5 backend

      if (sname %in% avh5snames) {
        snameExists <- TRUE
        gsco1pop[[sname]] <- HDF5Array(h5fpath, paste0(sname, "/scores"))
      }

    } else {                   ## RDS-serialized Rle backend

      if (pop == "default")
        fname <- sprintf("%s.%s.rds", object@data_pkgname, sname)
      else
        fname <- sprintf("%s.%s.%s.rds", object@data_pkgname, pop, sname)
      if (length(object@data_serialized_objnames) > 0 &&
          fname %in% names(object@data_serialized_objnames))
        fname <- object@data_serialized_objnames[fname]
      fpath <- file.path(object@data_dirpath, fname)
      if (file.exists(fpath)) {
        snameExists <- TRUE
        gsco1pop[[sname]] <- readRDS(fpath)
      }

    }

    if (!snameExists) {
      message(sprintf("no %s scores for population %s in sequence %s from %s object %s (%s).",
                       type(object), pop, sname, class(object), objectname, name(object)))
      gsco1pop[[sname]] <- Rle(lengths=slengths[sname], values=as.raw(0L))
    }

  }

  gsco1pop
}

.check_ref_alt_args <- function(ref, alt) {
  if (length(ref) != length(alt))
    stop("'ref' and 'alt' arguments have different lengths.")

  if (length(ref) > 0) {
    if (!class(ref) %in% c("character", "DNAStringSet", "DNAStringSetList")) {
      stop("'ref' argument must be either a character vector, a DNAStringSet or a DNAStringSetList object.")
    } else if (is(ref, "DNAStringSetList")) {
      mask <- elementNROWS(ref) > 1
      if (any(mask)) {
        ref <- unstrsplit(CharacterList(ref), sep=",")
        ref[mask] <- NA_character_
        message(sprintf("%d multiallelic 'ref' alleles have been discarded.",
                        sum(mask)))
      }
      ref <- unlist(ref)
    }

    if (!class(alt) %in% c("character", "DNAStringSet", "DNAStringSetList")) {
      stop("'alt' argument must be either a character vector, a DNAStringSet or a DNAStringSetList object.")
    } else if (class(alt) == "DNAStringSetList") {
      mask <- elementNROWS(alt) > 1
      if (any(mask)) {
        alt <- unstrsplit(CharacterList(alt), sep=",")
        alt[mask] <- NA_character_
        message(sprintf("%d multiallelic 'alt' alleles have been discarded.",
                        sum(mask)))
      }
      alt <- unlist(alt)
    }
  }

  list(ref=ref, alt=alt)
}

.filter_ambig_multinuc_ref_alt_args <- function(ref, alt) {
  if (length(ref) != length(alt))
    stop("'ref' and 'alt' arguments have different lengths.")

  if (length(ref) > 0) {
    stopifnot(class(ref) %in% c("character", "DNAStringSet")) ## QC

    if (is(ref, "DNAStringSet"))
      ref <- as.character(ref)
    mask <- nchar(ref) > 1
    if (any(mask)) {
      ref[mask] <- NA_character_
      message(sprintf("%d multinucleotide 'ref' alleles have been discarded.",
                      sum(mask)))
    }
    mask <- !is.na(ref) & (!ref %in% DNA_BASES)
    if (any(mask)) {
      ref[mask] <- NA_character_
      message(sprintf("%d ambiguous 'ref' alleles have been discarded.",
                      sum(mask)))
    }

    if (is(alt, "DNAStringSet"))
      alt <- as.character(alt)
    mask <- nchar(alt) > 1
    if (any(mask)) {
      alt[mask] <- NA_character_
      message(sprintf("%d multinucleotide 'alt' alleles have been discarded.",
                      sum(mask)))
    }
    mask <- !is.na(alt) & (!alt %in% DNA_BASES)
    if (any(mask)) {
      alt[mask] <- NA_character_
      message(sprintf("%d ambiguous 'alt' alleles have been discarded.",
                      sum(mask)))
    }

    mask <- ref == alt
    mask[is.na(mask)] <- FALSE
    if (any(mask)) {
      ref[mask] <- NA_character_
      alt[mask] <- NA_character_
      message(sprintf("%d identical 'ref' and 'alt' alleles have been discarded.",
                      sum(mask)))
    }
  }

  list(ref=ref, alt=alt)
}
## assumes 'object' and 'ranges' have the same sequence "styles"
.scores_snrs <- function(object, ranges, pop, summaryFun=mean, quantized=FALSE,
                         scores.only=FALSE, ref=character(0), alt=character(0),
                         caching=TRUE) {
  objectname <- deparse(substitute(object))
  if (length(ranges) == 0)
    return(numeric(0))

  ra <- .check_ref_alt_args(ref, alt)
  ref <- ra$ref
  alt <- ra$alt
  rm(ra)

  snames <- unique(as.character(runValue(seqnames(ranges))))
  if (any(!snames %in% seqnames(object)))
    stop(sprintf("Sequence names %s in 'ranges' not present in reference genome %s.",
                 paste(snames[!snames %in% seqnames(object)], collapse=", "),
                 providerVersion(genomeDescription(object))))

  gscopops <- get(object@data_pkgname, envir=object@.data_cache)
  if (all(names(gscopops) %in% seqlevels(object))) { ## temporary fix for annotations w/o populations
    tmp <- gscopops
    gscopops <- list()
    gscopops[[defaultPopulation(object)]] <- List() ## RleList(tmp, compress=FALSE)
    assign(object@data_pkgname, gscopops, envir=object@.data_cache)
  }

  missingMask <- !pop %in% names(gscopops)
  for (popname in pop[missingMask]) {
    gscopops[[popname]] <- List() ## RleList(compress=FALSE)
    if (hdf5Backend(object)) ## HDF5 backend, fetch common metadata from first population
      metadata(gscopops[[popname]]) <- metadata(gscopops[[1]])
  }
  anyMissing <- any(missingMask)

  ans <- DataFrame(as.data.frame(matrix(NA_real_, nrow=length(ranges),
                                        ncol=length(pop), dimnames=list(NULL, pop))))
  ans2 <- NULL

  for (popname in pop) {
    missingMask <- !snames %in% names(gscopops[[popname]])
    anyMissing <- anyMissing || any(missingMask)
    if (any(missingMask))
      gscopops[[popname]] <- .fetch_scores_snrs(object, objectname, gscopops[[popname]],
                                                popname, snames[missingMask])

    sco <- rep(NA_real_, length(ranges))
    mdata <- metadata(gscopops[[popname]]) ## metadata in List object of HDF5 pkgs
    if ("Rle" %in% class(gscopops[[popname]][[1]]))
      mdata <- metadata(gscopops[[popname]][[1]])
    if (length(mdata) > 0 && class(mdata$dqfun) == "function")
      sco <- .rleGetValues(gscopops[[popname]], ranges, mdata,
                           summaryFun=summaryFun, quantized=quantized,
                           isHDF5=hdf5Backend(object))
    else
      warning(sprintf("No dequantization function available for scores population %s. Scores will be all NA for this population.",
                      popname))

  if (length(ref) > 0 && length(alt) > 0) {
    if (is.matrix(sco)) { ## if it's a matrix, then we have multiple scores per position
      ra <- .filter_ambig_multinuc_ref_alt_args(ref, alt)
      ref <- ra$ref
      alt <- ra$alt
      rm(ra)
      sco2 <- rep(NA_real_, length(ranges))
      mask1 <- !is.na(ref) & !is.na(alt)
      mt.r <- match(ref[mask1], DNA_BASES)
      mt.a <- match(alt[mask1], DNA_BASES)
      mask2 <- mt.r < mt.a
      idxcol <- mt.a
      idxcol[mask2] <- mt.a[mask2] - 1L
      sco2[mask1] <- sco[cbind(which(mask1), idxcol)]
      sco <- sco2
      rm(sco2)
    } else { # if it's not a matrix, then assume we have metadata associated with the scores
      maskREF <- as.logical(.rleGetMetaValues(gscopops[[popname]],
                                              ranges, "maskREF",
                                              isHDF5=hdf5Backend(object)))
      if (is.null(ans2) && length(maskREF) > 0) {
        ans2 <- ans
        colnames(ans) <- paste0(colnames(ans), "_REF")
        colnames(ans2) <- paste0(colnames(ans2), "_ALT")
      }
      popnameALT <- paste0(popname, "_ALT")
      popname <- paste0(popname, "_REF")
      ans2[[popnameALT]] <- sco
      ans2[[popnameALT]][maskREF] <- 1 - sco[maskREF]
      sco[!maskREF] <- 1 - sco[!maskREF]
    }
  }

  ans[[popname]] <- sco
  }
  if (anyMissing && caching)
    assign(object@data_pkgname, gscopops, envir=object@.data_cache)
  rm(gscopops)

  if (!is.null(ans2)) {
    ans <- cbind(ans, ans2)
    rm(ans2)
  }

  if (scores.only)
    return(ans)

  mcols(ranges) <- cbind(mcols(ranges), ans)
  seqlevels(ranges) <- seqlevels(object)
  seqinfo(ranges) <- seqinfo(object)

  ranges
}

## fetch genomic scores stored on disk for specific sequences given in 'snames'
## and add them to the 'gscononsnrs'data structure, which is then returned back
.fetch_scores_nonsnrs <- function(object, objectname, gscononsnrs, pop, snames) {
  popnames <- character(0)
  if (length(gscononsnrs) > 0)
    popnames <- colnames(mcols(gscononsnrs[[1]]))

  if (length(snames) > 0) {
    md <- list()
    slengths <- seqlengths(object)
    for (sname in snames) {
      fname <- file.path(object@data_dirpath,
                         sprintf("%s.GRnonsnv.%s.rds", name(object), sname))
      obj <- GRanges()
      if (file.exists(fname)) {
        obj <- readRDS(fname)
        for (popname in popnames) {
          fname <- file.path(object@data_dirpath,
                             sprintf("%s.RLEnonsnv.%s.%s.rds", name(object), popname, sname))
          if (file.exists(fname)) {
            rleobj <- readRDS(fname)
            md <- metadata(rleobj)
            mcols(obj)[[popname]] <- rleobj
          } else {
            message(sprintf("no %s scores for population %s nonSNRs in sequence %s from %s object %s.",
                            name(object), popname, sname, class(object), objectname))
            ## mcols(obj)[[popname]] <- rep(NA_real_, length(obj))
            mcols(obj)[[popname]] <- Rle(lengths=length(obj), values=as.raw(0L)) ## should this be NA?
          }
        }
      } else {
        message(sprintf("no %s scores for nonSNRs in sequence %s from %s object %s.",
                        name(object), sname, class(object), objectname))
        mcols(obj) <- DataFrame(as.data.frame(matrix(raw(0), nrow=0, ncol=length(popnames),
                                                     dimnames=list(NULL, popnames))))
      }
      gscononsnrs[[sname]] <- obj
    }
    metadata(gscononsnrs) <- md ## is this really necessary??
  }

  if (length(pop) > 0) {
    md <- list()
    tmp <- unlist(gscononsnrs)
    names(tmp) <- NULL
    for (popname in pop) {
      scovalues <- Rle(raw(length(tmp)))
      i <- 1
      ## b/c we're storing MAF values as metadata columns of GRanges
      ## populations need to be loaded for all already loaded chromosomes
      for (sname in names(gscononsnrs)) {
        fname <- file.path(object@data_dirpath,
                           sprintf("%s.RLEnonsnv.%s.%s.rds", name(object), popname, sname))
        if (file.exists(fname)) {
          obj <- readRDS(fname)
          md <- metadata(obj)
          scovalues[i:(i+length(gscononsnrs[[sname]])-1)] <- obj
        }
        i <- i + length(gscononsnrs)
      }
      mcols(tmp)[[popname]] <- scovalues
      ## add maskREF metadata to each population, if available
      metadata(mcols(tmp)[[popname]])$maskREF <- md$maskREF
    }
    gscononsnrs <- split(tmp, seqnames(tmp), drop=TRUE)
    metadata(gscononsnrs) <- md
    rm(tmp)
  }

  gscononsnrs
}

.scores_nonsnrs <- function(object, ranges, pop, quantized=FALSE,
                            scores.only=FALSE, ref=character(0), alt=character(0),
                            minoverlap=1L, caching=TRUE) {
  objectname <- deparse(substitute(object))
  gscononsnrs<- get(sprintf("%s.nonsnvs", name(object)), envir=object@.data_cache)

  ra <- .check_ref_alt_args(ref, alt)
  ref <- ra$ref
  alt <- ra$alt
  rm(ra)

  ## ## by now, only multiple SNR alleles considered
  ## if (length(ref) > 0 && length(alt) > 0)
  ##   message("arguments 'ref' and 'alt' are given but there is only one score per genomic position.")

  if (length(intersect(seqlevelsStyle(ranges), seqlevelsStyle(object))) == 0)
    seqlevelsStyle(ranges) <- seqlevelsStyle(object)[1]

  snames <- unique(as.character(runValue(seqnames(ranges))))
  if (any(!snames %in% seqnames(object)))
    stop(sprintf("Sequence names %s in 'ranges' not present in reference genome %s.",
                 paste(snames[!snames %in% seqnames(object)], collapse=", "),
                 providerVersion(genomeDescription(object))))

  missingSeqMask <- !snames %in% names(gscononsnrs)
  anyMissing <- any(missingSeqMask)
  allpopnames <- character(0)
  if (length(gscononsnrs) > 0)
    allpopnames <- colnames(mcols(gscononsnrs[[1]]))
  missingPopMask <- !pop %in% allpopnames
  anyMissing <- anyMissing || any(missingPopMask)

  if (any(missingSeqMask) || any(missingPopMask)) ## if genomic scores belong to queried sequences that are not cached, load them
    gscononsnrs <- .fetch_scores_nonsnrs(object, objectname, gscononsnrs,
                                         pop[missingPopMask], snames[missingSeqMask])

  ans <- ans2 <- NULL
  if (quantized)
    ans <- DataFrame(as.data.frame(matrix(raw(0), nrow=length(ranges), ncol=length(pop),
                                        dimnames=list(NULL, pop))))
  else
    ans <- DataFrame(as.data.frame(matrix(NA_real_, nrow=length(ranges), ncol=length(pop),
                                        dimnames=list(NULL, pop))))

  ## the default value of 'minoverlap=1L' assumes that the sought nonsnrs are
  ## stored as in VCF files, using the nucleotide composition of the reference sequence
  ov <- findOverlaps(ranges, unlist(gscononsnrs),
                     minoverlap=minoverlap, type="equal")
  qHits <- queryHits(ov)
  sHits <- subjectHits(ov)
  if (length(ov) > 0) {
    q <- mcols(unlist(gscononsnrs))[sHits, pop, drop=FALSE]
    if (quantized)
      ans[qHits, pop] <- DataFrame(lapply(q, decode))
    else {
      .dequantizer <- metadata(gscononsnrs)$dqfun
      dqargs <- metadata(gscononsnrs)$dqfun_args
      lappargs <- c(list(X=q, FUN=.dequantizer), dqargs)
      ans[qHits, pop] <- DataFrame(do.call("lapply", lappargs))
    }
    if (any(duplicated(qHits)))
      message("gscores: more than one genomic score overlapping queried positions, reporting only the first hit.")

    if (length(ref) > 0 && length(alt) > 0) {
      for (popname in pop) {
        maskREF <- as.logical(metadata(mcols(unlist(gscononsnrs))[[popname]])$maskREF[sHits])
        if (is.null(ans2) && length(maskREF) > 0) {
          ans2 <- ans
          colnames(ans) <- paste0(colnames(ans), "_REF")
          colnames(ans2) <- paste0(colnames(ans2), "_ALT")
        }
        popnameALT <- paste0(popname, "_ALT")
        popnameREF <- paste0(popname, "_REF")
        ans2[qHits, popnameALT] <- ans[qHits, popnameREF]
        ans2[qHits, popnameALT][maskREF] <- 1 - ans[qHits, popnameREF][maskREF]
        ans[qHits, popnameREF][!maskREF] <- 1 - ans[qHits, popnameREF][!maskREF]
      }
    }
  }

  if (anyMissing && caching)
    assign(sprintf("%s.nonsnvs", name(object)), gscononsnrs, envir=object@.data_cache)
  rm(gscononsnrs)

  if (!is.null(ans2)) {
    ans <- cbind(ans, ans2)
    rm(ans2)
  }

  if (scores.only)
    return(ans)

  mcols(ranges) <- cbind(mcols(ranges), ans)
  seqlevels(ranges) <- seqlevels(object)
  seqinfo(ranges) <- seqinfo(object)

  ranges
}

## getters qfun and dqfun
setMethod("qfun", "GScores",
          function(object, pop=defaultPopulation(object)) {
            if (!pop %in% populations(object))
              stop(sprintf("There is no score population %s in this GScores object. Use populations() to find out what score populations are available.", pop))
            obj <- get(object@data_pkgname, envir=object@.data_cache)
            fun <- NA
            if (hdf5Backend(object))
              fun <- metadata(obj[[pop]])$qfun
            else
              fun <- metadata(obj[[pop]][[1]])$qfun
            fun
          })

setMethod("dqfun", "GScores",
          function(object, pop=defaultPopulation(object)) {
            if (!pop %in% populations(object))
              stop(sprintf("There is no score population %s in this GScores object. Use populations() to find out what score populations are available.", pop))
            obj <- get(object@data_pkgname, envir=object@.data_cache)
            fun <- NA
            if (hdf5Backend(object))
              fun <- metadata(obj[[pop]])$dqfun
            else
              fun <- metadata(obj[[pop]][[1]])$dqfun
          })

citation <- function(package, ...) UseMethod("citation")
citation.character <- function(package, ...) {
  if (missing(package)) package <- "base"
  utils::citation(package, ...)
}
setMethod("citation", signature="missing", citation.character)
setMethod("citation", signature="character", citation.character)
citation.GScores <- function(package, ...) {
  obj <- get(package@data_pkgname, envir=package@.data_cache)
  cit <- bibentry()
  if ("Rle" %in% class(obj[[1]]) || length(obj[[1]]) == 0 || hdf5Backend(package))
    cit <- metadata(obj[[1]])$citation
  else
    cit <- metadata(obj[[1]][[1]])$citation
  if (is.null(cit))
    cit <- bibentry()
  cit
}
setMethod("citation", signature="GScores", citation.GScores)

.pprintseqs <- function(x) {
  y <- x
  if (length(x) > 5)
    y <- c(y[1:2], "...", y[length(y)])
  y <- paste(y, collapse=", ")
  y
}

.pprintnsites <- function(x) {
  y <- x
  if (x > 10e6)
    y <- sprintf("%.0f millions", x/1e6)
  else if (x > 1e6)
    y <- sprintf("%.1f millions", x/1e6)
  y
}

## show method
setMethod("show", "GScores",
          function(object) {
            snrobj <- get(name(object), envir=object@.data_cache) ## single-nucleotide ranges
            if (all(names(snrobj) %in% seqlevels(object))) { ## temporary fix will working w/ outdated annotations
              tmp <- snrobj
              snrobj <- list()
              snrobj[[defaultPopulation(object)]] <- RleList(tmp, compress=FALSE)
              assign(object@data_pkgname, snrobj, envir=object@.data_cache)
            }

            nonsnrobj <- get(paste0(object@data_pkgname, ".nonsnvs"),
                             envir=object@.data_cache)
            loadedsnrpops <- loadedsnrseqs <- "none"
            loadednonsnrpops <- loadednonsnrseqs <- "none"
            if (length(snrobj) > 0) {
              loadedsnrseqs <- names(snrobj)
              if (length(populations(object)) > 1) {
                loadedsnrseqs <- names(snrobj[[1]])
                loadedsnrpops <- names(snrobj)
              } else if (length(populations(object)) == 1 && populations(object) == "default")
                loadedsnrseqs <- names(snrobj[[1]])
            }
            if (ncol(mcols(nonsnrobj)) > 0)
              loadednonsnrpops <- colnames(mcols(nonsnrobj))
            if (length(nonsnrobj) > 0)
              loadednonsnrseqs <- unique(seqnames(nonsnrobj))
              
            max.abs.error <- NA
            if (length(length(loadedsnrseqs)) > 0) {
              if (!hdf5Backend(object))
                max.abs.error <- max(unlist(sapply(lapply(snrobj[[defaultPopulation(object)]],
                                                          metadata), "[[", "max_abs_error"),
                                            use.names = FALSE))
              else
                max.abs.error <- max(unlist(lapply(snrobj[[defaultPopulation(object)]],
                                                   function(x)
                                                     as.numeric(HDF5Array(path(seed(x)),
                                                               name=sprintf("%s/max_abs_error",
                                                                            sub("/scores", "", seed(x)@name))))),
                                            use.names=FALSE))
            }

            cat(class(object), " object \n",
                "# organism: ", organism(object), " (", provider(genomeDescription(object)), ", ",
                                providerVersion(genomeDescription(object)), ")\n",
                "# provider: ", provider(object), "\n",
                "# provider version: ", providerVersion(object), "\n",
                "# download date: ", object@download_date, "\n", sep="")
            if (!hdf5Backend(object)) {
              if (gscoresNonSNRs(object)) {
                cat("# loaded sequences (SNRs): ", .pprintseqs(loadedsnrseqs), "\n",
                    "# loaded sequences (nonSNRs): ", .pprintseqs(loadednonsnrseqs), "\n", sep="")
                if (loadedsnrpops[1] != "none" && length(loadedsnrpops) > 1)
                  cat("# loaded populations (SNRs): ", .pprintseqs(loadedsnrpops), "\n",
                      "# loaded populations (nonSNRs): ", .pprintseqs(loadednonsnrpops), "\n", sep="")
              } else {
                cat("# loaded sequences: ", .pprintseqs(loadedsnrseqs), "\n", sep="")
                if (loadedsnrpops[1] != "none" && length(loadedsnrpops) > 1)
                  cat("# loaded populations: ", .pprintseqs(loadedsnrpops), "\n", sep="")
              }
            }
            if (defaultPopulation(object) != "default")
              cat("# default scores population: ", defaultPopulation(object), "\n", sep="")
            if (!is.na(nsites(object)) && nsites(object) > 0)
              cat("# number of sites: ", .pprintnsites(nsites(object)), "\n", sep="")
            if (!is.na(max.abs.error)) {
              if (defaultPopulation(object) != "default")
                cat("# maximum abs. error (def. pop.): ", signif(max.abs.error, 3), "\n", sep="")
              else
                cat("# maximum abs. error: ", signif(max.abs.error, 3), "\n", sep="")
            }
            if (nchar(object@license) > 0) {
              licensestr <- object@license
              if (nchar(object@license_url) > 0)
                licensestr <- sprintf("%s, see %s", object@license, object@license_url)
              cat(sprintf("# license: %s\n", licensestr))
            }
            if (length(citation(object)) > 0)
              cat("# use 'citation()' to cite these data in publications\n")
          })

.rgscoresRle <- function(object, gscopops, snames, n, pop, ranges) {
  nscores <- sapply(gscopops[[pop]][snames], function(x) {
                      mask <- runValue(x) != 0
                      sum(runLength(x)[mask])
                    })
  f <- factor(sample(snames, size=n, replace=TRUE, prob=nscores/sum(nscores)),
              levels=snames)
  nxs <- table(f)
  dqf <- dqfun(object)
  pxs <- lapply(setNames(snames, snames), function(s, l, n) {
                  gr <- GRanges(seqinfo=seqinfo(object))
                  if (n[s] > 0) {
                    mask <- runValue(l[[s]]) != 0
                    whmask <- which(mask)
                    rlcsum <- cumsum(runLength(l[[s]])[whmask])
                    rndpos <- sample(rlcsum[length(rlcsum)], size=n[s], replace=FALSE)
                    rndpos <- sapply(rndpos, function(x) sum(rlcsum <= x))
                    rndpos <- sapply(whmask[rndpos], function(j) sum(runLength(l[[s]])[1:j]))
                    rndsco <- l[[s]][rndpos]
                    gr <- GRanges(seqnames=rep(s, n[s]),
                                  ranges=IRanges(start=rndpos, width=1),
                                  seqinfo=seqinfo(object))
                    mcols(gr)[[pop]] <- dqf(rndsco)
                  }
                  gr
                }, gscopops[[pop]], nxs)

  unlist(do.call("GRangesList", pxs))
}

.rgscoresHDF5 <- function(object, gscopops, snames, n, pop, ranges) {
  nscores <- sapply(gscopops[[pop]][snames], function(x) {
                      mask <- x != 0
                      sum(mask)
                    })
  f <- factor(sample(snames, size=n, replace=TRUE, prob=nscores/sum(nscores)),
              levels=snames)
  nxs <- table(f)
  dqf <- dqfun(object)
  pxs <- lapply(setNames(snames, snames), function(s, l, n) {
                  gr <- GRanges(seqinfo=seqinfo(object))
                  if (n[s] > 0) {
                    mask <- l[[s]] != 0
                    whmask <- which(mask)
                    rndpos <- sample(whmask, size=n[s], replace=FALSE)
                    rndsco <- l[[s]][rndpos]
                    gr <- GRanges(seqnames=rep(s, n[s]),
                                  ranges=IRanges(start=rndpos, width=1),
                                  seqinfo=seqinfo(object))
                    mcols(gr)[[pop]] <- dqf(rndsco)
                  }
                  gr
                }, gscopops[[pop]], nxs)

  unlist(do.call("GRangesList", pxs))
}

setMethod("rgscores", signature(n="GScores", object="missing"),
          function(n, object, ...) {
            rgscores(n=1L, object=n, ...)
          })

setMethod("rgscores", signature(n="missing", object="GScores"),
          function(n, object, ...) {
            rgscores(n=1L, object, ...)
          })

setMethod("rgscores", signature(n="numeric", object="GScores"),
          function(n=1, object, ...) {
            rgscores(n=as.integer(n), object, ...)
          })

setMethod("rgscores", signature(n="integer", object="GScores"),
          function(n=1L, object, ...) {
            ## default non-generic arguments
            paramNames <- c("pop", "ranges", "scores.only")
            pop <- defaultPopulation(object)
            ranges <- keepStandardChromosomes(GRanges(seqinfo=seqinfo(object)))
            scores.only <- FALSE

            ## get arguments
            arglist <- list(...)
            mask <- nchar(names(arglist)) == 0
            if (any(mask))
              names(arglist)[mask] <- paste0("X", 1:sum(mask))

            mask <- names(arglist) %in% paramNames
            if (any(!mask))
                stop(sprintf("unused argument (%s)", names(arglist)[!mask]))
            list2env(arglist, envir=sys.frame(sys.nframe()))

            objectname <- deparse(substitute(object))
            stopifnot(length(pop) == 1) ## QC
            gscopops <- get(object@data_pkgname, envir=object@.data_cache)

            if (!is.logical(scores.only) || length(scores.only) > 1)
              stop("'scores.only' must be one logical value (TRUE or FALSE).")

            if (!is(ranges, "GRanges") && !is.character(ranges))
              stop("'ranges' must be either a 'GRanges' object or a character vector of sequence names.")
            if (is.character(ranges)) {
              mask <- !ranges %in% seqlevels(object)
              if (any(mask))
                stop(sprintf("Sequence names %s not in %s.",
                             paste(ranges[mask], collapse=", "), objectname))
              slen <- seqlengths(object)[ranges]
              ranges <- GRanges(seqnames=ranges, IRanges(rep(1, length(ranges)), slen))
            }

            snames <- seqlevels(keepStandardChromosomes(seqinfo(object)))
            if (length(ranges) > 0) {
              snames <- unique(as.character(runValue(seqnames(ranges))))
              if (any(!snames %in% seqnames(object)))
                stop(sprintf("Sequence names %s in 'ranges' not present in reference genome %s.",
                             paste(snames[!snames %in% seqnames(object)], collapse=", "),
                             providerVersion(genomeDescription(object))))
            }

            if (!pop %in% populations(object))
              stop(sprintf("Population %s is not in 'GScores' object %s. Use 'populations()' to find out the available ones.",
                           pop, objectname))

            if (!pop %in% names(gscopops))
              gscopops[[pop]] <- List() ## RleList(compress=FALSE)

            missingMask <- !snames %in% names(gscopops[[pop]])
            if (any(missingMask)) {
              gscopops[[pop]] <- .fetch_scores_snrs(object, objectname, gscopops[[pop]],
                                                    pop, snames[missingMask])
            }

            gsco <- GRanges(seqinfo=seqinfo(object))
            if (hdf5Backend(object))
              gsco <- .rgscoresHDF5(object, gscopops, snames, n, pop, ranges)
            else
              gsco <- .rgscoresRle(object, gscopops, snames, n, pop, ranges)

            if (scores.only)
              gsco <- mcols(gsco)[[pop]]

            gsco
          })


## functions to find out genomic scores within genomic ranges
## feature request by https://github.com/rcastelo/GenomicScores/issues/20
setMethod("wgscores", c("GScores", "GenomicRanges"),
          function(x, ranges, ...) {
            ## default non-generic arguments
            paramNames <- c("pop", "type", "caching")
            pop <- defaultPopulation(x)
            type <- "snrs"
            caching <- TRUE

            ## get arguments
            arglist <- list(...)
            mask <- nchar(names(arglist)) == 0
            if (any(mask))
              names(arglist)[mask] <- paste0("X", 1:sum(mask))

            mask <- names(arglist) %in% paramNames
            if (any(!mask))
                stop(sprintf("unused argument (%s)", names(arglist)[!mask]))
            list2env(arglist, envir=sys.frame(sys.nframe()))

            if (!type %in% c("snrs", "nonsnrs"))
              stop("argument 'type' must be either 'snrs' (default) or 'nonsnrs'.")

            mask <- pop %in% populations(x)
            if (any(!mask))
              stop(sprintf("scores population %s is not present in %s. Please use 'populations()' to find out the available ones.",
                           pop[!mask], name(x)))

            ## adapt to sequence style and genome version from the input
            ## GScores object, thus assuming positions are based on the same
            ## genome even though might be differently specified
            ## (i.e., hg19 vs hs37d5 or hg38 vs GRCh38)
            if (length(intersect(seqlevelsStyle(ranges), seqlevelsStyle(x))) == 0)
              seqlevelsStyle(ranges) <- seqlevelsStyle(x)[1]
            commonSeqs <- intersect(seqlevels(ranges), seqlevels(x))
            if (any(is.na(genome(ranges)))) {
              genome(ranges) <- genome(x)
            } else if (any(genome(ranges)[commonSeqs] != genome(x)[commonSeqs])) {
              message(sprintf("assuming %s represent the same genome build between query ranges and the GScores object, respectively.",
                              paste(c(unique(genome(ranges)[commonSeqs]),
                                      unique(genome(x)[commonSeqs])),
                                    collapse=" and ")))
              genome(ranges) <- genome(x)
            }

            if (is.unsorted(ranges))
              ranges <- sort(ranges)

            ans <- NULL
            if (type == "snrs")
              ans <- .which_scores_snrs(x, ranges, pop, caching)
            else
              ans <- .which_scores_nonsnrs(x, ranges, pop, caching)

            if (is.unsorted(ans))
              ans <- sort(ans)

            ans
          })

.which_scores_snrs <- function(object, ranges, pop, caching=TRUE) {
  objectname <- deparse(substitute(object))
  if (length(ranges) == 0)
    return(numeric(0))

  snames <- unique(as.character(runValue(seqnames(ranges))))
  if (any(!snames %in% seqnames(object)))
    stop(sprintf("Sequence names %s in 'ranges' not present in reference genome %s.",
                 paste(snames[!snames %in% seqnames(object)], collapse=", "),
                 genome(object)[1]))

  gscopops <- get(object@data_pkgname, envir=object@.data_cache)
  if (all(names(gscopops) %in% seqlevels(object))) { ## temporary fix for annotations w/o populations
    tmp <- gscopops
    gscopops <- list()
    gscopops[[defaultPopulation(object)]] <- List() ## RleList(tmp, compress=FALSE)
    assign(object@data_pkgname, gscopops, envir=object@.data_cache)
  }

  missingMask <- !pop %in% names(gscopops)
  for (popname in pop[missingMask]) {
    gscopops[[popname]] <- List() ## RleList(compress=FALSE)
    if (GenomicScores:::hdf5Backend(object)) ## HDF5 backend, fetch common metadata from first population
      metadata(gscopops[[popname]]) <- metadata(gscopops[[1]])
  }
  anyMissing <- any(missingMask)

  ans <- vector(mode="list", length=length(pop))
  names(ans) <- pop
  for (popname in pop) {
    missingMask <- !snames %in% names(gscopops[[popname]])
    anyMissing <- anyMissing || any(missingMask)
    if (any(missingMask))
      gscopops[[popname]] <- .fetch_scores_snrs(object, objectname, gscopops[[popname]],
                                                popname, snames[missingMask])

    mdata <- metadata(gscopops[[popname]]) ## metadata in List object of HDF5 pkgs
    if ("Rle" %in% class(gscopops[[popname]][[1]]))
      mdata <- metadata(gscopops[[popname]][[1]])

    sco <- .rleWhichValues(gscopops[[popname]], ranges, mdata)
    mcols(sco)[[popname]] <- sco$score
    mcols(sco)[["score"]] <- NULL

    ans[[popname]] <- sco
  }

  if (anyMissing && caching)
    assign(object@data_pkgname, gscopops, envir=object@.data_cache)
  rm(gscopops)

  ans <- lapply(ans, function(x) {
                  seqlevels(x) <- seqlevels(object)
                  seqinfo(x) <- seqinfo(object)
                  x
                })
  ans <- Reduce(merge, ans)
  ans
}

.rleWhichValues <- function(rlelst, gr, mdata) {

  seqlevels(gr) <- names(rlelst)
  ans <- list()

  tmpans <- NA_real_
  ord <- order(seqnames(gr)) ## store ordering below in 'split()'
  startbyseq <- split(start(gr), seqnames(gr), drop=TRUE)
  widthbyseq <- split(width(gr), seqnames(gr), drop=TRUE)
  f <- function(r, p, w)
         sapply(seq_along(p),
                function(i, r, p, w) {
                  p[i] + which(r[p[i]:(p[i]+w[i]-1)] != raw(w[i])) - 1
                },
                r, p, w)

  tmpans <- lapply(mapply(f, rlelst[names(startbyseq)], startbyseq, widthbyseq,
                   SIMPLIFY=FALSE), as.vector)
  tmpans <- lapply(tmpans, unlist)

  ans <- tmpans[ord]
  ans <- GRanges(rep(names(ans), lengths(ans)),
                 IRanges(start=unlist(ans, use.names=FALSE), width=1))

  if (length(ans) > 0)
    ans$score <- .rleGetValues(rlelst, ans, mdata, mean)
  else
    ans$score <- numeric(0)
  ans
}

.which_scores_nonsnrs <- function(object, ranges, pop, caching=TRUE) {
  objectname <- deparse(substitute(object))
  gscononsnrs<- get(sprintf("%s.nonsnvs", name(object)), envir=object@.data_cache)

  if (length(intersect(seqlevelsStyle(ranges), seqlevelsStyle(object))) == 0)
    seqlevelsStyle(ranges) <- seqlevelsStyle(object)[1]

  snames <- unique(as.character(runValue(seqnames(ranges))))
  if (any(!snames %in% seqnames(object)))
    stop(sprintf("Sequence names %s in 'ranges' not present in reference genome %s.",
                 paste(snames[!snames %in% seqnames(object)], collapse=", "),
                 providerVersion(genomeDescription(object))))

  missingSeqMask <- !snames %in% names(gscononsnrs)
  anyMissing <- any(missingSeqMask)
  allpopnames <- character(0)
  if (length(gscononsnrs) > 0)
    allpopnames <- colnames(mcols(gscononsnrs[[1]]))
  missingPopMask <- !pop %in% allpopnames
  anyMissing <- anyMissing || any(missingPopMask)

  if (any(missingSeqMask) || any(missingPopMask)) ## if genomic scores belong to queried sequences that are not cached, load them
    gscononsnrs <- .fetch_scores_nonsnrs(object, objectname, gscononsnrs,
                                         pop[missingPopMask], snames[missingSeqMask])

  ans <- list()

  ov <- findOverlaps(ranges, unlist(gscononsnrs), minoverlap=0L, type="any")
  sHits <- subjectHits(ov)

  if (length(ov) > 0) {
    ans <- unlist(gscononsnrs)[sHits]
    mcols(ans)[setdiff(allpopnames, pop)] <- NULL
    q <- mcols(ans)
    .dequantizer <- metadata(gscononsnrs)$dqfun
    dqargs <- metadata(gscononsnrs)$dqfun_args
    lappargs <- c(list(X=q, FUN=.dequantizer), dqargs)
    mcols(ans) <- DataFrame(do.call("lapply", lappargs))
  }

  if (anyMissing && caching)
    assign(sprintf("%s.nonsnvs", name(object)), gscononsnrs, envir=object@.data_cache)
  rm(gscononsnrs)

  ## mcols(ranges) <- cbind(mcols(ranges), ans)
  ## seqlevels(ranges) <- seqlevels(object)
  ## seqinfo(ranges) <- seqinfo(object)

  ans
}
rcastelo/GenomicScores documentation built on May 5, 2024, 11:38 a.m.