R/functions_fetch_signal.R

Defines functions .get_unique_names .get_file_attribs quantileGRangesWidth prepare_fetch_GRanges shift_anchor viewGRangesWinSample_dt .check_qgr ssvFetchSignal

Documented in prepare_fetch_GRanges quantileGRangesWidth shift_anchor ssvFetchSignal viewGRangesWinSample_dt

# functions useful for fetching signal data regardless of source:
# bam, bigwig, etc.

#' signal loading framework
#'
#' Does nothing unless load_signal is overridden to carry out reading data from
#' file_paths (likely via the appropriate ssvFetch* function, ie.
#' \code{\link{ssvFetchBigwig}} or \code{\link{ssvFetchBam}}
#'
#' @param file_paths character vector of file_paths to load from. Alternatively,
#'   file_paths can be a data.frame or data.table whose first column is a
#'   character vector of paths and additial columns will be used as metadata.
#' @param qgr GRanges of intervals to return from each file
#' @param unique_names unique file ids for each file in file_paths.  Default is
#'   names of file_paths vector
#' @param names_variable character, variable name for column containing
#'   unique_names entries.  Default is "sample"
#' @param file_attribs optional data.frame/data.table with one row per item in
#' file paths.  Each column will be a variable added to final tidy output.
#' @param win_size numeric/integer window size resolution to load signal at.
#'   Default is 50.
#' @param win_method character.  one of c("sample", "summary").  Determines if
#'   \code{\link{viewGRangesWinSample_dt}} or
#'   \code{\link{viewGRangesWinSummary_dt}} is used to represent each region in
#'   qgr.
#' @param return_data.table logical. If TRUE data.table is returned instead of
#'   GRanges, the default.
#' @param load_signal function taking f, nam, and qgr arguments.  f is from
#'   file_paths, nam is from unique_names, and qgr is qgr. See details.
#' @param n_cores integer number of cores to use. Uses mc.cores option if not
#'   supplied.
#' @param n_region_splits integer number of splits to apply to qgr. The query
#'   GRanges will be split into this many roughly equal parts for increased
#'   parallelization. Default is 1, no split.
#' @param force_skip_centerFix boolean, if TRUE all query ranges will be used
#'   "as is".  This is already the case by default if win_method == "summary"
#'   but may have applications where win_method == "sample".
#' @details load_signal is passed f, nam, and qgr and is executed in the
#'   environment where load_signal is defined. See \code{\link{ssvFetchBigwig}}
#'   and \code{\link{ssvFetchBam}} for examples.
#' @return A GRanges with values read from file_paths at intervals of win_size.
#'   Originating file is coded by unique_names and assigned to column of name
#'   names_variable.  Output is data.table is return_data.table is TRUE.
#' @export
#' @import pbmcapply
#' @examples
#' library(GenomicRanges)
#' bam_f = system.file("extdata/test.bam",
#'     package = "seqsetvis", mustWork = TRUE)
#' bam_files = c("a" = bam_f, "b" = bam_f)
#' qgr = CTCF_in_10a_overlaps_gr[1:2]
#' qgr = resize(qgr, 500, "center")
#' load_bam = function(f, nam, qgr) {
#'     message("loading ", f, " ...")
#'     dt = seqsetvis:::ssvFetchBam.single(bam_f = f,
#'                       qgr = qgr,
#'                       win_size = 50,
#'                       fragLen = NULL,
#'                       target_strand = "*",
#'                       return_data.table = TRUE)
#'     dt[["sample"]] = nam
#'     message("finished loading ", nam, ".")
#'     dt
#' }
#' ssvFetchSignal(bam_files, qgr, load_signal = load_bam)
ssvFetchSignal = function(file_paths,
                          qgr,
                          unique_names = NULL,
                          names_variable = "sample",
                          file_attribs = NULL,
                          win_size = 50,
                          win_method = c("sample", "summary")[1],
                          return_data.table = FALSE,
                          load_signal = function(f, nam, qgr) {
                              warning("nothing happened, ",
                                      "supply a function to",
                                      "load_signal parameter.")
                          },
                          n_cores = getOption("mc.cores", 1),
                          n_region_splits = 1,
                          force_skip_centerFix = FALSE) {



    tmp = .get_file_attribs(file_paths, file_attribs)
    file_paths = tmp$file_paths
    file_attribs = tmp$file_attribs
    remove(tmp)
    unique_names = .get_unique_names(unique_names, file_paths, file_attribs, names_variable)
    file_attribs[[names_variable]] = unique_names
    if (is.null(file_attribs[[names_variable]])) {
        file_attribs[[names_variable]] = unique_names
    }
    if (is.null(names(file_paths))) {
        names(file_paths) = unique_names
    }
    stopifnot(is.character(file_paths))
    stopifnot(is(qgr, "GRanges"))
    stopifnot(is.character(unique_names))
    stopifnot(is.character(names_variable))
    stopifnot(is.numeric(win_size))
    if (any(duplicated(unique_names))) {
        stop("some unique_names are duplicated:\n",
             paste(collapse = "\n",
                   unique(unique_names[duplicated(unique_names)])), "\n",
             "If you haven't manually supplied uninque_names, this is derived from",
             " file paths supplied.  \nTry supplying unique_names manually if you ",
             "intended to load duplicate files.")
    }
    stopifnot(file.exists(file_paths))
    #if (win_method == "sample") {
    qgr = prepare_fetch_GRanges(qgr = qgr,
                                win_size = win_size,
                                target_size = NULL,
                                skip_centerFix = win_method != "sample" | force_skip_centerFix)
    #}
    qgr = .check_qgr(qgr)
    stopifnot(is.numeric(n_region_splits))
    if(n_region_splits > length(qgr)){
        n_region_splits = length(qgr)
    }
    n_region_splits = round(n_region_splits)
    stopifnot(n_region_splits >= 1)

    if(n_region_splits == 1){
        nam_load_signal = function(nam) {
            f = file_paths[nam]
            load_signal(f, nam, qgr)
        }
        if(n_cores == 1 | length(unique_names) == 1){
            bw_list = lapply(unique_names,
                             nam_load_signal)
        }else{
            bw_list = pbmcapply::pbmclapply(unique_names,
                                            nam_load_signal,
                                            mc.cores = n_cores)
        }

    }else{
        split_qgr = split(qgr, ceiling(seq_along(qgr)/length(qgr)*n_region_splits))
        file_attribs = file_attribs[rep(seq_len(nrow(file_attribs)), each = n_region_splits),, drop = FALSE]

        task_df = expand.grid(seq_along(split_qgr), unique_names)
        colnames(task_df) = c("region_var", "file_var")
        rownames(task_df) = paste(task_df$file_var, task_df$region_var)
        nam_load_signal = function(nam) {
            f = file_paths[task_df[nam,]$file_var]
            sub_qgr = split_qgr[[task_df[nam,]$region_var]]
            load_signal(f, task_df[nam,]$file_var, sub_qgr)
        }
        if(n_cores == 1){
            bw_list = lapply(rownames(task_df),
                             nam_load_signal)
        }else{
            bw_list = pbmcapply::pbmclapply(rownames(task_df),
                                            nam_load_signal,
                                            mc.cores = n_cores)
        }

    }


    for (i in seq_along(bw_list)) {
        for (attrib in colnames(file_attribs)) {
            if(nrow(bw_list[[i]]) == 0){
                bw_list[[i]][[attrib]] = file_attribs[[attrib]][0]
            }else{
                bw_list[[i]][[attrib]] = file_attribs[[attrib]][i]
            }
        }
    }
    out = data.table::rbindlist(bw_list)
    out[[names_variable]] = factor(out[[names_variable]], levels = unique_names)
    if (!return_data.table) {
        out = GRanges(out)
    }
    return(out)
}

.check_qgr = function(qgr){
    if(is.null(qgr$id)){#need id
        if (!is.null(qgr$name)) {
            qgr$id = qgr$name
        }else if (!is.null(names(qgr))) {
            qgr$id = names(qgr)
        }else{
            qgr$id = paste0("region_", seq_along(qgr))
        }
    }
    if(any(duplicated(qgr$id))) stop("qgr$id must be unique. qgr$id is either directly supplied or taken from qgr$name or names(qgr).")
    names(qgr) = qgr$id
    qgr
}

#' get a windowed sampling of score_gr
#'
#' This method is appropriate when all GRanges in qgr are identical width and
#' when it is practical to use a window_size smaller than features in genomic
#' signal.  For instance, when retrieving signal around peaks or promoters this
#' method maintains a fixed genomic scale across regions.  This allows meaingful
#' comparison of peak widths can be made.
#'
#' Summarizes score_gr by grabbing value of "score" every window_size bp.
#' Columns in output data.table are: standard GRanges columns: seqnames, start,
#' end, width, strand id - matched to names(score_gr). if names(score_gr) is
#' missing, added as 1:length(score_gr). y - value of score from score_gr. x -
#' relative bp position.
#'
#' @param score_gr GRanges with a "score" metadata column.
#' @param qgr regions to view by window.
#' @param window_size qgr will be represented by value from score_gr every
#'   window_size bp.
#' @param attrib_var character name of attribute to pull data from. Default is
#'   "score", compatible with with bigWigs or bam coverage.
#' @param fill_value numeric or character value to use where queried regions are
#'   empty.  Default is 0 and appropriate for both calculated coverage and
#'   bedgraph/bigwig like files.  Will automatically switch to "MISSING" if data
#'   is guessed to be qualitative.
#' @param anchor character. controls how x value is derived from position for
#'   each region in qgr.  0 may be the left side or center.  If not unstranded,
#'   x coordinates are flipped for (-) strand. One of c("center",
#'   "center_unstranded", "left", "left_unstranded"). Default is "center".
#' @return data.table that is GRanges compatible
#' @export
#' @examples
#' bam_file = system.file("extdata/test.bam",
#'     package = "seqsetvis")
#' qgr = CTCF_in_10a_overlaps_gr[seq_len(5)]
#' qgr = GenomicRanges::resize(qgr, width = 500, fix = "center")
#' bam_gr = seqsetvis:::fetchBam(bam_file, qgr)
#' bam_dt = viewGRangesWinSample_dt(bam_gr, qgr, 50)
#'
#' if(Sys.info()['sysname'] != "Windows"){
#'     bw_file = system.file("extdata/MCF10A_CTCF_FE_random100.bw",
#'         package = "seqsetvis")
#'     bw_gr = rtracklayer::import.bw(bw_file, which = qgr)
#'     bw_dt = viewGRangesWinSample_dt(bw_gr, qgr, 50)
#' }
viewGRangesWinSample_dt = function(score_gr,
                                   qgr,
                                   window_size,
                                   attrib_var = "score",
                                   fill_value = 0,
                                   anchor = c("center", "center_unstranded",
                                              "left", "left_unstranded")[1]) {
    #reserve bindings for data.table
    x = id = NULL
    stopifnot(is(score_gr, "GRanges"))
    stopifnot(!is.null(mcols(score_gr)[[attrib_var]]))
    stopifnot(is(qgr, "GRanges"))
    if(is.null(qgr$id)) qgr = .check_qgr(qgr)
    stopifnot(is.numeric(window_size))
    stopifnot(window_size >= 1)
    stopifnot(window_size %% 1 == 0)
    stopifnot(anchor %in% c("center", "center_unstranded",
                            "left", "left_unstranded"))
    windows = slidingWindows(qgr, width = window_size, step = window_size)

    # names(windows) = qgr$id
    # windows's handling of names seems to have changed and now every nest
    # GRanges has parent's name
    names(windows) = NULL
    windows = unlist(windows)
    windows$id = names(windows)
    windows = resize(windows, width = 1, fix = "center")
    olaps = suppressWarnings(data.table::as.data.table(
        findOverlaps(
            query = windows,
            subject = score_gr,
            ignore.strand = TRUE
        )
    ))
    # patch up missing/out of bound data with 0
    missing_idx = setdiff(seq_along(windows), olaps$queryHits)
    if (length(missing_idx) > 0) {
        olaps = rbind(
            olaps,
            data.table::data.table(
                queryHits = missing_idx,
                subjectHits = length(score_gr) + 1
            )
        )[order(queryHits)]
        suppressWarnings({
            patch_gr = GRanges("chrPatchZero",
                               IRanges::IRanges(1, 1))
            mcols(patch_gr)[[attrib_var]] = fill_value
            score_gr = c(score_gr,
                         patch_gr)
        })
    }
    # set y and output windows = windows[olaps$queryHits]
    windows$y = fill_value
    windows[olaps$queryHits]$y = mcols(score_gr[olaps$subjectHits])[[attrib_var]]
    score_dt = data.table::as.data.table(windows)

    return(shift_anchor(score_dt, window_size, anchor))
}


#' Summarizes signal in bins.  The same number of bins per region in qgr is used
#' and widths can vary in qgr, in contrast to
#' \code{\link{viewGRangesWinSample_dt}} where width must be constant across
#' regions.
#'
#' This function is most appropriate where features are expected to vary greatly
#' in size and feature boundaries are important, ie. gene bodies, enhancers or
#' TADs.
#'
#' Columns in output data.table are: standard GRanges columns: seqnames, start,
#' end, width, strand id - matched to names(score_gr). if names(score_gr) is
#' missing, added as seq_along(score_gr). y - value of score from score_gr x -
#' relative bp position
#' @param score_gr GRanges with a "score" metadata column.
#' @param qgr regions to view by window.
#' @param n_tiles numeric >= 1, the number of tiles to use for every region in
#'   qgr.
#' @param attrib_var character name of attribute to pull data from. Default is
#'   "score", compatible with with bigWigs or bam coverage.
#' @param attrib_type one of NULL, qualitative or quantitative.  If NULL will
#'   attempt to guess by casting attrib_var attribute to character or factor.
#'   Default is NULL.
#' @param fill_value numeric or character value to use where queried regions are
#'   empty.  Default is 0 and appropriate for both calculated coverage and
#'   bedgraph/bigwig like files.  Will automatically switch to "MISSING" if data
#'   is guessed to be qualitative.
#' @param anchor character. controls how x value is derived from position for
#'   each region in qgr.  0 may be the left side or center.  If not unstranded,
#'   x coordinates are flipped for (-) strand. One of c("center",
#'   "center_unstranded", "left", "left_unstranded"). Default is "center".
#' @param summary_FUN function. used to aggregate score by tile.  must accept
#'   x=score and w=width numeric vectors as only arguments. default is
#'   weighted.mean.  limma::weighted.median is a good alternative.
#' @return data.table that is GRanges compatible
#' @export
#' @importFrom stats weighted.mean
#' @examples
#' bam_file = system.file("extdata/test.bam",
#'     package = "seqsetvis")
#' qgr = CTCF_in_10a_overlaps_gr[1:5]
#' # unlike viewGRangesWinSample_dt, width is not fixed
#' # qgr = GenomicRanges::resize(qgr, width = 500, fix = "center")
#' bam_gr = seqsetvis:::fetchBam(bam_file, qgr)
#' bam_dt = viewGRangesWinSummary_dt(bam_gr, qgr, 50)
#'
#' if(Sys.info()['sysname'] != "Windows"){
#'     bw_file = system.file("extdata/MCF10A_CTCF_FE_random100.bw",
#'         package = "seqsetvis")
#'     bw_gr = rtracklayer::import.bw(bw_file, which = qgr)
#'     bw_dt = viewGRangesWinSummary_dt(bw_gr, qgr, 50)
#' }
viewGRangesWinSummary_dt = function (score_gr,
                                     qgr,
                                     n_tiles = 100,
                                     attrib_var = "score",
                                     attrib_type = NULL,
                                     fill_value = 0,
                                     anchor = c("center", "center_unstranded",
                                                "left", "left_unstranded")[1],
                                     summary_FUN = stats::weighted.mean) {
    #reserve bindings for data.table
    x = id = tile_start = tile_end = tile_id =
        tile_width = scored_width = tile_density = score_width = NULL
    stopifnot(is(score_gr, "GRanges"))
    stopifnot(!is.null(mcols(score_gr)[[attrib_var]]))
    stopifnot(is(qgr, "GRanges"))
    if(is.null(qgr$id)) qgr = .check_qgr(qgr)
    stopifnot(is.numeric(n_tiles))
    stopifnot(n_tiles >= 1)
    stopifnot(n_tiles %% 1 == 0)
    stopifnot(anchor %in% c("center", "center_unstranded", "left",
                            "left_unstranded"))
    tiles = tile(qgr, n_tiles)
    names(tiles) = NULL
    tiles = unlist(tiles)
    tiles$id = names(tiles)
    tiles$tile_id = seq_along(tiles)


    olaps = suppressWarnings(data.table::as.data.table(
        findOverlaps(
            query = tiles,
            subject = score_gr,
            ignore.strand = TRUE
        )
    ))
    #fill in gaps with zeroes
    missing_idx = setdiff(seq_along(tiles), olaps$queryHits)
    if (length(missing_idx) > 0) {
        olaps = rbind(
            olaps,
            data.table::data.table(
                queryHits = missing_idx,
                subjectHits = length(score_gr) + 1
            )
        )[order(queryHits)]
        suppressWarnings({
            patch_gr = GRanges("chrPatchZero",
                               IRanges::IRanges(1, 1))
            mcols(patch_gr)[[attrib_var]] = NA
            score_gr = c(score_gr,
                         patch_gr
            )
        })
    }
    mcols(score_gr) = mcols(score_gr)[attrib_var]
    cov_dt = cbind(as.data.table(score_gr[olaps$subjectHits])[, -c(1, 4:5)],
                   as.data.table(tiles[olaps$queryHits])[, -c(1, 4:5)])
    colnames(cov_dt)[4:5] = c("tile_start", "tile_end")

    cov_dt[start == 1 &
               end == 1, c("start", "end") := list(tile_start, tile_end)]
    #trim score regions to fit in tiles so score weighting is accurate

    cov_dt[start < tile_start, start := tile_start]
    cov_dt[end > tile_end, end := tile_end]
    cov_dt[, score_width := end - start + 1]
    cov_dt[, tile_width := tile_end - tile_start + 1]

    if(all(is.na(cov_dt[[attrib_var]]))){
        cov_dt[[attrib_var]] = fill_value
    }
    set(cov_dt, i = which(is.na(cov_dt[[attrib_var]])), j = attrib_var, value = fill_value)

    if(is.null(attrib_type)){
        if(any(class(cov_dt[[attrib_var]]) %in% c("character", "factor"))){
            attrib_type = "qualitative"
            if(fill_value == 0){
                fill_value = "MISSING"
            }
        }else{
            attrib_type = "quantitative"
        }
    }

    # check for incompletely retrieved regions (zeroes omitted for instance)
    check_dt = cov_dt[, list(scored_width = sum(score_width)), by = list(tile_id, id, tile_width)]
    # check_dt$tile_width = width(tiles)
    #add a dummy interval of score zero to correct width
    repair_dt = check_dt[tile_width != scored_width, list(
        start = -1,
        end = -1,
        score = fill_value,
        tile_start = -1,
        tile_end = -1,
        id = id,
        tile_id = tile_id,
        score_width = tile_width - scored_width,
        tile_width
    )]
    setnames(repair_dt, "score", attrib_var)
    cov_dt = rbind(cov_dt, repair_dt)




    if(attrib_type == "qualitative"){
        density_dt = cov_dt[, list(tile_density = summary_FUN(score_width / tile_width, rep(1, length(width)))),
                            by = c("tile_id", "id", attrib_var)]
    }else if(attrib_type == "quantitative"){
        density_dt = cov_dt[, list(tile_density = summary_FUN(get(attrib_var), score_width)),
                            by = list(tile_id, id)]
    }else{
        stop("attrib_type must be one of qualitative or quantitative")
    }

    tiles = as.data.table(tiles)
    tiles[, x := (seq_len(n_tiles) - .5) / n_tiles, by = id]

    # density_dt[, x := (seq_len(n_tiles) - .5) / n_tiles, by = id]
    if (!all(tiles$id %in% density_dt$id))
        stop("something bad happened when merging density ",
             "data.table back to tiles GRanges.")

    if(attrib_type == "qualitative"){
        score_dt = merge(tiles, density_dt[, list(y = tile_density, id, tile_id, ATTRIB_VAR = get(attrib_var))], by = c("tile_id", "id"))
        setnames(score_dt, "ATTRIB_VAR", attrib_var)
        #score_dt[is.na(quality), quality := fill_value]
    }else{
        score_dt = merge(tiles, density_dt[, list(y = tile_density, id, tile_id)], by = c("tile_id", "id"))
        #score_dt[is.na(y), y := fill_value]
    }
    score_dt$tile_id = NULL
    #slightly different than summary,
    #x is already set and regions are already contiguous.
    #just need to flip x or center as needed.
    switch(
        anchor,
        center = {
            center_val = mean((seq_len(n_tiles) - .5) / n_tiles)
            score_dt[, x := x - center_val, by = id]
            score_dt[strand == "-", x := (-1 * x)]
        },
        center_unstranded = {
            center_val = mean((seq_len(n_tiles) - .5) / n_tiles)
            score_dt[, x := x - center_val, by = id]
        },
        left = {
            score_dt[strand == "-", x := (1 - 1 * x), by = id]
        },
        left_unstranded = {
            #do nothing
        }
    )

    #ensure x is rounded and easily flippable
    score_dt$x = round(score_dt$x, 9)

    score_dt
}

#' orients the relative position of x's zero value and
#' extends ranges to be contiguous
#'
#' @param score_dt data.table, GRanges() sufficient
#' @param window_size numeric, window size used to generate score_dt
#' @param anchor character, one of c("center", "center_unstranded",
#' "left", "left_unstranded")
#' @return score_dt with x values shifted appropriately and start and end
#' extended to make ranges contiguous
shift_anchor = function(score_dt, window_size, anchor) {
    x = id = NULL
    shift = round(window_size / 2)
    fudge = 1
    if(window_size == 1) fudge = 0
    switch(
        anchor,
        center = {
            score_dt[, x := start - min(start) + shift, by = id]
            score_dt[, x := x - round(mean(x)), by = id]
            if(window_size %% 2 == 0){
                score_dt[strand == "-", x := -1 * x]
            }else{
                score_dt[strand == "-", x := -1 * x + fudge]
            }

        },
        center_unstranded = {
            score_dt[, x := start - min(start) + shift, by = id]
            score_dt[, x := x - round(mean(x)), by = id]
        },
        left = {
            score_dt[, x := -1]
            score_dt[strand != "-", x := start - min(start) + shift,
                     by = id]
            #flip negative

            score_dt[strand == "-", x := -1 * (end - max(end) - shift),
                     by = id]

        },
        left_unstranded = {
            score_dt[, x := start - min(start) + shift, by = id]
        }
    )

    score_dt[, start := start - shift]
    score_dt[, end := end + window_size - shift - 1]
    return(score_dt)
}

#' prepares GRanges for windowed fetching.
#'
#' output GRanges parallels input with consistent width evenly divisible by
#' win_size.  Has warning if GRanges needed resizing, otherwise no warning
#' and input GRanges is returned unchanged.
#'
#' @param qgr GRanges to prepare
#' @param win_size numeric window size for fetch
#' @param min_quantile numeric [0,1], lowest possible quantile value.  Only
#' relevant if target_size is not specified.
#' @param target_size numeric final width of qgr if known. Default of NULL
#' leads to quantile based determination of target_size.
#' @param skip_centerFix boolean, if FALSE (default) all regions will be resized
#' GenomicRanges::resize(x, w, fix = "center") to a uniform size based on
#' min_quantile to a width divisible by win_size.
#' @return GRanges, either identical to qgr or with suitable consistent width
#' applied.
#' @export
#' @examples
#' qgr = prepare_fetch_GRanges(CTCF_in_10a_overlaps_gr, win_size = 50)
#' #no warning if qgr is already valid for windowed fetching
#' prepare_fetch_GRanges(qgr, win_size = 50)
prepare_fetch_GRanges = function(qgr,
                                 win_size,
                                 min_quantile = .75,
                                 target_size = NULL,
                                 skip_centerFix = FALSE) {
    if (!skip_centerFix &&
        (length(unique(width(qgr))) > 1 || width(qgr)[1] %% win_size != 0)) {
        if (is.null(target_size)) {
            target_size = quantileGRangesWidth(
                qgr = qgr,
                min_quantile = min_quantile,
                win_size = win_size
            )
        }
        if (target_size %% win_size != 0) {
            stop(
                "target_size: ",
                target_size,
                " not evenly divisible by win_size: ",
                win_size
            )
        }

        qgr = centerFixedSizeGRanges(qgr, fixed_size = target_size)
        message(
            "widths of qgr were not ",
            "identical and evenly divisible by win_size.",
            "\nA fixed width of ",
            target_size,
            " was applied based on the data provided."
        )
    }
    if (any(start(qgr) < 1)) {
        warning("Some out of bounds GRanges had to be shifted back to start >= 1.")
        fix_shift = 1 - start(qgr)
        fix_shift = vapply(fix_shift, function(x)
            max(x, 0), FUN.VALUE = 1)
        qgr = IRanges::shift(qgr, fix_shift)
    }
    return(qgr)
}

#' Quantile width determination strategy
#'
#' Returns the lowest multiple of win_size greater than
#' min_quantile quantile of width(qgr)
#'
#' @param qgr GRanges to calculate quantile width for
#' @param min_quantile numeric [0,1] the minimum quantile of width in qgr
#' @param win_size numeric/integer >=1, returned value will be a multiple of
#' this
#' @return numeric that is >= min_quantile and evenly divisible by win_size
#' @export
#' @examples
#' gr = CTCF_in_10a_overlaps_gr
#' quantileGRangesWidth(gr)
#' quantileGRangesWidth(gr, min_quantile = .5, win_size = 100)
quantileGRangesWidth = function(qgr,
                                min_quantile = .75,
                                win_size = 1) {
    stopifnot(is(qgr, "GRanges"))
    stopifnot(is.numeric(min_quantile), is.numeric(win_size))
    stopifnot(min_quantile >= 0 && min_quantile <= 1)
    stopifnot(length(min_quantile) == 1 && length(win_size) == 1)
    stopifnot(win_size %% 1 == 0)
    stopifnot(win_size >= 1)
    qwidth = quantile(width(qgr), min_quantile)
    fwidth = ceiling(qwidth / win_size) * win_size
    return(fwidth)
}

.get_file_attribs = function(file_paths, file_attribs){
    if(is.data.table(file_paths)){
        file_paths = as.data.frame(file_paths)
    }
    if(is.data.table(file_attribs)){
        file_attribs = as.data.frame(file_attribs)
    }
    if(is.null(file_attribs)){
        if (is.data.frame(file_paths)) {
            if (ncol(file_paths) == 1) {
                file_attribs = data.frame(matrix(
                    0, nrow = nrow(file_paths), ncol = 0
                ))
            } else{
                k = which(grepl("file", colnames(file_paths)))[1]
                if(is.na(k)) k = 1
                file_attribs = file_paths[,-k, drop = FALSE]
            }

        } else{
            #file_paths is assumed to be a character vector
            file_attribs = data.frame(data.frame(matrix(
                0, nrow = length(file_paths), ncol = 0
            )))
        }
    }
    if (is.data.frame(file_paths)) {
        if(any(grepl("file", colnames(file_paths)))){
            k = which(grepl("file", colnames(file_paths)))[1]
            file_paths = file_paths[[k]]
        }else{
            file_paths = file_paths[[1]]
        }
    }

    if (is.list(file_paths)) {
        file_paths = unlist(file_paths)
    }
    if (is.factor(file_paths)){
        file_paths = as.character(file_paths)
    }
    return(list(file_paths = file_paths, file_attribs = file_attribs))
}

.get_unique_names = function(unique_names, file_paths, file_attribs, names_variable){
    if(is.null(unique_names)){
        if(!is.null(file_attribs[[names_variable]])){
            unique_names = file_attribs[[names_variable]]
        }else if (is.data.frame(file_paths) || is.data.table(file_paths)) {
            if(is.null(colnames(file_paths))){
                unique_names = file_paths[[1]]
            }else{
                if(any(grepl("file", colnames(file_paths)))){
                    k = which(grepl("file", colnames(file_paths)))[1]
                    unique_names = file_paths[[k]]
                }else{
                    unique_names = file_paths[[1]]
                }
            }
        }else{
            if(is.null(names(file_paths))){
                unique_names = file_paths
            }else{
                unique_names = names(file_paths)
            }

        }
    }
    if (is.factor(unique_names)){
        unique_names = levels(droplevels(unique_names))
    }
    if(any(duplicated(unique_names))){
        stop("some unique_names are duplicated:\n",
             paste(collapse = "\n",
                   unique(unique_names[duplicated(unique_names)])), "\n",
             "If you haven't manually supplied uninque_names, this is derived from",
             " file paths supplied.  \nTry supplying unique_names manually if you ",
             "intended to load duplicate files.")
    }
    unique_names
}

Try the seqsetvis package in your browser

Any scripts or data that you put into this service are public.

seqsetvis documentation built on Nov. 8, 2020, 5:57 p.m.