recoup: Create genomic signal profiles in predefined or custom areas...
In pmoulos/recoup: An R package for the creation of complex genomic profile plots

recoup

R Documentation

Create genomic signal profiles in predefined or custom areas using short sequence reads

Description

This function calculates and plots signal profiles created from short sequence reads derived from Next Generation Sequencing technologies. The profiles provided are either sumarized curve profiles or heatmap profiles. Currently, recoup supports genomic profile plots for reads derived from ChIP-Seq and RNA-Seq experiments. The function uses ggplot2 and ComplexHeatmap graphics facilities for curve and heatmap coverage profiles respectively. The output list object can be reused as input to this function which will automatically recognize which profile elements needto be recalculated, saving time.

Usage

    recoup(
        input,
        design = NULL,
        region = c("genebody", "tss", "tes", "utr3", "custom"),
        type = c("chipseq", "rnaseq"),
        signal = c("coverage", "rpm"),
        genome = c("hg18", "hg19", "hg38", "mm9" ,"mm10",
            "rn5", "rn6", "dm3", "dm6", "danrer7", "danrer10",
            "pantro4", "pantro5", "susscr3", "susscr11", 
            "ecucab2", "tair10"),
        version = "auto",
        refdb = c("ensembl", "ucsc", "refseq"),
        flank = c(2000, 2000),
        onFlankFail = c("drop","trim"),
        fraction = 1,
        orderBy = list(
            what = c("none", "suma", "sumn", 
            "maxa","maxn", "avga", "avgn", "hcn"), 
            order = c("descending", "ascending"),
            custom = NULL
        ),
        binParams = list(
            flankBinSize = 0,
            regionBinSize = 0,
            sumStat = c("mean", "median"),
            interpolation = c("auto", "spline", "linear", 
                "neighborhood"),
            binType = c("variable", "fixed"),
            forceHeatmapBinning = TRUE,
            forcedBinSize = c(50, 200),
            chunking = FALSE
        ),
        selector = NULL,
        preprocessParams = list(
            fragLen = NA,
            cleanLevel = c(0, 1, 2, 3),
            normalize = c("none", "linear", 
                "downsample","sampleto"),
            sampleTo = 1e+6,
            spliceAction = c("split", "keep", "remove"),
            spliceRemoveQ = 0.75,
            bedGenome = NA
        ),
        plotParams = list(
            plot = TRUE,
            profile = TRUE,
            heatmap = TRUE,
            correlation = TRUE,
            signalScale = c("natural", "log2"),
            heatmapScale = c("common", "each" ),
            heatmapFactor = 1,
            corrScale = c("normalized", "each"),
            sumStat = c("mean", "median"),
            smooth = TRUE,
            corrSmoothPar = ifelse(is.null(design), 0.1, 
                0.5),
            singleFacet = c("none", "wrap", "grid"),
            multiFacet = c("wrap", "grid"),
            singleFacetDirection = c("horizontal", "vertical"),
            conf = TRUE,
            device = c("x11", "png", "jpg", "tiff", "bmp",
                "pdf", "ps"),
            outputDir = ".",
            outputBase = NULL
        ),
        saveParams = list(
            ranges = TRUE,
            coverage = TRUE,
            profile = TRUE,
            profilePlot = TRUE,
            heatmapPlot = TRUE,
            correlationPlot = TRUE
        ),
        kmParams = list(
            k = 0,
            nstart = 20,
            algorithm = c("Hartigan-Wong", 
                "Lloyd", "Forgy", "MacQueen"),
            iterMax = 20,
            reference = NULL
        ),
        strandedParams = list(
            strand = NULL, 
            ignoreStrand = TRUE
        ),
        ggplotParams = list(
            title = element_text(size = 12),
            axis.title.x = element_text(size = 10, 
                face = "bold"),
            axis.title.y = element_text(size = 10, 
                face = "bold"),
            axis.text.x = element_text(size = 9, 
                face = "bold"),
            axis.text.y = element_text(size = 10, 
                face = "bold"),
            strip.text.x = element_text(size = 10, 
                face = "bold"),
            strip.text.y = element_text(size = 10, 
                face = "bold"),
            legend.position = "bottom",
            panel.spacing = grid::unit(1, "lines")
        ),
        complexHeatmapParams = list(
            main = list(
                cluster_rows = ifelse(length(grep(
                    "hc", orderBy$what)) > 0, TRUE, FALSE),
                cluster_columns = FALSE,
                column_title_gp = grid::gpar(fontsize = 10, 
                    font = 2),
                show_row_names = FALSE,
                show_column_names = FALSE,
                heatmap_legend_param = list(
                    color_bar = "continuous"
                )
            ),
            group=list(
                cluster_rows = ifelse(length(grep(
                    "hc", orderBy$what)) > 0, TRUE, FALSE),
                cluster_columns = FALSE,
                column_title_gp = grid::gpar(fontsize = 10, 
                    font = 2),
                show_row_names = FALSE,
                show_column_names = FALSE,
                row_title_gp = grid::gpar(fontsize = 8,
                    font = 2),
                gap = unit(5, "mm"),
                heatmap_legend_param = list(
                    color_bar = "continuous"
                )
            )
        ),
        bamParams = NULL,
        onTheFly = FALSE,
        localDb = file.path(system.file(package = "recoup"),
            "annotation.sqlite"),
        rc = NULL
    )

Arguments

`input`	the main input to `recoup` can be either a list or a configuration file (with essentially the same contents as the list). In case of list input, it is a list of n lists, where n the number of samples. See Details for the inner list contents. Alternatively, `input` can be a text tab delimited file with a specific header (the same fields as each inner list when `input` is a list) and one row for each sample. Again, see Details section for the field specifications.
`design`	either a data frame with grouping factors as columns (e.g. two grouping factors can be strand, and Ensembl biotype) or a tab delimited text file with the same content (grouping factors in columns). If a data frame, the `row.names` attribute must correspond to the names (e.g. rownames) of the `genome` argument, or be a superset or subset of them. If a file, the first column must correspond to the names (e.g. rownames) of the `genome` argument or be a superset or subset of them.
`region`	one of `"tss"`, `"tes"`, `"genebody"`, `"custom"`.
`type`	one of `"chipseq"`, `"rnaseq"`.
`signal`	plots signal based on coverage (`"coverage"` default) or reads per million `"rpm"` (experimental!).
`genome`	when `region` is `"tss"`, `"tes"` or `"genebody"`, `genome` can be one of `"hg38"`, `"hg19"`, `"hg18"`, `"mm10"`, `"mm9"`, `"dm3"`, `"rn5"`, `"danrer7"`, `"pantro4"`, `"susscr3"` for human, mouse, fruitfly, rat, zebrafish and chimpanzee genomes respectively. When when `region` is one of the above or `"custom"`, `genome` can be a tab delimited BED-like text file or a data frame.
`version`	the version of `genome` to use as related to `source`. Either `"auto"` (default) or a numeric value representing an Ensembl version or the creation date of UCSC or RefSeq annotations.
`refdb`	one of `"ensembl"`, `"ucsc"` or `"refseq"`. It will be used to retrieve genomic reference regions when `genome` argument is one of the supported organisms.
`flank`	a vector of length two with the number of base pairs to flank upstream and downstream the `region`. Minimum flank is 0bp and maximum is 50kb. It is always expressed in bp.
`onFlankFail`	action to be taken when flanking causes the requested plot genomic coordinates to go beyond the lengths of reference sequences (e.g. chromosomes). It can be `"drop"` (default) or `"trim"`. Note that trimming will cause the flanking and main regions to be merged in genebody plots and therefore possibly reducing plot resolution.

regionMinimum flank is 0bp and maximum is 50kb. It is always expressed in bp.

`fraction`	a number from 0 to 1 (default) denoting the fraction of total data to be used. See Details for further information.
`orderBy`	a named list whose members control the order of the genomic regions (related to the `genome` and `region` arguments as they appear in heatmap profiles. The list has the following fields: `what`: one of `"none"` (default), `"suma"`, `"sumn"`, `"maxa"`, `"maxn"`, `"avga"`, `"avgn"`, `"hcn"`, where `n` in `"sumn"`, `"maxn"`, `"hcn"` is the index of the profile which sould be used as reference. See Details for further information. `order`: either `"descending"` (default) for ordering coverages from highest to lowest, or `"ascending"` for the opposite. `custom`: a numeric vector of custom values (e.g. RNA abundance) that will be used to sort all the profiles. If provided, `what` will be ignored. Defaults to `NULL`.
`binParams`	a named list whose members control the resolution of the coverage profiles. The list has the following fields: `flankBinSize`: the number of intervals (bins) into which the upstream and downstream regions are split and the per-base coverage is averaged across. If `0` (default), no binning is performed and the profiles are calculated based at the base-pair level (the highest possible resolution). `regionBinSize`: the number of intervals (bins) into which the main region is split and the per-base coverage is averaged across. If `0` (default), no binning is performed and the profiles are calculated based at the base-pair level (the highest possible resolution). `sumStat`: the statistic which is used to summarize the bin coverage. Can be `"mean"` (default) or `"median"`. `interpolation`: the interpolation method to be used for coverage interpolation when the reference regions are of unequal lengths (e.g. gene bodies) and the `regionBinSize` is larger than some of the former. Can be `"auto"` (default), `"spline"`, `"linear"` or `"neighborhood"`. See Details for further explanations of each option. `binType`: the type of bins (variable or fixed) when `signal="rpm"`. It defaults to `"variable"`. See Details for further info. `forceHeatmapBinning`: if `TRUE` (default) and the profile resolution is very high (see `flankBinSize` and `regionBinSize` above), binning is applied prior to heatmap profile generation, otherwise the heatmaps will be oversized and will take a lot of time to render. Set to `FALSE` if both `flankBinSize` and `regionBinSize` are not zero so as to avoid unecessary profile recalculations. `forcedBinSize`: a vector with two integers representing the `flankBinSize` and `regionBinSize` to be used with `forceHeatmapBinning` above. `chunking`: `TRUE` (default) or `FALSE`. When `TRUE`, recoup will try to chunk the data for profile matrix calculation. See Details for a few further notes in the usage of `binParams`.
`selector`	a named list whose members control some subsetting abilities regarding the input reference genomic regions (`genome` argument) or NULL (default) when the `genome` argument is/may be custom. When list, it has the following fields: `id`: a vector of ids of the same type as those present in the `genome` file or organism type/version. `bioype`: a vector of Ensembl biotypes that will be used to filter the `genome` when the latter is one of the supported organisms. Not used when `genome` is a custom file. `exonType`: currently not used.
`preprocessParams`	a named list whose members control certain preprocessing steps applied to the `GRanges` objects obtained while or after reading the input BAM/BED files with short reads or BigWig files with processed signals. The list has the following fields: `fragLen`: the expected DNA fragment length. Reads will be extended (or truncated) to this size. Not used for RNA-Seq type plots. `cleanLevel`: integer from 0 (default) to 3, controlling read filtering level prior to profile generation. See details for further information. `normalize`: one of `"none"` (default), `"linear", "downsample", "sampleto"`. Controls how the coverages are normalized across samples. See Details for explanation of these options. `sampleTo`: a fixed library size for downsampling to be used with `"sampleto"` option above. If a sample has less reads than this fixed size, it is silently reported as is. `spliceAction`: one of `"split"` (default), `"keep", "remove"`. Controls the action to be performed with spliced reads in the case of RNA-Seq samples. See Details for explanation of these options. `spliceRemoveQ`: the quantile of putative joint spliced read length to be used for read filtering when `spliceAction` is `"remove"`. See Details for further explanations. `bedGenome`: one of the supported genomes, as when reading from bed files, chromosomal lenghts are not available and must be retrieved with another way.
`plotParams`	a named list whose members control profile (curve and heatmap) plotting parameters. The list has the following fields: `plot`: if set to `TRUE` (default), the plots created with the calculated profiles with `recoup` are displayed. Set to `FALSE` to plot later using the output object. `profile`: if set to `TRUE` (default), the average coverage profile across the genomic regions of preference is calculated. Set to `FALSE` to suppress this. `heatmap`: if set to `TRUE` (default), the coverage heatmap profile across the genomic regions of preference is calculated. Set to `FALSE` to suppress this. `correlation`: if set to `TRUE` (default), the plots created with the calculated coverage correlations are displayed. Set to `FALSE` to plot later using the output object. `signalScale`: one of `"natural"` (default) or `"log2"` to control the signal scale of the final coverage plots. Hint: use `log2` scale for RNA-Seq profiles as it produces much smoother plots. `heatmapScale`: one of `"common"` (default) or `"each"`. When `"common"`, a common heatmap color scale is calculated for all samples. When `"each"`, each heatmap has its own color scale. `heatmapFactor`: a positive numeric value by which the upper color scale limit of the heatmap profile is multiplied. Defaults to `1`. See Details for further information. `corrScale`: either `"normalized"` (default) or `"each"`. Controls the scale display in coverage correlation plots. See Details for further information. `sumStat`: the statistic which is used to summarize coverage matrices. Can be `"mean"` (default) or `"median"`. `smooth`: if `TRUE` (default), the final curve profiles are smoothed using splines. Set to `FALSE` for no smoothing. If the reference genomic regions are many, the differences are minimal. `corrSmoothPar`: a numeric value between 0 and 1 which controls the smoothing of correlation plots. Its default value is controlled by the presence of `design`. See Details for further information. `corrScale`: either `"normalized"` (default) or `"each"`. See Details for further information. `singleFacet`: how should `ggplot2` should facet the profiles with 1-factor design and only one sample whose profile will be plotted. Can be `"none"` (default), `"wrap"` or `"grid"`. When `"none"`, no gridding is applied and design factors are distinguished by colour. With more than one design factors, the `multiFacet` option below is used. `multiFacet`: how should `ggplot2` should facet the profiles with 1-factor design and more than one samples whose profile will be plotted. Can be `"wrap"` (default) or `"grid"`. 2 or 3 (3rd would be colour) factor designs are faceted with `"grid"`. `singleFacetDirection`: if single facetting is requested, how should the panels be arranged, horizontally (default, `"horizontal"`) or vertically (`"vertical"`). `conf`: plot also confidence intervals using `geom_ribbon` in profile or correlation plots. `device`: the R plotting device to redirect the plots to. Can be `"x11"` (default), `"png", "jpg", "tiff", "bmp", "pdf", "ps"`. `outputDir`: the directory to place profiles when the plotting device is not `"x11"`. Defaults to `"."`. `outputBase`: the naming template for output files when the plotting device is not `"x11"`. The extensions `"_profile"` and `"_heatmap"` will be appended to distinguish each plot type. Leave `NULL` (default) for automatic filename generation.
`saveParams`	a named list which controls the information to be stored in the `recoup` output list object. The list has the following fields: `ranges`: set to `TRUE` (default) to store the `GRanges` object obtained from the BAM/BED files. Set to `FALSE` for not saving. Not applicable when input is of type BigWig. `coverage`: set to `TRUE` (default) to store the `Rle` list object obtained from the coverage calculations. Set to `FALSE` for not saving. `profile`: set to `TRUE` (default) to store the profile matrices exracted from coverage summarizations. Set to `FALSE` for not saving. It must be present when using the `recoup` output in the plotting functions `recoupProfile` and `recoupHeatmap`. `profilePlot`: set to `TRUE` (default) to store `ggplot` object containing the average coverage plot. Set to `FALSE` for not saving. Must be `TRUE` if you wish to use the `recoup` output later with `recoupPlot`. `heatmapPlot`: set to `TRUE` (default) to store `ComplexHeatmap` object containing the coverage heatmap plot. Set to `FALSE` for not saving. Must be `TRUE` if you wish to use the `recoup` output later with `recoupPlot`. `correlationPlot`: set to `TRUE` (default) to store `ggplot` object containing coverage correlation plot. Set to `FALSE` for not saving. Must be `TRUE` if you wish to use the `recoup` output later with `recoupPlot`. See the Details section for some additional information.
`kmParams`	a named list which controls the execution of k-means clustering using standard R base function `kmeans` otherwise. The list has the following fields: `k`: the number of clusters for k-means clustering. When `0` (default), no k-means clustering is performed. `nstart`: See `kmeans`. `algorithm`: See `kmeans`. `iterMax`: See the `iter.max` parameter in `kmeans`. `reference`: which profile to use as reference for the determination of clusters and ordering. The rest of the heatmaps will be ordered according to the reference clustering. It can be either a sample id or `NULL` (default). If the latter, all the profile matrices are merged into one big matrix and k-means clustering is performed on that matrix. The result of k-means clustering will be appended to `design` as an additional field. If `design` is `NULL`, it will be created and passed to the plotting functions.
`strandedParams`	a named list which controls how strand information will be treated (if present). The list has the following fields: `strand`: if set to `NULL` (default) then reads from both strands are used from the input BAM/BED files. If `"+"` or `"-"`, then only the respective strands are used. Not applicable for input of type BigWig. `ignoreStrand`: `TRUE` (default) or `FALSE`. Passed to the `ignore.strand` argument in the `findOverlaps` function used during coverage calculations.
`ggplotParams`	a named list with theme parameters passed to the `ggplot` function of the `ggplot2` package. See the documentation of `ggplot2` for further details. Only the parameters mentioned in the function call are used.
`complexHeatmapParams`	a named list with groups of parameters passed to the `Heatmap` function of the `ComplexHeatmap` package. The list has the following fields: `main`: `Heatmap` parameters applied to each non-split (according to `design`) heatmap. See the `recoup` function call for supported parameters and `Heatmap` for further details. `main`: `Heatmap` parameters applied to each split (according to `design`) heatmap. See the `recoup` function call for supported parameters and `Heatmap` for further details.
`bamParams`	BAM file read parameters passed to `BamFile`. See the related function. Currently this is not used.
`onTheFly`	Read short reads directly from BAM files when input contains paths to BAM files. In this case the storage of short reads in the output list object as a GRanges object is not possible and the final object becomes less reusable but the memory footprint is lower. Defaults to `FALSE`.
`localDb`	local path with the annotation database. See also `buildAnnotationDatabase`.
`rc`	fraction (0-1) of cores to use in a multicore system. It defaults to `NULL` (no parallelization).

Details

When input is a list, is should contain as many sublists as the number of samples. Each sublist must have at least the following fields:

id: a unique identifier for each sample which should not contain whitespaces and preferably no special characters.
file: the full path to the BAM, BED of BigWig file. If the path to the BAM is a hyperlink, the BAM file must be indexed. BigWigs are already indexed.
format: one of "bam", "bed" or "bigwig".

Additionally, each sublist may also contain the following fields:

name: a sample name which will appear in plots.
color: either an R color (see the colors function) or a hexadecimal color (e.g. "#FF0000").

When input is a text file, this should be strictly tab-delimited (no other delimiters like comma), should contain a header with the same names (case sensitive) as the sublist fields above (id, file, format are mandatory and name, color are optional).

When genome is not one of the supported organisms, it should be a text tab delimited file (only tabs supported) with a header line, or a data frame, where the basic BED field must be present, that means that at least "chromosome", "start", "end", a unique identifier column and "strand" must be present, \ preferably in this order. This file is read in a data frame and then passed to the makeGRangesFromDataFrame function from the GenomicRanges package which takes care of the rest. See also the makeGRangesFromDataFrame's documentation. When genome is one of the supported organisms, recoup takes care of the rest.

The version argument controls what annotation version is used (when using local annotation after having built a store with buildAnnotationStore or when downloading on the fly). When "auto", it will use the latest annotation version for the selected source. So, if source="ensembl", it will use the latest installed or available version for the specified organism based on information retrieved from the biomaRt package. For example, for organism="hg19", it will be 91 at the point where this manual is written. If source="refseq", recoup will either use the latest downloaded annotation according to a timestamp in the directory structure or download and use the latest tables from UCSC on the fly. If an annotation version does not exist, recoup will throw an error and exit.

When region is "tss", the curve and heatmap profiles are centered around the TSS of the (gene) regions provided with the genome argument, flanked accordind to the flank argument. The same applied for region="tes" where the plots are centered around the transcription end site. When region is "genebody", the profiles consist of two flanking parts (upstream of the TSS and downstream of the TES) and a middle part consisting of the gene body coverage profile. The latter is constructed by creating a fixed number of intervals (bins) along each gene and averaging the coverage of each interval. In some extreme cases (e.g. for small genes), the number of bins may be larger than the gene length. In these cases, a few zeros are distributed randomly across the number of bins to reach the predefined number of gene body intervals. When region is "custom" the behavior depends on the custom regions length. If it contains single-base intervals (e.g. ChIP-Seq peak centers), then the behavior is similar to the TSS behavior above. If it contains genomic intervals of equal or unequal size, the behavior is similar to the gene body case.

The fraction parameter controls the total fraction of both total reads and genomic regions to be used for profile creation. This means that the total reads for each sample are randomly downsampled to fraction*100% of the original reads and the same applied to the input genomic regions. This practice is followed by similar packages (like ngs.plot) and serves the purpose of a quick overview of how the actual profiles look before profiling the whole genome.

Regarding the orderBy parameters, for the options of the what parameter "sum" type of options order profiles according to i) the sum of coverages of all samples in each genomic region when orderBy$what="suma" or ii) the sum of coverages of sample n (e.g. 2) in each genomic region when orderBy$what="sumn" (e.g. orderBy$what="sum1"). The same apply for the "max" type of options but this time the ordering is performed according to the position of the highest coverage in each genomic profile. Ties in the position of highest coverage are broken randomly and sorting is performed with the default R sort. Similarly for "avg" type of options, the ordering is performed according to the average total coverage of a reference region. For the "hc" type of options, hierarchical clustering is performed on the selected (n) reference profile (e.g. orderBy$what="hc1") and this ordering is applied to the rest of the sample profiles. When what="none", no ordering is performed and the input order is used (genome argument). If any design is present through the design argument or k-means clustering is also performed (through the kmParams argument), the orderBy directives are applied to each sub-profile created by design or k-means clustering.

Regarding the flankBinSize field of binParams, it is used only when region="genebody" or region="custom" and the custom regions are not single-base regions. This happens as when the genomic regions to be profiled are single-base regions (e.g. TSSs or ChIP-Seq peak centers), these regions are merged with the flanking areas and alltogether form the main genomic region. In these cases, only the regionBinSize field value is used. Note that when type="rnaseq" or region="genebody" or region="custom" with non single-base regions the values of flankBinSize and regionBinSize offer a fine control over how the flanking and the main regions are presented in the profiles. For example, when flankBinSize=100 and regionBinSize=100 with a gene body profile plot, the outcome will look kind of "unrealistic" as the e.g. 2kb flanking regions will look very similar to the usually larger gene bodies. On the other hand if flankBinSize=50 and regionBinSize=200, this setting will create more "realistic" gene body profiles as the flanking regions will be squished and the gene body area will look expanded. Within the same parameter group is also interpolation. When working with reference regions of different lengths (e.g. gene bodies), it happens very often that their lengths are a little to a lot smaller than the number of bins into which they should be split and averaged in order to be able to create the average curve and heatmap profiles. recoup allows for dynamic resolutions by permitting to the user to set the number of bins into which genomic areas will be binned or by allowing a per-base resolution where possible. The interpolation parameter controls what happens in such cases. When "spline", the R function spline is used, with the default method, to produce a spline interpolation of the same size as the regionBinSize option and is used as the coverage for that region. When "linear", the procedure is the same as above but using approx. When "neighborhood", a number of NA values are distributed randomly across the small area coverage vector, excluding the first and the last two positions, in order to reach regionBinSize. Then, each NA position is filled with the mean value of the two values before and the two values after the NA position, with na.rm=TRUE. This method should be avoided when >20% of the values of the extended vector are NA's as it may cause a crash. However, it should be the most accurate one in the opposite case (few NA's). When "auto" (the default), a hybrid of "spline" and "neighborhood" is applied. If the NA's constitute more than 20% of the extended vector, "spline" is used, otherwise "neighborhood". None of the above is applied to regions of equal length as there is no need for that. Furthermore, the parameter binType within the same parameter group controls the type of bins that a genomic interval should be split to in order to effectively calculate realistic signals when signal="rpm". When "variable", the number of bins that each genomic interval is split to is proportional to the square root of its size (the square root smooths the region length distribution, otherwise many regions e.g. in the set of human genes/transcripts will end up in unit-size bins even though they can support larger resolutions). The final signal is interpolated to a length of regionBinSize or flankBinSize to produce the final plots. When "fixed", the genomic intervals are "pushed" to have regionBinSize or flankBinSize bins, but if the areas are not large enought, they may end-up to many unit-size bins which will inflate and oversmooth the signal. It may give better results if the regions where the profile is to be created are all large enough.

Regarding the usage of selector$id field, this requires some careful usage, as if the ids present there and the ids of the genome areas do not match, there will be no genomic regions left to calculate coverage profiles on and the program will crash.

Regarding the usage of the preprocessParams argument, the normalize field controls how the GRanges representing the reads extracted from BAM/BED files or the signals extracted from BigWig files will be normalized. When "none", no normalization is applied and external normalization is assumed. When "linear", all the library sizes are divided by the maximum one and a normalization factor is calculated for each sample. The coverage of this sample across the input genomic regions is then multiplied by this factor. When "downsample", all libraries are downsampled to the minimum library size among samples. When "sampleto", all libraries are downsampled to a fixed number of reads. The sampleTo field of preprocessParams tells recoup the fixed number of reads to downsample all libraries when preprocessParams$normalize="sampleTo". It defaults to 1 million reads (1e+6). The spliceAction field of preprocessParams is used to control the action to be taken in the presence of RNA-Seq spliced reads (implies type="rnaseq"). When "keep", no action is performed regarding the spliced reads (represented as very long reads spanning intronic regions in the GRanges object). When "remove", these reads are excluded from coverage calculations according to their length as follows: firstly the length distribution of all reads lengths (using the width function for GRanges) is calculated. Then the quantile defined by the field spliceRemoveQ of preprocessParams is calculated and reads above the length corresponding to this quantile are excluded. When "splice", then splice junction information inferred from CIGAR strings (if) present in the BAM files is used to splice the longer reads and calculate real coverages. This option is not available with BED files, however, BED files can contain pre-spliced reads using for example BEDTools for conversion. It should also be noted that in the case of BigWig files, only linear normalization is supported as there is no information on raw reads. The cleanLevel field controls what filtering will be applied to the raw reads read from BAM/BED files prior to producing the signal track. It can have four values: 0 for no read processing/filtering, (use reads as they are, no uniqueness and no removal of unlocalized regions and mitochondrial DNA reads, unless filtered by the user before using recoup), 1 for removing unlocalized regions (e.g. chrU, hap, random etc.), 2 for removing reads of level 1 plus mitochondrial reads (chrM) and 3 for removing reads of level 2 plus using unique reads only. The default is level 0 (no filtering).

Regarding the heatmapFactor option of plotParams, it controls the color scale of the heatmap as follows: the default value (1) causes the extremes of the heatmap colors to be linearly and equally distributed across the actual coverage profile values. If set smaller than 1, the the upper extreme of the coverage values (which by default maps to the upper color point) is multiplied by this factor and this new value is set as the upper color break (limit). This has the effect of decreasing the brightness of the heatmap as color is saturated before reaching the maximum coverage value. If set greater than 1, then the heatmap brightness is increased. Regarding the correlation option of plotParams, if TRUE then recoup calculates average coverage values for each reference region (row-wise in the profile matrices) instead of the average coverage in each base of the reference regions (column-wise in the profile matrices). This is particularly useful for checking whether total genome profiles for some biological factor/condition correlate with each other. This potential correlation is becoming even clearer when orderBy$what is not "none". Regarding the corrScale option of plotParams, it controls whether the average coverage curves over the set of reference genomic regions (one average coverage vale per genomic region, note the difference with the profiles where the coverage is calculated over the genomic locations themselves) should be normalized to a 0-1 scale or not. This is particularly useful when plotting data from different libraries (e.g. PolII and H3K27me1 occupancy over gene bodies) where other types of normalization (e.g. read downsampling cannot be applied). Regarding the corrSmoothPar option of plotParams, it controls the smoothing parameter for coverage correlation curves. If design is present, spline smoothing is applied (smooth.spline) with spar=0.5 else lowess smoothing is applied (lowess) with f=0.1. corrSmoothPar controls the spar and f respectively.

Regarding the usage of saveParams argument, this is useful for several purposes: one is for re-using recoup without re-reading BAM/BED/BigWig files. If the ranges are present in the input object to recoup, they are not re-calculated. If not stored, the memory/storage usage is reduced but the object can be used only for simply replotting the profiles using recoupProfile and/or recoupHeatmap functions.

As a note regarding parallel calculations, the number of cores assigned to recoup depends both on the number of cores and the available RAM in your system. The most RAM expensive part of recoup is currently the construction of binned profile matrices. If you have a lot of cores (e.g. 16) but less than 128Gb of RAM for this number of cores, you should avoid using all cores, especially with large BAM files. Half of them would be more appropriate.

Finally, the output list of recoup can be provided as input again to recoup with some input parameters changed. recoup will then automatically recognize what has been changed and recalculate some, all or none of the genomic region profiles, depending on what input parameters have changed. For example, if any of the ordering options change (e.g. from no profile ordering to k-means clustering), then no recalculations are performed and the process is very fast. If region binning is changed (binParams$flankBinSize or binParams$regionBinSize), then only profile matrices are recalculated and coverages are maintained. If any of the preprocessParams changes, this causes all object including the short reads to be reimported and profiles recalculated from the beginning.

Value

a named list with five members:

data: the input argument if it was a list or the resulting list from the unexported internal readConfig function, with the ranges, coverage and profile fields filled according to saveParams. This data member can be used again as an argument to recoup. The coverage and profile fields will be recalculated according to recoup parameters but the ranges will be resued if the input files are not changed.
design: the design data frame which is used to facet the profiles.
plots: the ggplot2 and/or Heatmap objects created by recoup.
callopts: the majority of recoup call parameters. Their storage serves the reuse of a recoup list object so that only certain elements of plots are recalculated.

Author(s)

Panagiotis Moulos

Examples

# Load some sample data
data("recoup_test_data",package="recoup")

# Note: the figures that will be produced will not look 
# realistic and will be "bumpy". This is because package
# size limitations posed by Bioconductor guidelines do not
# allow for a full test dataset. Have a look at the
# vignette on how to test with more realistic data.

# TSS high resolution profile with no design
test.tss <- recoup(
    test.input,
    design=NULL,
    region="tss",
    type="chipseq",
    genome=test.genome,
    flank=c(2000,2000),
    selector=NULL,
    rc=0.1
)

# Genebody low resolution profile with 2-factor design, 
# wide genebody and more narrow flanking
test.gb <- recoup(
    test.input,
    design=test.design,
    region="genebody",
    type="chipseq",
    genome=test.genome,
    flank=c(2000,2000),
    binParams=list(flankBinSize=50,regionBinSize=150),
    orderBy=list(what="hc1"),
    selector=NULL,
    rc=0.1
)

pmoulos/recoup documentation built on May 20, 2024, 2:22 a.m.