Nothing
#' Load bed file as GRanges
#'
#' Wraps around \code{\link{import.bed}} and
#' tries to speed up loading with the
#' use of data.table. Supports gzip, gz, bgz and bed formats.
#' Also safer chromosome naming with the argument chrStyle
#' @param filePath The location of the bed file
#' @inheritParams matchSeqStyle
#' @importFrom data.table fread setDF
#' @importFrom tools file_ext
#' @importFrom rtracklayer import.bed
#' @return a \code{\link{GRanges}} object
#' @export
#' @family utils
#' @examples
#' # path to example CageSeq data from hg19 heart sample
#' cageData <- system.file("extdata", "cage-seq-heart.bed.bgz",
#' package = "ORFik")
#' fread.bed(cageData)
#'
fread.bed <- function(filePath, chrStyle = NULL) {
if (.Platform$OS.type == "unix") {
if (file.exists(filePath)) {
if (any(file_ext(filePath) %in% c("gzip", "gz", "bgz"))) {
bed <- bedToGR(setDF(
fread(cmd = paste("gunzip -c", filePath), sep = "\t")))
} else if (file_ext(filePath) == "bed"){
bed <- bedToGR(setDF(fread(filePath, sep = "\t")))
} else {
bed <- import.bed(con = filePath)
}
} else {stop("Filepath specified does not name existing file.")}
} else {
## NB: Windows user will have slower loading
bed <- import.bed(con = filePath)
}
return(matchSeqStyle(bed, chrStyle))
}
#' Custom bam reader
#'
#' Read in Bam file from either single end or paired end.
#' Safer combined version of \code{\link{readGAlignments}} and
#' readGAlignmentPairs that takes care of some common errors.\cr
#' If QNAMES of the aligned reads are from collapsed fasta files
#' (if the names are formated from collapsing in either
#' (ORFik, ribotoolkit or fastx)), the
#' bam file will contain a meta column called collapsed with the counts
#' of duplicates per read.\cr
#'
#' In the future will use a faster .bam loader for big .bam files in R.
#' @param path a character / data.table with path to .bam file. There
#' are 3 input file possibilities.
#' \itemize{
#' \item{single end : }{a character path (length 1)}
#' \item{paired end (1 file) : }{Either a character path (length of 2), where
#' path[2] is "paired-end", or a data.table
#' with 2 columns, forward = path & reverse = "paired-end"}
#' \item{paired end (2 files) : }{Either a character path (length of 2), where
#' path[2] is path to R2, or a data.table
#' with 2 columns, forward = path to R1 & reverse = path to R2.
#' (This one is not used often)}
#' }
#' @inheritParams matchSeqStyle
#' @inheritParams GenomicAlignments::readGAlignments
#' @param strandMode numeric, default 0. Only used for paired end bam files.
#' One of (0: strand = *, 1: first read of pair is +, 2: first read of pair is -).
#' See ?strandMode. Note: Sets default to 0 instead of 1, as readGAlignmentPairs uses 1.
#' This is to guarantee hits, but will also make mismatches of overlapping
#' transcripts in opposite directions.
#' @return a \code{\link{GAlignments}} or \code{\link{GAlignmentPairs}} object of bam file
#' @importFrom Rsamtools scanBam BamFile ScanBamParam
#' @export
#' @family utils
#' @examples
#' bam_file <- system.file("extdata", "ribo-seq.bam", package = "ORFik")
#' readBam(bam_file, "UCSC")
readBam <- function(path, chrStyle = NULL, param = NULL, strandMode = 0) {
if (!(length(path) %in% c(1,2))) stop("readBam must have 1 or 2 bam files!")
if (is(path, "factor")) path <- as.character(path)
# If data.table path
if (is(path, "data.table")) {
if (path$reverse == "paired-end") {
message("ORFik reads this paired end bam as readGAlignmentPairs")
message(paste("strandMode =", strandMode, ". Update it if is wrong!"))
bam <- matchSeqStyle(readGAlignmentPairs(path$forward, param = param,
strandMode = strandMode), chrStyle)
if (length(bam) == 0)
stop(paste("File", path$forward,
"was read as paired-end file, but had 0 paired reads!"))
return(bam)
} else {
message("ORFik reads these split paired end bams as readGAlignments combination")
return(matchSeqStyle(c(readGAlignments(path$forward, param = param),
readGAlignments(path$reverse, param = param)), chrStyle))
}
}
# If character path
if (is(path, "character") & length(path) == 2) {
if (path[2] == "paired-end"){
message("ORFik reads paired end bam in as readGAlignmentPairs")
message(paste("strandMode =", strandMode, ". Update it if is wrong!"))
bam <- matchSeqStyle(readGAlignmentPairs(path[1], param = param,
strandMode = strandMode), chrStyle)
if (length(bam) == 0)
stop(paste("File", path[1],
"was read as paired-end file, but had 0 paired reads!"))
return(bam)
} else {
message("ORFik reads these split paired end bams as readGAlignments combination")
return(matchSeqStyle(c(readGAlignments(path[1], param = param),
readGAlignments(path[2], param = param)), chrStyle))
}
}
# else single end bam file
# Check if it is a collapsed reads format bam file
# Get qnames with the data (check first 2 rows)
headers <- unlist(scanBam(BamFile(path, yieldSize=2),
param = ScanBamParam(what = "qname")),
use.names = FALSE)
bam.is.collapsed <- all(seq_along(headers) %in%grep("^(>|>seq)\\d+(-|_x)\\d+$", headers))
if (bam.is.collapsed) {
headers <- unlist(scanBam(path, param = ScanBamParam(what = "qname")),
use.names = FALSE)
format.header <- ifelse(all(seq_along(headers) %in% grep("^>seq\\d+_x\\d+$", headers)),
"ribotoolkit",
"fastx")
if (format == "ribotoolkit") {
scores <- as.integer(gsub(".*_x", "", headers))
} else {
scores <- as.integer(gsub(".*-", "", headers))
}
bam <- matchSeqStyle(readGAlignments(path, param = param), chrStyle)
mcols(bam) <- DataFrame(collapsed = scores)
return(bam)
} else return(matchSeqStyle(readGAlignments(path, param = param), chrStyle))
}
#' Custom wig reader
#'
#' Given 2 wig files, first is forward second is reverse.
#' Merge them and return as GRanges object.
#' If they contain name reverse and forward, first and second order
#' does not matter, it will search for forward and reverse.
#'
#' @param path a character path to two .wig files, or a data.table
#' with 2 columns, (forward, filepath) and reverse, only 1 row.
#' @inheritParams matchSeqStyle
#' @importFrom rtracklayer import.wig
#' @return a \code{\link{GRanges}} object of the file/s
#' @family utils
#'
readWig <- function(path, chrStyle = NULL) {
if (is(path, "character")) {
if (length(path) != 2) stop("readWig must have 2 wig files,
one forward strand and one reverse!")
forwardPath <- grep("forward|fwd", path)
reversePath <- grep("reverse|rev", path)
if (length(forwardPath) == 1 & length(reversePath) == 1){
forwardIndex <- forwardPath
reverseIndex <- reversePath
}
forward <- import.wig(path[forwardIndex])
reverse <- import.wig(path[reverseIndex])
} else if (is(path, "data.table")) {
if (!is.null(path$forward)) {
forward <- import.wig(path$forward)
} else forward <- import.wig(path$filepath)
reverse <- import.wig(path$reverse)
}
strand(forward) <- "+"
strand(reverse) <- "-"
return(matchSeqStyle(c(forward, reverse), chrStyle))
}
#' Load GRanges object from .bedo
#'
#' .bedo is .bed ORFik, an optimized bed format for coverage reads with read lengths
#' .bedo is a text based format with columns (6 maximum):\cr
#' 1. chromosome\cr 2. start\cr 3. end\cr 4. strand\cr
#' 5. ref width (cigar # M's, match/mismatch total)\cr
#' 6. duplicates of that read\cr
#'
#' Positions are 1-based, not 0-based as .bed.
#' export with export.bedo
#' @param path a character, location on disc (full path)
#' @return GRanges object
#' @importFrom tools file_ext
#' @export
import.bedo <- function(path) {
if (file_ext(path) != "bedo") stop("import.bedo can only load .bedo files!")
dt <- fread(input = path)
if (is.null(dt$end)) dt$end <- dt$start
return(makeGRangesFromDataFrame(dt, keep.extra.columns = TRUE))
}
#' Load GAlignments object from .bedoc
#'
#' A much faster way to store, load and use bam files.\cr
#' .bedoc is .bed ORFik, an optimized bed format for coverage reads with
#' cigar and replicate number.\cr
#' .bedoc is a text based format with columns (5 maximum):\cr
#' 1. chromosome\cr 2. cigar: (cigar # M's, match/mismatch total)
#' \cr 3. start (left most position) \cr 4. strand (+, -, *)\cr
#' 5. score: duplicates of that read\cr
#'
#' Positions are 1-based, not 0-based as .bed.
#' export with export.bedo
#' @param path a character, location on disc (full path)
#' @return GAlignments object
#' @importFrom tools file_ext
#' @export
import.bedoc <- function(path) {
if (file_ext(path) != "bedoc")
stop("import.bedoc can only load .bedoc files!")
return(getGAlignments(fread(input = path, stringsAsFactors = TRUE)))
}
#' Load GRanges / GAlignments object from .ofst
#'
#' A much faster way to store, load and use bam files.\cr
#' .ofst is ORFik fast serialized object,
#' an optimized format for coverage reads with
#' cigar and replicate number. It uses the fst format as back-end:
#' \code{\link{fst-package}}.\cr A .ofst ribo seq file can compress the
#' information in a bam file from 5GB down to a few MB. This new files has
#' super fast reading time, only a few seconds, instead of minutes. It also has
#' random index access possibility of the file. \cr
#' .ofst is represented as a data.frane format with minimum 4 columns:\cr
#' 1. chromosome\cr 2. start (left most position) \cr 3. strand (+, -, *)\cr
#' 4. width (not added if cigar exists)\cr
#' 5. cigar (not needed if width exists):
#' (cigar # M's, match/mismatch total) \cr
#' 5. score: duplicates of that read\cr
#' 6. size: qwidth according to reference of read\cr\cr
#' If file is from \code{\link{GAlignmentPairs}},
#' it will contain a cigar1, cigar2 instead
#' of cigar and start1 and start2 instead of start
#'
#' Other columns can be named whatever you want and added to meta columns.
#' Positions are 1-based, not 0-based as .bed.
#' Import with import.ofst
#' @param file a path to a .ofst file
#' @inheritParams readBam
#' @return a GAlignment, GAlignmentPairs or GRanges object,
#' dependent of if cigar/cigar1 is defined in .ofst file.
#' @importFrom fst read_fst
#' @export
#' @examples
#' ## GRanges
#' gr <- GRanges("1:1-3:-")
#' tmp <- file.path(tempdir(), "path.ofst")
#' # export.ofst(gr, file = tmp)
#' # import.ofst(tmp)
#' ## GAlignment
#' # Make input data.frame
#' df <- data.frame(seqnames = "1", cigar = "3M", start = 1L, strand = "+")
#' ga <- ORFik:::getGAlignments(df)
#' # export.ofst(ga, file = tmp)
#' # import.ofst(tmp)
import.ofst <- function(file, strandMode = 0) {
df <- read_fst(file)
if ("cigar" %in% colnames(df)) {
getGAlignments(df)
} else if ("cigar1" %in% colnames(df)) {
getGAlignmentsPairs(df, strandMode)
} else getGRanges(df)
}
#' Load any type of sequencing reads
#'
#' Wraps around rtracklayer::import and tries to speed up loading with the
#' use of data.table. Supports gzip, gz, bgz compression formats.
#' Also safer chromosome naming with the argument chrStyle
#'
#' NOTE: For wig you can send in 2 files, so that it automaticly merges
#' forward and reverse stranded objects. You can also just send 1 wig file,
#' it will then have "*" as strand.
#' @param path a character path to file (1 or 2 files),
#' or data.table with 2 colums(forward&reverse)
#' or a GRanges/Galignment/GAlignmentPairs object etc.
#' If it is ranged object it will presume to be
#' already loaded, so will return the object as it is,
#' updating the seqlevelsStyle if given.
#' @inheritParams readBam
#' @importFrom tools file_ext
#' @importFrom tools file_path_sans_ext
#' @importFrom rtracklayer import
#' @return a \code{\link{GAlignments}}/\code{\link{GRanges}} object,
#' depending on input.
#' @export
#' @family utils
#' @examples
#' bam_file <- system.file("extdata", "ribo-seq.bam", package = "ORFik")
#' fimport(bam_file)
#' # Certain chromosome naming
#' fimport(bam_file, "NCBI")
#' # Paired end bam strandMode 1:
#' fimport(bam_file, strandMode = 1)
#' # (will have no effect in this case, since it is not paired end)
#'
fimport <- function(path, chrStyle = NULL, param = NULL, strandMode = 0) {
if (is(path, "data.table")) {
if (ncol(path) == 2 & colnames(path) == c("forward", "reverse")) {
path <- c(path$forward, path$reverse)
} else stop("When path is data.table,",
"it must have 2 columns (forward&reverse)")
} else if (is(path, "list")) {
if (!(all(lengths(path) %in% c(1,2))) | length(path) != 1) {
stop("When path is list,",
"it must have 1 or 2 elements (forward, reverse)")
}
}
if (is(path, "list") | is(path, "data.table") | is(path, "character")) {
path <- unlist(path)
path <- path[path != ""]
pairedEndBam <- FALSE
if (length(path) == 2) {
if (path[2] == "paired-end") {
pairedEndBam <- TRUE
path <- path[1]
}
}
}
if (is.character(path)) {
if (all(file.exists(path))) {
fext <- file_ext(path)
compressions <- c("gzip", "gz", "bgz", "zip")
areCompressed <- fext %in% compressions
fext[areCompressed] <- file_ext(file_path_sans_ext(path[areCompressed],
compression = FALSE))
if (length(path) == 2) { # Multiple file paths
if (all(fext %in% c("wig"))) {
return(readWig(path, chrStyle))
} else if (all(fext %in% c("bam"))) {
return(readBam(path, chrStyle, param, strandMode))
} else stop("only wig and valid bam format allowed for 2 files input!")
} else if (length(path) == 1) { # Only 1 file path given
if (fext == "bam") {
if (pairedEndBam) path <- c(path, "paired-end")
return(readBam(path, chrStyle, param, strandMode))
} else if (fext == "bed" |
file_ext(file_path_sans_ext(path,
compression = TRUE)) == "bed" |
file_ext(file_path_sans_ext(path, compression = FALSE))
== "bed") {
return(fread.bed(path, chrStyle))
} else if (fext == "bedo") {
return(matchSeqStyle(import.bedo(path), chrStyle))
} else if (fext == "bedoc") {
return(matchSeqStyle(import.bedoc(path), chrStyle))
} else if (fext == "ofst") {
return(matchSeqStyle(import.ofst(path, strandMode), chrStyle))
}else return(matchSeqStyle(import(path), chrStyle))
} else stop("fimport takes either 1 or 2 files!")
} else stop(paste(path, "does not exist as File/Files!"))
} else if (is.gr_or_grl(path) | is(path, "GAlignments") |
is(path, "GAlignmentPairs")) {
return(matchSeqStyle(path, chrStyle))
} else {
stop("path must be either a valid character",
" filepath or ranged object.")
}
}
#' Convenience wrapper for Rsamtools FaFile
#'
#' Get fasta file object, to find sequences in file.\cr
#' Will load and import file if necessarry.
#' @param faFile \code{\link{FaFile}}, BSgenome, fasta/index file path or an
#' ORFik \code{\link{experiment}}. This file is usually used to find the
#' transcript sequences from some GRangesList.
#' @importFrom Rsamtools FaFile
#' @importFrom methods is
#' @return a \code{\link{FaFile}} or BSgenome
#' @family utils
#' @export
#' @examples
#' # Some fasta genome with existing fasta index in same folder
#' path <- system.file("extdata", "genome.fasta", package = "ORFik")
#' findFa(path)
#'
findFa <- function(faFile) {
if (is.character(faFile)) {
if (file.exists(faFile)) {
return(FaFile(faFile))
} else {
stop("faFile does not name a valid fasta/index file")
}
} else if (is(faFile, "FaFile") || is(faFile, "BSgenome")) {
return(faFile)
} else if (is(faFile, "experiment")) {
return(FaFile(faFile@fafile))
}
stop("faFile must be FaFile, BSgenome, valid filePath, or ORFik experiment")
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.