Nothing
### =========================================================================
### Seqinfo objects
### -------------------------------------------------------------------------
###
### A Seqinfo object is a table-like object that contains basic information
### about a set of genomic sequences. The table has 1 row per sequence and
### 1 column per sequence attribute. Currently the only attributes are the
### length, circularity flag, and genome provenance (e.g. hg19) of the
### sequence, but more attributes might be added in the future as the need
### arises.
###
setClass("Seqinfo",
representation(
seqnames="character",
seqlengths="integer",
is_circular="logical",
genome="character"
)
)
### NOTE: Other possible (maybe better) names: GSeqinfo or GenomicSeqinfo
### for Genome/Genomic Sequence info.
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Getters
###
setMethod("seqnames", "Seqinfo", function(x) x@seqnames)
setMethod("names", "Seqinfo", function(x) seqnames(x))
setMethod("length", "Seqinfo", function(x) length(seqnames(x)))
setMethod("seqlevels", "Seqinfo", function(x) seqnames(x))
setMethod("seqlengths", "Seqinfo",
function(x)
{
ans <- x@seqlengths
names(ans) <- seqnames(x)
ans
}
)
setMethod("isCircular", "Seqinfo",
function(x)
{
ans <- x@is_circular
names(ans) <- seqnames(x)
ans
}
)
setMethod("genome", "Seqinfo",
function(x)
{
ans <- x@genome
names(ans) <- seqnames(x)
ans
}
)
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Validity
###
.valid.Seqinfo.seqnames <- function(x_seqnames, what="'seqnames(x)'")
{
if (!is.character(x_seqnames))
return(paste0(what, " must be a character vector"))
if (!is.null(names(x_seqnames)))
return(paste0(what, " must be unnamed"))
if (any(x_seqnames %in% c(NA, "")))
return(paste0(what, " cannot contain NAs or empty strings (\"\")"))
if (anyDuplicated(x_seqnames))
return(paste0(what, " cannot contain duplicated sequence names"))
NULL
}
### Not really checking the slot itself but the value returned by the
### slot accessor.
.valid.Seqinfo.seqlengths <- function(x)
{
x_seqlengths <- seqlengths(x)
if (!is.integer(x_seqlengths)
|| length(x_seqlengths) != length(x)
|| !identical(names(x_seqlengths), seqnames(x)))
return("'seqlengths(x)' must be an integer vector of the length of 'x' and with names 'seqnames(x)'")
if (any(x_seqlengths < 0L, na.rm=TRUE))
return("'seqlengths(x)' contains negative values")
NULL
}
### Not really checking the slot itself but the value returned by the
### slot accessor.
.valid.Seqinfo.isCircular <- function(x)
{
x_is_circular <- isCircular(x)
if (!is.logical(x_is_circular)
|| length(x_is_circular) != length(x)
|| !identical(names(x_is_circular), seqnames(x)))
return("'isCircular(x)' must be a logical vector of the length of 'x' and with names 'seqnames(x)'")
NULL
}
### Not really checking the slot itself but the value returned by the
### slot accessor.
.valid.Seqinfo.genome <- function(x)
{
x_genome <- genome(x)
if (!is.character(x_genome)
|| length(x_genome) != length(x)
|| !identical(names(x_genome), seqnames(x)))
return("'genome(x)' must be a character vector of the length of 'x' and with names 'seqnames(x)'")
NULL
}
.valid.Seqinfo <- function(x)
{
c(.valid.Seqinfo.seqnames(seqnames(x)),
.valid.Seqinfo.seqlengths(x),
.valid.Seqinfo.isCircular(x),
.valid.Seqinfo.genome(x))
}
setValidity2("Seqinfo", .valid.Seqinfo)
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Constructor
###
### The .normarg_*() helper functions below do only partial checking of the
### arguments. The full validation is performed by the validity method when
### new() is called.
###
.make_Seqinfo_from_genome <- function(genome)
{
if (!isSingleString(genome) || genome == "")
stop("'genome' must be a single non-empty string")
NCBI_assemblies <- registered_NCBI_assemblies()
if (genome %in% NCBI_assemblies[ , "assembly"] ||
genome %in% NCBI_assemblies[ , "assembly_accession"])
return(getChromInfoFromNCBI(genome, as.Seqinfo=TRUE))
UCSC_genomes <- registered_UCSC_genomes()
if (genome %in% UCSC_genomes[ , "genome"])
return(getChromInfoFromUCSC(genome, as.Seqinfo=TRUE))
stop(wmsg("\"", genome, "\" is not a registered NCBI assembly ",
"or UCSC genome (use registered_NCBI_assemblies() or ",
"registered_UCSC_genomes() to list the NCBI or UCSC ",
"assemblies/genomes currently registered in the ",
"GenomeInfoDb package)"))
}
### Make sure this always returns an *unnamed* character vector.
.normarg_seqlevels <- function(seqlevels)
{
if (is.null(seqlevels))
return(character(0))
seqlevels <- unname(seqlevels)
errmsg <- .valid.Seqinfo.seqnames(seqlevels, what="supplied 'seqlevels'")
if (!is.null(errmsg))
stop(errmsg)
seqlevels
}
### Make sure this always returns an *unnamed* integer vector.
.normarg_seqlengths <- function(seqlengths, seqnames)
{
if (identical(seqlengths, NA))
return(rep.int(NA_integer_, length(seqnames)))
if (!is.vector(seqlengths))
stop("supplied 'seqlengths' must be a vector")
if (length(seqlengths) != length(seqnames))
stop("length of supplied 'seqlengths' must equal ",
"the number of sequences")
if (!is.null(names(seqlengths))
&& !identical(names(seqlengths), seqnames))
stop("when the supplied 'seqlengths' vector is named, ",
"the names must match the seqnames")
if (is.logical(seqlengths)) {
if (all(is.na(seqlengths)))
return(as.integer(seqlengths))
stop("bad supplied 'seqlengths' vector")
}
if (!is.numeric(seqlengths))
stop("bad supplied 'seqlengths' vector")
if (is.integer(seqlengths)) {
seqlengths <- unname(seqlengths)
} else {
seqlengths <- as.integer(seqlengths)
}
if (any(seqlengths < 0L, na.rm=TRUE))
stop("supplied 'seqlengths' contains negative values")
seqlengths
}
### Make sure this always returns an *unnamed* logical vector.
.normarg_isCircular <- function(isCircular, seqnames)
{
if (identical(isCircular, NA))
return(rep.int(NA, length(seqnames)))
if (!is.vector(isCircular))
stop("supplied 'isCircular' must be a vector")
if (length(isCircular) != length(seqnames))
stop("length of supplied 'isCircular' must equal ",
"the number of sequences")
if (!is.null(names(isCircular))
&& !identical(names(isCircular), seqnames))
stop("when the supplied circularity flags are named, ",
"the names must match the seqnames")
if (!is.logical(isCircular))
stop("bad supplied 'isCircular' vector")
unname(isCircular)
}
### Make sure this always returns an *unnamed* character vector parallel to
### 'seqnames'.
.normarg_genome <- function(genome, seqnames)
{
if (!(is.vector(genome) || is.factor(genome)))
stop(wmsg("supplied 'genome' must be a vector or factor"))
if (!is.character(genome))
genome <- as.character(genome)
if (length(genome) == 0L) {
if (length(seqnames) == 0L)
return(unname(genome))
stop(wmsg("supplied 'genome' vector is empty"))
}
## The most common situation is that the supplied 'genome' vector contains
## a single (possibly NA) unique value. Note that, in that case, the names
## on 'genome' are ignored.
ugenome <- unique(genome)
if (length(ugenome) == 1L)
return(rep.int(ugenome, length(seqnames)))
if (!is.null(names(genome))) {
if (identical(names(genome), seqnames))
return(unname(genome))
stop(wmsg("when the supplied 'genome' vector contains more ",
"than one distinct value, the names on it must be ",
"identical to the seqlevels of the object"))
}
if (length(genome) == length(seqnames))
return(genome)
if (length(genome) != 1L)
stop(wmsg("when length of supplied 'genome' vector is not 1, ",
"then it must equal the number of sequences"))
rep.int(genome, length(seqnames))
}
Seqinfo <- function(seqnames=NULL, seqlengths=NA, isCircular=NA, genome=NA)
{
if (is.null(seqnames)
&& identical(seqlengths, NA)
&& identical(isCircular, NA)
&& isSingleString(genome))
return(.make_Seqinfo_from_genome(genome))
seqnames <- .normarg_seqlevels(seqnames)
seqlengths <- .normarg_seqlengths(seqlengths, seqnames)
is_circular <- .normarg_isCircular(isCircular, seqnames)
genome <- .normarg_genome(genome, seqnames)
new("Seqinfo", seqnames=seqnames,
seqlengths=seqlengths,
is_circular=is_circular,
genome=genome)
}
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Updating old Seqinfo objects
###
setMethod("updateObject", "Seqinfo",
function(object, ..., verbose=FALSE)
{
if (verbose)
message("updateObject(object = 'Seqinfo')")
if (!is(try(object@genome, silent=TRUE), "try-error"))
return(genome)
as(Seqinfo(seqnames=object@seqnames,
seqlengths=object@seqlengths,
isCircular=object@is_circular),
class(object))
}
)
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Subsetting
###
### Support subsetting only by name.
setMethod("[", "Seqinfo",
function(x, i, j, ..., drop=TRUE)
{
if (!missing(j) || length(list(...)) > 0L)
stop("invalid subsetting")
if (missing(i))
return(x)
if (!is.character(i))
stop("a Seqinfo object can be subsetted only by name")
if (!identical(drop, TRUE))
warning("'drop' argument is ignored when subsetting ",
"a Seqinfo object")
x_names <- names(x)
i2names <- match(i, x_names)
new_seqlengths <- unname(seqlengths(x))[i2names]
new_isCircular <- unname(isCircular(x))[i2names]
new_genome <- unname(genome(x))[i2names]
Seqinfo(seqnames=i, seqlengths=new_seqlengths,
isCircular=new_isCircular, genome=new_genome)
}
)
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Setters
###
setReplaceMethod("seqnames", "Seqinfo",
function(x, value)
{
value <- .normarg_seqlevels(value)
if (length(value) != length(x))
stop("length of supplied 'seqnames' vector must equal ",
"the number of sequences")
x@seqnames <- value
x
}
)
setReplaceMethod("names", "Seqinfo",
function(x, value) `seqnames<-`(x, value)
)
setReplaceMethod("seqlevels", "Seqinfo",
function(x,
pruning.mode=c("error", "coarse", "fine", "tidy"),
value)
{
pruning.mode <- match.arg(pruning.mode)
if (pruning.mode != "error")
warning("'pruning.mode' is ignored in \"seqlevels<-\" method ",
"for Seqinfo objects")
new2old <- getSeqlevelsReplacementMode(value, seqlevels(x))
if (identical(new2old, -3L)) {
## "renaming" mode
seqnames(x) <- value
return(x)
}
if (identical(new2old, -2L) || identical(new2old, -1L)) {
## "subsetting" mode
return(x[value])
}
new_seqlengths <- unname(seqlengths(x))[new2old]
new_isCircular <- unname(isCircular(x))[new2old]
new_genome <- unname(genome(x))[new2old]
Seqinfo(seqnames=value, seqlengths=new_seqlengths,
isCircular=new_isCircular, genome=new_genome)
}
)
setReplaceMethod("seqlengths", "Seqinfo",
function(x, value)
{
x@seqlengths <- .normarg_seqlengths(value, seqnames(x))
x
}
)
setReplaceMethod("isCircular", "Seqinfo",
function(x, value)
{
x@is_circular <- .normarg_isCircular(value, seqnames(x))
x
}
)
setReplaceMethod("genome", "Seqinfo",
function(x, value)
{
x@genome <- .normarg_genome(value, seqnames(x))
x
}
)
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Coercion
###
### S3/S4 combo for as.data.frame.Vector
as.data.frame.Seqinfo <- function(x, row.names=NULL, optional=FALSE, ...)
{
if (!is.null(row.names))
warning("supplied 'row.names' value was ignored")
if (!identical(optional, FALSE))
warning("supplied 'optional' value was ignored")
if (length(list(...)) != 0L)
warning("extra arguments were ignored")
data.frame(seqlengths=unname(seqlengths(x)),
isCircular=unname(isCircular(x)),
genome=unname(genome(x)),
row.names=seqnames(x),
check.names=FALSE,
stringsAsFactors=FALSE)
}
setMethod("as.data.frame", "Seqinfo", as.data.frame.Seqinfo)
.from_DataFrame_to_Seqinfo <- function(from)
{
if (!is.data.frame(from) && !is(from, "DataFrame"))
stop("'from' must be a data.frame or DataFrame object")
from_colnames <- colnames(from)
## Extract seqnames.
if ("seqnames" %in% from_colnames) {
ans_seqnames <- from[ , "seqnames"]
} else if ("seqlevels" %in% from_colnames) {
ans_seqnames <- from[ , "seqlevels"]
} else {
ans_seqnames <- rownames(from)
seqnames_as_ints <- suppressWarnings(as.integer(ans_seqnames))
if (!any(is.na(seqnames_as_ints))
&& all(seqnames_as_ints == ans_seqnames))
stop("no sequence names found in input")
}
if (!is.character(ans_seqnames))
ans_seqnames <- as.character(ans_seqnames)
## Extract seqlengths.
if ("seqlengths" %in% from_colnames) {
ans_seqlengths <- from[ , "seqlengths"]
} else {
ans_seqlengths <- rep.int(NA_integer_, nrow(from))
}
if (!is.integer(ans_seqlengths))
ans_seqlengths <- as.integer(ans_seqlengths)
## Extract isCircular.
if ("isCircular" %in% from_colnames) {
ans_isCircular <- from[ , "isCircular"]
} else if ("is_circular" %in% from_colnames) {
ans_isCircular <- from[ , "is_circular"]
} else {
ans_isCircular <- rep.int(NA, nrow(from))
}
if (!is.logical(ans_isCircular))
ans_isCircular <- as.logical(ans_isCircular)
## Extract genome.
if ("genome" %in% from_colnames) {
ans_genome <- from[ , "genome"]
} else {
ans_genome <- rep.int(NA_character_, nrow(from))
}
if (!is.character(ans_genome))
ans_genome <- as.character(ans_genome)
Seqinfo(seqnames=ans_seqnames, seqlengths=ans_seqlengths,
isCircular=ans_isCircular, genome=ans_genome)
}
setAs("data.frame", "Seqinfo", .from_DataFrame_to_Seqinfo)
setAs("DataFrame", "Seqinfo", .from_DataFrame_to_Seqinfo)
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Display
###
### S3/S4 combo for summary.Seqinfo
summary.Seqinfo <- function(object, ...)
{
## nb of sequences
object_len <- length(object)
if (object_len == 0L)
return("no sequences")
ans <- c(object_len, " sequence")
if (object_len > 1L)
ans <- c(ans, "s")
## circularity
circ_count <- sum(isCircular(object), na.rm=TRUE)
if (circ_count != 0L)
ans <- c(ans, " (", circ_count, " circular)")
## genomes
ugenomes <- unique(genome(object))
genome_count <- length(ugenomes)
if (genome_count == 1L) {
if (is.na(ugenomes))
ans <- c(ans, " from an unspecified genome")
else
ans <- c(ans, " from ", ugenomes, " genome")
} else {
if (genome_count > 3L)
ugenomes <- c(ugenomes[1:2], "...")
genomes_in1string <- paste0(ugenomes, collapse=", ")
ans <- c(ans, " from ", genome_count, " genomes ",
"(", genomes_in1string, ")")
}
## seqlengths
seqlengths <- seqlengths(object)
if (all(is.na(seqlengths)))
ans <- c(ans, "; no seqlengths")
paste0(ans, collapse="")
}
setMethod("summary", "Seqinfo", summary.Seqinfo)
### cat(.showOutputAsCharacter(x), sep="\n") is equivalent to show(x).
.showOutputAsCharacter <- function(x)
{
tmp <- tempfile()
sink(file=tmp, type="output")
show(x)
sink(file=NULL)
readLines(tmp)
}
.compactDataFrame <- function(x)
{
head_nrow <- get_showHeadLines()
tail_nrow <- get_showTailLines()
max_nrow <- head_nrow + tail_nrow + 1L
if (nrow(x) <= max_nrow)
return(x)
head <- head(x, n=head_nrow)
tail <- tail(x, n=tail_nrow)
dotrow <- rep.int("...", ncol(x))
names(dotrow) <- colnames(x)
dotrow <- data.frame(as.list(dotrow),
row.names="...",
check.names=FALSE,
stringsAsFactors=FALSE)
## Won't handle properly the situation where one row in 'head' or 'tail'
## happens to be named "...".
rbind(head, dotrow, tail)
}
### Should work properly on "narrow" data frames. Untested on data frames
### that are wider than the terminal.
showCompactDataFrame <- function(x, rownames.label="", left.margin="")
{
compactdf <- .compactDataFrame(x)
label_nchar <- nchar(rownames.label)
if (label_nchar != 0L)
row.names(compactdf) <- format(row.names(compactdf), width=label_nchar)
showme <- .showOutputAsCharacter(compactdf)
if (label_nchar != 0L)
substr(showme[1L], 1L, label_nchar) <- rownames.label
cat(paste0(left.margin, showme), sep="\n")
}
setMethod("show", "Seqinfo",
function(object)
{
cat(class(object), " object with ", summary(object), ":\n", sep="")
if (length(object) == 0L)
return(NULL)
showCompactDataFrame(as.data.frame(object),
rownames.label="seqnames", left.margin=" ")
}
)
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### Combining
###
### Why no "c" or "rbind" method for Seqinfo objects?
### 'c(x, y)' would be expected to just append the rows in 'y' to the rows in
### 'x' resulting in an object of length 'length(x) + length(y)'. But that
### would tend to break the constraint that the seqnames of a Seqinfo object
### must be unique.
### So what we really need is the ability to "merge" Seqinfo objects, that is,
### if a row in 'x' has the same seqname as a row in 'y', then the 2 rows must
### be merged in a single row before it's put in the result. If 2 rows cannot
### be merged because they contain incompatible information (e.g. different
### seqlengths or different circularity flags), then an error must be raised.
###
.Seqinfo.mergexy <- function(x, y)
{
ans_seqnames <- union(seqnames(x), seqnames(y))
ans_genome <- mergeNamedAtomicVectors(genome(x), genome(y),
what=c("sequence", "genomes"))
ans_seqlengths <- mergeNamedAtomicVectors(seqlengths(x), seqlengths(y),
what=c("sequence", "seqlengths"))
ans_is_circular <- mergeNamedAtomicVectors(isCircular(x), isCircular(y),
what=c("sequence", "circularity flags"))
common_seqnames <- intersect(seqnames(x), seqnames(y))
x_proper_seqnames <- setdiff(seqnames(x), common_seqnames)
y_proper_seqnames <- setdiff(seqnames(y), common_seqnames)
if (length(x_proper_seqnames) != 0L && length(y_proper_seqnames) != 0L) {
if (length(common_seqnames) == 0L) {
msg <- c("The 2 combined objects have no sequence levels in ",
"common. (Use\n suppressWarnings() to suppress this ",
"warning.)")
warning(msg)
} else if (any(is.na(genome(x)[common_seqnames]))
|| any(is.na(genome(y)[common_seqnames]))) {
msg <- c("Each of the 2 combined objects has sequence levels ",
"not in the other:\n",
" - in 'x': ",
paste(x_proper_seqnames, collapse=", "), "\n",
" - in 'y': ",
paste(y_proper_seqnames, collapse=", "), "\n",
" Make sure to always combine/compare objects based on ",
"the same reference\n genome (use suppressWarnings() ",
"to suppress this warning).")
warning(msg)
}
}
Seqinfo(seqnames=ans_seqnames, seqlengths=ans_seqlengths,
isCircular=ans_is_circular, genome=ans_genome)
}
.Seqinfo.merge <- function(...)
{
args <- unname(list(...))
## Remove NULL elements...
arg_is_null <- sapply(args, is.null)
if (any(arg_is_null))
args[arg_is_null] <- NULL # ... by setting them to NULL!
if (length(args) == 0L)
return(Seqinfo())
x <- args[[1L]]
if (length(args) == 1L)
return(x)
args <- args[-1L]
if (!all(sapply(args, is, class(x))))
stop("all arguments in must be ", class(x), " objects (or NULLs)")
for (y in args)
x <- .Seqinfo.mergexy(x, y)
x
}
### These methods should not be called with named arguments: this tends to
### break dispatch!
setMethod("merge", c("Seqinfo", "missing"),
function(x, y, ...) .Seqinfo.merge(x, ...)
)
setMethod("merge", c("missing", "Seqinfo"),
function(x, y, ...) .Seqinfo.merge(y, ...)
)
setMethod("merge", c("Seqinfo", "NULL"),
function(x, y, ...) .Seqinfo.merge(x, ...)
)
setMethod("merge", c("NULL", "Seqinfo"),
function(x, y, ...) .Seqinfo.merge(y, ...)
)
setMethod("merge", c("Seqinfo", "Seqinfo"),
function(x, y, ...) .Seqinfo.merge(x, y, ...)
)
setMethod("intersect", c("Seqinfo", "Seqinfo"), function(x, y) {
merge(x, y)[intersect(seqnames(x), seqnames(y))]
})
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### checkCompatibleSeqinfo()
###
checkCompatibleSeqinfo <- function(x, y) merge(seqinfo(x), seqinfo(y))
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.