Nothing
maxDists <- function(mat, idx=NA, N=1, exclude=rep(FALSE,nrow(mat)), include.center=TRUE){
## Return indices of N objects with a maximal sum of pairwise
## distances.
##
## * mat - square distance matrix
## * idx - starting indices; if is.na, starts with the object with the
## minimum median distance to all other objects.
## * N - total number of selections; length of idx is subtracted.
## * exclude - boolean vector indicating elements to exclude from the calculation.
## * include.center - includes the most central object in the output if TRUE
## TODO: unit tests
N <- min(N, nrow(mat))
m <- mat
m[exclude,] <- NA
if(length(idx) == 0 || is.na(idx)){
idx <- integer(0)
}
## add most central element
if(include.center){
## exclude rows corresponding to already selected objects
m[idx,] <- NA
idx <- c(idx, which.min(apply(m,1,median)))
}
remaining <- N - length(idx)
## accumulate a total of N elements
if(remaining > 0){
for(x in seq(remaining)){
m[idx,] <- NA
cols <- m[,idx,drop=FALSE]
idx <- c(idx, which.max(apply(cols,1,sum)))
}
}
## pairs <- combn(idx, 2)
## print(apply(pairs, 2, function(p) mat[p[1],p[2]]))
return(idx)
}
findOutliers <- function(mat, quant, cutoff){
## Outliers are defined as elements with edge length to the
## centermost element > cutoff.
##
## Input
## -----
##
## * mat - square matrix of distances
## * quant - given all pairwise distances x, calculate distance
## threshold as quantile(x, quant). Values closer to 0 are more
## stringent.
## * cutoff - an absolute cutoff overriding quant
##
## Value
## -----
##
## Returns a boolean vector corresponding to margin of mat; outliers
## have a value of TRUE.
##
if(missing(cutoff)){
cutoff <- quantile(mat[lower.tri(mat)], quant, na.rm=TRUE)
}
## find index of most central element
center <- which.min(apply(mat,1,median))
## distance from each element to most central element
dd <- mat[center,]
return(dd > cutoff)
}
## nodeTable <- function(taxData, tre, tax_ids){
## ## * taxData - list of taxa, nodes, ranks (output of getTaxonomyFromDB)
## ## * tre - ape phylo object read from a placefile (output of placeTree)
## ## * tax_ids - character vector of NCBI tax ids in tree order
## ## path from the root (first element of each vector) to each tip (last element)
## tipsToRoot <- .Call("seq_root2tip",
## tre$edge, length(tre$tip.label),
## tre$Nnode, PACKAGE = "ape")
## ## list of descendants of each node
## nodes <- descendants(tre, tipsToRoot)
## ## functionality of classify16s::nodemap here: generate a list
## ## describing parents, children, and descendant tax_ids at each node
## ## restrict taxTable to a subset of ranks defined elsewhere
## rankNames <- taxData$ranks
## taxTable <- taxData$taxa[tax_ids,rankNames]
## ## add ancestry of the root node
## edges <- rbind(tre$edge, c(NA, length(tre$tip.label)+1))
## ## now the parent of each node i is given by parents[i]
## parents <- edges[order(edges[,2]),1]
## nmap <- lapply(
## seq_along(nodes),
## function(i){
## leaves <- nodes[[i]]
## if(length(leaves)==0){
## leaves <- i
## }
## L1 <- list(
## i=i,
## parent=parents[i],
## children=leaves,
## nChildren=length(leaves),
## tax_id=setdiff(unique(tax_ids[leaves]),NA)
## )
## ## add set of taxa at each rank
## L2 <- lapply(
## rankNames,
## function(r){setdiff(unique(taxTable[[r]][leaves]),NA)}
## )
## names(L2) <- rankNames
## c(L1,L2)
## }
## )
## ## functionality of classify16s:::nodeNames here: define
## ## rank-specific taxonomic names for each node
## ## define ranks to include
## ## singleRankOk lists ranks that are permitted to have a single value;
## ## this is useful to prevent confusion generated by rarely-defined
## ## ranks (such as species group) while allowing higher-level ranks
## ## (such as kingdom) to have the same value in all rows.
## singleRankOk=c('superkingdom','kingdom', 'superphylum','phylum','class')
## ndefined <- apply(taxTable, 2, function(x) length(setdiff(unique(x),NA)))
## excludedRanks <- setdiff(names(ndefined)[ndefined < 2], singleRankOk)
## ## get rid of 'no_rank' while we're at it
## ranks <- setdiff(rankNames, c(excludedRanks,'no_rank'))
## ## taxonomic names corresponding to tax_ids
## taxnames <- taxData$nodes$tax_name
## names(taxnames) <- taxData$nodes$tax_id
## ## name each node and return a data.frame
## nnames <- do.call(rbind, lapply(nmap,
## function(x){
## nodeNamer(x, taxnames, ranks)
## }))
## ## nnames lacks mapping to pplacer node numbers; need to provide
## ## this using pmap
## pmap <- placeMap(tre)
## nnames$at <- as.integer(pmap$pplacer[match(nnames$node, pmap$node)])
## return(nnames)
## }
## nodeNamer <- function(node, taxnames, ranks){
## ## Define a name for the given node.
## ##
## ## Input
## ## -----
## ## * node - a list with named elements ...
## ## * taxnames - named character vector of taxonomic names (names are tax_ids)
## ## * ranks - character vector of ranks to consider in constructing the name
## thisNode <- node[ranks]
## stopifnot(all(unlist(thisNode) %in% names(taxnames)))
## nranks <- length(ranks)
## ## positions along ranks with a single taxon assignment
## oneName <- which(sapply(thisNode,length) == 1)
## tax_ids <- unlist(thisNode)[ranks]
## if(length(oneName) > 0){
## ##if(max(oneName) == nranks){
## ## the most specific rank has only one tax_id - we are essentially done
## tab <- data.frame(
## node=rep(node$i, nranks),
## rank=ranks,
## thisRank=ranks,
## tax_id=tax_ids,
## tax_name=taxnames[tax_ids]
## )
## if(max(oneName) < length(ranks)){
## ## restrict to ranks with a single taxon and assign names for this
## ## subset; fill in lower ranks with the most specific
## ## classification available.
## mostSpecific <- max(oneName)
## ii <- seq(mostSpecific + 1, length(ranks))
## tab$thisRank[ii] <- tab$thisRank[mostSpecific]
## tab$tax_id[ii] <- tab$tax_id[mostSpecific]
## tab$tax_name[ii] <- tab$tax_name[mostSpecific]
## }
## }else{
## ## not even the most general rank is uniquely defined; here we punt
## thisRank <- ranks[1]
## tab <- data.frame(
## node=rep(node$i, nranks),
## rank=ranks,
## thisRank=rep(thisRank, nranks),
## tax_id=rep(paste(node[[thisRank]], collapse=' '), nranks),
## tax_name=rep(paste(node[[thisRank]], collapse=' '), nranks)
## )
## }
## return(tab)
## }
refpkgContents <- function(path, manifest='CONTENTS.json'){
## Read the file `manifest` from a refpackage located at `path` and
## return a list containing the package contents.
## TODO: import(rjson) in NAMESPACE fails - how do I use this
## package properly?
library(rjson)
contents <- fromJSON(file=file.path(path, manifest))
## TODO: validate md5 checksums
files <- lapply(file.path(path, contents$files), normalizePath)
names(files) <- names(contents$files)
contents$files <- files
return(contents)
}
taxonomyFromRefpkg <- function(path, seqnames, lowest_rank=NA){
## Construct a data.frame providing the lineage of each sequence
## described in seq_info.
##
## Input
## -----
##
## * path - path to a refpkg directory
## * seqnames - optional character vector of sequence names. If
## provided, determines the order of $taxTab
## * lowest_rank - name of the most specific (ie, rightmost) rank to
## include. Default is the name of the rightmost column in
## refpkg_contents$taxonomy
##
## Value
## -----
##
## A list with the following elements
## * taxNames - a named character vector of taxonomic names (names
## are tax_ids)
## * taxTab - a data.frame in which each row corresponds to a
## reference sequence and contains a tax_id followed by the
## corresponding lineage (columns are root...lowest_rank)
first_rank <- 'root'
contents <- refpkgContents(path)
## taxonomic assignments of reference sequences
seq_info <- read.csv(contents$files$seq_info, as.is=TRUE)
## TODO: taxtable.py should represent Non/NA as ,, not ,"",
taxonomy <- read.csv(contents$files$taxonomy, colClasses='character', na.strings=c(""))
taxNames <- taxonomy$tax_name
names(taxNames) <- taxonomy$tax_id
## this merge should preserve the order of seq_info
taxTab <- merge(seq_info[,c('seqname','tax_id')], taxonomy, sort=FALSE)
rownames(taxTab) <- taxTab[['seqname']]
## define the columns to be included as those bounded by first_rank
## and lowest_rank
if(is.na(lowest_rank)){
lowest_rank <- colnames(taxTab)[ncol(taxTab)]
}
cols <- do.call(seq,
lapply(c(first_rank,lowest_rank),
function(n) match(n, colnames(taxTab)))
)
taxTab <- cbind(tax_id=taxTab$tax_id,
taxTab[,cols],
stringsAsFactors = FALSE)
if(!missing(seqnames)){
taxTab <- taxTab[match(seqnames, rownames(taxTab)),]
}
## sanity checks for taxTab:
## will fail if not all sequences in seq_info have corresponding
## entries in taxonomy table
if(any(is.na(taxTab$tax_id))){
stop('Every value of tax_id in the reference package seq_info.csv must be represented in taxonomy.csv')
}
return(list(taxNames=taxNames, taxTab=taxTab))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.