## Dec 2, 2015 changes ftp://ftp.ncbi.nlm.nih.gov/genomes/README.txt
## files moved to ftp://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/
read.ncbi.ftp <- function(org, filePattern="ptt$|rnt$", ftp ="genomes/archive/old_refseq/Bacteria", ...){
ftpdir <- paste("ftp://ftp.ncbi.nih.gov", ftp, org, sep="/")
# list files in directory
x <- ftpList( ftpdir )
files <- grep(filePattern, x$name, value=TRUE)
if(length(files) == 0){stop("No files matching ", filePattern, " found")}
# get file endings
fileTypes <- strsplit2(files, ".", 2, fixed=TRUE)
# CANNOT read these files
n0 <- which( fileTypes %in% c("asn", "gbk", "val", "GeneMark-2", "rpt"))
if(length(n0) > 0 ){
files <- files[-n0]
if(length(files) == 0){stop("Cannot read files ending in ", unique(fileTypes) )}
fileTypes <- fileTypes[-n0]
}
n <- length(files)
z <- vector("list", n)
## Coordinates
if(all( fileTypes %in% c("ptt", "rnt", "GeneMarkHMM-2", "Glimmer3", "Prodigal-2", "gff"))){
for(i in 1:n){
print(paste("Downloading", files[i]))
file <- paste(ftpdir, files[i], sep="/")
if( fileTypes[i] == "GeneMarkHMM-2"){
z[[i]] <- read.genemark( file, ... )
}else if(fileTypes[i] == "Glimmer3") {
z[[i]] <- read.glimmer( file, ... )
}else if(fileTypes[i] == "Prodigal-2") {
z[[i]] <- read.prodigal( file, ... )
}else if(fileTypes[i] == "gff") {
z[[i]] <- read.gff( file, ... )
}else{
z[[i]] <- read.ptt( file , ...)
# ADD feature type?
if (fileTypes[[i]] == "ptt") {
values(z[[i]])$feature <- "CDS"
}else{
values(z[[i]])$feature <- "RNA"
}
}
}
## COMBINE into single object
deflines <- unique( sapply(z, function(y) metadata(y)$defline))
## suppress warnings about combining different seq accessions
z <- suppressWarnings( do.call("c", z) )
# Sort by chromosome size AND START
n <- order(seqlengths(z), decreasing=TRUE)
seqlevels(z) <- seqlevels(z)[n]
z <- z[order( match(IRanges::as.vector(seqnames(z)), seqlevels(z)), start(z)), ]
# add new metadata
metadata(z) <- list(ftp = ftpdir, files=files, defline=deflines, date=Sys.Date() )
}
## AA sequences
if(all(fileTypes %in% c("faa"))){
for(i in 1:n){
print(paste("Downloading", files[i]))
file <- paste(ftpdir, files[i], sep="/")
z[[i]] <- readAAStringSet(file, ...)
}
z <- do.call("c", z)
}
## DNA sequences
if(all(fileTypes %in% c("fna", "frn", "ffn"))){
for(i in 1:n){
print(paste("Downloading", files[i]))
file <- paste(ftpdir, files[i], sep="/")
## feb 15, 2013 'read.DNAStringSet' is deprecated
z[[i]] <- readDNAStringSet(file, ...)
}
z <- do.call("c", z)
## check if single sequence (then use DNAString instead of DNAStringSet?)
# if(length(z)==1) z <- z[[1]]
}
## z is empty ? mixed file types???
# if(is.list(z) && all(sapply(z, is.null)) ) stop("Too many different file types: ", paste(unique(fileTypes), collapse=", "))
z
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.