#' Init uORFome pipeline
#'
#' Make directory structure for orf finding, create database
#' assign variables and validate input data.
#'
#' Debug possibility: \cr
#' If you get this error: \cr
#' Error in bmRequest(request = request, verbose = verbose) :\cr
#' Internal Server Error (HTTP 500). \cr
#' Rerun and it should work, this is a port issue.
#' @param mainPath folder for uORFome to put results
#' @param df.rfp ORFik experiment of Ribo-seq
#' @param df.rna ORFik experiment of RNA-seq,
#' set to NULL if you don't have RNA-seq
#' @param df.cage ORFik experiment of CAGE,
#' set to NULL if you don't have CAGE.
#' @param organism scientific name of organism,
#' like Homo sapiens, Danio rerio, etc.
#' @param biomart default "ensembl", get gene symbols and GO terms for uORF genes. Will
#' be automaticly detected by organism name in ensembl database.
#' Set to NULL if you don't want to check Gene symbols and GO terms.
#' @param mode character, default: "uORF". alternative "aCDS". Do you want to predict
#' on uORFs or artificial CDS. if "aCDS" will run twice once for whole length CDS and one for
#' truncated CDS to validate model works for short ORFs. "CDS" is option to predict on
#' whole CDS.
#' @param startCodons.cds.allowed character, default same as startCodons argument.
#' Which start codons can the CDS you train on have ?
#' @param stopCodons.cds.allowed character, default same as stopCodons argument
#' Which stop codons can the CDS you train on have ?
#' @param features features to train model on, any of the features created
#' during ORFik::computeFeatures, default:
#' \code{c("countRFP", "disengagementScores", "entropyRFP", "floss",
#' "fpkmRFP","ioScore", "ORFScores", "RRS", "RSS", "startCodonCoverage",
#' "startRegionCoverage","startRegionRelative")}
#' @importFrom tools file_ext
#' @importFrom BiocParallel registered
#' @export
checkAndInitPipe <- function(mainPath, df.rfp, df.rna, df.cage,
organism, biomart = "ensembl", mode = "uORF",
startCodons.cds.allowed = c("ATG", "CTG", "TTG", "AAG", "AGG"),
stopCodons.cds.allowed = c("TAA", "TGA", "TAG"),
features = c("countRFP", "disengagementScores", "entropyRFP", "floss",
"fpkmRFP","ioScore", "ORFScores", "RRS", "RSS", "startCodonCoverage",
"startRegionCoverage","startRegionRelative")) {
if (!(mode %in% c("uORF", "CDS", "aCDS")))
stop("mode must be uORF or CDS or aCDS (artificial CDS)")
stopifnot(all(features %in% c("countRFP", "disengagementScores", "entropyRFP", "floss",
"fpkmRFP","ioScore", "ORFScores", "RRS", "RSS", "startCodonCoverage",
"startRegionCoverage","startRegionRelative")))
stopifnot(length(features) == length(unique(features)))
dir.create(mainPath, showWarnings = FALSE, recursive = TRUE)
setwd(mainPath)
message("Welcome, setting up uORFome folders\n")
message("--------------------------------------")
message(p("Registered organism is: ", organism))
if (!is.null(biomart)) {
biomart_dataset <- getBiomartFromOrganism(organism, biomart = biomart)
} else biomart_dataset <- NULL
message(paste("Project location: ", mainPath))
message(paste("Run mode:", mode))
message("--------------------------------------")
# Set directory paths
if (mode == "uORF") {
resultsFolder <- p(mainPath, "/results")
regionUORFsFolder = p(resultsFolder,"/uORF_searchRegion/")
leadersFolder = p(resultsFolder,"/New_Cage_Leaders/")
uorfFolder = p(resultsFolder,"/candidate_uORFs/")
idFolder = p(resultsFolder,"/uorfIDs/")
plottingFolder = p(resultsFolder,"/Plotting/")
}
dataFolder <- p(mainPath, "/helper_files")
featuresFolder <- p(mainPath, "/features")
modelFolder <- p(mainPath, "/prediction_model")
# Create directories
if (mode == "uORF") {
dir.create(resultsFolder, showWarnings = FALSE)
dir.create(leadersFolder, showWarnings = FALSE)
dir.create(regionUORFsFolder, showWarnings = FALSE)
dir.create(uorfFolder, showWarnings = FALSE)
dir.create(idFolder, showWarnings = FALSE)
}
dir.create(featuresFolder, showWarnings = FALSE)
dir.create(dataFolder, showWarnings = FALSE)
dir.create(modelFolder, showWarnings = FALSE)
# Create database
dir.create("dataBase", showWarnings = FALSE)
dataBaseFolder <- p(mainPath,"/dataBase")
assign("dataBaseFolder", dataBaseFolder, envir = .GlobalEnv)
uORFomePipe:::createDataBase(p(dataBaseFolder, "/uorfCatalogue.sqlite"))
# Assign variables to global environment
if (mode == "uORF") {
assign("resultsFolder", resultsFolder, envir = .GlobalEnv)
assign("regionUORFsFolder", regionUORFsFolder, envir = .GlobalEnv)
assign("leadersFolder", leadersFolder, envir = .GlobalEnv)
assign("uorfFolder", uorfFolder, envir = .GlobalEnv)
assign("idFolder", idFolder, envir = .GlobalEnv)
assign("plottingFolder", plottingFolder, envir = .GlobalEnv)
}
assign("mainFolder", mainPath, envir = .GlobalEnv)
assign("dataFolder", dataFolder, envir = .GlobalEnv)
assign("featuresFolder", featuresFolder, envir = .GlobalEnv)
assign("organism", organism, envir = .GlobalEnv)
assign("biomart_dataset", biomart_dataset, envir = .GlobalEnv)
# now validate all that directories exist
if(mode == "uORF") {
if (!dir.exists(dataFolder))
stop(p("Could not find directory: ", c(dataFolder)[!file.exists( dataFolder)]))
}
# Set up annotation and save transcript-regions in helper_libraries
gtfdb = df.rfp@txdb
txdb <- NULL
if (!file.exists(p(dataFolder, "/CageFiveUTRs.rds"))) {
seqstyle <- seqlevelsStyle(ORFik:::findFa(df.rfp@fafile))[1]
f <- file_ext(gtfdb)
if (f == "gff" | f == "gff2" | f == "gff3" | f == "gtf") {
if (!file.exists(p(gtfdb, ".db")) | !file.exists(p(gtfdb, "sqlite"))) {
txdb <- GenomicFeatures::makeTxDbFromGFF(gtfdb)
} else {
if (file.exists(p(gtfdb, ".db"))) {
gtfdb <- p(gtfdb, ".db")
txdb <- loadTxdb(gtfdb)
} else {
gtfdb <- p(gtfdb, ".sqlite")
txdb <- loadTxdb(gtfdb)
}
}
txdb <- ORFik:::matchSeqStyle(txdb, seqstyle)
saveDb(txdb, file = p(dataFolder, "/Gtf.db"))
txdb <- loadTxdb(p(dataFolder, "/Gtf.db"))
} else if(f == "db" | f == "sqlite") {
txdb <- loadTxdb(gtfdb, chrStyle = seqstyle)
saveDb(txdb, p(dataFolder, "/Gtf.db"))
} else stop("when txdb is path, must be one of .gff, .gtf and .db")
loadRegions(txdb, c("tx", "cds", "fiveUTRs", "threeUTRs"))
saveRDS(get("tx", mode = "S4"), file = p(dataFolder, "/tx.rds"))
saveRDS(get("cds", mode = "S4"), file = p(dataFolder, "/cds.rds"))
saveRDS(fiveUTRs, file = p(dataFolder, "/fiveUTRs.rds"))
saveRDS(threeUTRs, file = p(dataFolder, "/threeUTRs.rds"))
if (mode == "aCDS") {
orfs <- subset.ORF(cds, startCodons = startCodons.cds.allowed,
stopCodons = stopCodons.cds.allowed, fa = df.rfp@fafile)
saveRDS(orfs, p(dataFolder, "/cds_filtered.rds"))
} else if (mode == "uORF") {
saveRDS(cds[widthPerGroup(cds) > 5], p(dataFolder, "/cds_filtered.rds"))
}
if (mode != "uORF") saveRDS(tx, file = p(dataFolder, "/CageFiveUTRs.rds"))
rm(list = c("tx", "cds", "fiveUTRs", "threeUTRs"), envir = .GlobalEnv)
}
assign("faiName", df.rfp@fafile, envir = .GlobalEnv)
assign("gtfdb", p(dataFolder, "/Gtf.db"), envir = .GlobalEnv)
validateInputs(df.rfp, df.rna, df.cage, mode)
message("--------------------------------------")
message("This is default backend:")
#print(registered()[1])
message(p("Using ", registered()[[1]]$workers, " threads from CPU"))
message(p("Example on how to register default backend to 10 cores:"))
print("register(MulticoreParam(workers = 10), default=TRUE)\n")
message("Initialization finished")
message("--------------------------------------")
return(invisible(NULL))
}
#' Validation of experiments
#' @inheritParams checkAndInitPipe
validateInputs <- function(df.rfp, df.rna, df.cage, mode) {
samples.rfp <- nrow(df.rfp);samples.rna <- nrow(df.rna)
if (is.null(df.rfp)) stop("Ribo-seq must be defined to run!")
if (!is.null(df.rna)) {
if (samples.rfp != samples.rna) stop("Not equal samples in RNA-seq and Ribo-seq")
if (!all(df.rfp$stage %in% df.rna$stage))
stop("Not equal tissues/stages in RNA-seq and Ribo-seq")
if (!all(df.rfp$condition %in% df.rna$condition))
stop("Not equal conditions in RNA-seq and Ribo-seq")
}
if (is.null(df.cage)) {
message("Running without CAGE")
message("Cancel if this was wrong, and input correct ORFik experiment!")
} else if (!all(df.rfp$stage %in% df.cage$stage))
stop("Not equal tissues/stages in CAGE and Ribo-seq")
# Making tissue grouping tables
groups <- bamVarName(df.rfp, skip.replicate = TRUE, skip.libtype = TRUE)
ugroups <- unique(groups)
if (length(ugroups) == 1) {
ugroups <- ifelse(ugroups == "", "SingleGroup", ugroups)
}
if (mode %in% c("CDS", "aCDS")) ugroups <- "combined"
if (tableNotExists("experiment_groups")) insertTable(ugroups, "experiment_groups")
if (tableNotExists("experiment_groups_all")) insertTable(groups, "experiment_groups_all")
if (nrow(df.rfp) == 1) {
message(p("Input validated, will run in single library mode:"))
} else {
message(p("Tissues validated, will run for ", length(ugroups), " groups:"))
message("Replicates per group:")
print(table(groups))
}
return(invisible(NULL))
}
# load cds
# remove random size of middle so length resembles ORF, must still have same frame!
# save these CDS and use them as test uORFs
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.