# NEW
# .searchForAccessionAcrossDBsDF - initial accessionConverter
# with more function arguments (i.e. table columns for sra and geo)
#
# CHANGED
# convertAccession (previously accessionConverter)
# - a wrapper around .searchForAccessionAcrossDBsDF
# Converts a list of accessions (of one type)
# into all possible accessions within SRA and GEO
#'
#' Search for accession across databases
#'
#' \code{.searchForAccessionAcrossDBsDF} classifies accessions,
#' searches for them in their original database (SRA or GEO),
#' then, if conversion is possible, searches for corresponding accessions
#' in the other database (SRA or GEO). If no conversion is possible,
#' the columns from the corresponding database are returned empty (NAs).
#'
#'
#' @param acc_vector A vector of accessions
#' \strong{(all must belong to the same type)}
#' @param sra_columns A character vector with names of the columns
#' to be returned from SRA
#' @param geo_columns A character vector with names of the columns
#' to be returned from GEO
#' @param df (OPTIONAL) a data frame to be used as either SRA or GEO data frame
#' @return A data frame with the results of the query
#'
#' @examples
#' ## Setup SpiderSeqR environment first (please use non-demo version)
#' ## startSpiderSeqRDemo()
#' ## .searchForAccessionAcrossDBsDF("GSE48253", "*", "*", "*")
#' ## df argument is optional
#'
#' ## If using named arguments,
#' ## it is not necessary to provide columns from the original df
#' # .searchForAccessionAcrossDBsDF(acc_vector = df$gsm,sra_columns="*",df=df)
#'
#' @section Using df argument:
#' When data frame argument is provided, corresponding columns
#' (sra_columns for SRA or geo_columns and gse_columns for GEO)
#' will be disregarded (and are not necessary to provide,
#' but only if using named arguments).
#' Accession vector is only used to classify the input data frame.
#'
#'
#' @keywords internal
#'
.searchForAccessionAcrossDBsDF <- function(acc_vector, sra_columns,
geo_columns, gse_columns, df){
accession_class <- classifyAccession(acc_vector)
accepted_sra <- c("run_accession", "experiment_accession",
"sample_accession", "study_accession")
accepted_geo <- c("gsm", "series_id")
accepted_classes <- c(accepted_sra, accepted_geo)
if (!(accession_class %in% accepted_classes)){
stop("Accession needs to belong to one of the supported classes")
}
#=============================================================
# GEO accession as input
#=============================================================
if (accession_class %in% accepted_geo){
#GEO data frame
if (accession_class == "gsm"){
if (!missing(df)){
geo_df <- df # Set original df as geo_df
} else {
geo_df <- .searchGEOForGSM(acc_vector, geo_columns, gse_columns)
}
}
if (accession_class == "series_id"){
if (!missing(df)){
warning("Provided df will be ignored")
}
geo_df <- .searchGEOForGSE(acc_vector, geo_columns, gse_columns)
}
#TEMP
.vex("temp_geo_df", geo_df)
#...
if (dim(geo_df)[1]==0) stop("No matching entries found")
#saveRDS(geo_df, "geo_df.Rda")
#SRR_GSM data frame
srr_gsm_df <- .searchSRR_GSM(geo_df$gsm)
#saveRDS(srr_gsm_df, "srr_gsm_df.Rda")
#TEMP
.vex("temp_srr_gsm_df", srr_gsm_df)
#...
#SRA data frame
if(sum(grepl("SRR\\d\\d\\d+", srr_gsm_df$run_accession)>0)){
# Only search SRA if there is viable GEO/SRA conversion
# if(length(srr_gsm_df$run_accession)!=0){
sra_df <- .searchSRAForAccession(srr_gsm_df$run_accession,
sra_columns)
} else { # Generate an empty data frame if no results in SRR_GSM
if (length(sra_columns) == 1 & sra_columns[1] == "*"){
# Set sql_sra_columns to the list of all
sql_sra_columns <- .listSRAFields()
} else {
# Otherwise keep all of them intact
sql_sra_columns <- sra_columns
}
sra_df <-
stats::setNames(data.frame(matrix(ncol=length(sql_sra_columns),
nrow = 0)), sql_sra_columns)
sra_df <- .renameSRAColumns(sra_df)
}
#saveRDS(sra_df, "sra_df.Rda")
#TEMP
.vex("temp_sra_df", sra_df)
#...
#Merge
#DOUBLE CHECK IF WANT ALL OR ALL.X ===*===
geo_srr_gsm_df <- merge(geo_df, srr_gsm_df,
by.x = "gsm", by.y = "gsm", all = TRUE)
geo_srr_gsm_sra_df <- merge(geo_srr_gsm_df, sra_df,
by.x = "run_accession",
by.y = "run_accession", all = TRUE)
#TEMP
.vex("temp_geo_srr_gsm_df", geo_srr_gsm_df)
.vex("temp_geo_srr_gsm_sra_df", geo_srr_gsm_sra_df)
#...
#Rename data frame
output_df <- geo_srr_gsm_sra_df
}
#=============================================================
#=============================================================
# SRA accession as input
#=============================================================
if (accession_class %in% accepted_sra){
#SRA data frame
if (!missing(df)){
sra_df <- df # Set original df as sra_df
} else {
sra_df <- .searchSRAForAccession(acc_vector, sra_columns)
}
#TEMP
.vex("temp_sra_df", sra_df)
#...
if (dim(sra_df)[1]==0) stop("No matching entries found")
#SRR_GSM data frame
srr_gsm_df <- .searchSRR_GSM(sra_df$run_accession)
#TEMP
.vex("temp_srr_gsm_df", srr_gsm_df)
#...
#GEO data frame
# # Only search GEO if there is viable SRA/GEO conversion
# if(length(srr_gsm_df$gsm)!=0){
# geo_df <- .searchGEOForGSM(srr_gsm_df$gsm, geo_columns)
#} else { # Generate an empty data frame if no results in SRR_GSM
# geo_df <- stats::setNames(data.frame(matrix(ncol =
# length(geo_columns), nrow = 0)), geo_columns)
#}
#GEO data frame
if (sum(grepl("GSM\\d\\d\\d+", srr_gsm_df$gsm)>0)){
# Only search GEO if there is viable SRA/GEO conversion
# if(length(srr_gsm_df$gsm)!=0){
geo_df <- .searchGEOForGSM(srr_gsm_df$gsm, geo_columns, gse_columns)
} else { # Generate an empty data frame if no results in SRR_GSM
# GSM
if (length(geo_columns) == 1 & geo_columns[1] == "*"){
# Set gsm_columns to the list of all
df_gsm_columns <- .listGSMFields()
} else {
# Otherwise keep all of them intact
df_gsm_columns <- geo_columns
}
df_gsm <-
stats::setNames(data.frame(matrix(ncol=length(df_gsm_columns),
nrow = 0)), df_gsm_columns)
df_gsm <- .renameGSMColumns(df_gsm)
# GSE
if (is.null(gse_columns)){
geo_df <- df_gsm
} else {
if (length(gse_columns) == 1 & gse_columns[1] == "*"){
# Set sql_geo_columns to the list of all
df_gse_columns <- .listGSEFields()
} else {
# Otherwise keep all of them intact
df_gse_columns <- gse_columns
}
df_gse <- stats::setNames(data.frame(matrix(
ncol = length(df_gse_columns), nrow = 0)), df_gse_columns)
df_gse <- .renameGSEColumns(df_gse)
geo_df <- cbind(df_gsm, df_gse)
}
}
#Create an empty data frame with appropriate column names
#if(dim(geo_df)[1]==0){
#geo_gsm_columns <- dbListFields(get("geo_con"), "gsm")
#}
#TEMP
.vex("temp_geo_df", geo_df)
#...
#Merge
#DOUBLE CHECK IF WANT ALL OR ALL.X ===*===
sra_srr_gsm_df <- merge(sra_df, srr_gsm_df,
by.x = "run_accession",
by.y = "run_accession", all = TRUE)
#TEMP
.vex("temp_sra_srr_gsm_df", sra_srr_gsm_df)
#...
sra_srr_gsm_geo_df <- merge(sra_srr_gsm_df, geo_df,
by.x = "gsm", by.y = "gsm", all = TRUE)
#TEMP
.vex("temp_sra_srr_gsm_geo_df", sra_srr_gsm_geo_df)
#...
output_df <- sra_srr_gsm_geo_df
}
#=============================================================
return(output_df)
}
#'
#' Convert between accession types
#'
#' \code{convertAccession} converts a vector of accessions
#' (all belonging into the same accession type) into all possible accession
#' types within SRA and GEO. If no SRA/GEO conversion is possible,
#' all the missing accession types are marked as NAs.
#'
#'
#' @param acc_vector A vector of accessions
#' \strong{(all must belong to the same type)}
#' @return A data frame with conversion between all accession types
#' @family Workflow functions
#' @family Core functions
#' @examples
#'
#' # Setup SpiderSeqR environment first (please use non-demo version)
#' startSpiderSeqRDemo()
#'
#' convertAccession("SRP134708")
#'
#' convertAccession("SRR3707942")
#'
#' convertAccession("GSM2027840")
#'
#' # Note that DRP, ERP and SRP are of the same accession type (study level)
#' convertAccession(c("DRP003157", "SRP061795"))
#'
#'
#'
#'
#'
#' @section Accepted Accession Types:
#' \code{convertAccession} accepts any of the 4 SRA or 2 GEO accession types
#' (see section \emph{'Background Information on Accession Types')}.
#' \code{convertAccession} accepts only one accession type at a time.
#'
#' For example, the following queries are NOT allowed:
#'
#'
#' \code{convertAccession("SRR_____", "SRP_____")}
#'
#' \code{convertAccession("GSE_____", "SRP_____")}
#'
#'
#' In order to obtain the above results, it is necessary to run
#' separate queries for each accession type, and, if desirable,
#' bind the data frames together
#' (e.g. \code{rbind(convertAccession("SRR_____"),
#' convertAccession("SRP_____"))}).
#'
#' SRA accessions differing by the first letter belong to the same type,
#' hence it is possible to run: \code{convertAccession("SRP_____", "ERP_____")}
#'
#'
#'
#' @section Output format:
#' The function outputs a data frame with conversion of the input accessions
#' into all possible types.
#'
#' In the best case scenario, i.e. if an accession exists in both SRA and GEO
#' databases, these would include all 6 accession types
#' (SRR, SRX, SRS, SRP, GSM, GSE).
#'
#' If an accession exists only in one of the databases, the conversion will
#' be limited to that one database. For example, if an accession only exists
#' in SRA, only SRA accessions will be provided, whilst the GEO columns will
#' be populated with NAs.
#'
#'
#'
#' @section Troubleshooting:
#' The conversion between SRA and GEO databases is based on a custom database
#' generated by \code{startSpiderSeqR()} function. To ensure best results,
#' make sure that the most up to date versions of the databases.
#' To improve results, you can do the following:
#' \enumerate{
#' \item Download the most up to date versions of SRAmetadb.sqlite
#' and GEOmetadb.sqlite files - this is done by running
#' \code{\link{startSpiderSeqR}}, specifying an appropriate argument
#' for expiry period of database files
#' (e.g. \code{startSpiderSeqR(path = getwd(), general_expiry = 1)})
#' \item Generate a fresh custom database for conversion between
#' accessions (SRR_GSM.sqlite) - this is also done by running
#' \code{\link{startSpiderSeqR}}, specifying an appropriate argument
#' for expiry period of the database file
#' \item As a last resort, manually search for the missing
#' conversions online
#' }
#'
#' \emph{\strong{NOTE:}} because the SRR_GSM.sqlite database
#' is machine-generated,
#' there is some risk that it might not include some conversions in case
#' they have been recorded in the database in a non-standard way.
#' If in doubt, it is worth checking the accession page online.
#' However, users should be aware that the overlap between SRA and GEO
#' is only about 20\% (at the time of writing), so most entries will not have
#' corresponding accession numbers in the other database
#'
#'
#'
#' @section Background Information on Accession Types:
#' The two lists below include accession types within SRA and GEO respectively.
#'
#' All of these are supported by the convertAccession function.
#'
#' \strong{SRA}
#' \enumerate{
#' \item SRP or DRP or ERP - project_accession
#' \item SRS or DRS or ERS - sample_accession
#' \item SRX or DRX or ERX - experiment_accession
#' \item SRR or DRR or ERR - run_accession
#' }
#'
#' \emph{\strong{NOTE:}} depending on the location of the database
#' (NCBI, EBI or DDBJ),
#' these accessions might begin with a different letter (S, E or D),
#' so the accession levels can be either
#' SRP/SRX/SRS/SRR or ERP/ERX/ERS/ERR or DRP/DRX/DRS/ERR.
#' Accessions beginning with 'S' are by far the most common.
#'
#'
#' \strong{GEO}
#' \enumerate{
#' \item GSE - series_id
#' \item GSM - sample
#' }
#'
#' \emph{\strong{NOTE:}} GEO accession system is further
#' complicated by existence of
#' 'superseries', which act as higher level series.
#' In these cases a given GSM would belong to multiple (at least two)
#' GSEs - its series_id and superseries.
#'
#' @export
#'
convertAccession <- function(acc_vector){
geo_columns <- c("gsm", "series_id")
# Not needed (.searchSRR_GSM has defaults)
#srr_gsm_columns <- c("gsm", "gsm_check", "run_accession")
sra_columns <- c("run_accession", "experiment_accession",
"sample_accession", "study_accession")
output_df <- suppressMessages(.searchForAccessionAcrossDBsDF(acc_vector,
geo_columns = geo_columns,
sra_columns = sra_columns, gse_columns = NULL))
# Suppress messages to prevent the message from .appendGSEColumns
# about no columns being needed
# output_df <- .searchForAccessionAcrossDBsDF(acc_vector,
# geo_columns = geo_columns,
# sra_columns = sra_columns,
# gse_columns = NULL)
# NOTE: if gse_columns include ath else, add it to extracted columns below
# Reorder columns (order always the same, regardless of accession type)
# output_df <- output_df[ , c("run_accession",
# "experiment_accession", "sample_accession",
# "study_accession", "gsm", "series_id",
# "gsm_check")]
output_df <- output_df[ , c("run_accession", "experiment_accession",
"sample_accession", "study_accession",
"gsm", "series_id")] # Remove gsm_check
temp_unordered_output_df <- output_df #===*===
order_columns <- list(output_df$study_accession,
output_df$sample_accession,
output_df$experiment_accession,
output_df$run_accession,
#output_df$series_id,
output_df$gsm)
output_df <- output_df[orderAccessions(order_columns), ]
output_df <- .unifyDFFormat(output_df)
return(output_df)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.