#' @title Read the output of dbCAN and extract abundance profile
#' @description read_dbcan3 calculates the abundance of each Gene within the
#' bins based on the dbCAN output from run_dbcan3 V3.0.6.
#' @usage read_dbcan3(dbcan_path, write=FALSE, profile=TRUE)
#' @param dbcan_path a path where dbCAN output data are. They
#' should have the extension overview.txt and all files in the path are the ones that
#' need to be read. Output data should have 6 columns with the bin names
#' followed by the Genes obtained in every algorithm (HMMER,Hotpep,DIAMOND),
#' column 'Signalp' indcating if a Peptide signal is found and a column
#' '#ofTools" indicating the number of algorithms that found this Gene.
#' @param profile a logical value indicating if you want to print a profile
#' or not.
#' @param write a logical value indicating to save the data imported
#' as a formatted table with .tsv extension with a time stamp and it will be
#' located in your current workin directory
#' @import dplyr tidyr readr stringr rlang tidyselect utils
#' @examples
#' \dontrun{
#' read_dbcan3("C:/Users/bins/")
#' }
#' @export
read_dbcan3<-function(dbcan_path, write=FALSE, profile=TRUE){
ruta_dbcan<-dbcan_path
# Load all the data tables results ---------------------------------------####
lapply_read_delim_bind_rows <- function(path, pattern = "overview.txt"){
files = list.files(path, pattern ="overview.txt", full.names = TRUE, recursive = T)
lapply(files, read.delim, check.names=F) %>%
bind_rows()
}
dbcan_df<-suppressWarnings(lapply_read_delim_bind_rows(ruta_dbcan))
# Reading data ----------------------------------------------------------####
dbcan_df_format<- suppressWarnings(
dbcan_df %>%
clean_names() %>%
rename(Number_of_Tools = number_of_tools) %>%
filter(Number_of_Tools > 1) %>%
rename(Bin_name = gene_id) %>%
mutate( hmmer2=str_replace_all(.data$hmmer, "[[:punct:]]", "\t")) %>%
separate(.data$hmmer2, c("dbNamesHMM"), sep="\t") %>%
mutate(diamond2=str_replace_all(.data$diamond, "[[:punct:]]", "\t")) %>%
separate(.data$diamond2, c("dbNamesdiamond"), sep="\t"))
###################################### e_cami ##################################
if (is.null(dbcan_df_format$e_cami)) {
df_format <-dbcan_df_format %>%
unite("dbCAN_names", dbNamesHMM, dbNamesdiamond, sep="_", remove = F) %>%
mutate(dbCAN_names=str_replace_all(.data$dbCAN_names, "^_", "")) %>%
separate(.data$dbCAN_names, c("dbCAN_names"), sep="_") %>%
mutate(dbCAN = case_when(
str_detect(.data$dbCAN_names, "CBM") ~ "carbohydrate-binding module [CBM]",
str_detect(.data$dbCAN_names, "CE") ~ "carbohydrate esterases [CEs]",
str_detect(.data$dbCAN_names, "GH") ~ "glycoside hydrolases [GHs]",
str_detect(.data$dbCAN_names, "GT") ~ "glycosyltransferases [GTs]",
str_detect(.data$dbCAN_names, "PL") ~ "polysaccharide lyases [PLs]",
str_detect(.data$dbCAN_names, "AA") ~ "auxiliary ativities [AAs]")
) %>%
mutate_if(is.character, str_trim) %>%
dplyr::select(.data$Bin_name, .data$dbCAN_names, domain_name = .data$dbCAN,
.data$signalp) %>%
calc_abundance(analysis = "dbCAN", col_rename = "dbCAN_names") %>%
dplyr::select(-.data$Scaffold_name, .data$dbCAN_names) %>%
group_by(.data$Bin_name, dbCAN_family = .data$dbCAN_names,
.data$domain_name, .data$signalp) %>%
summarise_if(is.numeric, sum)
} else {
df_format <-dbcan_df_format %/%
mutate(ecami2=str_replace_all(.data$e_cami, "[[:punct:]]", "\t")) %>%
separate(.data$ecami2, c("dbNameseCAMI"), sep="\t") %>%
unite("dbCAN_names", dbNamesHMM, dbNamesdiamond, dbNameseCAMI,
sep="_", remove = F) %>%
mutate(dbCAN_names=str_replace_all(.data$dbCAN_names, "^_", "")) %>%
separate(.data$dbCAN_names, c("dbCAN_names"), sep="_") %>%
mutate(dbCAN = case_when(
str_detect(.data$dbCAN_names, "CBM") ~ "carbohydrate-binding module [CBM]",
str_detect(.data$dbCAN_names, "CE") ~ "carbohydrate esterases [CEs]",
str_detect(.data$dbCAN_names, "GH") ~ "glycoside hydrolases [GHs]",
str_detect(.data$dbCAN_names, "GT") ~ "glycosyltransferases [GTs]",
str_detect(.data$dbCAN_names, "PL") ~ "polysaccharide lyases [PLs]",
str_detect(.data$dbCAN_names, "AA") ~ "auxiliary ativities [AAs]")
) %>%
mutate_if(is.character, str_trim) %>%
dplyr::select(.data$Bin_name, .data$dbCAN_names,
domain_name = .data$dbCAN, .data$signalp) %>%
calc_abundance(analysis = "dbCAN", col_rename = "dbCAN_names") %>%
dplyr::select(-.data$Scaffold_name, .data$dbCAN_names) %>%
group_by(.data$Bin_name, dbCAN_family = .data$dbCAN_names,
.data$domain_name, .data$signalp) %>%
summarise_if(is.numeric, sum)
}
# Reformating data ----------------------------------------------------------####
dbcan_df_reformat <-df_format %>%
dplyr::select(-.data$signalp, .data$dbCAN_family) %>%
group_by(.data$Bin_name, .data$dbCAN_family, .data$domain_name) %>%
summarize_if(is.numeric, sum) %>%
pivot_wider(names_from = "Bin_name", values_from = "Abundance") %>%
ungroup() %>%
mutate_if(is.numeric, ~replace(., is.na(.), 0))
# Messages --------------------------------------------------------------####
initial<-dim(dbcan_df)
final<-dim(dbcan_df_format %>%
filter(Number_of_Tools >1))
signals<- df_format %>%
group_by(.data$signalp) %>%
count()
signals2<- df_format %>%
group_by(.data$signalp) %>%
count()
print(paste0("Input Genes = " , initial[1]))
print(paste0("Remained Genes after filtering = " , final[1]))
print(paste0("Percentage of genes remained = " , round(final[1]/initial[1]*100), "%"))
print(paste0("Number of genes with signals = " , sum(signals[-1,]$n)))
print(paste0("Number of genes with signals that passed filtering = " , sum(signals2[-1,]$n)))
# Profile or not --------------------------------------------------------------####
if(isTRUE(profile)){
output<-dbcan_df_reformat
} else{
output<-df_format
}
# Write data or not --------------------------------------------------------------####
if(isTRUE(write)){
write_tsv(output, paste0("dbcan_output_", format(Sys.time(), "%b_%d_%X"), ".tsv"))
}
else{
return(output)
}
# Return ----------------------------------------------------------------####
return(output)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.