knitr::opts_chunk$set(echo = FALSE) knitr::opts_chunk$set(warning=FALSE, collapse = TRUE, message=FALSE, comment = "") suppressMessages(library(OmicSelector)) suppressMessages(library(dplyr)) suppressMessages(library(data.table)) suppressMessages(library(knitr)) suppressMessages(library(rmarkdown)) suppressMessages(library(kableExtra)) suppressMessages(library(naniar)) suppressMessages(library(VIM)) setwd("/OmicSelector") set.seed(1) require(XML) config <- xmlParse("1_preprocessing.xml") config <- xmlToList(config) knitr::opts_chunk$set(fig.width=13, fig.height=9) knitr::opts_knit$set(root.dir = '/OmicSelector') message(getwd()) dataset = fread("data_start.csv") datasetx = dplyr::select(dataset, starts_with("hsa")) metadane = dplyr::select(dataset, -starts_with("hsa"))
Initial data check:
readLines("initial_check.txt") %>% paste0(collapse="\n") %>% cat
Names correction:
if(config[["correct_names"]] == "yes") { old = colnames(dataset) dataset = OmicSelector_correct_miRNA_names(dataset) new = colnames(dataset) zmiany = data.frame(`Old name` = old[old != new], `New name` = new[old != new]) if(nrow(zmiany)>0) { kable(zmiany, col.names = c("Old feature name","New feature name")) %>% kable_styling(bootstrap_options = c("striped", "hover")) fwrite(dataset, "data_start.csv") dataset = fread("data_start.csv") datasetx = dplyr::select(dataset, starts_with("hsa")) metadane = dplyr::select(dataset, -starts_with("hsa")) } else { cat("Correction was performed, but no names were changed.") } } else { cat("No names correction was performed.") } colnames(dataset) = make.names(colnames(dataset), unique=TRUE) colnames(datasetx) = make.names(colnames(datasetx), unique=TRUE) colnames(metadane) = make.names(colnames(metadane), unique=TRUE)
Initial pattern of missing values:
vis_miss(datasetx, warn_large_data = FALSE) gg_miss_var(datasetx) if (n_var_miss(datasetx) > 2) { gg_miss_upset(datasetx) }
if(is.null(config[["missing_imput"]])) { cat("No missing data imputation was performed.") } else if(config[["missing_imput"]] == "pmm") { suppressMessages(library(mice)) temp1 = mice(datasetx, m=1, maxit = 50) datasetx = complete(temp1) fwrite(cbind(metadane, datasetx), "data_start.csv") # Sprawdzanie cat("\nAfter imputation:") vis_miss(datasetx) gg_miss_var(datasetx) if (n_var_miss(datasetx) > 2) { gg_miss_upset(datasetx) } } else if(config[["missing_imput"]] == "median") { suppressMessages(library(imputeMissings)) datasetx = impute(datasetx, method = "median/mode") fwrite(cbind(metadane, datasetx), "data_start.csv") # Sprawdzanie cat("\nAfter imputation:") vis_miss(datasetx) gg_miss_var(datasetx) if (n_var_miss(datasetx) > 2) { gg_miss_upset(datasetx) } } else if(config[["missing_imput"]] == "mean") { for(i in 1:ncol(datasetx)) { datasetx[ , i][is.na(datasetx[ , i])] <- mean(datasetx[ , i], na.rm = TRUE) } fwrite(cbind(metadane, datasetx), "data_start.csv") # Sprawdzanie cat("\nAfter imputation:") vis_miss(datasetx) gg_miss_var(datasetx) if (n_var_miss(datasetx) > 2) { gg_miss_upset(datasetx) } } else if(config[["missing_imput"]] == "mean") { do_zostawienia = complete.cases(datasetx) datasetx = datasetx[do_zostawienia,] metadane = datasetx[metadane,] fwrite(cbind(metadane, datasetx), "data_start.csv") kable(table(do_zostawienia), col.names = c("Remaining","Number of features")) %>% kable_styling(bootstrap_options = c("striped", "hover")) cat("Cases with NAs were deleted. FALSE cases were deleted.") }
if(config[["input_format"]] == "counts") { # Czy filtracja if(config[["filtration"]] == "yes") { ttpm = OmicSelector_counts_to_log10tpm(datasetx, metadane, ids = metadane$sample, filtr = T, filtr_minimalcounts = config[["filtration_mincounts"]], filtr_howmany = config[["filtration_propotion"]]) fwrite(cbind(metadane, ttpm), "data_start.csv") dataset = fread("data_start.csv") datasetx = dplyr::select(dataset, starts_with("hsa")) metadane = dplyr::select(dataset, -starts_with("hsa")) } else { ttpm = OmicSelector_counts_to_log10tpm(datasetx, metadane, ids = metadane$sample, filtr = F) fwrite(cbind(metadane, ttpm), "data_start.csv") dataset = fread("data_start.csv") datasetx = dplyr::select(dataset, starts_with("hsa")) metadane = dplyr::select(dataset, -starts_with("hsa")) } } else { cat("No normalization was performed (per pipeline config).") }
Download starting (normalized) dataset: data_start.csv
Class frequences:
barplot(table(dataset$Class), horiz=TRUE) kable(table(dataset$Class), col.names = c("Class","Number of cases")) %>% kable_styling(bootstrap_options = c("striped", "hover"))
Check if dataset is balanced using binominal test (vs. p=0.5):
cancer_cases = sum(dataset$Class == "Case") all_cases = length(dataset$Class) binom.test(n = all_cases, x = cancer_cases, p = 0.5, alternative = "two.sided")
OmicSelector_table(psych::describe(datasetx)) fwrite(psych::describe(datasetx), "result_preprocessing_descriptivestats.csv")
Download result_preprocessing_descriptivestats.csv
Correlations between features:
suppressMessages(library(ComplexHeatmap)) suppressMessages(library(circlize)) correlations = cor(datasetx, method = "pearson") correlations[lower.tri(correlations)] = 0 fwrite(psych::describe(datasetx), "result_preprocessing_correlationmatrix.csv") col_fun = colorRamp2(c(-1, 0, 1), c("blue", "white", "red"), transparency = 0.5) Heatmap(correlations, name = "corr", col = col_fun, cluster_rows = FALSE, cluster_columns = FALSE, show_row_names = FALSE, show_column_names = FALSE) flattenCorrMatrix <- function(cormat, pmat) { ut <- upper.tri(cormat) data.frame( row = rownames(cormat)[row(cormat)[ut]], column = rownames(cormat)[col(cormat)[ut]], cor =(cormat)[ut], p = pmat[ut] ) } suppressMessages(library(Hmisc)) cor_p = rcorr(as.matrix(datasetx), type="pearson") cor_p2 = flattenCorrMatrix(cor_p$r, cor_p$P) fwrite(cor_p2, "result_preprocessing_correlationmatrix_details.csv") OmicSelector_table(cor_p2)
Correlation matrix (correlation coef.): Download result_preprocessing_correlationmatrix.csv
Details: Download result_preprocessing_correlationmatrix_details.csv
Differential expression:
if(config['input_format'] == "transformed" | config['input_format'] == "counts") { DE = OmicSelector_differential_expression_ttest(datasetx, metadane$Class, mode = "logtpm") } if(config['input_format'] == "deltact") { DE = OmicSelector_differential_expression_ttest(datasetx, metadane$Class, mode = "deltact") } fwrite(DE, "result_preprocessing_wholedataset_DE.csv") OmicSelector_table(DE)
Splitting the dataset to training, testing and validation datasets.
mixed = OmicSelector_prepare_split(metadane = metadane, ttpm = datasetx, train_proc = as.numeric(config[["training_split"]]))
packageDescription("OmicSelector")
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.