#!/usr/bin/Rscript
### SIAMCAT - Statistical Inference of Associations between
### Microbial Communities And host phenoTypes R flavor EMBL
### Heidelberg 2012-2018 GNU GPL 3.0
#' @title Read label file
#'
#' @description Read label information from a file
#'
#' @param fn.in.label name of the tsv file containing labels
#'
#' @return label object containing several entries:\itemize{
#' \item \code{$label} named vector containing the numerical labels from the
#' file;
#' \item \code{$info} information about the classes in the label;
#' \item \code{$type} information about the label type (e.g. \code{BINARY});
#'}
#'
#' @details This function reads in a tsv file with labels and converts it
#' into a label.
#'
#' First row is expected to be
#'
#' \code{#BINARY:1=[label for cases];
#' -1=[label for controls]}.
#'
#' Second row should contain the sample identifiers as tab-separated list
#' (consistent with feature and metadata).
#'
#' Third row is expected to contain the actual class labels (tab-separated):
#' \code{1} for each case and \code{-1} for each control.
#'
#' Note: Labels can take other numeric values (but not characters or strings);
#' importantly, the label for cases has to be greater than the one for controls
#'
#'
#' @export
#'
#' @encoding UTF-8
#'
#' @examples
#' # run with example data
#' fn.label <- system.file('extdata',
#' 'label_crc_zeller_msb_mocat_specI.tsv',
#' package = 'SIAMCAT')
#'
#' crc.zeller.label <- read.label(fn.label)
read.label <- function(fn.in.label) {
if (is.null(fn.in.label))
stop("Filename for labels file not provided!\n")
label <-
read.table(
file = fn.in.label,
sep = "\t",
header = TRUE,
row.names = NULL,
stringsAsFactors = FALSE,
check.names = FALSE,
quote = "",
comment.char = "#",
encoding = "latin1"
)
label <- as.matrix(label)
if (dim(label)[1] > dim(label)[2]) {
temp <- names(label)
names(label) <- NULL
label <- rbind(temp, label)
rownames(label) <- label[, 1]
label[, 1] <- NULL
label <- t(label)
}
namesL <- colnames(label)
label <- as.numeric(label)
names(label) <- namesL
# Check general suitablity of supplied dataset
classes <- unique(label)
for (i in classes) {
if (sum(label == i) <= 5)
stop(
"Data set has only",
sum(label == i),
"training examples of class",
i,
" This is not enough for
SIAMCAT to proceed"
)
if (sum(label == i) < 10) {
msg <- paste("Data set has only", sum(label == i),
"training examples of class", i, " . Note that a dataset",
" this small/skewed is not necessarily suitable for analysis",
" in this pipeline.")
message(msg)
}
}
# Check label header!
con <- file(fn.in.label, "rt")
header <- readLines(con, 1)
if (substring(header, 1, 1) != "#") {
stop("Label header seems to be missing or broken.")
}
close(con)
label <- list(label = label)
label.info <- parse.label.header(header)
label$info <- label.info$class.descr
label$type <- label.info$type
stopifnot(label$type == "BINARY")
invisible(label)
}
##### auxiliary function to trim whitespace from string returns string without
##### leading or trailing whitespace
#' @keywords internal
trim <- function(x) {
gsub("^\\s+|\\s+$", "", x)
}
#' @title Parse label header
#'
#' @description This function parses the header of a label file
#'
#' @param label.header - string in the format: #<TYPE>:<L1>=<class1>;
#' <L2>=<class2>[;<L3>=<class3>] where <TYPE> is a string specifying the type
#' of label variable such as BINARY (for binary classification), CATEGORICAL
#' (for multi-class classification), or CONTINUOUS (for regression)
#' <L1> is a short numeric label for the first class with description <class1>
#' (similarly for the other classes)
#'
#' @return a list with tow items \itemize{
#' \item \code{$type} type of the label: BINARY CONTINUOUS or CATEGORICAL
#' \item \code{$class.descr} lables and information on what do they mean
#'}
#'
#' @keywords internal
parse.label.header <- function(label.header) {
s <- strsplit(label.header, ":")[[1]]
type <- trim(s[1])
if (substr(type, 1, 1) == "#")
type <- trim(substr(type, 2, nchar(type)))
class.descr <-
unlist(strsplit(strsplit(trim(s[2]), ";")[[1]], "="))
l <- class.descr[seq(2, length(class.descr), 2)]
class.descr <-
as.numeric(class.descr[seq(1, length(class.descr) - 1, 2)])
names(class.descr) <- l
label.info <- list()
label.info$type <- type
label.info$class.descr <- sort(class.descr)
return(label.info)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.