R/IO_functions.R

Defines functions retrieve_feature_dictionary allisna

# Function to import pyradiomics csv files --------------------------------------------

#' Import pyradiomics data
#'
#' This functions imports feature values extracted using pyradiomics software
#'
#' @param dir (character) Path to the directory containing tsv files generated by pyradiomics
#'
#' @return A RadAR object (class \code{\link{SummarizedExperiment}})
#' @author Matteo Benelli (\email{matteo.benelli@uslcentro.toscana.it})
#' @export
#'
#' @examples
import_pyradiomics <- function (dir = NULL) {

  assertthat::assert_that(length(dir) > 0, msg = paste("[RadAR] Error:", dir, "should be specified"))
  assertthat::assert_that(file.exists(dir), msg = paste("[RadAR] Error:", dir, "does not exist"))
  extractor <- "pyradiomics"
  list_files <- list.files(dir, pattern = "csv$", full.names = T)
  assertthat::assert_that(length(list_files) > 0, msg = paste("[RadAR] Error:", dir, "does not contain csv files"))

  ix_files <- check_files(list_files)
  list_files <- list_files[ix_files]

  tmp <- list.files(dir, pattern = "csv$", full.names = F)
  filenames <-  gsub(".csv", "", tmp[ix_files])

  feature_data <- c()
  mask_id <- c()
  sample_id <- c()
  filename_tot <- c()

  for (i in 1: length(list_files)) {
    message(paste("[RadAR] importing", list_files[i]))
    xx <- read.delim(list_files[i], sep = ",", header = T)
    # check file, i=1
    if (i == 1) {
      ncol_file <- ncol(xx)
      end_general <- rev(grep("general_", colnames(xx)))[1]
      if (is.na(end_general)) {
        end_general <- rev(grep("diagnostics_", colnames(xx)))[1]
      }
      assertthat::assert_that(!is.na(end_general), msg = "[RadAR] Error: Bad or unexpected format")
      xx.data <- xx[1, -c(1: end_general), drop = F]
      #assertthat::assert_that(all(is.numeric(xx.data)), msg = "[RadAR] Error: Bad or unexpected format (data is not numeric)")
      image_types <- gsub("_.*", "", colnames(xx.data))
      feature_names <- gsub(".*_", "", colnames(xx.data))
    } else {
      assertthat::assert_that(ncol(xx) == ncol_file, msg = paste("[RadAR] Error: unexpected number of features in", list_files[i]))
      xx.data <- xx[, -c(1: end_general)]
    }
    mylabels <- unique(xx$Mask)
    mask_id <- c(mask_id, mylabels)
    sample_id <- c(sample_id, rep(filenames[i], length(mylabels)))
    filename_tot <- c(filename_tot, rep(list_files[i], length(mylabels)))

    assertthat::assert_that(!any(duplicated(xx$Mask)), msg = paste("[RadAR] duplicated mask names in", list_files[i]))
    for (j in  1: length(mylabels)) {
      mylabel <- mylabels[j]
      #      mylabel <- gsub("Segmentation_segment_", "", mylabels[j])
      feature_data <- cbind(feature_data,
                            as.numeric(xx.data[which(xx$Mask == mylabel), ]))
    }
  }
  colnames(feature_data) <- mask_id
  if (length(unique(image_types)) > 1) {
    feature_ids <- paste0(feature_names, ".", image_types)
  } else {
    feature_ids <- feature_names
  }
  rownames(feature_data) <- feature_ids

  dict_out <- retrieve_feature_dictionary(features_names = feature_names,
                                          extractor = extractor)
  #  feature_description <- rep(NA, nrow(feature_data))
  rowData <- data.frame(feature_name = feature_names,
                        image_type = image_types,
                        feature_description = dict_out$feature_description,
                        feature_type = dict_out$feature_type
  )
  #rownames(rowData) <- feature_ids
  colData <- data.frame(filename = filename_tot,
                        sample_id = sample_id,
                        mask_id = mask_id)

  metadata <- list()
  metadata$extractor <- extractor
  rdr <- new_radar_dataset(features = feature_data, rowData = rowData, colData = colData, metadata = metadata)
  return(rdr)

}
# Function to import 3D Slicer tabulated files --------------------------------------------

#' Import 3DSlicer data
#'
#' This functions imports feature values extracted using 3DSlicer software
#'
#' @param dir (character) Path to the directory containing tsv files generated by 3DSlicer
#'
#' @return A RadAR object (class \code{\link{SummarizedExperiment}})
#' @author Matteo Benelli (\email{matteo.benelli@uslcentro.toscana.it})
#' @export
#'
#' @examples
import_3dslicer <- function (dir) {

  assertthat::assert_that(length(dir) > 0, msg = paste("[RadAR] Error:", dir, "should be specified"))
  assertthat::assert_that(file.exists(dir), msg = paste("[RadAR] Error:", dir, "does not exist."))

  extractor <- "3dslicer"
  list_files <- list.files(dir, pattern = "tsv$", full.names = T)
  assertthat::assert_that(length(list_files) > 0, msg = paste("[RadAR] Error:", dir, "does not contain tsv files"))
  ix_files <- check_files(list_files)
  list_files <- list_files[ix_files]

  tmp <- list.files(dir, pattern = "tsv$", full.names = F)
  filenames <-  gsub(".tsv", "", tmp[ix_files])

  feature_data <- c()
  mask_id <- c()
  sample_id <- c()
  filename_tot <- c()
  for (i in 1: length(list_files)) {
    message(paste("[RadAR] importing", list_files[i]))
    xx <- read.delim(list_files[i], sep = "\t", header = T)
    # identify file format
    if (colnames(xx)[1] == "Label") {
      mylabels <- unique(xx$Label)
      if (i == 1) {
        assertthat::assert_that(length(xx$Feature.Class) > 0, msg = "[RadAR] Error: Bad or unexpected format")
        assertthat::assert_that(length(xx$Label) > 0, msg = "[RadAR] Error: Bad or unexpected format")
        assertthat::assert_that(length(xx$Image.type) > 0, msg = "[RadAR] Error: Bad or unexpected format")
        assertthat::assert_that(length(xx$Feature.Name) > 0, msg = "[RadAR] Error: Bad or unexpected format")
        #assertthat::assert_that(all(is.numeric(xx$Value)), msg = "[RadAR] Error: Bad or unexpected format")
        nfeatures_file0 <- table(xx$Feature.Class)/length(unique(xx$Label))
        ix_data_tmp <- which(xx$Label == mylabels[1] & xx$Image.type %in% c("general", "diagnostics") == F)
        image_types <- xx$Image.type[ix_data_tmp]
        feature_names <- xx$Feature.Name[ix_data_tmp]

      } else {
        nfeatures_file <- table(xx$Feature.Class)/length(unique(xx$Label))
        assertthat::assert_that(assertthat::are_equal(nfeatures_file0, nfeatures_file),
                                msg = paste("[RadAR] Error: unexpected number of features in", list_files[i]))
      }
      ix_data <- which(xx$Image.type %in% c("general", "diagnostics") == F)
      #xx <- xx[which(xx$Image.type %in% image_type),]

      mask_id <- c(mask_id, mylabels)
      sample_id <- c(sample_id, rep(filenames[i], length(mylabels)))
      filename_tot <- c(filename_tot, rep(list_files[i], length(mylabels)))
      assertthat::assert_that(!any(duplicated(xx$Mask)), msg = paste("[RadAR] duplicated mask names in", list_files[i]))
      for (j in  1: length(mylabels)) {
        mylabel <- mylabels[j]
        #      mylabel <- gsub("Segmentation_segment_", "", mylabels[j])
        feature_data <- cbind(feature_data,
                              as.numeric(xx$Value[intersect(which(xx$Label == mylabel),ix_data)]))
      }
    }
    if (colnames(xx)[1] == "Image.type") {
      tmp_header <- read.delim(list_files[i], sep = "\t", header = F, check.names = F, nrows = 1)
      mylabels <- gsub(".* ", "", tmp_header[4: length(tmp_header)])
      if (i == 1) {
        assertthat::assert_that(length(xx$Feature.Class) > 0, msg = "[RadAR] Error: Bad or unexpected format")
        assertthat::assert_that(length(xx$Image.type) > 0, msg = "[RadAR] Error: Bad or unexpected format")
        assertthat::assert_that(length(xx$Feature.Name) > 0, msg = "[RadAR] Error: Bad or unexpected format")
        nfeatures_file0 <- nrow(xx)
        ix_data_tmp <- which(xx$Image.type %in% c("general", "diagnostics") == F)
        image_types <- xx$Image.type[ix_data_tmp]
        feature_names <- xx$Feature.Name[ix_data_tmp]

      } else {
        nfeatures_file <- nrow(xx)
        assertthat::assert_that(assertthat::are_equal(nfeatures_file0, nfeatures_file),
                                msg = paste("[RadAR] Error: unexpected number of features in", list_files[i]))
      }

      ix_data <- which(xx$Image.type %in% c("general", "diagnostics") == F)

      mask_id <- c(mask_id, mylabels)
      sample_id <- c(sample_id, rep(filenames[i], length(mylabels)))
      filename_tot <- c(filename_tot, rep(list_files[i], length(mylabels)))
      #assertthat::assert_that(!any(duplicated(xx$Mask)), msg = paste("[RadAR] duplicated mask names in", list_files[i]))
      for (j in  1: length(mylabels)) {
        mylabel <- mylabels[j]
        #      mylabel <- gsub("Segmentation_segment_", "", mylabels[j])
        feature_data <- cbind(feature_data,
                              as.numeric(xx[ix_data, (3+j)]))
      }

    } else {
      assertthat::assert_that(1 < 0, msg = paste("[RadAR] Error: format unrecognized."))
    }
  }

  colnames(feature_data) <- mask_id
  if (length(unique(image_types)) > 1) {
    feature_ids <- paste0(feature_names, ".", image_types)
  } else {
    feature_ids <- feature_names
  }
  rownames(feature_data) <- feature_ids

  dict_out <- retrieve_feature_dictionary(features_names = feature_names,
                                          extractor = extractor)
  #  feature_description <- rep(NA, nrow(feature_data))
  rowData <- data.frame(feature_name = feature_names,
                        image_type = image_types,
                        feature_description = dict_out$feature_description,
                        feature_type = dict_out$feature_type
  )
  #rownames(rowData) <- feature_ids
  colData <- data.frame(filename = filename_tot,
                        sample_id = sample_id,
                        mask_id = mask_id)

  metadata <- list()
  metadata$extractor <- extractor
  rdr <- new_radar_dataset(features = feature_data, rowData = rowData, colData = colData, metadata = metadata)
  return(rdr)

}

# Function to import LifeX session files --------------------------------------------

#' Import LifeX session data
#'
#' @param session (character) Path to the session file generated by LifeX
#'
#' @return
#' @export
#'
#' @examples
import_lifex_session <- function (session) {

  assertthat::assert_that(length(session) > 0, msg = paste("[RadAR] Error:", session, "should be specified"))
  extractor <- "lifex"

  tmp0 <- scan(file = session, what = "", sep = "\n")
  skiplines <- grep("^INFO", tmp0)[1] - 1
  xx <- read.delim(file = session, sep = ",", header = T, skip = skiplines)
  ncol_file <- ncol(xx)
  prefix_data <- gsub("_.*", "", colnames(xx))
  ix_data <- which(prefix_data %in% c("HISTO",
                                      "CONVENTIONAL",
                                      "DISCRETIZED",
                                      "SHAPE",
                                      "GLCM",
                                      "GLRLM",
                                      "NGLDM",
                                      "GLZLM" ))
  xx.data <- xx[, ix_data, drop = F]
  image_types <- rep(NA, ncol(xx.data))
  feature_names <- colnames(xx.data)


  xx$Mask <- paste0(xx$INFO_PatientID, "_", xx$INFO_NameOfRoi)
  mask_id <- xx$Mask
  sample_id <- mask_id
  filename_tot <- rep(session, nrow(xx))
  assertthat::assert_that(!any(duplicated(xx$Mask)), msg = paste("[RadAR] duplicated mask names in", session))
  mylabels <- mask_id
  feature_data <- c()
  for (j in  1: length(mylabels)) {
    mylabel <- mylabels[j]
    feature_data <- cbind(feature_data,
                          as.numeric(xx.data[which(xx$Mask == mylabel), ]))
  }

  colnames(feature_data) <- mask_id
  if (length(unique(image_types)) > 1) {
    feature_ids <- paste0(feature_names, ".", image_types)
  } else {
    feature_ids <- feature_names
  }
  ## check all NA features
  ix_na <- which(apply(feature_data, 1, allisna))
  if (length(ix_na) > 0) {
    message(paste("[RadAR]", length(ix_na), "not numeric features were removed"))
    feature_data <- feature_data[-ix_na, ]
    feature_ids <- feature_ids[-ix_na]
  }
  rownames(feature_data) <- feature_ids

  ix_notfinite <- which(!is.finite(feature_data), arr.ind = T)
  if (length(ix_notfinite) > 0) {
    feature_data[ix_notfinite] <- NA
  }

  dict_out <- retrieve_feature_dictionary(features_names = feature_names,
                                          extractor = extractor)
  #  feature_description <- rep(NA, nrow(feature_data))
  rowData <- data.frame(feature_name = feature_names,
                        image_type = image_types,
                        feature_description = dict_out$feature_description,
                        feature_type = dict_out$feature_type
  )
  if (length(ix_na) > 0) {
    rowData <- rowData[-ix_na, ]
  }
  colData <- data.frame(filename = filename_tot,
                        sample_id = sample_id,
                        mask_id = mask_id)

  metadata <- list()
  metadata$extractor <- extractor
  rdr <- new_radar_dataset(features = feature_data, rowData = rowData, colData = colData, metadata = metadata)
  return(rdr)

}


# Function to import LifeX files (xls and csv) --------------------------------------------

#' Import LifeX data
#'
#' @param dir (character) Path to the directory containing xls or csv files generated by LifeX
#'
#' @return A RadAR object (class \code{\link{SummarizedExperiment}})
#' @author Matteo Benelli (\email{matteo.benelli@uslcentro.toscana.it})
#' @export
#'
#' @examples
import_lifex <- function (dir) {
  assertthat::assert_that(length(dir) > 0, msg = paste("[RadAR] Error:", dir, "should be specified"))
  extractor <- "lifex"

  list_files <- NULL
  list_files_xls <- list.files(dir, pattern = "xls$", full.names = T)
  list_files_csv <- list.files(dir, pattern = "csv$", full.names = T)
  if (length(list_files_xls) > 0 & length(list_files_csv) == 0) {
    list_files <- list_files_xls
    filetype <- "xls"
  }
  if (length(list_files_csv) > 0 & length(list_files_xls) == 0) {
    list_files <- list_files_csv
    filetype <- "csv"
  }

  assertthat::assert_that(length(list_files) > 0, msg = paste("[RadAR] Error:", dir, "no LifeX (xls or csv) file found.
                                                              For session file exported from LifeX, use import_lifex_session function instead."))

  if (filetype == "xls") {
    if (!requireNamespace("gdata", quietly = TRUE)) {
      stop("[RadAR] Error: package \"gdata\" needed for this function to work. Please install it.",
           call. = FALSE)
    }
    tmp <- list.files(dir, pattern = "xls$", full.names = F)
    filenames <-  gsub(".xls", "", tmp)
  }
  if (filetype == "csv") {
    tmp <- list.files(dir, pattern = "csv$", full.names = F)
    filenames <-  gsub(".csv", "", tmp)
  }

  feature_data <- c()
  mask_id <- c()
  sample_id <- c()
  filename_tot <- c()

  for (i in 1: length(list_files)) {
    message(paste("[RadAR] importing", list_files[i]))
    if (filetype == "xls") {
      xx <- gdata::read.xls(list_files[i], header = T)
    } else {
      xx <- read.delim(list_files[i], header = T, sep = ",", skip = 1)

    }
    # check file, i=1
    if (i == 1) {
      ncol_file <- ncol(xx)
      prefix_data <- gsub("_.*", "", colnames(xx))
      ix_data <- which(prefix_data %in% c("HISTO",
                                          "CONVENTIONAL",
                                          "DISCRETIZED",
                                          "SHAPE",
                                          "GLCM",
                                          "GLRLM",
                                          "NGLDM",
                                          "GLZLM" ))
      xx.data <- xx[1, ix_data, drop = F]
      image_types <- rep(NA, ncol(xx.data))
      feature_names <- colnames(xx.data)
    } else {
      assertthat::assert_that(ncol(xx) == ncol_file, msg = paste("[RadAR] Error: unexpected number of features in", list_files[i]))
      xx.data <- xx[, ix_data]
    }
    xx$Mask <- paste0(xx$INFO_PatientID, "_", xx$INFO_NameOfRoi)
    mylabels <- unique( xx$Mask)
    mask_id <- c(mask_id, mylabels)
    sample_id <- c(sample_id, rep(filenames[i], length(mylabels)))
    filename_tot <- c(filename_tot, rep(list_files[i], length(mylabels)))

    assertthat::assert_that(!any(duplicated(xx$Mask)), msg = paste("[RadAR] duplicated mask names in", list_files[i]))
    for (j in  1: length(mylabels)) {
      mylabel <- mylabels[j]
      feature_data <- cbind(feature_data,
                            as.numeric(xx.data[which(xx$Mask == mylabel), ]))
    }
  }
  colnames(feature_data) <- mask_id
  if (length(unique(image_types)) > 1) {
    feature_ids <- paste0(feature_names, ".", image_types)
  } else {
    feature_ids <- feature_names
  }
  ## check all NA features
  ix_na <- which(apply(feature_data, 1, allisna))
  if (length(ix_na) > 0) {
    message(paste("[RadAR]", length(ix_na), "not numeric features were removed"))
    feature_data <- feature_data[-ix_na, ]
    feature_ids <- feature_ids[-ix_na]
  }
  rownames(feature_data) <- feature_ids

  ix_notfinite <- which(!is.finite(feature_data), arr.ind = T)
  if (length(ix_notfinite) > 0) {
    feature_data[ix_notfinite] <- NA
  }

  dict_out <- retrieve_feature_dictionary(features_names = feature_names,
                                          extractor = extractor)
  #  feature_description <- rep(NA, nrow(feature_data))
  rowData <- data.frame(feature_name = feature_names,
                        image_type = image_types,
                        feature_description = dict_out$feature_description,
                        feature_type = dict_out$feature_type
  )
  if (length(ix_na) > 0) {
    rowData <- rowData[-ix_na, ]
  }
  #rownames(rowData) <- feature_ids
  colData <- data.frame(filename = filename_tot,
                        sample_id = sample_id,
                        mask_id = mask_id)

  metadata <- list()
  metadata$extractor <- extractor
  rdr <- new_radar_dataset(features = feature_data, rowData = rowData, colData = colData, metadata = metadata)
  return(rdr)

}
#' Import features
#'
#' This functions imports a table of radiomic features (features in rows, patients in columns)
#'
#' @param file (character) Path to the table of radiomic features. It should be tab delimited.
#' First row should report sample names. First column should report feature names.
#'
#' @return A RadAR object (class \code{\link{SummarizedExperiment}})
#' @author Matteo Benelli (\email{matteo.benelli@uslcentro.toscana.it})
#' @export
#'
#' @examples
import_radiomic_table <- function (file = NULL) {

  assertthat::assert_that(length(file) > 0,
                          msg = paste("[RadAR] Error: file", file, "doesn't exist"))

  extractor <- "unknown / generic table"
  feature_data <- read.delim(file = file, sep = "\t", row.names = 1, header = T)
  feature_data <- as.matrix(feature_data)
  colData <- data.frame (sample_id = colnames(feature_data))
  rowData <- data.frame (feature_name = rownames(feature_data))
  rownames(colData) <- colnames(feature_data)
  rownames(rowData) <- rownames(feature_data)
  metadata <- list()
  metadata$extractor <- extractor
  rdr <- new_radar_dataset(features = feature_data, rowData = rowData, colData = colData, metadata = metadata)
  return(rdr)

}



new_radar_dataset <- function (features, rowData, colData, metadata) {

  rdr <- SummarizedExperiment::SummarizedExperiment(assays=list(values=features),
                                                    rowData=rowData,
                                                    colData=colData,
                                                    metadata=metadata)
  flag_dup <- which(duplicated(rownames(rdr)))
  if (length(flag_dup) > 0) {
    message(paste("[RadAR]", "Removed", length(flag_dup),  "duplicated rows (identical feature names)"))
    keep <- which(!duplicated(rownames(rdr)))
    rdr <- rdr[keep,]
  }
  return(rdr)
}

check_files <- function (list_files) {
  flag <- rep(T, length(list_files))
  for (i in 1: length(list_files)) {
    myfile <- file.path(list_files[i])
    tmp <-  file.size(myfile)
    if (tmp < 2000) {
      message (paste("[RadAR]", myfile, "is empty or corrupted and will be discarded"))
      flag[i] <- F
    }
  }
  return(which(flag))
}

allisna <- function(x) {
  all(is.na(x))
}


retrieve_feature_dictionary <- function(features_names, extractor) {
  unique.feature_names <- unique(features_names)
  feature_type <- rep(NA, length(features_names))
  feature_description <- rep(NA, length(features_names))
  if (extractor %in% c("3dslicer", "pyradiomics")) {
    my_dict <- dict$pyrad
  } else {
    my_dict <- dict$lifex
  }
  for (i in 1: length(unique.feature_names)) {
    ix <- which(my_dict$feature_name == unique.feature_names[i])
    if (length(ix) == 1) {
      feature_description[which(features_names == unique.feature_names[i])] <- my_dict$feature_description[ix]
      feature_type[which(features_names == unique.feature_names[i])] <- my_dict$feature_type[ix]
    }
  }
  out <- list(feature_description = feature_description,
              feature_type = feature_type)
  return (out)
}
cgplab/RadAR documentation built on Nov. 10, 2021, 1:32 a.m.