#' Filter and extract function
#'
#' This function lets user to create a new GRangesList with fixed information:
#' seqnames, ranges and strand, and a variable part made up by the regions
#' defined as input. The metadata and metadata_prefix are used to filter
#' the data and choose only the samples that match at least one metdatata
#' with its prefix. The input regions are shown for each sample obtained
#' from filtering.
#'
#' @import xml2
#' @importFrom dplyr bind_cols
#' @importFrom data.table fread
#' @importFrom rtracklayer import
#'
#' @param data string GMQL dataset folder path or GRangesList
#' object
#' @param metadata vector of strings containing names of metadata attributes
#' to be searched for in metadata files.
#' Data will be extracted if at least one condition is satisfied:
#' this condition is logically "ANDed" with prefix filtering (see below)
#' if NULL no filtering action occures
#' (i.e every sample is taken for region filtering)
#' @param metadata_prefix vector of strings that will support the metadata
#' filtering. If defined, each 'metadata' is concatenated with the
#' corresponding prefix.
#' @param region_attributes vector of strings that extracts only region
#' attributes specified; if NULL no regions attribute is taken and the output
#' is only GRanges made up by the region coordinate attributes
#' (seqnames, start, end, strand);
#' It is also possible to assign the \code{\link{FULL}} with or without
#' its input parameter; in case was without the `except` parameter,
#' all the region attributes are taken, otherwise all the region attributes
#' are taken except the input attribute defined by except.
#' @param suffix name for each metadata column of GRanges. By default it is the
#' value of the metadata attribute named "antibody_target". This string is
#' taken from sample metadata file or from metadata() associated.
#' If not present, the column name is the name of selected regions specified
#' by 'region_attributes' input parameter
#'
#' @details
#' This function works only with dataset or GRangesList all whose samples or
#' Granges have the same region coordinates (chr, ranges, strand) ordered in
#' the same way for each sample
#'
#' In case of GRangesList data input, the function searches for metadata
#' into metadata() function associated to GRangesList.
#'
#' @return GRanges with selected regions
#'
#' @examples
#'
#' ## This statement defines the path to the folder "DATASET" in the
#' ## subdirectory "example" of the package "RGMQL" and filters such folder
#' ## dataset including at output only "pvalue" and "peak" region attributes
#'
#' test_path <- system.file("example", "DATASET", package = "RGMQL")
#' filter_and_extract(test_path, region_attributes = c("pvalue", "peak"))
#'
#' ## This statement imports a GMQL dataset as GRangesList and filters it
#' ## including at output only "pvalue" and "peak" region attributes, the sort
#' ## function makes sure that the region coordinates (chr, ranges, strand)
#' ## of all samples are ordered correctly
#'
#' grl <- import_gmql(test_path, TRUE)
#' sorted_grl <- sort(grl)
#' filter_and_extract(sorted_grl, region_attributes = c("pvalue", "peak"))
#'
#' ## This statement imports a GMQL dataset as GRangesList and filters it
#' ## including all the region attributes
#'
#' sorted_grl_full <- sort(grl)
#' filter_and_extract(sorted_grl_full, region_attributes = FULL())
#'
#' ## This statement imports a GMQL dataset as GRangesList and filters it
#' ## including all the region attributes except "jaccard"
#'
#' sorted_grl_full_except <- sort(grl)
#' filter_and_extract(
#' sorted_grl_full_except,
#' region_attributes = FULL("jaccard")
#' )
#'
#' @export
#'
filter_and_extract <- function(
data,
metadata = NULL,
metadata_prefix = NULL,
region_attributes = NULL,
suffix = "antibody_target"
) {
if (is(data, "GRangesList")) {
.extract_from_GRangesList(
data,
metadata,
metadata_prefix,
region_attributes,
suffix)
} else {
.extract_from_dataset(
data,
metadata,
metadata_prefix,
region_attributes, suffix)
}
}
.extract_from_dataset <- function(
datasetName,
metadata,
metadata_prefix,
regions,
suffix
) {
datasetName <- sub("/*[/]$", "", datasetName)
if (basename(datasetName) != "files") {
datasetName <- file.path(datasetName, "files")
}
if (!dir.exists(datasetName)) {
stop("Directory does not exists")
}
gdm_meta_files <- list.files(
datasetName,
pattern = "*.gdm.meta$",
full.names = TRUE
)
gtf_meta_files <- list.files(
datasetName,
pattern = "*.gtf.meta$",
full.names = TRUE
)
if (!length(gdm_meta_files) && !length(gtf_meta_files)) {
stop("no samples present or no files format supported")
}
if (length(gdm_meta_files) && length(gtf_meta_files)) {
stop("GMQL dataset cannot be mixed dataset: no GTF and GDM together")
}
vector_field <- .schema_header(datasetName)
if (length(gdm_meta_files)) {
samples_file <- .check_metadata_files(
metadata, metadata_prefix,
gdm_meta_files
)
samples_meta_to_read <- unlist(samples_file)
if (length(samples_meta_to_read)) {
samples_to_read <- gsub(".meta$", "", samples_meta_to_read)
} else {
samples_to_read <- gsub(".meta$", "", gdm_meta_files)
samples_meta_to_read <- gtf_meta_files
}
suffix_vec <- .get_suffix(suffix, FALSE, samples_meta_to_read)
granges <- .parse_gdm_files(
vector_field,
samples_to_read,
regions,
suffix_vec)
} else {
samples_file <- .check_metadata_files(
metadata,
metadata_prefix,
gtf_meta_files
)
samples_meta_to_read <- unlist(samples_file)
if (length(samples_meta_to_read)) {
samples_to_read <- gsub(".meta$", "", samples_meta_to_read)
} else {
samples_to_read <- gsub(".meta$", "", gtf_meta_files)
samples_meta_to_read <- gtf_meta_files
}
suffix_vec <- .get_suffix(suffix, FALSE, samples_meta_to_read)
granges <- .parse_gtf_files(
samples_to_read,
regions,
suffix_vec,
vector_field
)
}
}
.extract_from_GRangesList <- function(
rangesList,
metadata,
metadata_prefix,
regions,
suffix
) {
if (!is(rangesList, "GRangesList")) {
stop("only GrangesList admitted")
}
if (!length(rangesList)) {
stop("rangesList empty")
}
meta_list <- metadata(rangesList)
samples <- .check_metadata_list(metadata, metadata_prefix, meta_list)
if (!length(unlist(samples))) {
samples <- rangesList
} else {
index <- unlist(samples)
samples <- rangesList[c(index)]
}
new_meta_list <- metadata(samples)
suffix_vec <- .get_suffix(suffix, TRUE, new_meta_list)
granges <- .parse_Granges(samples, regions, suffix_vec)
}
.parse_Granges <- function(region_list, regions, suffixes) {
if (is.null(suffixes)) {
suffixes <- ""
}
g1 <- region_list[[1]]
if(is.object(regions) && ("FULL" %in% class(regions))) {
all_values <- names(elementMetadata(g1))
except_values <- regions$values
regions <- if (is.null(except_values))
all_values
else
all_values[!all_values %in% except_values]
names(regions) <- NULL
}
elementMetadata(g1) <- NULL
if (!is.null(regions)) {
DF_list <- mapply(function(g_x, h) {
meta <- elementMetadata(g_x)[regions]
if (h != "") {
names(meta) <- paste(regions, h, sep = ".")
}
data.frame(meta)
}, region_list, suffixes, SIMPLIFY = FALSE)
DF_only_regions <- dplyr::bind_cols(DF_list)
elementMetadata(g1) <- DF_only_regions
}
g1
}
.get_suffix <- function(col_name, from_list, meta_fl) {
suffix <- paste0(col_name, "$")
if (from_list) {
meta_list <- mapply(function(x, index) {
vec_names <- names(x)
s_index <- grep(suffix, vec_names)
first_index <- s_index[1]
suffix <- unlist(x[first_index]) # ne prendo solo uno
names(suffix) <- NULL
# if found retrieve samples that has at least one choosen metadata
if (first_index && !is.na(first_index)) {
suffix
} else {
""
}
}, meta_fl, seq_along(meta_fl))
}
else {
meta_list <- vapply(meta_fl, function(x) {
list <- .add_metadata(x)
vec_names <- names(list)
index <- grep(suffix, vec_names)
first_index <- index[1]
suffix <- unlist(list[first_index]) # ne prendo solo uno
names(suffix) <- NULL
# if found retrieve samples that has at least one choosen metadata
if (first_index && !is.na(first_index)) {
suffix
} else {
""
}
}, character(1))
}
names(meta_list) <- NULL
meta_list
}
.check_metadata_list <- function(metadata, metadata_prefix, meta_list) {
vec_meta <- paste0(metadata_prefix, metadata)
list <- mapply(function(x, index) {
vec_names <- names(x)
a <- lapply(vec_meta, function(y) {
which(y == vec_names)
})
## we would like that manage more index from grep
found <- as.logical(length(unlist(a)))
# if found retrieve samples that has at least one choosen metadata
if (found) {
index
}
}, meta_list, seq_along(meta_list))
}
.check_metadata_files <- function(metadata, metadata_prefix, meta_files) {
vec_meta <- paste0(metadata_prefix, metadata)
meta_list <- lapply(meta_files, function(x) {
list <- .add_metadata(x)
vec_names <- names(list)
a <- lapply(vec_meta, function(y) {
grep(y, vec_names)
})
## we would like that manage more index from grep
found <- as.logical(length(unlist(a)))
# if found retrieve samples that has at least one chosen metadata
if (found) {
x
}
})
}
.parse_gtf_files <- function(
gtf_region_files,
regions,
suffixes,
vector_field
) {
attr_col_names <- vector_field[
!vector_field %in% c("seqname", "seqid", "start", "end", "strand")]
g1 <- rtracklayer::import(
con = gtf_region_files[1],
format = "gtf",
colnames = attr_col_names
)
elementMetadata(g1) <- NULL
if (is.null(suffixes)) {
suffixes <- ""
}
# check if we used a FULL parameter instead of char array containing
# the region parameters
if(is.object(regions) && ("FULL" %in% class(regions))) {
all_values <- vector_field[!vector_field %in% c(
"seqname",
"strand",
"start",
"end")
]
except_values <- regions$values
regions <- if (is.null(except_values))
all_values
else
all_values[!all_values %in% except_values]
names(regions) <- NULL
# since import convert this value from GMQL schema to GTF format
# we need to convert it back
regions <- replace(regions, regions == "feature", "type")
regions <- replace(regions, regions == "frame", "phase")
}
if (!is.null(regions)) {
DF_list <- mapply(function(x, header) {
g_x <- rtracklayer::import(
x,
format = "gtf",
colnames = attr_col_names
)
meta <- elementMetadata(g_x)[regions]
if (header != "") {
names(meta) <- paste(regions, header, sep = ".")
}
data.frame(meta)
}, gtf_region_files, suffixes, SIMPLIFY = FALSE)
DF_only_regions <- dplyr::bind_cols(DF_list)
elementMetadata(g1) <- DF_only_regions
}
g1
}
.parse_gdm_files <- function(
vector_field,
gdm_region_files,
regions,
suffixes
) {
# read first sample cause chromosome regions are the same for all samples
df <- data.table::fread(
gdm_region_files[1],
col.names = vector_field,
header = FALSE,
sep = "\t"
)
col_names <- names(df)
df <- subset(df, TRUE, c("chr", "left", "right", "strand"))
# check if we used a FULL parameter instead of char array containing
# the region parameters
if(is.object(regions) && ("FULL" %in% class(regions))) {
all_values <- vector_field[!vector_field %in% c(
"chr",
"left",
"right",
"strand")
]
except_values <- regions$values
regions <- if (is.null(except_values))
all_values
else
all_values[!all_values %in% except_values]
names(regions) <- NULL
}
if (!is.null(regions)) {
df_list <- lapply(gdm_region_files, function(x, regions, vector_field) {
region_frame <- data.table::fread(
x,
col.names = vector_field,
header = FALSE,
sep = "\t")
col_names <- names(region_frame)
# delete column not choosen by input
if (!is.null(regions)) {
col_names <- col_names[col_names %in% regions]
}
if (length(col_names)) {
r <- subset(region_frame, TRUE, col_names)
}
}, regions, vector_field)
df_only_regions <- dplyr::bind_cols(df_list)
complete_df <- dplyr::bind_cols(df, df_only_regions)
region_names <- names(complete_df)[-(seq_len(4))]
region_names <- gsub("[0-9]+", "", region_names)
region_names <- paste(region_names, suffixes, sep = ".")
region_names <- c(names(complete_df)[(seq_len(4))], region_names)
names(complete_df) <- region_names
g <- GenomicRanges::makeGRangesFromDataFrame(
complete_df,
keep.extra.columns = TRUE,
seqnames.field = c("seqnames", "seqname",
"chromosome", "chrom",
"chr", "chromosome_name"),
start.field = "left",
end.field = "right")
} else {
g <- GenomicRanges::makeGRangesFromDataFrame(
df,
keep.extra.columns = TRUE,
seqnames.field = c("seqnames", "seqname",
"chromosome", "chrom",
"chr", "chromosome_name"),
start.field = "left",
end.field = "right")
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.