Nothing
#' Creates a tibble containing the information of which genes/pathways are
#' altered in a patient in which clone.
#'
#' It expects a comma-separated table where the first column is the name
#' of the altered gene or pathway. The other columns are for the clones in
#' the respective tumor. Such a table can be generated with a tool that
#' identifies clones in tumor samples, e.g. \code{Cloe}.
#'
#' The table is expected to be comma-separated and to have the columns
#' 'altered_entity', 'clone1', 'clone2', ..., 'cloneN', depending on how
#' many clones were detected in the respective tumor. Each row then contains
#' in the first column the name of the mutated gene or affected pathway,
#' e.g. "ENSG00000134086", and in the other columns it has either zeros
#' or ones, indicating in which clone the respective gene/pathway is altered.
#'
#' @title Get clone alteration tibble.
#' @param path_to_file The path to the file with the table of altered
#' genes/pathways and their clone affiliation.
#' @param max_num_clones The upper bound for the number of clones that
#' were found per tumor. Default: 7.
#' @author Ariane L. Moore
#' @return The tibble containing the information of which gene/pathway is
#' altered in which clone in a patient. Has the columns 'file_name',
#' 'patient_id', 'altered_entity', 'clone1', 'clone2', ... up to the maximal
#' number of clones (Default: until 'clone7'). Note that the labelling
#' of the clones does not matter and only needs to stay fixed within each
#' patient and tree inference.
#' @export
#' @importFrom tibble tibble add_column
#' @importFrom dplyr is.tbl
#' @importFrom magrittr "%>%"
#' @importFrom utils tail read.csv
#' @examples
#' ext_data_dir <- system.file('extdata', package='GeneAccord')
#' create_tbl_ent_clones(paste(ext_data_dir,
#' "/clonal_genotypes/cloe_seed5/01.csv", sep=""))
create_tbl_ent_clones <- function(path_to_file, max_num_clones=7){
stopifnot(is.character(path_to_file))
stopifnot(is.numeric(max_num_clones))
stopifnot(file.exists(path_to_file))
message("Found the file ", path_to_file)
## create tibble
file_name_tibble <- paste(tail(unlist(strsplit(path_to_file, "/")), n=2),
collapse="/")
this_csv <- read.csv(path_to_file)
all_ents <- as.character(as.vector(this_csv[,1]))
this_clone_tbl <- tibble(file_name=file_name_tibble,
patient_id=sub(".csv", "",
basename(path_to_file)),
altered_entity=all_ents)
num_mutated_ents <- dim(this_clone_tbl)[1]
## minus the columns for file_name, patient_id, and altered_entity
num_clones <- dim(this_csv)[2] - 1
## now add the columns with the clone assignments
for(i in seq_len(num_clones)){
clone_name <- paste("clone", i, sep="")
this_clone_tbl <- this_clone_tbl %>%
add_column(new_col=as.integer(this_csv[,(i+1)]))
names(this_clone_tbl)[names(this_clone_tbl) == "new_col"] <-
clone_name
}
stopifnot(num_clones <= max_num_clones)
## to make sure that all tibbles from all patients contain the same
## number of
## clones, pseudo-columns with only zeros are added if the number of
## clones
## is less than 'max_num_clones'
if(num_clones < max_num_clones){
next_clone <- num_clones+1
for(i in seq(next_clone, max_num_clones)){
clone_name <- paste("clone", i, sep="")
this_clone_tbl <- this_clone_tbl %>%
add_column(new_col=as.integer(rep(0,
num_mutated_ents)))
names(this_clone_tbl)[names(this_clone_tbl) == "new_col"] <-
clone_name
}
}
final_clone_tbl <- this_clone_tbl
stopifnot(is.tbl(final_clone_tbl))
num_entries <- dim(final_clone_tbl)[1]
num_clones <- dim(final_clone_tbl)[2] - 3
stopifnot(num_clones == max_num_clones)
message("Found ", num_entries, " altered genes/pathways in total.")
return(final_clone_tbl)
}
#' Read in the patient's gene-to-clone assignment across a collection of
#' trees
#'
#' Creates a tibble containing the information of which genes/pathways are
#' altered in which clone in a patient across a collection of tree inferences.
#' It expects a list containing the paths to the comma-separated tables where
#' the first column is the name of the altered gene or pathway. The other
#' columns are for the clones in the respective tumor. Such tables can be
#' generated by repeatedly performing the phylogenetic tree inference with
#' e.g. the package \code{Cloe}, or by sampling from the posterior. The
#' tables are expected to be comma-separated and to have the columns
#' 'altered_entity', 'clone1', 'clone2', ..., 'cloneN', depending on how many
#' clones were detected in the respective tumor. Each row then contains in
#' the first column the name of the mutated gene or affected pathway, e.g.
#' "ENSG00000134086", and in the other columns it has either zeros or ones,
#' indicating in which clone the respective gene/pathway is altered.
#'
#' @title Get clone alteration tibble across the collection of trees.
#' @param input_files A vector containing the paths to the files with the
#' tables of altered genes/pathways and their clone affiliation from
#' the collection of tree inferences.
#' @param no_noisy_ents Minimum fraction for genes/pathways of how often they
#' have to occur across the collection of trees in order to be in the
#' tibble. This makes sure that noisy genes, which were not assigned to many
#' trees are excluded. Default: 0.9.
#' @param max_num_clones The upper bound for the number of clones that were
#' found per tumor. Default: 7.
#' @author Ariane L. Moore
#' @return A clean tibble with the information of which gene/pathway is
#' altered in which clone in the patient, and with an entry for each tree
#' inference where it occurred. Has the columns 'file_name', 'patient_id',
#' 'altered_entity', 'clone1', 'clone2', ... up to the maximal number of
#' clones (Default: until 'clone7'), and 'tree_id' as an indication in which
#' tree the assignment was found. Note that the labelling of the clones does
#' not matter and only needs to stay fixed within each patient and
#' tree inference.
#' @export
#' @importFrom magrittr "%>%"
#' @importFrom dplyr select filter mutate bind_rows tally group_by pull is.tbl
#' @examples
#' ext_data_dir <- system.file('extdata', package='GeneAccord')
#' this_patient <- "01"
#' input_files_01 <- paste(ext_data_dir,
#' "/clonal_genotypes/cloe_seed", seq(5, 100, by=5),
#' "/", this_patient, ".csv", sep="")
#' create_tbl_tree_collection(input_files_01)
create_tbl_tree_collection <- function(input_files, no_noisy_ents=0.9,
max_num_clones=7){
altered_entity <- n <- NULL
stopifnot(is.vector(input_files))
stopifnot(is.numeric(no_noisy_ents))
stopifnot(is.numeric(max_num_clones))
sanity_check <- lapply(input_files,
function(x){stopifnot(is.character(x));
stopifnot(file.exists(x))})
num_files <- length(input_files)
message("Found a collection of tree inferences of ",
num_files, " input files.")
## go over the files, create a tibble with the additional information
## from which tree collection the tbl is
all_tbl_list <- list()
for(i in seq_len(num_files)){
this_tbl <- suppressMessages(create_tbl_ent_clones(input_files[i],
max_num_clones=max_num_clones))
all_tbl_list[[i]] <- this_tbl %>% mutate(tree_id=i)
}
## this is now the clone tibble from just that patient but
## from all tree inferences
all_trees_clone_tbl <- bind_rows(all_tbl_list)
## now exclude genes that occur in less than 90% of the trees,
## e.g. less than 18 of 20
all_ents <- unique(all_trees_clone_tbl$altered_entity)
histogram_ents <- all_trees_clone_tbl %>%
select(altered_entity) %>%
group_by(altered_entity) %>%
tally()
noisy_ent_thresh <- no_noisy_ents*num_files
## get a vector with all genes that are stably assigned to the trees
stable_ents <- histogram_ents %>%
filter(n >= noisy_ent_thresh) %>%
pull(altered_entity)
## filter such that only those are included
all_trees_clone_tbl <- all_trees_clone_tbl %>%
filter(altered_entity %in% stable_ents)
num_genes_filtered_out <- length(all_ents)-length(stable_ents)
if (num_genes_filtered_out == 1){
message("Removed ", num_genes_filtered_out,
" gene that is missing in more than ",
(num_files-noisy_ent_thresh) ," of gene-to-clone assignments.")
} else {
message("Removed ", num_genes_filtered_out,
" genes that are missing in more than ",
(num_files-noisy_ent_thresh) ," of gene-to-clone assignments.")
}
stopifnot(is.tbl(all_trees_clone_tbl))
num_entries <- dim(all_trees_clone_tbl)[1]
num_clones <- dim(all_trees_clone_tbl)[2] - 4
stopifnot(num_clones == max_num_clones)
message("Found ", num_entries,
" altered genes/pathways in total, which are from ",
num_files," gene-to-clone assignments.")
return(all_trees_clone_tbl)
}
#' Compute the clonal exclusivity rates for each gene-to-clone-assignment
#' from the collection of tree inferences.
#'
#' Takes the gene-to-clone assignment tibble as created with
#' \code{\link{create_tbl_tree_collection}} and computes for each instance
#' from the collection of trees the rate of clonal exclusivity. This rate
#' is the fraction of gene/pathway pairs that were on a different branch
#' in the tumor phylogeny, i.e. the fraction of pairs that was clonally
#' exclusive.
#'
#' @title Get rates of clonal exclusivity for each tree inference
#' @param pat_tbl A tibble with the information of which gene/pathway is
#' altered in which clone in the patient, and including this information
#' from the collection of trees. Can be created with with
#' \code{\link{create_tbl_tree_collection}}.
#' @author Ariane L. Moore
#' @return A vector with all rates of clonal exclusivity from all tree
#' inferences.
#' @export
#' @importFrom magrittr "%>%"
#' @importFrom dplyr tibble is.tbl filter select
#' @importFrom stats median
#' @examples
#' clone_tbl <- dplyr::tibble(file_name =
#' rep("fn1", 10),
#' "patient_id"=rep("pat1", 10),
#' "altered_entity"=paste0("gene",
#' LETTERS[seq_len(10)]),
#' "clone1"=c(0, 1, 0, 1, 0, 1, 0, 1, 1, 1),
#' "clone2"=c(1, 0, 1, 0, 1, 1, 1, 0, 0, 1),
#' "tree_id"=c(rep(1, 5), rep(2, 5)))
#' compute_rates_clon_excl(clone_tbl)
compute_rates_clon_excl <- function(pat_tbl){
tree_id <- file_name <- patient_id <- NULL
stopifnot(is.tbl(pat_tbl))
stopifnot("patient_id" %in% colnames(pat_tbl))
stopifnot("file_name" %in% colnames(pat_tbl))
stopifnot("altered_entity" %in% colnames(pat_tbl))
stopifnot("clone1" %in% colnames(pat_tbl))
stopifnot("clone2" %in% colnames(pat_tbl))
stopifnot("tree_id" %in% colnames(pat_tbl))
## extract patient id
pat_id <- unique(pat_tbl$patient_id)
stopifnot(length(pat_id) == 1)
## extract how many different instances are in the collection
## of trees
all_trees <- unique(pat_tbl$tree_id)
num_trees <- length(all_trees)
message("Found ", num_trees,
" different tree instances in the gene-to-clone tibble from patient ",
pat_id, ".")
message("Will compute the rate of clonal exclusivity for",
" each of these...")
## compute the rate of clonal exclusivity for each tree inference
all_rates_across_tree_collection<-vapply(all_trees, function(this_tree){
this_tree_pat_tbl_wo_pat_name <- pat_tbl %>%
filter(tree_id == this_tree) %>%
select(-file_name, -patient_id, -tree_id)
this_tree_rate_this_pat <-
suppressMessages(
get_rate_diff_branch_ent_pair(this_tree_pat_tbl_wo_pat_name))
return(this_tree_rate_this_pat)
}, numeric(1))
message("...done")
mean_rate <- mean(all_rates_across_tree_collection)
median_rate <- median(all_rates_across_tree_collection)
rate_is_zero <- length(which(all_rates_across_tree_collection == 0))
message("Current patient ", pat_id, " has mean rate ",
round(mean_rate, digits=3),
", median rate " , round(median_rate, digits=3),
", and the rate is zero in ", rate_is_zero, " out of ",
num_trees, " trees.")
names(all_rates_across_tree_collection) <- rep(pat_id, num_trees)
return(all_rates_across_tree_collection)
}
#' Compute the rate of mutated gene/pathway pairs being in different
#' branches.
#'
#' Given the output of a tool that identifies clones within tumors and their
#' phylogenetic history, this function computes the rate of mutated
#' gene/pathway pairs being in different branches. That is, it will calculate
#' the number of times mutated gene/pathway pairs are in different
#' branches/clones divided by the total number of all mutated gene/pathway
#' pairs.
#'
#' @title Compute rate of being in different branches/clones.
#' @param clone_tbl A tibble containing the columns 'altered_entity', and
#' then a column for each clone
#' @author Ariane L. Moore
#' in the tumor, e.g. 'clone1', 'clone2', 'clone3'. This tibble can be
#' generated e.g. from the \code{Cloe} output.
#' @return The rate of occurrence of mutated gene/pathway pairs being in
#' different clones.
#' @export
#' @importFrom dplyr is.tbl tibble
#' @importFrom caTools combs
#' @examples
#' clone_tbl <- dplyr::tibble(
#' altered_entity=c(paste("gene", seq_len(10), sep="")),
#' clone1=c(rep(0,10)),
#' clone2=c(sample(c(0,1), 10, replace=TRUE)),
#' clone3=c(sample(c(0,1), 10, replace=TRUE)),
#' clone4=c(sample(c(0,1), 10, replace=TRUE)))
#' get_rate_diff_branch_ent_pair(clone_tbl)
get_rate_diff_branch_ent_pair <- function(clone_tbl) {
altered_entity <- NULL
stopifnot(is.tbl(clone_tbl))
stopifnot("altered_entity" %in% colnames(clone_tbl))
stopifnot("clone1" %in% colnames(clone_tbl))
if("patient_id" %in% colnames(clone_tbl) ||
"file_name" %in% colnames(clone_tbl) ||
"tree_id" %in% colnames(clone_tbl)){
stop("The clone tibble is expected to have only the",
" altered_entity\n",
"and clones columns, but not file_name, patient_id, or tree_id!")
}
## get all genes/pathways, here referred to as entity (ent)
all_ents <- unique(as.character(clone_tbl$altered_entity))
num_ents <- length(all_ents)
num_entries <- dim(clone_tbl)[1]
stopifnot(num_entries == num_ents)
if(num_ents == 1){
message("Found only 1 mutated gene/pathway in the provided",
" tibble.",
" The rate of being on a different branch cannot be determined",
", but will be set to 0")
return(0)
} else {
message("Found ", num_ents,
" different mutated genes/pathways in the provided tibble.")
num_clones <- dim(clone_tbl)[2]-1
## one column is the gene/pathway name
message("Found ", num_clones,
" different clones in the provided tibble.")
## get the number of pairs
num_pairs <- num_ents*(num_ents-1)/2
all_pairs <- combs(all_ents, 2)
stopifnot(dim(all_pairs)[1] == num_pairs)
## then use apply on the pairs with a function that takes two
## genes/pathways, and the clone tibble which invokes
## is_diff_branch_ent_pair(ent1, ent2, clone_tbl)
## and returns whether or not the pair is in different
## branches/clones (TRUE or FALSE)
all_pairs_branch_info <- apply(all_pairs, 1, function(x){
suppressMessages(is_diff_branch_ent_pair(x[1],
x[2],
clone_tbl))})
stopifnot(is.logical(all_pairs_branch_info))
## get the number and rate of pairs being in
## different branches/clones
num_pairs_diff_branch <- sum(unlist(all_pairs_branch_info))
rate_pairs_diff_branch <- num_pairs_diff_branch/num_pairs
stopifnot(rate_pairs_diff_branch >= 0 &&
rate_pairs_diff_branch <= 1)
## message to user
message("Of ", num_pairs, " total gene/pathway pairs, ",
num_pairs_diff_branch, " occur in ",
"different branches/clones of the tumor.")
return(rate_pairs_diff_branch)
}
}
#' Check whether a given pair of mutated genes/pathways is in
#' different branches/clones.
#'
#' Given two mutated genes or pathways and the clone tibble as
#' described in \code{\link{get_rate_diff_branch_ent_pair}},
#' this function returns \code{TRUE} or \code{FALSE} for whether
#' the pair is mutated in different branches/clones.
#'
#' @title Check whether pair is in different branches/clones.
#' @param ent1 One mutated gene/pathway from the pair.
#' @param ent2 The other mutated gene/pathway from the pair.
#' @param clone_tbl A tibble containing the columns
#' 'altered_entity', and then a column for each clone
#' in the tumor, e.g. 'clone1', 'clone2', 'clone3'. This tibble
#' can be generated e.g. from the \code{Cloe} output.
#' @author Ariane L. Moore
#' @return \code{TRUE} or \code{FALSE} for whether or not the
#' pair is mutated in different clones/in different
#' branches of the tree.
#' @export
#' @importFrom magrittr "%>%"
#' @importFrom dplyr tibble is.tbl filter select
#' @examples
#' clone_tbl <- dplyr::tibble(
#' altered_entity=c(paste("gene", seq_len(10), sep="")),
#' clone1=c(rep(0,10)),
#' clone2=c(sample(c(0,1), 10, replace=TRUE)),
#' clone3=c(sample(c(0,1), 10, replace=TRUE)),
#' clone4=c(sample(c(0,1), 10, replace=TRUE)))
#' is_diff_branch_ent_pair("gene1", "gene2", clone_tbl)
is_diff_branch_ent_pair <- function(ent1, ent2, clone_tbl) {
altered_entity <- NULL
stopifnot(is.character(ent1))
stopifnot(is.character(ent2))
stopifnot(is.tbl(clone_tbl))
stopifnot("altered_entity" %in% colnames(clone_tbl))
stopifnot("clone1" %in% colnames(clone_tbl))
if("patient_id" %in% colnames(clone_tbl) ||
"file_name" %in% colnames(clone_tbl) ||
"tree_id" %in% colnames(clone_tbl)){
stop("The clone tibble is expected to have only",
" the altered_entity and clones columns, but not file_name,",
" patient_id, or tree_id!")
}
## extract the clone profiles of the two entities
ent1_profile_tbl <- clone_tbl %>%
filter(altered_entity == ent1) %>%
select(-altered_entity)
ent2_profile_tbl <- clone_tbl %>%
filter(altered_entity == ent2) %>%
select(-altered_entity)
## turn into a vector
ent1_profile <- as.numeric(as.vector(unlist(ent1_profile_tbl[1,])))
ent2_profile <- as.numeric(as.vector(unlist(ent2_profile_tbl[1,])))
## sanity check: make sure the rows with ent1 and ent2 exist
if(is.na(ent1_profile)[1] || is.na(ent2_profile)[1]){
warning("At least one of the two entities ",
ent1, " and ", ent2,
" is not in the provided tibble.")
return(FALSE)
} else {
## the product of these two profiles will only have ones
## where both occur in the same clone
## i.e. sum of this will count how many clones they share
num_shared_clones <- sum(ent1_profile*ent2_profile)
message("The pair ", ent1, " and ", ent2,
" share ", num_shared_clones,
" clone(s).")
return(ifelse(num_shared_clones > 0, FALSE, TRUE))
}
}
#' Compute all values of how often gene pairs were clonally
#' exclusive/all trees for a patient.
#'
#' It computes a histogram of the following two values: Amomg
#' all gene/pathway pairs in a patient, the number of trees in
#' which the both entities of a pair are assigned to a clone
#' at all, and the number of trees in which the pair is clonally
#' exclusive.
#'
#' @title Compute all values of how often gene pairs were
#' clonally exclusive across all trees for a patient.
#' @param clone_tbl A tibble containing the columns
#' 'altered_entity', and then a column for each clone
#' in the tumor, e.g. 'clone1', 'clone2', 'clone3'. It also
#' contains the column 'tree_id', which specifies
#' which tree of the collection of tree inferences was used.
#' This tibble can be generated e.g. from the \code{Cloe} output.
#' @author Ariane L. Moore
#' @return A list with two vectors: The numbers of how often
#' gene pairs were mutated across trees, and the numbers of
#' how often they were clonally exclusive. The order of these
#' two vectors is matching, i.e. the ith entry in each
#' vector refers to the same gene pair.
#' @export
#' @importFrom magrittr "%>%"
#' @importFrom caTools combs
#' @importFrom dplyr is.tbl select tibble
#' @importFrom stats median
#' @examples
#' clone_tbl <- dplyr::tibble(
#' altered_entity=c(paste("gene", seq_len(10), sep="")),
#' clone1=c(rep(0,10)),
#' clone2=c(sample(c(0,1), 10, replace=TRUE)),
#' clone3=c(sample(c(0,1), 10, replace=TRUE)),
#' clone4=c(sample(c(0,1), 10, replace=TRUE)),
#' tree_id=c(rep(5, 5), rep(10, 5)) )
#' get_hist_clon_excl(clone_tbl)
get_hist_clon_excl <- function(clone_tbl) {
altered_entity <- tree_id <- patient_id <-
file_name <- NULL
stopifnot(is.tbl(clone_tbl))
stopifnot("altered_entity" %in% colnames(clone_tbl))
stopifnot("clone1" %in% colnames(clone_tbl))
stopifnot("tree_id" %in% colnames(clone_tbl))
## extract number of tree instances
all_trees <- unique(clone_tbl$tree_id)
num_trees <- length(all_trees)
## get the histogram of how often pairs were clonally
## exclusive/all trees
## get all genes/pathways, here referred to as entity (ent)
all_ents <- unique(as.character(clone_tbl$altered_entity))
num_ents <- length(all_ents)
pat_id<-""
if("patient_id" %in% colnames(clone_tbl)){
## extract patient id
pat_id <- unique(clone_tbl$patient_id)
clone_tbl <- clone_tbl %>% select(-patient_id)
message("Found ", num_trees,
" different tree instances in the gene-to-clone tibble",
" from patient ",
pat_id, ", ",
"and ", num_ents, " different mutated genes/pathways in total.")
stopifnot(length(pat_id) == 1)
} else {
message("Found ", num_trees,
" different tree instances in the provided gene-to-clone tibble, ",
"and ", num_ents, " different mutated genes/pathways in total.")
}
if("file_name" %in% colnames(clone_tbl)){
clone_tbl <- clone_tbl %>% select(-file_name)
}
message("Will compute the histograms for all pairs of how often ",
"they occur across tree instances, ",
"and how often they are clonally exclusive...")
# get the number of pairs
num_pairs <- num_ents*(num_ents-1)/2
all_pairs <- combs(all_ents, 2)
stopifnot(dim(all_pairs)[1] == num_pairs)
## then use apply on the pairs with a function that takes two
## genes/pathways, and the clone tibble which invokes
## get_hist_clon_excl_this_pat_this_pair(ent1, ent2, clone_tbl)
## and returns in how many trees they were both mutated and how
## often clonally exclusive
all_pairs_clon_excl_over_num_trees_info <-
apply(all_pairs, 1, function(x){
this_res_num_trees_num_clon<-
suppressMessages(get_hist_clon_excl_this_pat_this_pair(x[1],
x[2],
clone_tbl))
return(this_res_num_trees_num_clon)}
)
message("...done")
stopifnot(is.matrix(all_pairs_clon_excl_over_num_trees_info))
stopifnot(dim(all_pairs_clon_excl_over_num_trees_info)[1] == 2)
stopifnot(dim(all_pairs_clon_excl_over_num_trees_info)[2] == num_pairs)
num_trees_pair_mutated <- all_pairs_clon_excl_over_num_trees_info[1,]
num_times_clonal_excl <- all_pairs_clon_excl_over_num_trees_info[2,]
## it can be that there are also zeros in the num_trees_pair_mutated,
## because we compute all pairs by taking all ents from all tree
## inferences. It can be that one gene is only in one tree, and
## the other only in a different tree. Then for this pair, the
## num_trees_pair would be zero. We do not want to include
## pairs who are never mutated together in a tree. Hence, we exclude
## those
idx_pairs_zero_times <- which(num_trees_pair_mutated == 0)
if(length(idx_pairs_zero_times) > 0){
num_trees_pair_mutated <-
num_trees_pair_mutated[-idx_pairs_zero_times]
num_times_clonal_excl <-
num_times_clonal_excl[-idx_pairs_zero_times]
}
if(length(num_trees_pair_mutated) == 0 ||
length(num_times_clonal_excl) == 0){
message("It seems like there was no gene/pathway pair at all",
" that occurs in the same gene-to-clone assignment.",
" Please make sure that you have at least two ",
"genes/pathways assigned to clones in at least",
" one of the tree instances.")
return(list(0, 0))
} else {
## message to user
message("The median value for how often a gene pair",
" occurs across the trees is ", median(num_trees_pair_mutated),
", and ",
median(num_times_clonal_excl) ,
" is the median value for the pairs being clonally exclusive.")
return(list(num_trees_pair_mutated, num_times_clonal_excl))
}
}
#' Check for a pair how often it was mutated in the current patient
#' across trees, and how often also clonally exclusive.
#'
#' @title Check for a pair how often it was mutated in the current
#' patient across trees, and how often also clonally exclusive.
#' @param entA One gene/pathway of the pair
#' @param entB The other gene/pathway of the pair
#' @param clone_tbl A tibble containing the columns 'altered_entity',
#' and then a column for each clone
#' in the tumor, e.g. 'clone1', 'clone2', 'clone3'. It also contains
#' the column 'tree_id', which specifies
#' which tree of the collection of tree inferences was used. This
#' tibble can be generated e.g. from the \code{Cloe} output.
#' @author Ariane L. Moore
#' @return A vector with the values of in how many trees the pair
#' was mutated, and in how many of those it was clonally exclusive.
#' @export
#' @importFrom magrittr "%>%"
#' @importFrom dplyr is.tbl filter select tibble
#' @examples
#' clone_tbl <- dplyr::tibble(
#' altered_entity=c(paste("gene", seq_len(10), sep="")),
#' clone1=c(rep(0,10)),
#' clone2=c(sample(c(0,1), 10, replace=TRUE)),
#' clone3=c(sample(c(0,1), 10, replace=TRUE)),
#' clone4=c(sample(c(0,1), 10, replace=TRUE)),
#' tree_id=c(rep(5, 5), rep(10, 5)) )
#' get_hist_clon_excl_this_pat_this_pair("gene1", "gene2", clone_tbl)
get_hist_clon_excl_this_pat_this_pair<-function(entA, entB, clone_tbl){
altered_entity <- tree_id <- NULL
stopifnot(is.character(entA))
stopifnot(is.character(entB))
stopifnot(is.tbl(clone_tbl))
stopifnot("altered_entity" %in% colnames(clone_tbl))
stopifnot("clone1" %in% colnames(clone_tbl))
stopifnot("tree_id" %in% colnames(clone_tbl))
if("patient_id" %in% colnames(clone_tbl) ||
"file_name" %in% colnames(clone_tbl)){
stop("The clone tibble is expected to have only the","
altered_entity, tree_id, and clones columns, but not file_name, ",
"or patient_id!")
}
for(this_col in colnames(clone_tbl)){ # sanity check
if(this_col != "altered_entity" &&
!grepl("clone", this_col) && this_col != "tree_id"){
## if there still is a column in this_clone_tbl that is not
## either a clone or an altered entity, or the tree_id
## this will be problematic, because it will be counted as
## a clone and hence may
## lead to erroneous numbers of clonal exclusivity
warning("The column ", this_col,
" in the clone tibble may be accidentally counted as a clone.")
}
}
## get the number of trees
all_tree_ids <- unique(as.character(clone_tbl$tree_id))
num_trees <- length(all_tree_ids)
message("Found ", num_trees,
" different tree ids in the clone tbl.")
## get the info of num_trees and clon excl
this_pair_across_trees <- vapply(all_tree_ids, function(x){
this_tree_clone_tbl <- clone_tbl %>%
filter(tree_id == x) %>%
select(-tree_id)
all_ents_this_tree <-
as.character(this_tree_clone_tbl$altered_entity)
if(entA %in% all_ents_this_tree &&
entB %in% all_ents_this_tree){
this_pair_occurs <- 1
this_pair_clon_excl <-
sum(suppressMessages(is_diff_branch_ent_pair(entA,
entB,
this_tree_clone_tbl)))
} else {
this_pair_occurs <- this_pair_clon_excl <- 0
}
return(c(this_pair_occurs, this_pair_clon_excl))
}, numeric(2))
## the columns are the info for each tree_id: occurs?, clon excl?
stopifnot(dim(this_pair_across_trees)[2] == num_trees)
stopifnot(dim(this_pair_across_trees)[1] == 2)
num_trees_clon_excl <- apply(this_pair_across_trees, 1, sum)
num_trees_occurs <- num_trees_clon_excl[1]
num_clon_excl <- num_trees_clon_excl[2]
stopifnot(num_clon_excl <= num_trees_occurs)
stopifnot(num_trees_occurs <= num_trees)
stopifnot(num_trees_occurs >= 0 && num_clon_excl >= 0)
return(c(num_trees_occurs, num_clon_excl))
}
#' Merge clone profile of identical entities in clone tibble from
#' one patient
#'
#' Given a clone tibble as created with \code{\link{create_tbl_ent_clones}}
#' from one patient and where the entities were possibly mapped from genes
#' to pathways, this function checks whether there were several entities
#' mapped to the same new entity. If so, the clone profile will be merged.
#' This can be the case, for instance, of two mutated genes are in the same
#' pathway(s).
#'
#' @title Merge identical entities in clone tibble from one patient
#' @param clone_tbl The clone tibble as generated with
#' \code{\link{create_tbl_ent_clones}} from one patient.
#' @author Ariane L. Moore
#' @return The same tibble but in case there were several identical
#' genes/pathways in the same patient with
#' different clone profiles, their profile will be merged together. This
#' can happen if, e.g. two genes with different clone profiles are in the
#' same pathway. When mapping them to the pathways, there will be two
#' identical 'altered_entities' with different clone profiles. These
#' profiles would be merged by this function because the pathway is
#' affected in the union of clones were the two genes were mutated.
#' @export
#' @importFrom magrittr "%>%"
#' @importFrom dplyr tibble is.tbl filter select as.tbl
#' @examples
#' clone_tbl <- dplyr::tibble(
#' file_name=c(rep("fn1", 4)),
#' patient_id=c(rep("pat1", 4)),
#' altered_entity=c("pw1", "pw1",
#' "pw2", "pw3"),
#' clone1=c(1, 0, 1, 0),
#' clone2=c(0, 1, 0, 1),
#' clone3=c(1, 1, 0, 1),
#' clone4=c(0, 1, 0, 0))
#' merge_clones_identical_ents(clone_tbl)
merge_clones_identical_ents <- function(clone_tbl) {
patient_id <- file_name <- altered_entity <- NULL
stopifnot(is.tbl(clone_tbl))
stopifnot("patient_id" %in% colnames(clone_tbl))
stopifnot("file_name" %in% colnames(clone_tbl))
stopifnot("altered_entity" %in% colnames(clone_tbl))
stopifnot("clone1" %in% colnames(clone_tbl))
stopifnot("clone2" %in% colnames(clone_tbl))
if("tree_id" %in% colnames(clone_tbl)){
stop("The clone tibble is expected to have only",
" the altered_entity, patient_id, file_name, ",
"and clones columns, but not tree_id!")
}
## make sure that this is just from one patient
this_pat <- unique(as.character(clone_tbl$patient_id))
this_file_name <- unique(as.character(clone_tbl$file_name))
stopifnot(length(this_pat) == 1)
## extract information from tibble
num_entries <- dim(clone_tbl)[1]
num_clones <- dim(clone_tbl)[2] - 3
## message to user
message("The clone tibble from patient ",
this_pat," contains ", num_entries, " entries.")
message("There are ", num_clones,
" different clones specified.")
## check whether there are identical mutated_entities
all_sorted_ents <- sort(as.character(clone_tbl$altered_entity))
all_sorted_unique_ents <- unique(all_sorted_ents)
num_all_ents <- length(all_sorted_ents)
stopifnot(num_all_ents == num_entries)
num_unique_ents <- length(all_sorted_unique_ents)
message("There are ", num_unique_ents,
" different mutated genes/pathways in the tibble.")
## and if so, merge them together
merged_ents_clone_tbl_list <- lapply(all_sorted_unique_ents,
function(x){
## extract the tibble with just the clone profiles for
## the current entity
this_ent_tbl <- clone_tbl %>%
filter(altered_entity == x) %>%
select(-file_name, -patient_id, -altered_entity)
## make sure that the clone indicators 0 and 1 are
## integers and not factors
this_ent_tbl <- apply(this_ent_tbl[, seq_len(num_clones)], 2,
function(y){as.numeric(as.character(y))})
## this is to make sure that the sum over columns works in
## case there is just one row
this_ent_tbl <- rbind(this_ent_tbl, c(rep(0, num_clones)))
## sum up the rows, i.e. the clone profiles
merged_clone_profile <-
ifelse(apply(this_ent_tbl, 2, sum) > 0, 1, 0)
stopifnot(length(merged_clone_profile) == num_clones)
# convert the merged_clone_profile to a dataframe
merged_clone_profile_df<-as.data.frame(t(merged_clone_profile))
## create the merged tibble
## the clone columns are now double instead of factor
this_ent_merged_tbl <- as.data.frame(cbind(
file_name=this_file_name,
patient_id=this_pat,
altered_entity=x,
merged_clone_profile_df))
## in order to use bind_rows, we have to make sure that
## the altered ent is a factor with all levels from all
## patients
this_ent_merged_tbl$altered_entity <-
factor(this_ent_merged_tbl$altered_entity,
levels = all_sorted_unique_ents)
## return the final merged tibble for the current entity
this_ent_merged_tbl
})
## append the list of tibbles to one big tibble
final_merged_clone_tbl <-
as.tbl(bind_rows(merged_ents_clone_tbl_list))
stopifnot(is.tbl(final_merged_clone_tbl))
num_entries <- dim(final_merged_clone_tbl)[1]
num_cols <- dim(final_merged_clone_tbl)[2]
stopifnot(num_cols == (num_clones + 3))
message("Now there are ", num_entries,
" different mutated genes/pathways from patient ",
this_pat, ".")
stopifnot(num_entries == num_unique_ents)
return(final_merged_clone_tbl)
}
#' Extract number of clones in each patient.
#'
#' Given a clone tibble as created with
#' \code{\link{create_tbl_ent_clones}} this function
#' extracts the information, how many clones there are
#' in each patient. The counted clones will be those
#' with at least one non-zero entry, i.e.
#' at least one gene/pathway assigned to the clone.
#'
#' @title Extract number of clones.
#' @param clone_tbl The tibble generated with
#' \code{\link{create_tbl_ent_clones}}.
#' @author Ariane L. Moore
#' @return A named vector with the number of clones
#' in each patient. The name of each element is the respective
#' patient_id.
#' @export
#' @importFrom magrittr "%>%"
#' @importFrom dplyr tibble is.tbl select distinct pull filter
#' @examples
#' clone_tbl <- dplyr::tibble(
#' file_name=c(rep("fn1", 2), rep("fn2", 2)),
#' patient_id=c(rep("pat1", 2), rep("pat2", 2)),
#' altered_entity=c("pw1", "pw2", "pw1", "pw3"),
#' clone1=c(0, 0, 0, 0),
#' clone2=c(0, 1, 0, 1),
#' clone3=c(1, 1, 0, 1),
#' clone4=c(1, 0, 0, 0))
#' extract_num_clones_tbl(clone_tbl)
extract_num_clones_tbl <- function(clone_tbl) {
patient_id <- altered_entity <- file_name <- NULL
stopifnot(is.tbl(clone_tbl))
stopifnot("patient_id" %in% colnames(clone_tbl))
stopifnot("altered_entity" %in% colnames(clone_tbl))
stopifnot("file_name" %in% colnames(clone_tbl))
stopifnot("clone1" %in% colnames(clone_tbl))
stopifnot("clone2" %in% colnames(clone_tbl))
stopifnot(!"tree_id" %in% colnames(clone_tbl))
## this should not be in here because otherwise it will be
## counted as a clone
## message to user
num_entries <- dim(clone_tbl)[1]
message(num_entries,
" alterations in total found in the provided tibble.")
num_clones_upper_bound <- dim(clone_tbl)[2]-3
message(num_clones_upper_bound,
" is assumed to be the upper bound of number of clones.")
## extract all the patient id's
all_pats <- clone_tbl %>%
select(patient_id) %>%
distinct() %>%
pull(patient_id)
num_pats <- length(all_pats)
num_clones <- vapply(all_pats, function(x){
this_pat_clone_tbl <- clone_tbl %>%
filter(patient_id == x) %>%
select(-file_name, -patient_id, -altered_entity)
## make sure that the clone indicators 0 and 1 are integers
## and not factors
this_pat_clone_tbl <-
apply(this_pat_clone_tbl[, seq_len(num_clones_upper_bound)],
2, function(y){as.numeric(as.character(y))})
## this is to make sure that the sum over columns works
## in case there is just one row
this_pat_clone_tbl <- rbind(this_pat_clone_tbl,
c(rep(0, num_clones_upper_bound)))
clones_summary <- apply(this_pat_clone_tbl, 2, sum)
stopifnot(length(clones_summary) == num_clones_upper_bound)
this_num_clones <- sum(ifelse(clones_summary > 0, 1, 0))
stopifnot(this_num_clones > 0)
stopifnot(this_num_clones <= num_clones_upper_bound)
return(this_num_clones)
}, numeric(1))
## message to user
min_num_clones <- min(num_clones)
max_num_clones <- max(num_clones)
message("There are ",
min_num_clones, "-",
max_num_clones,
" clones in each patient.")
return(num_clones)
}
#' Take the final pairs and return the patients id's, in which
#' they are mutated, and the patients id's in which they are
#' clonally exclusive.
#'
#' This function takes the final pairs of interest, and returns a
#' tibble with the information for each gene pair, in which patient
#' the pair was mutated, and in which of these patients the pair was
#' clonally exclusive in the majority of the trees in the tree
#' inference collection.
#'
#' @title Get the patients in which pairs are mutated
#' @param clone_tbl_all_trees The tibble containing the information
#' of which gene/pathway is mutated in which clone from which
#' patient across a collection of trees. Can be generated with
#' \code{\link{create_tbl_tree_collection}}, repeatedly for
#' each patient, and then combining them.
#' @param pairs_of_interest_tbl A tibble containing pairs of
#' mutated genes/pathways. More precisely, it contains the
#' columns 'entity_A' and 'entity_B'.
#' @author Ariane L. Moore
#' @return A tibble similar to the input
#' \code{pairs_of_interest_tbl} but with two additional columns,
#' namely 'mutated_in', and 'clonally_exclusive_in'. The column
#' 'mutated_in' contains the patient id's that the pair is mutated
#' in separated by a semicolon. The column 'clonally_exclusive_in'
#' contains the semicolon separated patient id's of the ones in
#' which the pairs was also clonally exclusive in the majority of
#' the trees in the collection of tree inferences.
#' @export
#' @importFrom magrittr "%>%"
#' @importFrom dplyr tibble distinct filter select group_by
#' tally pull is.tbl bind_cols
#' @examples
#' clone_tbl <- dplyr::tibble(file_name=rep(c(rep(c("fn1", "fn2"),
#' each=3)), 2),
#' patient_id=rep(c(rep(c("pat1", "pat2"),
#' each=3)), 2),
#' altered_entity=c(rep(c("geneA", "geneB", "geneC"),
#' 4)),
#' clone1=c(0, 1, 0, 1, 0, 1, 0,
#' 1, 1, 1, 0, 0),
#' clone2=c(1, 0, 1, 0, 1, 1, 1,
#' 0, 0, 1, 0, 1),
#' tree_id=c(rep(5, 6), rep(10, 6)))
#' pairs_of_interest <- dplyr::tibble(entity_A=c("geneA", "geneB"),
#' entity_B=c("geneB", "geneC"))
#' take_pairs_and_get_patients(clone_tbl, pairs_of_interest)
take_pairs_and_get_patients <- function(clone_tbl_all_trees,
pairs_of_interest_tbl){
entity_A <- entity_B <- patient_id <- tree_id <- file_name <-
n <- altered_entity <- NULL
stopifnot(is.tbl(clone_tbl_all_trees))
stopifnot("patient_id" %in% colnames(clone_tbl_all_trees))
stopifnot("file_name" %in% colnames(clone_tbl_all_trees))
stopifnot("altered_entity" %in% colnames(clone_tbl_all_trees))
stopifnot("clone1" %in% colnames(clone_tbl_all_trees))
stopifnot("clone2" %in% colnames(clone_tbl_all_trees))
stopifnot("tree_id" %in% colnames(clone_tbl_all_trees))
stopifnot("entity_A" %in% colnames(pairs_of_interest_tbl))
stopifnot("entity_B" %in% colnames(pairs_of_interest_tbl))
## message to user
all_tree_ids <- unique(clone_tbl_all_trees %>% pull(tree_id))
num_pats <-
length(unique(clone_tbl_all_trees %>% pull(patient_id)))
num_trees <- length(all_tree_ids)
num_pairs <- dim(pairs_of_interest_tbl)[1]
message("Found ", num_pairs,
" pairs of interest for which the patients will be extracted.")
message("There are ", num_pats,
" different patients in the clone tibble and ",
"the number of tree inferences is ", num_trees, ".")
num_min_trees_clon_excl <- floor((num_trees/2)+1)
message("A pair is counted as clonally exclusive in a patient,",
" if it clonally exclusive in the majority",
" of tree inferences, i.e. in at least ",
num_min_trees_clon_excl, " trees of a patient.")
## find out for each pair separately in which patients it is
## mutated and clonally exclusive
pairs_res_pats <- apply(pairs_of_interest_tbl, 1, function(x){
entA <- as.character(x[1])
entB <- as.character(x[2])
## get the patients in which they are both mutated
this_pair_pats_and_trees <- clone_tbl_all_trees %>%
filter(altered_entity %in% x) %>%
select(patient_id, tree_id)
pat_ids_all_trees <- lapply(all_tree_ids, function(y){
pat_ids_in_which_both_mutated <-
this_pair_pats_and_trees %>%
filter(tree_id == y) %>%
select(patient_id) %>%
group_by(patient_id) %>%
tally() %>%
filter(n == 2) %>%
pull(patient_id)
return(pat_ids_in_which_both_mutated)
})
these_pats_mut <- unique(as.vector(unlist(pat_ids_all_trees)))
stopifnot(!is.list(these_pats_mut))
## now we extract from the clone tbl only the patients
## in which they are both mutated
this_pair_pats_mut_clon_tbl <- clone_tbl_all_trees %>%
filter(altered_entity %in% x) %>%
filter(patient_id %in% these_pats_mut)
## for each pat in which they are mutated, get number of
## times it was clonally exclusive
## just for the current patient
pair_num_clon_excl <- vapply(these_pats_mut, function(y){
## take the clone tibble
this_clone_tbl <- this_pair_pats_mut_clon_tbl %>%
filter(patient_id == y) %>%
select(-file_name, -patient_id)
sanity_check <- lapply(colnames(this_clone_tbl),
function(this_col){
if(this_col != "altered_entity" &&
!grepl("clone", this_col) && this_col != "tree_id"){
## if there still is a column in this_clone_tbl that
## is not either a clone or an altered entity, or
## the tree_id this will be problematic, because it
## will be counted as a clone and hence may
## lead to erroneous numbers of clonal exclusivity
warning("The column ", this_col,
" in the clone tibble may be accidentally",
" counted as a clone.")
}
})
stopifnot(is.list(sanity_check))
num_trees_num_clon_excl_this_pat <-
suppressMessages(get_hist_clon_excl_this_pat_this_pair(entA,
entB,
this_clone_tbl))
return(num_trees_num_clon_excl_this_pat[2])
}, numeric(1))
stopifnot(length(pair_num_clon_excl) == length(these_pats_mut))
## now we construct the stings to return, that is the string with
## all semicolon separated pat ids in which the pair is mutated
## and the string with all semicolon separated pat ids in which they
## are clonally exlcusive in the majority of trees
this_pair_mutated_in<-
paste(these_pats_mut, sep="", collapse=";")
this_pair_clon_excl_in_vec <-
names(pair_num_clon_excl[which(pair_num_clon_excl >=
num_min_trees_clon_excl)])
this_pair_clon_excl_in <-
paste(this_pair_clon_excl_in_vec, sep="", collapse=";")
## in the case where one or both of these strings is empty:
if(this_pair_mutated_in == ""){
this_pair_mutated_in <- "."
}
if(this_pair_clon_excl_in == ""){
this_pair_clon_excl_in <- "."
}
return(c(this_pair_mutated_in, this_pair_clon_excl_in))
})
stopifnot(dim(pairs_res_pats)[1] == 2)
stopifnot(dim(pairs_res_pats)[2] == num_pairs)
mutated_in <- pairs_res_pats[1,]
clonally_exclusive_in <- pairs_res_pats[2,]
res_tbl<-bind_cols(pairs_of_interest_tbl,
mutated_in=mutated_in,
clonally_exclusive_in=clonally_exclusive_in)
return(res_tbl)
}
#' Map the ensembl gene ids to hgnc symbols from a tibble with pairs.
#'
#' After having extracted the pairs of interest, it is of interest to
#' know the genes hgnc symbols of the pairs. Here, it is assumed that the
#' current gene identifier of the pairs are ensembl gene ids. They will
#' be mapped to the corresponding hgnc symbols.
#'
#' @title Map the ensembl gene ids to hgnc symbols from a tibble
#' @param pairs_of_interest_tbl A tibble containing pairs of mutated
#' genes/pathways. More precisely, it contains the columns
#' 'entity_A' and 'entity_B'.
#' @param all_genes_tbl A tibble with all genes ensembl id's and
#' hgnc symbols. Can be created with
#' \code{\link{create_ensembl_gene_tbl_hg}}.
#' @author Ariane L. Moore
#' @return A tibble similar to the input \code{pairs_of_interest_tbl}
#' but with two additional columns, namely 'hgnc_gene_A', and 'hgnc_gene_B'.
#' The column 'hgnc_gene_A' contains the hgnc gene symbol of 'entity_A',
#' and the column 'hgnc_gene_B' the one of 'entity_B'.
#' @export
#' @importFrom magrittr "%>%"
#' @importFrom dplyr tibble is.tbl pull bind_cols
#' @examples
#' \dontrun{
#' pairs_of_interest <- dplyr::tibble(
#' entity_A=c("ENSG00000181143", "ENSG00000163939"),
#' entity_B=c("ENSG00000141510", "ENSG00000163930"))
#' all_genes_tbl <- create_ensembl_gene_tbl_hg()
#' map_pairs_to_hgnc_symbols(pairs_of_interest, all_genes_tbl)
#' }
map_pairs_to_hgnc_symbols <- function(pairs_of_interest_tbl,
all_genes_tbl){
entity_A <- entity_B <- ensembl_gene_id <-
hgnc_symbol <- NULL
stopifnot(is.tbl(pairs_of_interest_tbl))
stopifnot(is.tbl(all_genes_tbl))
stopifnot("entity_A" %in% colnames(pairs_of_interest_tbl))
stopifnot("entity_B" %in% colnames(pairs_of_interest_tbl))
stopifnot("ensembl_gene_id" %in% colnames(all_genes_tbl))
stopifnot("hgnc_symbol" %in% colnames(all_genes_tbl))
## map the ensembl gene ID's to the gene names
these_ens_ids_A <- pairs_of_interest_tbl %>%
pull(entity_A)
hgnc_gene_A <- c()
for(this_ens in these_ens_ids_A){
this_hgnc <-
suppressMessages(ensembl_to_hgnc(this_ens,
all_genes_tbl))
hgnc_gene_A <- c(hgnc_gene_A, this_hgnc)
}
these_ens_ids_B <- pairs_of_interest_tbl %>%
pull(entity_B)
hgnc_gene_B <- c()
for(this_ens in these_ens_ids_B){
this_hgnc <-
suppressMessages(ensembl_to_hgnc(this_ens,
all_genes_tbl))
hgnc_gene_B <- c(hgnc_gene_B, this_hgnc)
}
## add the columns to the pairs of interest tibble
pairs_of_interest_tbl_hgnc <-
bind_cols(pairs_of_interest_tbl,
hgnc_gene_A=hgnc_gene_A,
hgnc_gene_B=hgnc_gene_B)
return(pairs_of_interest_tbl_hgnc)
}
#' Write the resulting significant pairs tibble to disk as
#' a tab-separated file.
#'
#' After having extracted the significant pairs. The tibble
#' can be saved as a tab-separated file. It is assumed that the
#' input tibble has the columns 'hgnc_gene_A', 'hgnc_gene_B',
#' 'pval', 'qval', 'mutated_in', 'clonally_exclusive_in'.
#'
#' @title Write resulting significant pairs to disk
#' @param sig_pairs The tibble containing the significant pairs
#' of mutated genes/pathways.
#' @param avg_rates_m The average rates of clonal exclusivity for
#' each patient. The name of each rate is the respective
#' patient identifier.
#' @param tsv_file The path to the tsv-file to which the results
#' should be written.
#' @param num_digits The number of digits after the comma of the
#' average rate m, the p-value and the q-value (adjusted p-value).
#' Default: 2.
#' @author Ariane L. Moore
#' @return The tibble that is written to disk. It has the columns 'Gene A',
#' 'Gene B', 'P-value', 'Adjusted p-value',
#' 'Mutated in (rate)', 'Clonally exclusive in'.
#' @export
#' @importFrom dplyr is.tbl tibble
#' @importFrom utils write.table
#' @examples
#' sig_pairs <- dplyr::tibble(hgnc_gene_A=c("VHL", "BAP1"),
#' hgnc_gene_B=c("PTEN", "KIT"),
#' pval=c(0.001, 0.002),
#' qval=c(0.01, 0.02),
#' mutated_in=c("pat1; pat2", "pat1; pat2"),
#' clonally_exclusive_in=c("pat1; pat2",
#' "pat2"))
#' avg_rates_m <- c(pat1=0.0034, pat2=0.0021)
#' write_res_pairs_to_disk(sig_pairs, avg_rates_m, "test.tsv")
#' file.remove("test.tsv")
write_res_pairs_to_disk <- function(sig_pairs, avg_rates_m,
tsv_file, num_digits=2){
stopifnot(is.tbl(sig_pairs))
stopifnot("hgnc_gene_A" %in% colnames(sig_pairs))
stopifnot("hgnc_gene_B" %in% colnames(sig_pairs))
stopifnot("pval" %in% colnames(sig_pairs))
stopifnot("qval" %in% colnames(sig_pairs))
stopifnot("mutated_in" %in% colnames(sig_pairs))
stopifnot("clonally_exclusive_in" %in% colnames(sig_pairs))
stopifnot(is.numeric(avg_rates_m))
stopifnot(is.character(tsv_file))
stopifnot(is.numeric(num_digits))
stopifnot(num_digits >= 0)
## instead of just the patient id (e.g. "pat1; pat2") we want also
## the string Patient and the rate in parenthesis
## (e.g. "Patient pat1 (0.0034); Patient pat2 (0.0021)")
mutated_in_with_patient_and_rates <- vapply(sig_pairs$mutated_in,
function(x){
these_pats_label <- unlist(strsplit(x, ";"))
these_pats_label_with_rates <- vapply(these_pats_label,
function(this_pat){
this_pat <- trimws(this_pat, which="both")
## make sure that the patients labels are also
## among the named rates
stopifnot(this_pat %in% names(avg_rates_m))
this_new_label <-paste0("Patient ",
this_pat, " (",
round(avg_rates_m[which(names(avg_rates_m) == this_pat)],
digits=num_digits), ")")
return(this_new_label)
}, character(1))
return(paste(sort(these_pats_label_with_rates), collapse="; "))
},
character(1))
## instead of just the patient id (e.g. "pat1; pat2") we want also
## the string Patient (e.g. "Patient pat1; Patient pat2")
clonally_excl_in_with_patient <- vapply(sig_pairs$clonally_exclusive_in,
function(x){
these_pats_label <- unlist(strsplit(x, ";"))
these_pats_label_without_rates<-vapply(these_pats_label,
function(this_pat){
this_pat <- trimws(this_pat, which="both")
this_new_label <-paste0("Patient ", this_pat)
return(this_new_label)
}, character(1))
return(paste(sort(these_pats_label_without_rates), collapse="; "))
},
character(1))
## remove the names, because otherwise the results tibble has
## weird rownames
names(mutated_in_with_patient_and_rates) <- NULL
names(clonally_excl_in_with_patient) <- NULL
## prepare the results table
table_to_save <- data.frame(GeneA=sig_pairs$hgnc_gene_A,
GeneB=sig_pairs$hgnc_gene_B,
Pvalue=round(as.numeric(as.vector(sig_pairs$pval)),
digits=num_digits),
AdjustedPvalue=round(as.numeric(as.vector(sig_pairs$qval)),
digits=num_digits),
MutatedIn=mutated_in_with_patient_and_rates,
ClonallyExclusiveIn=clonally_excl_in_with_patient)
## the order should be without rounding it
orderOfPairs <- order(as.numeric(as.vector(sig_pairs$pval)))
table_to_save <- table_to_save[orderOfPairs, ]
colnames(table_to_save) <- c("Gene A", "Gene B", "P-value",
"Adjusted p-value",
"Mutated in (rate)", "Clonally exclusive in")
## write it to disk
write.table(table_to_save,
file=tsv_file,
quote=FALSE, col.names=TRUE,
row.names=FALSE, sep="\t")
return(table_to_save)
}
#' Check in how many patients pairs are mutated in
#'
#' After having created the tibble with all gene-to-clone
#' assignments from all patients and the whole collection
#' of trees, we're interested in how many patients tha
#' pairs are mutated in. This function creats a histogram
#' that shows in how many patients the pairs are mutated in.
#'
#' @title Pairs in how many patients histogram
#' @param clone_tbl The tibble containing the information of
#' which gene/pathway is mutated in which
#' clone from which patient and in which tree from the collection
#' of trees. Can be generated with \code{\link{create_tbl_tree_collection}}
#' for each patient separately and then appended.
#' @author Ariane L. Moore
#' @return The tibble that summarizes the number of pairs that
#' occur in how many patients.
#' @export
#' @importFrom magrittr "%>%"
#' @importFrom dplyr filter select is.tbl mutate distinct pull
#' tibble bind_rows tally group_by
#' @importFrom caTools combs
#' @examples
#' clone_tbl <- dplyr::tibble("file_name" =
#' rep(c(rep(c("fn1", "fn2"), each=3)), 2),
#' "patient_id"=rep(c(rep(c("pat1", "pat2"), each=3)), 2),
#' "altered_entity"=c(rep(c("geneA", "geneB", "geneC"), 4)),
#' "clone1"=c(0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0),
#' "clone2"=c(1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1),
#' "tree_id"=c(rep(5, 6), rep(10, 6)))
#' pairs_in_patients_hist(clone_tbl)
pairs_in_patients_hist <- function(clone_tbl){
patient_id <- altered_entity <- file_name <-
tree_id <- pairs <- n <- pairs_count <-
patient_count <- NULL
stopifnot(is.tbl(clone_tbl))
stopifnot("file_name" %in% colnames(clone_tbl))
stopifnot("patient_id" %in% colnames(clone_tbl))
stopifnot("altered_entity" %in% colnames(clone_tbl))
stopifnot("clone1" %in% colnames(clone_tbl))
stopifnot("clone2" %in% colnames(clone_tbl))
stopifnot("tree_id" %in% colnames(clone_tbl))
## get all patients
all_pats <- unique(as.character(clone_tbl$patient_id))
num_pats <- length(all_pats)
message("Found ", num_pats,
" different patient id's in the provided tibble.")
## get the pairs from each patient separately, and then check
## which pairs re-occur across at least two patients
list_all_pairs_all_pats <- lapply(all_pats, function(x){
all_ents_this_pat <- clone_tbl %>%
filter(patient_id == x) %>%
select(altered_entity) %>%
distinct() %>%
pull(altered_entity)
num_ents_this_pat <- length(all_ents_this_pat)
num_pairs_this_pat <-
num_ents_this_pat*(num_ents_this_pat-1)/2
## get all pairs of entities of that patient efficiently
all_pairs_this_pat <- combs(all_ents_this_pat, 2)
## to make sure the gene pairs are all in the same order such that
## later, when we check which pairs occur in multiple patients,
## that we do not miss pairs just because they are in a different
## order
all_pairs_this_pat <- as.data.frame(t(apply(all_pairs_this_pat, 1,
function(pair){sort(pair)})))
stopifnot(num_pairs_this_pat == dim(all_pairs_this_pat)[1])
return(all_pairs_this_pat)
})
all_pairs_all_pats <- bind_rows(list_all_pairs_all_pats)
unique_all_pairs_all_pats <- unique(all_pairs_all_pats)
num_pairs <- dim(unique_all_pairs_all_pats)[1]
all_pairs <- unique_all_pairs_all_pats
message("There are in total ",
num_pairs,
" pairs of genes/pathways mutated in at least one patient.")
## get the pairs and patients histogram
all_pairs_all_pats_one_string <- apply(all_pairs_all_pats, 1,
function(x){paste0(x, collapse="__")})
pairs_num_pats_tally <-
tibble(pairs=all_pairs_all_pats_one_string) %>%
group_by(pairs) %>%
tally() %>%
mutate(patient_count=n) %>%
select(pairs, patient_count)
num_pairs_num_pats_tally <-
pairs_num_pats_tally %>%
select(patient_count) %>%
group_by(patient_count) %>%
tally() %>%
mutate(pairs_count=n) %>%
select(pairs_count, patient_count)
pairs_c <- num_pairs_num_pats_tally$pairs_count
patient_c <- num_pairs_num_pats_tally$patient_count
num_pairs_atLeast2 <- sum(pairs_c[which(patient_c >= 2)])
message("There are in total ", num_pairs_atLeast2,
" pairs of genes/pathways mutated in at least 2 patients.")
## check what is the maximum n that pairs are mutated in how many
## patients
max_num_pat_pairs <- max(patient_c)
message("The maximum number of patients that a pair is ",
"mutated in is ",
max_num_pat_pairs, ". Please run the function",
" 'generate_ecdf_test_stat' ",
"with ",
"num_pat_pair_max=", max_num_pat_pairs, ".")
return(num_pairs_num_pats_tally)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.