#' Create a 'BANDITS_data' object
#'
#' \code{create_data} imports the equivalence classes and create a 'BANDITS_data' object.
#'
#'
#' @param salmon_or_kallisto a character string indicating the input data: 'salmon' or 'kallisto'.
#' @param gene_to_transcript a matrix or data.frame with a list of gene-to-transcript correspondances.
#' The first column represents the gene id, while the second one contains the transcript id.
#' @param salmon_path_to_eq_classes (for salmon input only) a vector of length equals to the number of samples:
#' each element indicates the path to the equivalence classes of the respective sample (computed by salmon).
#' @param kallisto_equiv_classes (for kallisto input only) a vector of length equals to the number of samples:
#' each element indicates the path to the equivalence classes ('.ec' files) of the respective sample (computed by kallisto).
#' @param kallisto_equiv_counts (for kallisto input only) a vector of length equals to the number of samples:
#' each element indicates the path to the counts of the equivalence classes ('.tsv' files) of the respective sample (computed by kallisto).
#' @param kallisto_counts (for kallisto input only) a matrix or data.frame,
#' with 1 column per sample and 1 row per transcript,
#' containing the estimated abundances for each transcript in each sample, computed by kallisto.
#' The matrix must be unfiltered and the order or rows must be unchanged.
#' @param eff_len a vector containing the effective length of transcripts; the vector names indicate the transcript ids.
#' Ideally, created via \code{\link{eff_len_compute}}.
#' @param n_cores the number of cores to parallelize the tasks on.
#' It is highly suggested to use at least one core per sample (default if not specificied by the user).
#' @param transcripts_to_keep a vector containing the list of transcripts to keep.
#' Ideally, created via \code{\link{filter_transcripts}}.
#' @param max_genes_per_group an integer number specifying the maximum number of genes that each group can contain.
#' When equivalence classes contain transcripts from distinct genes, these genes are analyzed together.
#' For computational reasons, 'max_genes_per_group' sets a limit to the number of genes that each group can contain.
#'
#' @return A \code{\linkS4class{BANDITS_data}} object.
#' @examples
#' # specify the directory of the internal data:
#' data_dir = system.file("extdata", package = "BANDITS")
#'
#' # load gene_to_transcript matching:
#' data("gene_tr_id", package = "BANDITS")
#'
#' # Specify the directory of the transcript level estimated counts.
#' sample_names = paste0("sample", seq_len(4))
#' quant_files = file.path(data_dir, "STAR-salmon", sample_names, "quant.sf")
#'
#' # Load the transcript level estimated counts via tximport:
#' library(tximport)
#' txi = tximport(files = quant_files, type = "salmon", txOut = TRUE)
#' counts = txi$counts
#'
#' # Optional (recommended): transcript pre-filtering
#' transcripts_to_keep = filter_transcripts(gene_to_transcript = gene_tr_id,
#' transcript_counts = counts,
#' min_transcript_proportion = 0.01,
#' min_transcript_counts = 10,
#' min_gene_counts = 20)
#'
#' # compute the Median estimated effective length for each transcript:
#' eff_len = eff_len_compute(x_eff_len = txi$length)
#'
#' # specify the path to the equivalence classes:
#' equiv_classes_files = file.path(data_dir, "STAR-salmon", sample_names, "aux_info", "eq_classes.txt")
#'
#' # create data from 'salmon' and filter internally lowly abundant transcripts:
#' input_data = create_data(salmon_or_kallisto = "salmon",
#' gene_to_transcript = gene_tr_id,
#' salmon_path_to_eq_classes = equiv_classes_files,
#' eff_len = eff_len,
#' n_cores = 2,
#' transcripts_to_keep = transcripts_to_keep)
#' input_data
#'
#' # create data from 'kallisto' and filter internally lowly abundant transcripts:
#' kallisto_equiv_classes = file.path(data_dir, "kallisto", sample_names, "pseudoalignments.ec")
#' kallisto_equiv_counts = file.path(data_dir, "kallisto", sample_names, "pseudoalignments.tsv")
#'
#' input_data_2 = create_data(salmon_or_kallisto = "kallisto",
#' gene_to_transcript = gene_tr_id,
#' kallisto_equiv_classes = kallisto_equiv_classes,
#' kallisto_equiv_counts = kallisto_equiv_counts,
#' kallisto_counts = counts,
#' eff_len = eff_len, n_cores = 2,
#' transcripts_to_keep = transcripts_to_keep)
#' input_data_2
#'
#' @author Simone Tiberi \email{simone.tiberi@uzh.ch}
#'
#' @seealso \code{\link{eff_len_compute}}, \code{\link{filter_transcripts}}, \code{\link{filter_genes}}, \code{\linkS4class{BANDITS_data}}
#'
#' @export
create_data = function(salmon_or_kallisto,
gene_to_transcript,
salmon_path_to_eq_classes = NULL,
kallisto_equiv_classes = NULL,
kallisto_equiv_counts = NULL,
kallisto_counts = NULL,
eff_len,
n_cores = NULL,
transcripts_to_keep = NULL,
max_genes_per_group = 50){
# wrapper to call the correct function: salmon_create_data (identical to the one already created) or kallisto_create_data (the one below)
cond_salmon_or_kallisto = (length(salmon_or_kallisto) == 1) & is.character(salmon_or_kallisto) & (salmon_or_kallisto %in% c("salmon", "kallisto"))
if( !cond_salmon_or_kallisto ){
message("'salmon_or_kallisto' must be a character string indicating the input data as: 'salmon' or 'kallisto'")
return(NULL)
}
# check that gene_to_transcript is a matrix or data.frame object
if( !is.data.frame(gene_to_transcript) & !is.matrix(gene_to_transcript) ){
message("'gene_to_transcript' must be a matrix or data.frame")
return(NULL)
}
if( ncol(gene_to_transcript) != 2 ){
message("'gene_to_transcript' must be a 2 column matrix or data.frame")
return(NULL)
}
if( !all( names(eff_len) %in% gene_to_transcript[,2]) ){
message("All transcript names in 'names(eff_len)' must be in 'gene_to_transcript[,2]'")
return(NULL)
}
if(max_genes_per_group > 100){
message("'max_genes_per_group' can be at most 100")
return(NULL)
}
# define the number of parellel cores (if left unspecified):
if(is.null(n_cores)){
n_cores = ifelse(salmon_or_kallisto == "salmon", length(salmon_path_to_eq_classes), length(kallisto_equiv_classes))
}
# initialize parallel cores (if n_cores > 1)
if( n_cores > 1.5){ # if n_cores > 1, I use parallel computing tools
suppressWarnings({
cl = makeCluster(n_cores, setup_strategy = "sequential")
})
}
# calculate the separator (1 or more _)
if(is.null(transcripts_to_keep)){
sep = "_"
while(TRUE){
Tr_id = as.character(gene_to_transcript[,2])
cond = sum( vapply(Tr_id, function(x){ grepl(sep, x, fixed=TRUE) }, FUN.VALUE = logical(1)) ) == 0
# sum(sapply(Tr_id, function(x){ grepl(sep, x, fixed=TRUE) })) == 0
if(cond){
break
}
sep = paste0(sep, "_")
}
}else{
sep = "_"
while(TRUE){
cond = sum( vapply(transcripts_to_keep, function(x){ grepl(sep, x, fixed=TRUE) }, FUN.VALUE = logical(1)) ) == 0
# sum(sapply(transcripts_to_keep, function(x){ grepl(sep, x, fixed=TRUE) })) == 0
if(cond){
break
}
sep = paste0(sep, "_")
}
}
# load the data:
if(salmon_or_kallisto == "salmon"){ # load salmon equivalence classes:
x = load_salmon_data(sep = sep,
path_to_eq_classes = salmon_path_to_eq_classes,
n_cores = n_cores, cl = cl,
transcripts_to_keep = transcripts_to_keep)
}else{ # load kallisto equivalence classes:
x = load_kallisto_data(sep = sep,
kallisto_equiv_classes = kallisto_equiv_classes,
kallisto_equiv_counts = kallisto_equiv_counts,
kallisto_counts = kallisto_counts,
n_cores = n_cores, cl = cl,
transcripts_to_keep = transcripts_to_keep)
}
if(is.null(x)){ # if the object is NULL, return it
return(x)
}
message("Data has been loaded")
N = length(x)
# From the truth table I take the Transcript_id and associated Gene_id.
Gene_id = as.character(gene_to_transcript[,1]); Tr_id = as.character(gene_to_transcript[,2])
# merge the ids of all classes here:
all_classes = unique( unlist( lapply(x, function(y){y$class_ids}) ) )
# unique( unlist( sapply(x, function(y){y$class_ids}) ) )
# I turn the transcript id from a single character into a vector
all_classes_vector = vapply( all_classes, strsplit, split = sep, fixed = TRUE, FUN.VALUE = list(1) )
# sapply( all_classes, strsplit, split = sep, fixed = TRUE )
# match the class id of each sample to the long vector containing all classes ids.
match_classes = lapply(x, function(u) match(u$class_ids, all_classes) )
# match_classes is a list, every element of the list refers to a sample.
# match_classes may now have some NAs.
# I find the counts of each equiv class in all the samples.
all_counts = matrix(0, nrow = length(all_classes), ncol = N)
# I set the matrix to 0: in case of no matching (class not present in a sample)
for(i in seq_len(N) ){ # super-fast:
match_NA = is.na(match_classes[[i]])
all_counts[match_classes[[i]][match_NA==FALSE],i] = x[[i]]$counts[match_NA==FALSE]
# here I filter the classes which were filtered out when filtering the transcripts.
}
# nr of transcripts per class:
n = vapply(all_classes_vector, length, FUN.VALUE = integer(1))
# sapply(all_classes_vector, length)
# create a data.frame structure keeping the info of what transcritps belong to what classes:
df_all_classes = data.frame( Class_id=rep(seq_along(n), n), Tr_id = unlist(all_classes_vector) )
rownames(df_all_classes) = c()
# then match equivalence classes to their genes (see readDGE)
# all_classes_vector is a list: every element of the list corresponds to a class.
df_all_classes$Gene_id = Gene_id[match(df_all_classes$Tr_id, Tr_id)]
genes_in_classes = split(df_all_classes$Gene_id, df_all_classes$Class_id)
genes_in_classes = lapply(genes_in_classes, unique)
############################################################################################################################################################
# 1) Consider ALL genes: first separate genes to be modelled alone from genes to be modelled together.
############################################################################################################################################################
genes_SELECTED = unique( df_all_classes$Gene_id ) # All genes
N_genes = length(genes_SELECTED)
n = vapply(genes_in_classes, length, FUN.VALUE = integer(1))
# sapply(genes_in_classes, length)
## I need to consider classes with 1 gene only...easy: "cond = sapply(genes_in_classes, length) == 1"
cond_1_gene_per_class = ( n == 1 )
# sapply(genes_in_classes, length) == 1 # But I also need to make sure that the genes respecting "cond" does not happear in other classes!
More_genes_in_classes = unique(unlist(genes_in_classes[cond_1_gene_per_class ==FALSE])) # list the genes happearing together in at least 1 class
# 1 - mean(cond_1_gene_per_class) # mean of equiv classes with tr from > 1 gene.
# 16 % before filterign, 17% after filtering.
# sum(all_counts[!cond_1_gene_per_class,])/sum(all_counts)
# mean counts from these classes
# 11 % before filtering, 10% after filtering.
# check what classes have at least 1 gene happearing with other genes in at least 1 class:
# in other words, check whether it's on the "More_genes_in_classes" list or not.
df_tmp = data.frame( Class_id=rep(seq_along(n), n), Gene_id = unlist(genes_in_classes))
df_tmp$More_genes_in_classes = df_tmp$Gene_id %in% More_genes_in_classes
cond_1_gene_per_class_FINAL = split(df_tmp$More_genes_in_classes, df_tmp$Class_id)
cond_1_gene_per_class_FINAL = !vapply(cond_1_gene_per_class_FINAL, any, FUN.VALUE = logical(1))
# !sapply(cond_1_gene_per_class_FINAL, any)
# 0.3 seconds
genes_names_Unique = as.character( genes_in_classes[cond_1_gene_per_class_FINAL] )
# genes to be modelled alone:
genes_SELECTED_Unique = unique( genes_names_Unique )
# genes to be modelled together (not in the previous list!):
genes_SELECTED_Together = genes_SELECTED[genes_SELECTED %in% genes_SELECTED_Unique ==FALSE]
N_genes_Unique = length(genes_SELECTED_Unique)
# genes to be modelled together (not in the previous list!):
genes_SELECTED_Together = unique( df_all_classes$Gene_id[df_all_classes$Gene_id %in% genes_SELECTED_Unique ==FALSE] );
N_genes_Together = length(genes_SELECTED_Together)
if(N_genes_Unique + N_genes_Together == 0){
message("0 genes to be analyzed")
return(NULL)
}
############################################################################################################################################################
# 2) Prepare data for the Unique genes:
############################################################################################################################################################
# ADD CONSTRAINT: if(N_genes_Unique > 0)
# Collect, for each gene, the classes associated to it.
classes_split_per_gene_Unique = split(all_classes_vector[cond_1_gene_per_class_FINAL], genes_names_Unique)
# Collect the counts associated to each class in every gene.
counts_split_per_gene_Unique = split(data.frame(all_counts[cond_1_gene_per_class_FINAL,]), genes_names_Unique)
# Finally, for each gene, consider the full list of transcripts (to define ... and ...)
# and make a matrix of 0, 1 indicating, for each class, what transcripts they have.
Transcripts_per_gene_Unique = lapply(classes_split_per_gene_Unique, function(x){ unique(unlist(x)) })
N_transcripts_per_gene_Unique = vapply(Transcripts_per_gene_Unique, length, FUN.VALUE = integer(1))
# sapply(Transcripts_per_gene_Unique, length)
N_genes_Unique = length(counts_split_per_gene_Unique);
#Error in classes_split_per_gene_Unique[[x]] : subscript out of bounds
# length(classes_split_per_gene_Unique) == length(counts_split_per_gene_Unique) # TRUE
classes_Unique = lapply(seq_len(N_genes_Unique), function(x){
m = vapply(classes_split_per_gene_Unique[[x]], function(y){ Transcripts_per_gene_Unique[[x]] %in% y }, FUN.VALUE = logical( length( Transcripts_per_gene_Unique[[x]] ) ))
# sapply(classes_split_per_gene_Unique[[x]], function(y){ Transcripts_per_gene_Unique[[x]] %in% y })
ifelse(m, 1, 0)
})
# 0.7 seconds (Mac)
################################################################################################
# Associate the Median eff length of transcripts:
################################################################################################
n = vapply(Transcripts_per_gene_Unique, length, FUN.VALUE = integer(1))
# sapply(Transcripts_per_gene_Unique, length)
df_eff_len_Unique = data.frame( Gene_id=rep(seq_along(n), n), Tr_id = unlist(Transcripts_per_gene_Unique))
df_eff_len_Unique$eff_len = eff_len[ match(df_eff_len_Unique$Tr_id, names(eff_len)) ]
eff_len_tr_Unique = split(df_eff_len_Unique$eff_len, df_eff_len_Unique$Gene_id)
genes_SELECTED_Unique = names(Transcripts_per_gene_Unique)
######################################################################################################
# TOGETHER Genes:
######################################################################################################
# ADD CONSTRAINT: if(N_genes_Together > 0)
all_counts_Together = all_counts[cond_1_gene_per_class_FINAL==FALSE,]
all_classes_vector_Together = all_classes_vector[cond_1_gene_per_class_FINAL==FALSE]
genes_in_classes_Together = genes_in_classes[cond_1_gene_per_class_FINAL==FALSE]
n_genes_per_class = vapply(genes_in_classes_Together, length, FUN.VALUE = integer(1))
# sapply(genes_in_classes_Together, length)
# First, I need to separate classes corresponding to 1 gene only to classes corresponding to >1 gene...DONE
# Now I need to group together the information about genes modelled together...they could be highly mixed!
# maybe I can use a similar approach to the one used to build the classes from the transcripts, although a 2,000 * 17,000 matrix would be quite big...
######################################################################################################
genes_in_classes_vector_Together = vapply( genes_in_classes_Together, paste, collapse = sep, FUN.VALUE = character(1) )
# sapply( genes_in_classes_Together, paste, collapse = sep )
names( genes_in_classes_vector_Together ) = genes_in_classes_vector_Together
classes_split_per_gene_Together = split( all_classes_vector_Together,
genes_in_classes_vector_Together )
counts_split_per_gene_Together = split( data.frame(all_counts_Together),
genes_in_classes_vector_Together )
# I look for what classes each gene appers in and record whether it happears uniquely or not.
genes_in_classes_split_per_gene_Together = strsplit(names(classes_split_per_gene_Together), split = sep, fixed = TRUE )
n_genes = vapply(genes_in_classes_split_per_gene_Together, length, FUN.VALUE = integer(1))
# sapply(genes_in_classes_split_per_gene_Together, length)
######################################################################################################
# I make group of genes to be modelled together and make a correspondance with the classes in classes_split_per_gene_Together
######################################################################################################
GROUPs_of_genes = list()
classes_associated_to_GROUPs = list()
genes_included = c()
g_id = 0
# it must be a loop (g_id does not always increase).
for(i in seq_len(N_genes_Together) ){
if(genes_SELECTED_Together[i] %in% genes_included == FALSE){
g_id = g_id + 1 # I create a new group of genes.
classes_associated_to_GROUPs[[g_id]] = which( vapply(genes_in_classes_split_per_gene_Together, function(x){ genes_SELECTED_Together[i] %in% x}, FUN.VALUE = logical(1)) )
# which( sapply(genes_in_classes_split_per_gene_Together, function(x){ genes_SELECTED_Together[i] %in% x}) )
GROUPs_of_genes[[g_id]] = unique( unlist(genes_in_classes_split_per_gene_Together[classes_associated_to_GROUPs[[g_id]]]) ) # Genes associated to i-th gene
j = 1
genes_included = c(genes_included, genes_SELECTED_Together[i])
while(j <= length(GROUPs_of_genes[[g_id]]) ){ # I loop on the other genes and repeat the operation
gene = GROUPs_of_genes[[g_id]][j]
if( !(gene %in% genes_included) ){
classes_associated_to_GROUPs[[g_id]] = unique( c(classes_associated_to_GROUPs[[g_id]],
which( vapply(genes_in_classes_split_per_gene_Together,
function(x){ gene %in% x}, FUN.VALUE = logical(1)) ) ))
GROUPs_of_genes[[g_id]] = unique( c(GROUPs_of_genes[[g_id]],
unlist(genes_in_classes_split_per_gene_Together[classes_associated_to_GROUPs[[g_id]]]) ) )
# Genes associated to i-th gene
genes_included = c(genes_included, gene)
}
j = j+1
}
}
}
######################################################################################################
# check if a Group has too many genes and split it START:
######################################################################################################
n_genes_per_group = vapply(GROUPs_of_genes, length, FUN.VALUE = integer(1))
# sapply(GROUPs_of_genes, length)
bigGroup = which( n_genes_per_group > max_genes_per_group)
if(length(bigGroup) > 0){ # if at least 1 group to be split
for(p in bigGroup){
message("One group of genes has ", n_genes_per_group[p], " genes")
message("Splitting the group in ", n_genes_per_group[p], " groups of individual genes")
classes_bigGroup = classes_associated_to_GROUPs[[p]]
################################################################################################
# Split genes and define their classes:
################################################################################################
classes_split_per_gene_Together = split( all_classes_vector_Together,
genes_in_classes_vector_Together )
counts_split_per_gene_Together = split( data.frame(all_counts_Together),
genes_in_classes_vector_Together )
Gene = unlist(genes_in_classes_split_per_gene_Together[classes_bigGroup])
classes_tmp = classes_split_per_gene_Together[classes_bigGroup]
Ngenes = vapply(genes_in_classes_split_per_gene_Together[classes_bigGroup], length, FUN.VALUE = integer(1))
# sapply(genes_in_classes_split_per_gene_Together[classes_bigGroup], length)
Class = rep(classes_tmp, Ngenes)
Class = unname(Class, force = TRUE) # remove names of top lists of Class
Class = lapply(Class, unname, force = TRUE) # remove names of each list in Class
# REMOVE NAMES OF OBJECTS: otherwise it'll crash (names too long!).
Counts = rep(counts_split_per_gene_Together[classes_bigGroup], vapply(genes_in_classes_split_per_gene_Together[classes_bigGroup], length, FUN.VALUE = integer(1)))
# sapply(genes_in_classes_split_per_gene_Together[classes_bigGroup], length))
Counts = unname(Counts, force = TRUE)
# REMOVE NAMES OF OBJECTS: otherwise it'll crash (names too long!).
class = do.call(c, Class)
gene = rep(Gene, vapply(Class, length, FUN.VALUE = integer(1)) )
# sapply(Class, length) )
counts = do.call(rbind, Counts)
# Split counts and classes by their gene name.
class_byGene = split(class, gene)
counts_byGene = split(counts, gene)
genes_bigGroup = names(class_byGene)
################################################################################################
# REMOVE THE TRANSCRIPTS THAT DO NOT BELONG THE GENE WE CONSIDER:
################################################################################################
df = data.frame(Gene_id = Gene_id[Gene_id %in% genes_bigGroup], Tr_id = Tr_id[Gene_id %in% genes_bigGroup] )
Tr_perGene_bigGroup = split(df$Tr_id, df$Gene_id)
Tr_perGene_bigGroup = lapply(Tr_perGene_bigGroup, as.character)
# it contains, for each gene, the list of transcripts associated to it.
# the order of the gene names is identical because split orders them alphabetically in both cases.
classes_split_bigGroup = lapply(seq_along(class_byGene), function(x){
X = class_byGene[[x]]
lapply(X, function(y){
y = unlist(y)
y[y %in% Tr_perGene_bigGroup[[x]] ]
})
})
################################################################################################
# Merge Identical classes and add up corresponding counts:
################################################################################################
# put transcript names of classes together as tr1_tr2
classes_split_bigGroup_num = lapply(classes_split_bigGroup, function(y){
# as.numeric(factor (sapply(y, function(yy) paste(sort(yy), collapse=sep) )) )
as.numeric(factor (vapply(y, function(yy) paste(sort(yy), collapse=sep), FUN.VALUE = character(1) ) ) )
})
DUPS = lapply(classes_split_bigGroup_num, duplicated)
classes_split_bigGroup_Unique = lapply(seq_along(classes_split_bigGroup), function(id){
classes_split_bigGroup[[id]][ DUPS[[id]] == FALSE ]
})
classes_split_bigGroup_Unique_num = lapply(seq_along(classes_split_bigGroup_num), function(id){
classes_split_bigGroup_num[[id]][ DUPS[[id]] == FALSE ]
})
# Select the counts for the unique classes:
counts_byGene_unique = lapply(seq_along(counts_byGene), function(id){
counts_byGene[[id]][ DUPS[[id]] == FALSE, ]
})
# Select the counts for the duplicated classes:
counts_byGene_DUPS = lapply(seq_along(counts_byGene), function(id){
counts_byGene[[id]][ DUPS[[id]], ]
})
counts_byGene_final = lapply(seq_along(counts_byGene), function(id){
X = counts_byGene_unique[[id]]
for(i in seq_len(sum(DUPS[[id]]==FALSE)) ){
match_dups = classes_split_bigGroup_num[[id]][DUPS[[id]]] == classes_split_bigGroup_Unique_num[[id]][i] # I look for the matching between the unique classes and duplicatd ones (eliminated)
if(sum(match_dups) > 0){
X[i,] = X[i,] + colSums(counts_byGene_DUPS[[id]][match_dups,]) # I add the counts of the corresponding duplicated classes
}
}
X
})
################################################################################################
# Finally, for each gene, consider the full list of transcripts (to define ... and ...)
# and make a matrix of 0, 1 indicating, for each class, what transcripts they have.
################################################################################################
Transcripts_per_gene_bigGroup = lapply(classes_split_bigGroup_Unique, function(x){ unique(unlist(x)) })
N_transcripts_per_gene_bigGroup = vapply(Transcripts_per_gene_bigGroup, length, FUN.VALUE = integer(1))
# sapply(Transcripts_per_gene_bigGroup, length)
N_genes_Unique_bigGroup = length(classes_split_bigGroup_Unique);
classes_Unique_bigGroup = lapply(seq_len(N_genes_Unique_bigGroup), function(x){
m = vapply(classes_split_bigGroup_Unique[[x]], function(y){ Transcripts_per_gene_bigGroup[[x]] %in% y }, FUN.VALUE = logical( length( Transcripts_per_gene_bigGroup[[x]] ) ))
# sapply(classes_split_bigGroup_Unique[[x]], function(y){ Transcripts_per_gene_bigGroup[[x]] %in% y })
ifelse(m, 1, 0)
})
# TO DO: REMOVE CLASSES from transcripts which are not present!
# AND merge classes which are identical.
################################################################################################
# Associate the Median eff length of transcripts:
################################################################################################
df_eff_len_bigGroup = data.frame( Gene_id=rep(seq_len(N_genes_Unique_bigGroup), N_transcripts_per_gene_bigGroup),
Tr_id = unlist(Transcripts_per_gene_bigGroup))
df_eff_len_bigGroup$eff_len = eff_len[ match(df_eff_len_bigGroup$Tr_id, names(eff_len)) ]
eff_len_tr_bigGroup = split(df_eff_len_bigGroup$eff_len, df_eff_len_bigGroup$Gene_id)
################################################################################################
# Store the info at the end of the Unique genes:
################################################################################################
genes_SELECTED_Unique = c( genes_SELECTED_Unique, genes_bigGroup)
Transcripts_per_gene_Unique = c( Transcripts_per_gene_Unique, Transcripts_per_gene_bigGroup)
eff_len_tr_Unique = c( eff_len_tr_Unique, eff_len_tr_bigGroup)
classes_Unique = c( classes_Unique, classes_Unique_bigGroup)
counts_split_per_gene_Unique = c( counts_split_per_gene_Unique, counts_byGene_final)
}
# Remove the big groups from the list of groups to be modelled together:
classes_associated_to_GROUPs = classes_associated_to_GROUPs[-bigGroup]
}
######################################################################################################
# Collect the counts associated to each class
######################################################################################################
# I now need to select the classes and counts associated to each "group" of genes
# Then "unlist" the classes and counts of each group: turn them into a single class and cont matrix.
counts_ALL_together_per_GROUP = lapply(classes_associated_to_GROUPs, function(x){
do.call(rbind, counts_split_per_gene_Together[x])
})
#counts_ALL_together_per_GROUP = lapply(classes_associated_to_GROUPs, function(x){
#res = counts_split_per_gene_Together[[x[1]]]
# loop only IF length(x) > 1 !!!
# otherwise it'll create a new record, identical to the previous one
#if(length(x) > 1){
# for(i in 2:length(x)){
# res = rbind(res, counts_split_per_gene_Together[[x[i]]])
# }
# # try to replace the for loop with: do.call(rbind, counts_split_per_gene_Together[x])
#}
#res
#})
classes_ALL_together_per_GROUP = lapply(classes_associated_to_GROUPs, function(x){
res = list()
for(i in seq_along(x)){
res[[i]] = classes_split_per_gene_Together[x[i]]
}
res
})
######################################################################################################
# Finally, for each gene, consider the full list of transcripts (to define ... and ...)
# and make a matrix of 0, 1 indicating, for each class, what transcripts they have.
Transcripts_per_GROUP = lapply(classes_ALL_together_per_GROUP, function(x){ unique(unlist(x)) })
N_transcripts_per_GROUP = vapply(Transcripts_per_GROUP, length, FUN.VALUE = integer(1))
# sapply(Transcripts_per_GROUP, length)
# since classes_ALL_together_per_GROUP[[i]] has a list per gene,
# and then each classes_ALL_together_per_GROUP[[i]][[j]][[1]] has a list per equiv class
# I need to unlist manually classes_ALL_together_per_GROUP
# otherwise unlist messes up the structure
N_GROUPS = length(classes_ALL_together_per_GROUP)
classes_ALL_together_per_GROUP_unlisted = list()
for(i in seq_len(N_GROUPS) ){
classes_ALL_together_per_GROUP_unlisted[[i]] = list(classes_ALL_together_per_GROUP[[i]][[1]][[1]][[1]])
J = length(classes_ALL_together_per_GROUP[[i]])
for(j in seq_len(J) ){
K = length(classes_ALL_together_per_GROUP[[i]][[j]][[1]])
for(k in seq_len(K) ){
if( sum(c(j == 1, k == 1)) < 2 ){ # the first case (j and k both = 1), is considered above to initialize the list.
classes_ALL_together_per_GROUP_unlisted[[i]] = c(classes_ALL_together_per_GROUP_unlisted[[i]],
list(classes_ALL_together_per_GROUP[[i]][[j]][[1]][[k]]))
}
}
}
}
# From the transcripts, I create the classes
classes_Together = lapply(seq_along(classes_ALL_together_per_GROUP_unlisted), function(x){
# m = sapply(classes_ALL_together_per_GROUP_unlisted[[x]], function(y){ Transcripts_per_GROUP[[x]] %in% unlist(y) })
m = vapply(classes_ALL_together_per_GROUP_unlisted[[x]], function(y){ Transcripts_per_GROUP[[x]] %in% unlist(y) }, FUN.VALUE = logical( length( Transcripts_per_GROUP[[x]] ) ))
ifelse(m, 1, 0)
})
################################################################################################
# Associate the Median eff length of transcripts:
################################################################################################
n = vapply(Transcripts_per_GROUP, length, FUN.VALUE = integer(1))
# sapply(Transcripts_per_GROUP, length)
df_eff_len_Unique = data.frame( Class_id=rep(seq_along(n), n), Tr_id = unlist(Transcripts_per_GROUP))
df_eff_len_Unique$eff_len = eff_len[ match(df_eff_len_Unique$Tr_id, names(eff_len)) ]
eff_len_tr_Together = split(df_eff_len_Unique$eff_len, df_eff_len_Unique$Class_id)
######################################################################################################
# I need an "id" telling me what transcripts belong to what gene in each group.
######################################################################################################
# For each Group, I match (on the truth matrix), the tr_id with the gene_to_transcript
# 1): list genes in each Group;
# 2): associate each tr to a gene;
# 3): count nr of transcripts per gene.
# use the gene_to_transcript matrix
n = vapply(Transcripts_per_GROUP, length, FUN.VALUE = integer(1))
# sapply(Transcripts_per_GROUP, length)
df_tmp = data.frame( Class_id=rep(seq_along(n), n), Tr_id = unlist(Transcripts_per_GROUP))
df_tmp$Gene_id = Gene_id[ match(df_tmp$Tr_id, Tr_id) ]
genes_per_GROUP = split(df_tmp$Gene_id, df_tmp$Class_id)
genes_per_GROUP_unique_together = lapply(genes_per_GROUP, unique)
for(i in seq_along(Transcripts_per_GROUP)){ # I associate each transcript to the corresponding gene
names(Transcripts_per_GROUP[[i]]) = genes_per_GROUP[[i]]
}
######################################################################################################
# Filter elements with < 2 transcripts per gene:
######################################################################################################
# Filter Unique genes:
K = vapply(eff_len_tr_Unique, length, FUN.VALUE = integer(1))
# sapply(eff_len_tr_Unique, length)
SEL_tr = K > 1
genes_SELECTED_Unique = genes_SELECTED_Unique[SEL_tr]
Transcripts_per_gene_Unique = Transcripts_per_gene_Unique[SEL_tr]
eff_len_tr_Unique = eff_len_tr_Unique[SEL_tr]
classes_Unique = classes_Unique[SEL_tr]
counts_split_per_gene_Unique = counts_split_per_gene_Unique[SEL_tr]
# store all genes and transcripts that will be analyzed (i.e., in genes with > 2 transcripts):
K = lapply( seq_len(N_GROUPS),
function(id){
# sapply(genes_per_GROUP_unique_together[[id]], function(x) sum(names(Transcripts_per_GROUP[[id]]) == x) )
vapply(genes_per_GROUP_unique_together[[id]], function(x) sum(names(Transcripts_per_GROUP[[id]]) == x), FUN.VALUE = integer(1))
}) # compute the number of transcrips belonging to 1 each gene.
all_genes = c(genes_SELECTED_Unique, unlist( lapply(seq_along(K), function(id){ genes_per_GROUP_unique_together[[id]][ K[[id]] > 1] }) ) )
# Filter Together genes (only if all genes in a group have < 2 transcripts)
SEL_tr = vapply(K, function(x) any(x > 1), FUN.VALUE = logical(1) )
# sapply(K, function(x) any(x > 1) ) # at least 1 gene with > 1 transcript
genes_per_GROUP_unique_together = genes_per_GROUP_unique_together[SEL_tr]
Transcripts_per_GROUP = Transcripts_per_GROUP[SEL_tr]
eff_len_tr_Together = eff_len_tr_Together[SEL_tr]
classes_Together = classes_Together[SEL_tr]
counts_ALL_together_per_GROUP = counts_ALL_together_per_GROUP[SEL_tr]
######################################################################################################
# automatically check the coherence of the matrixed in UNIQUE:
######################################################################################################
if(length(Transcripts_per_gene_Unique) > 0){
cond = vapply(seq_along(Transcripts_per_gene_Unique), function(i){
K = length(Transcripts_per_gene_Unique[[i]]); J = ncol(data.frame(classes_Unique[[i]]))
{length(eff_len_tr_Unique[[i]]) == K} & {nrow(data.frame(classes_Unique[[i]])) == K} & {nrow(counts_split_per_gene_Unique[[i]]) == J} & {ncol(counts_split_per_gene_Unique[[i]]) == N}
}, FUN.VALUE = logical(1))
if( mean(cond) != 1){ # Error!
message("something wrong in Unique classes")
}
}
######################################################################################################
# automatically check the coherence of the matrixed in TOGETHER:
######################################################################################################
if(length(Transcripts_per_GROUP) > 0){
cond = vapply(seq_along(Transcripts_per_GROUP), function(i){
K = nrow(classes_Together[[i]]); J = ncol(classes_Together[[i]])
{length(eff_len_tr_Together[[i]] ) == K} & {length(Transcripts_per_GROUP[[i]]) == K} & {nrow(counts_ALL_together_per_GROUP[[i]]) == J} & {ncol(counts_ALL_together_per_GROUP[[i]]) == N}
}, FUN.VALUE = logical(1))
if( mean(cond) != 1){ # Error!
message("something wrong in Together classes")
}
}
# stop clusters:
if( n_cores > 1.5){ # if n_cores > 1, I use parallel computing tools
stopCluster(cl)
}
######################################################################################################
# Make a class out of the results:
######################################################################################################
# Max number of genes and transcripts per GROUP:
message(paste("Max ", max(c( vapply(eff_len_tr_Unique, length, FUN.VALUE = integer(1)), vapply(eff_len_tr_Together, length, FUN.VALUE = integer(1)) )), " transcripts per group"))
message(paste("Max ", max( vapply(genes_per_GROUP_unique_together, length, FUN.VALUE = integer(1)) ), " genes per group"))
data = new("BANDITS_data",
genes = c(genes_SELECTED_Unique, genes_per_GROUP_unique_together),
transcripts = c(Transcripts_per_gene_Unique, Transcripts_per_GROUP),
effLen = c(eff_len_tr_Unique, eff_len_tr_Together),
classes = c(classes_Unique, classes_Together),
counts = c(counts_split_per_gene_Unique, counts_ALL_together_per_GROUP),
uniqueId = c( rep(TRUE, length(eff_len_tr_Unique)), rep(FALSE, length(eff_len_tr_Together)) ),
all_genes = all_genes )
# return results into a BANDITS_data object:
return(data)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.