R/complex_functions.R
In YAPSA: Yet Another Package for Signature Analysis

Documented in aggregate_exposures_by_category compare_to_catalogues compute_comparison_stat_df create_mutation_catalogue_from_df create_mutation_catalogue_from_VR cut_breaks_as_intervals get_extreme_PIDs makeVRangesFromDataFrame normalizeMotifs_otherRownames test_gene_list_in_exposures transform_rownames_deconstructSigs_to_YAPSA transform_rownames_MATLAB_to_R transform_rownames_nature_to_R transform_rownames_R_to_MATLAB transform_rownames_YAPSA_to_deconstructSigs

# Copyright © 2014-2019  The YAPSA package contributors
# This file is part of the YAPSA package. The YAPSA package is licenced under
# GPL-3

#' Construct a VRanges Object from a data frame
#'
#' In this package, big data frames are generated from cohort wide vcf-like
#' files. This function constructs a VRanges object from such a data frame by
#' using \code{\link{makeGRangesFromDataFrame}} from the package
#' \code{\link{GenomicRanges}}
#'
#' @param in_df A big dataframe constructed from a vcf-like file of a whole
#'   cohort. The first columns are those of a standard vcf file, followed by an
#'   arbitrary number of custom or user defined columns. One of these can carry
#'   a PID (patient or sample identifyier) and one can carry subgroup
#'   information.
#' @param in_keep.extra.columns in_seqinfo Argument passed on to
#'   \code{\link{makeGRangesFromDataFrame}}
#' @param in_seqinfo A seqInfo object, referring to the reference genome used.
#'   Argument passed on to \code{\link{makeGRangesFromDataFrame}}
#' @param in_seqnames.field Indicates the name of the column in which the
#'   chromosome is encoded
#' @param in_start.field Indicates the name of the column in which the start
#'   coordinate is encoded
#' @param in_end.field Indicates the name of the column in which the end
#'   coordinate is encoded
#' @param in_PID.field Indicates the name of the column in which the PID
#'   (patient or sample identifier) is encoded
#' @param in_subgroup.field Indicates the name of the column in which the
#'   subgroup information is encoded
#' @param in_strand.field Indicates the name of the column in which the
#'   strandedness is encoded
#' @param verbose_flag Verbose if 1
#'
#' @return The constructed VRanges object
#'
#' @examples
#'  data(lymphoma_test)
#'  temp_vr <- makeVRangesFromDataFrame(lymphoma_test_df,
#'                                      in_seqnames.field="CHROM",
#'                                      in_subgroup.field="SUBGROUP",
#'                                      verbose_flag=1)
#'
#' @seealso \code{\link{makeGRangesFromDataFrame}}
#'
#' @import GenomicRanges
#' @import VariantAnnotation
#' @importFrom GenomeInfoDb seqlengths seqlengths<-
#' @export
#' 
makeVRangesFromDataFrame <- function(in_df,in_keep.extra.columns=TRUE,
                                    in_seqinfo=NULL,
                                    in_seqnames.field="X.CHROM",
                                    in_start.field="POS",
                                    in_end.field="POS",in_PID.field="PID",
                                    in_subgroup.field="subgroup",
                                    in_strand.field="strand",
                                    verbose_flag=1) {
    out_vr <- NULL
    name_list <- tolower(names(in_df))
    match_list <- c("ref","alt",tolower(in_seqnames.field),
                    tolower(in_start.field),tolower(in_end.field))
    if(length(which(match_list %in% name_list)) == length(match_list)) {
        my_gr <- GenomicRanges::makeGRangesFromDataFrame(in_df,
                                keep.extra.columns=in_keep.extra.columns,
                                seqinfo=in_seqinfo,
                                seqnames.field=in_seqnames.field,
                                start.field=in_start.field,
                                end.field=in_end.field,
                                strand.field=in_strand.field)
        if(!("+" %in% unique(strand(my_gr))) & 
                !("-" %in% unique(strand(my_gr)))){
                strand(my_gr) <- "+"
                if(verbose_flag==1){
                    cat("YAPSA:::makeVRangesFromDataFrame::warning:strand ",
                    "information missing, set to \"+\".\n")
        }
    }    
    out_vr <- VRanges(seqnames=seqnames(my_gr),ranges=ranges(my_gr),
                        ref=my_gr$REF,alt=my_gr$ALT)
    if(tolower(in_PID.field) %in% name_list) {
        if(verbose_flag==1){
            cat("YAPSA:::makeVRangesFromDataFrame::in_PID.field found. ",
                "Retrieving PID information.\n")
        }
        column_ind <- 
            min(which(tolower(names(mcols(my_gr)))==tolower(in_PID.field)))
        out_vr$PID <- mcols(my_gr)[,column_ind]
        sampleNames(out_vr) <- mcols(my_gr)[,column_ind]
        } else if("pid" %in% name_list) {
            if(verbose_flag==1){
                cat(paste0(
                "YAPSA:::makeVRangesFromDataFrame::warning:in_PID.field ",
                "not a valid column name,",
                " but default is valid. Retrieving PID information.\n"))
        }
        out_vr$PID <- mcols(my_gr)[,"PID"]
        sampleNames(out_vr) <- mcols(my_gr)[,column_ind]
    } else {
        if(verbose_flag==1){
            cat("YAPSA:::makeVRangesFromDataFrame::warning:PID information ",
            "missing. Filling up with dummy entries.\n");}
            out_vr$PID <- "dummy_PID"
            sampleNames(out_vr) <- "dummyPID"
    }
    if(tolower(in_subgroup.field) %in% name_list) {
        if(verbose_flag==1){
            cat("YAPSA:::makeVRangesFromDataFrame::in_subgroup.field found. ",
                "Retrieving subgroup information.\n")
        }
    column_ind <- 
        min(which(tolower(names(mcols(my_gr)))==tolower(in_subgroup.field)))
        #out_vr$Type <- mcols(my_gr)[,column_ind]
        mcols(out_vr)[,in_subgroup.field] <- mcols(my_gr)[,column_ind]
    } else if("subgroup" %in% name_list) {
        if(verbose_flag==1){
            cat(paste0("YAPSA:::makeVRangesFromDataFrame::warning:",
                "in_subgroup.field not a valid column name, ",
                "but default is valid. Retrieving subgroup information.\n"))
    }
    #out_vr$Type <- my_gr$subgroup
    mcols(out_vr)[,in_subgroup.field] <- my_gr$subgroup
    } else {
        if(verbose_flag==1){
            cat("YAPSA:::makeVRangesFromDataFrame::warning:",
                "subgroup information missing. ",
                "Filling up with dummy entries.\n");}
        #out_vr$Type <- "dummy_subgroup"
        mcols(out_vr)[,in_subgroup.field] <- "dummy_subgroup"
    }
    seqlengths(out_vr) <- seqlengths(my_gr)    
    } else {
        if(verbose_flag==1){
            cat("YAPSA:::makeVRangesFromDataFrame::
                error:mismatch in column names, ","return NULL.\n")
        }
    }
    return(out_vr)
}


#' Create a Mutational Catalogue from a VRanges Object
#'
#' This function creates a mutational catalogue from a VRanges Object by first
#' calling \code{\link[SomaticSignatures]{mutationContext}} to establish the
#' motif context of the variants in the input VRanges and then calling
#' \code{\link[SomaticSignatures]{motifMatrix}} to build the mutational
#' catalogue \code{V}.
#'
#' @param in_vr A VRanges object constructed from a vcf-like file of a whole
#'   cohort. The first columns are those of a standard vcf file, followed by an
#'   arbitrary number of custom or used defined columns. One of these can carry
#'   a PID (patient or sample identifyier) and one can carry subgroup
#'   information.
#' @param in_refGenome The reference genome handed over to
#'   \code{\link[SomaticSignatures]{mutationContext}} and used to extract the
#'   motif context of the variants in \code{in_vr}.
#' @param in_wordLength The size of the motifs to be extracted by
#'   \code{\link[SomaticSignatures]{mutationContext}}
#' @param in_PID.field Indicates the name of the column in which the PID
#'   (patient or sample identifier) is encoded
#' @param in_verbose Verbose if \code{in_verbose=1}
#' @param in_rownames Optional parameter to specify rownames of the mutational
#'   catalogue \code{V} i.e. the names of the features.
#' @param adapt_rownames Rownames of the output \code{matrix} will be adapted if
#'   \code{adapt_rownames=1}
#'
#' @return A list with entries \code{matrix}, \code{frame}, \itemize{ \item
#'   \code{matrix}: The mutational catalogue \code{V} \item \code{frame}:
#'   Additional and meta information on rownames (features), colnames (PIDs) and
#'   subgroup attribution. }
#'
#' @examples
#'  library(BSgenome.Hsapiens.UCSC.hg19)
#'  data(lymphoma_test)
#'  data(sigs)
#'  word_length <- 3
#'  temp_vr <- makeVRangesFromDataFrame(
#'    lymphoma_test_df,in_seqnames.field="CHROM",
#'    in_subgroup.field="SUBGROUP",verbose_flag=1)
#'  temp_list <- create_mutation_catalogue_from_VR(
#'    temp_vr,in_refGenome=BSgenome.Hsapiens.UCSC.hg19,
#'    in_wordLength=word_length,in_PID.field="PID",
#'    in_verbose=1)
#'  dim(temp_list$matrix)
#'  head(temp_list$matrix)
#'  test_list <- split(lymphoma_test_df,f=lymphoma_test_df$PID)
#'  other_list <- list()
#'  for(i in seq_len(length(test_list))){
#'    other_list[[i]] <- test_list[[i]][c(1:80),]
#'  }
#'  other_df <- do.call(rbind,other_list)
#'  other_vr <- makeVRangesFromDataFrame(
#'    other_df,in_seqnames.field="CHROM",
#'    in_subgroup.field="SUBGROUP",verbose_flag=1)
#'  other_list <- create_mutation_catalogue_from_VR(
#'    other_vr,in_refGenome=BSgenome.Hsapiens.UCSC.hg19,
#'    in_wordLength=word_length,in_PID.field="PID",
#'    in_verbose=1,in_rownames=rownames(AlexCosmicValid_sig_df))
#'  dim(other_list$matrix)
#'  head(other_list$matrix)
#'
#' @seealso \code{\link[SomaticSignatures]{mutationContext}}
#' @seealso \code{\link[SomaticSignatures]{motifMatrix}}
#'
#' @importFrom SomaticSignatures mutationContext
#' @importFrom SomaticSignatures motifMatrix
#' @export
#' 
create_mutation_catalogue_from_VR <- function(in_vr,in_refGenome,in_wordLength,
                                            in_PID.field="PID",in_verbose=0,
                                            in_rownames=c(),adapt_rownames=1){
    # adapt levels of in_PID.field
    PID_index <- which(names(mcols(in_vr))==in_PID.field)
    names(mcols(in_vr))[PID_index] <- "PID"
    in_vr$PID <- factor(as.character(in_vr$PID))
    # now start real work of the function
    if ( in_verbose==1 ) {
        cat("YAPSA:::create_mutation_catalogue_from_VR::Extracting mutation ",
        "context for the SNVs from the reference genome...\n");
    }
    my_motifs <- mutationContext(in_vr, in_refGenome, 
                                    k=in_wordLength, unify=TRUE)
    if ( in_verbose==1 ) {
        cat("YAPSA:::create_mutation_catalogue_from_VR::Building mutational ",
        "catalogue...\n");
    }
    ## cave: mutationContextMatrix is the old version of the SomaticSignatures 
    ##       package motifMatrix is the new version, but return a list instead 
    ##       of a matrix when the data is sparse
    #my_matrix = mutationContextMatrix(my_motifs, group=in_PID.field,
    #                                  normalize = FALSE)
    my_matrix = motifMatrix(my_motifs, group=in_PID.field,normalize = FALSE)
    ## check if unallowed nucleotides in matrix
    deselect_ind <- grep("[NX]",rownames(my_matrix))
    if (length(deselect_ind) > 0) {
        my_matrix <- my_matrix[-deselect_ind]
    }
    # account for sparsity of the matrix
    if(length(in_rownames) > 0) {
        if(length(in_rownames) < dim(my_matrix)[1]) {
            if(in_verbose==1) {
                cat("YAPSA:::create_mutation_catalogue_from_VR::
                Warning: supplied ", "rownames too short\n");
            }
        } else if (!is.character(in_rownames)) {
            if(in_verbose==1) {
                cat("YAPSA:::create_mutation_catalogue_from_VR::
                    Warning: supplied ", "rownames not character format\n");
            }
        } else {
            my_rownames <- transform_rownames_MATLAB_to_R(in_rownames,
                                                        in_wordLength)
            temp_matrix <- matrix(0,nrow=length(my_rownames),
                                    ncol=dim(my_matrix)[2])
            match_ind <- match(rownames(my_matrix),my_rownames)
            if( (length(match_ind) < dim(my_matrix)[1]) |
                (any(is.na(match_ind)))){
                if(in_verbose==1) {
                    cat("YAPSA:::create_mutation_catalogue_from_VR::
                        Warning: my_matrix"," contains rownames which are not
                        in supplied rownames.\n");
                }
            } else {
                temp_colnames <- colnames(my_matrix)
                temp_matrix[match_ind,] <- my_matrix
                my_matrix <- temp_matrix
                rownames(my_matrix) <- my_rownames
                colnames(my_matrix) <- temp_colnames
            }
        }
    }
    # string manipulations for export to matlab
    if ( in_verbose==1 ) {
        cat("YAPSA:::create_mutation_catalogue_from_VR::
            Performing string ", "manipulations for output to matlab...\n");
    }
    temp_frame <- data.frame(strsplit(rownames(my_matrix)," "))
    colnames(temp_frame) <- rownames(my_matrix)
    my_frame <- as.data.frame(t(temp_frame[,]))
    colnames(my_frame) <- c("exchange","context")
    my_frame$types <- 
        paste0(substr(my_frame$exchange,1,1),">",substr(my_frame$exchange,2,2))
    ## adapt for word length here
    fraction_length <- floor(in_wordLength/2)
    my_frame$subtypes <- 
        paste0(substr(my_frame$context,1,fraction_length),
                substr(my_frame$exchange,1,1),
                substr(my_frame$context,
                in_wordLength-fraction_length+1,in_wordLength))  
    my_frame$newnames <- paste0(my_frame$types," ",my_frame$subtypes)
    if(adapt_rownames==1) {
        rownames(my_matrix) <- my_frame$newnames
    }
    return(list(matrix=my_matrix,frame=my_frame))
}


#' Create a Mutational Catalogue from a data frame
#'
#' This function creates a mutational catalogue from a data frame. It is a
#' wrapper function for \code{\link{create_mutation_catalogue_from_VR}}: it
#' first creates a VRanges object from the data frame by
#' \code{\link{makeVRangesFromDataFrame}} and then passes this object on to the
#' above mentioned custom function.
#'
#' @param this_df A data frame constructed from a vcf-like file of a whole
#'   cohort. The first columns are those of a standard vcf file, followed by an
#'   arbitrary number of custom or used defined columns. One of these can carry
#'   a PID (patient or sample identifyier) and one can carry subgroup
#'   information.
#' @param this_refGenome_Seqinfo A seqInfo object, referring to the reference
#'   genome used. Argument passed on to \code{\link{makeGRangesFromDataFrame}}
#'   and thus indirectly to \code{\link{makeGRangesFromDataFrame}}.
#' @param this_seqnames.field Indicates the name of the column in which the
#'   chromosome is encoded
#' @param this_start.field Indicates the name of the column in which the start
#'   coordinate is encoded
#' @param this_end.field Indicates the name of the column in which the end
#'   coordinate is encoded
#' @param this_PID.field Indicates the name of the column in which the PID
#'   (patient or sample identifier) is encoded
#' @param this_subgroup.field Indicates the name of the column in which the
#'   subgroup information is encoded
#' @param this_refGenome The reference genome handed over to
#'   \code{\link{create_mutation_catalogue_from_VR}} and indirectly to
#'   \code{\link[SomaticSignatures]{mutationContext}} and used to extract the
#'   motif context of the variants in \code{in_vr}.
#' @param this_wordLength The size of the motifs to be extracted by
#'   \code{\link[SomaticSignatures]{mutationContext}}
#' @param this_verbose Verbose if \code{this_verbose=1}
#' @param this_rownames Optional parameter to specify rownames of the mutational
#'   catalogue \code{V} i.e. the names of the features.
#' @param this_adapt_rownames Rownames of the output \code{matrix} will be
#'   adapted if \code{this_adapt_rownames=1}
#'
#' @return A list with entries \code{matrix} and \code{frame} obtained from
#'   \code{\link{create_mutation_catalogue_from_VR}}: \itemize{ \item
#'   \code{matrix}: The mutational catalogue \code{V} \item \code{frame}:
#'   Additional and meta information on rownames (features), colnames (PIDs) and
#'   subgroup attribution. }
#'
#' @examples
#'  library(BSgenome.Hsapiens.UCSC.hg19)
#'  data(lymphoma_test)
#'  word_length <- 3
#'  temp_list <- create_mutation_catalogue_from_df(
#'    lymphoma_test_df,this_seqnames.field = "CHROM",
#'    this_start.field = "POS",this_end.field = "POS",
#'    this_PID.field = "PID",this_subgroup.field = "SUBGROUP",
#'    this_refGenome = BSgenome.Hsapiens.UCSC.hg19,
#'    this_wordLength = word_length)
#'    dim(temp_list$matrix)
#'    head(temp_list$matrix)
#'
#' @seealso \code{\link{makeVRangesFromDataFrame}}
#' @seealso \code{\link{create_mutation_catalogue_from_VR}}
#'
#' @export
#' 
create_mutation_catalogue_from_df <- function(this_df,
                                        this_refGenome_Seqinfo=NULL,
                                        this_seqnames.field="X.CHROM",
                                        this_start.field="POS",
                                        this_end.field="POS",
                                        this_PID.field="PID",
                                        this_subgroup.field="subgroup",
                                        this_refGenome,this_wordLength,
                                        this_verbose=1,
                                        this_rownames=c(),
                                        this_adapt_rownames=1) {
    if(is.null(this_refGenome_Seqinfo)){
        this_refGenome_Seqinfo <- seqinfo(this_refGenome)
    }
    my_vr <- makeVRangesFromDataFrame(this_df,
                                    in_keep.extra.columns=TRUE,
                                    in_seqinfo=this_refGenome_Seqinfo,
                                    in_seqnames.field=this_seqnames.field,
                                    in_start.field=this_start.field,
                                    in_end.field=this_end.field,
                                    in_PID.field=this_PID.field,
                                    in_subgroup.field=this_subgroup.field,
                                    verbose_flag=this_verbose)
    my_result_list <- create_mutation_catalogue_from_VR(my_vr,
                                                this_refGenome,
                                                this_wordLength,
                                                this_PID.field,
                                                this_verbose,
                                                this_rownames,
                                                this_adapt_rownames)
    return(my_result_list)
}


#' Change rownames from one naming convention to another
#'
#' Rownames or names of the features used differ between the different contexts
#' a signature analysis is carried out in. The function
#' \code{transform_rownames_R_to_MATLAB} changes from the convention used in
#' the YAPSA pacakge to the one used by Alexandrov et al. in the MATLAB
#' framework.
#'
#' @param in_rownames 
#'  Character vector of input rownames
#' @param wordLength 
#'  Size of the considered motif context
#'
#' @return A character vector of the translated rownames.
#' 
#' @examples
#' NULL
#' 
#' @export
#'
transform_rownames_R_to_MATLAB <- function(in_rownames,wordLength=3) {
    out_rownames <- in_rownames
    temp_frame <- data.frame(strsplit(in_rownames," "))
    colnames(temp_frame) <- in_rownames
    my_frame <- as.data.frame(t(temp_frame[,]))
    colnames(my_frame) <- c("exchange","context")
    my_frame$types <- paste0(substr(my_frame$exchange,1,1),">",
                                substr(my_frame$exchange,2,2))
    ## adapt for word length here
    fraction_length <- floor(wordLength/2)
    my_frame$subtypes <- paste0(
        substr(my_frame$context,1,fraction_length),
        substr(my_frame$exchange,1,1),
        substr(my_frame$context,wordLength-fraction_length+1,wordLength))  
    out_rownames <- paste0(my_frame$types," ",my_frame$subtypes)
    return(out_rownames)
}


#' Change rownames from one naming convention to another
#'
#' The function \code{transform_rownames_MATLAB_to_R} changes from the 
#' convention used in Alexandrov et al. in the MATLAB framework to the one used
#' by the YAPSA pacakge.
#' 
#' @export
#' @rdname transform_rownames_R_to_MATLAB
#'
transform_rownames_MATLAB_to_R <- function(in_rownames,wordLength=3) {
    out_rownames <- in_rownames
    temp_frame <- data.frame(strsplit(in_rownames," "))
    colnames(temp_frame) <- in_rownames
    my_frame <- as.data.frame(t(temp_frame[,]))
    colnames(my_frame) <- c("exchange","context")
    my_frame$types <- paste0(substr(my_frame$exchange,1,1),
                                substr(my_frame$exchange,3,3))
    ## adapt for word length here
    fraction_length <- floor(wordLength/2)
    my_frame$subtypes <- paste0(
        substr(my_frame$context,1,fraction_length),".",
        substr(my_frame$context,wordLength-fraction_length+1,wordLength))  
    out_rownames <- paste0(my_frame$types," ",my_frame$subtypes)
    return(out_rownames)
}


#' Change rownames from one naming convention to another
#'
#' The function \code{transform_rownames_MATLAB_to_R} changes from the 
#' convention used in stored mutational catalogues by Alexandrov et al. to the 
#' one used by the YAPSA pacakge.
#'
#' @export
#' @rdname transform_rownames_R_to_MATLAB
#'
transform_rownames_nature_to_R <- function(in_rownames,wordLength=3) {
    out_rownames <- in_rownames
    ## adapt for word length here
    fraction_length <- floor(wordLength/2)
    string_length <- nchar(as.character(in_rownames[1]))
    my_frame <- repeat_df(in_value = "a",in_rows = length(in_rownames),
                            in_cols = 2)
    colnames(my_frame) <- c("types","subtypes")
    my_frame$subtypes <- paste0(
        substr(in_rownames,1,fraction_length),
        substr(in_rownames,fraction_length+2,fraction_length+2),
        substr(in_rownames,string_length-fraction_length+1,string_length))  
    my_frame$types <- substr(in_rownames,fraction_length+2,fraction_length+4)
    out_rownames <- paste0(my_frame$types," ",my_frame$subtypes)
    return(out_rownames)
}


#' Change rownames from one naming convention to another
#'
#' The function \code{transform_rownames_YAPSA_to_deconstructSigs} changes from
#' the convention used in the YAPSA package to the one used by the 
#' deconstructSigs package.
#'
#' @export
#' @rdname transform_rownames_R_to_MATLAB
#'
transform_rownames_YAPSA_to_deconstructSigs <- function(in_rownames,
                                                        wordLength=3) {
    out_rownames <- in_rownames
    ## adapt for word length here
    fraction_length <- floor(wordLength/2)
    string_length <- nchar(as.character(in_rownames[1]))
    my_frame <- repeat_df(in_value = "a",in_rows = length(in_rownames),
                            in_cols = 2)
    colnames(my_frame) <- c("types","subtypes")
    my_frame$subtypes <- lapply(in_rownames,function(x) strsplit(x," ")[[1]][2])
    my_frame$types <- lapply(in_rownames,function(x) strsplit(x," ")[[1]][1])
    out_rownames <- paste0(
        substr(my_frame$subtypes,1,fraction_length),"[",
        my_frame$types,"]",
        substr(my_frame$subtypes,fraction_length+2,2*fraction_length+1))
    return(out_rownames)
}


#' Change rownames from one naming convention to another
#'
#' The function \code{transform_rownames_YAPSA_to_deconstructSigs} changes from
#' the convention used in the deconstructSigs package to the one used by the 
#' YAPSA pacakge.
#'
#' @export
#' @rdname transform_rownames_R_to_MATLAB
#'
transform_rownames_deconstructSigs_to_YAPSA <- function(in_rownames,
                                                        wordLength=3) {
    out_rownames <- in_rownames
    ## adapt for word length here
    fraction_length <- floor(wordLength/2)
    string_length <- nchar(as.character(in_rownames[1]))
    my_frame <- repeat_df(in_value = "a",in_rows = length(in_rownames),
                            in_cols = 2)
    colnames(my_frame) <- c("types","subtypes")
    my_frame$subtypes <- paste0(
        substr(in_rownames,1,fraction_length),
        substr(in_rownames,2+fraction_length,2+fraction_length),
        substr(in_rownames,6+fraction_length,5+2*fraction_length))
    my_frame$types <- substr(in_rownames,2+fraction_length,4+fraction_length)
    out_rownames <- paste0(my_frame$types," ",my_frame$subtypes)  
    return(out_rownames)
}


#' Normalize Somatic Motifs with different rownames
#'
#' This is a wrapper function to
#' \code{\link[SomaticSignatures]{normalizeMotifs}}. The rownames are first
#' transformed to fit the convention of the
#' \code{\link[SomaticSignatures]{SomaticSignatures}} package and then passed on
#' to the above mentioned function.
#'
#' @param in_matrix,in_norms Arguments to
#'   \code{\link[SomaticSignatures]{normalizeMotifs}}
#' @param adjust_counts Whether to rescale the counts after adaption or not.
#'   Default is true.
#'
#' @return The matrix returned by
#' \code{\link[SomaticSignatures]{normalizeMotifs}}, but with rownames
#' transformed back to the convention of the input
#'
#' @examples
#'  NULL
#'
#' @importFrom SomaticSignatures normalizeMotifs
#' @export
#' 
normalizeMotifs_otherRownames <- function(in_matrix,
                                            in_norms,
                                            adjust_counts=TRUE) {
    my_matrix <- in_matrix
    rownames(my_matrix) <- transform_rownames_MATLAB_to_R(rownames(in_matrix))
    out_matrix <- SomaticSignatures::normalizeMotifs(my_matrix,in_norms)
    rownames(out_matrix) <- rownames(in_matrix)
    if(adjust_counts){
        total_counts <- colSums(in_matrix)
        intermediate_counts <- colSums(out_matrix)
        mult_factors <- total_counts/intermediate_counts
        out_matrix <- as.data.frame(t(t(out_matrix)*mult_factors))
    }
    return(out_matrix)
}


adjust_number_of_columns_in_list_of_catalogues <- 
    function(in_all_list, in_merged_results_list) {
    out_list <- list()
    number_of_strata <- length(in_all_list$results_lists_list)
    reference_matrix <- in_merged_results_list$matrix
    zero_df <- as.data.frame(reference_matrix-reference_matrix)
    colname_vector <- colnames(reference_matrix)
    for (i in seq_len(number_of_strata)) {
        temp_df <- as.data.frame(in_all_list$results_lists_list[[i]]$matrix)
        match_ind <- match(names(temp_df),colname_vector)
        new_df <- zero_df
        new_df[,match_ind] <- temp_df
        out_list[[i]] <- new_df
    }
    return(out_list)
}


save_mutation_catalogue <- function(in_table,in_matrix,in_frame,
                                    in_subgroup_flag,in_meta_info,in_subDir,
                                    in_verbose) {
    type_df <- aggregate(data=in_table,subgroup~PID, function(l) return(l[1]))
    if (dim(type_df)[1] != length(colnames(in_matrix))) {
        cat("YAPSA:::save_mutation_catalogue::Warning: 
            mismatch between subgroup",
            " information and colnames(in_matrix).\n");
    }
    subgroup_vector <- rep("dummy",dim(in_matrix)[2])
    if (in_subgroup_flag==1) {
        subgroup_vector <- 
            as.character(in_meta_info$Diagnosis[match(colnames(in_matrix),
                                                in_meta_info$PID)])
    }
    if (in_subgroup_flag==2) {
        subgroup_vector <- 
            as.character(type_df$subgroup[match(colnames(in_matrix),
                                                type_df$PID)])
    }
    # store to text files
    if ( in_verbose==1 ) {cat("Store to specified files...\n");}
    save(in_table,in_matrix,file=file.path(in_subDir,"MotifMatrix.RData"))
    write.table(in_matrix,sep="\t",file=file.path(in_subDir,"MotifMatrix.txt"),
                col.names=FALSE,row.names=FALSE)
    write(c("PIDs",colnames(in_matrix)),sep="\t",
        file=file.path(in_subDir,"MotifMatrix_colnames.txt"))
    write(c("subgroup",subgroup_vector),sep="\t",
        file=file.path(in_subDir,"MotifMatrix_subgroups.txt"))
    write(c("types",in_frame$types),sep="\t",
        file=file.path(in_subDir,"MotifMatrix_types.txt"))
    write(c("subtypes",in_frame$subtypes),sep="\t",
        file=file.path(in_subDir,"MotifMatrix_subtypes.txt"))
}


stratify_vcf_like_df <- function(in_table,in_column_name,in_verbose=0) {
    out_table_list <- list()
    out_name_list <- list()
    stratification_index <- which(names(in_table)==in_column_name)
    name_list <- unique(in_table[,stratification_index])
    index_range <- sort(name_list)
    counter <- 0
    if(in_verbose == 1) {
        cat("\nYAPSA:::stratify_vcf_like_df::Stratify...\n")
    }
    for (i in index_range) {
        counter <- counter + 1
        temp_ind <- which(in_table[,stratification_index]==i)
        temp_table <- in_table[temp_ind,]
        temp_name <- i
        if(in_verbose == 1) {
            cat("YAPSA:::stratify_vcf_like_df:: temp_name =",temp_name,
                "; number of variants:",dim(temp_table)[1],"; number of PIDs:",
                length(unique(temp_table$PID)),"\n")
        }
        out_table_list[[counter]] <- temp_table
        out_name_list[[counter]] <- temp_name
    }    
    return(list(table_list=out_table_list,name_list=out_name_list))
}


save_stratified_vcf_like_df <- function(in_table_list,in_name_list,
                                        in_target_dir,in_verbose) {
    number_of_strata <- length(in_table_list)
    subDir <- file.path(in_target_dir, "vcf_like")
    strataDir <- file.path(in_target_dir, "strata")
    dir.create(in_target_dir)
    dir.create(subDir)
    dir.create(strataDir)
    if(in_verbose == 1) {
        cat("\nYAPSA:::save_stratified_vcf_like_df::Write to file...\n")
    }
    for (i in seq_len(number_of_strata)) {
        temp_table <- in_table_list[[i]]
        temp_name <- in_name_list[[i]]
        if(in_verbose == 1) {
            cat("YAPSA:::save_stratified_vcf_like_df:: 
                temp_name =",temp_name,"; number of variants:",
                dim(temp_table)[1],"; number of PIDs:",
                length(unique(temp_table$PID)),"\n")
        }
        file_name <- paste0(subDir,"/somatic_SNVs_",temp_name,".tsv")
        names(temp_table) <- sub("X.CHROM","\\#CHROM",names(temp_table))
        write.table(temp_table, file=file_name,sep="\t",
                    quote =FALSE, row.names=FALSE)
        temp_dir <- file.path(in_target_dir, "strata", temp_name)
        dir.create(temp_dir)
    }    
    return(1)
}


stratify_and_create_mutational_catalogue <- 
    function(our_table, our_column_name, vcf_target_dir, strata_target_dir,
            our_refGenome_Seqinfo, our_seqnames.field="X.CHROM",
            our_start.field="POS", our_end.field="POS", our_PID.field="PID",
            our_subgroup.field="subgroup", our_refGenome, our_wordLength,
            in_verbose=1, our_rownames=c()) {
    results_lists_list <- list()
    results_list <- stratify_vcf_like_df(our_table,our_column_name,in_verbose)
    table_list <- results_list$table_list
    name_list <- results_list$name_list
    if(!is.null(vcf_target_dir)) {
        status <- save_stratified_vcf_like_df(table_list,name_list,
                                            vcf_target_dir,in_verbose)    
    }
    number_of_strata <- length(table_list)
    if(in_verbose == 1) {
        cat("\nYAPSA:::stratify_and_create_mutational_catalogue::Create ",
            "mutational catalogues...\n")
    }
    for (i in seq_len(number_of_strata)) {
        if(in_verbose == 1) {
            cat("\nYAPSA:::stratify_and_create_mutational_catalogue::
                Processing ","stratum: ",i,"\n")
    }
        temp_table <- table_list[[i]]
        temp_name <- name_list[[i]]
        temp_results_list <- create_mutation_catalogue_from_df(
            temp_table,
            our_refGenome_Seqinfo,
            this_seqnames.field=our_seqnames.field,
            this_start.field=our_start.field,
            this_end.field=our_end.field,
            this_PID.field=our_PID.field,
            this_subgroup.field=our_subgroup.field,
            our_refGenome,our_wordLength,in_verbose,
            our_rownames)
        if(!is.null(strata_target_dir)) {
            temp_matrix <- temp_results_list$matrix
            temp_frame <- temp_results_list$frame
            MotifMatrix_dir <- file.path(strata_target_dir,temp_name)
            save_mutation_catalogue(temp_table,temp_matrix,temp_frame,
                                    2,NULL,MotifMatrix_dir,in_verbose)
        }
        results_lists_list[[i]] <- temp_results_list
    }    
    return(list(table_list=table_list,name_list=name_list,
                results_lists_list=results_lists_list))  
}


#' Wrapper for cut
#'
#' In this wrapper function for the known \code{\link{cut}} function, the
#' \code{breaks} vector need not be supplied directly, instead, for every break,
#' an interval is supplied and the function optimizes the choice of the
#' breakpoint by chosing a local minimum of the distribution.
#'
#' @param in_vector Vector of numerical continuously distributed input
#' @param in_outlier_cutoffs Interval specifyinf the upper and lower bounds of
#'   the range to be considered
#' @param in_cutoff_ranges_list List if intervals in which the cutoffs for
#'   \code{\link{cut}} have to be optimized.
#' @param in_labels Labels assigned to the strata or factors returned
#' @param in_name String specifying the name of the quantity analyzed (and
#'   plotted on the x-axis of the figure to be created).
#' @param output_path Path where the figure produced by the density function
#'   should be stored if non-NULL.
#'
#' @return A list with entries \code{category_vector}, and \code{density_plot}
#'   and \code{cutoffs} \itemize{ \item \code{category_vector}: Factor vector of
#'   the categories or strata, of the same length as \code{in_vector} \item
#'   \code{density_plot}: Density plot produced by the density function and
#'   indication of the chosen cutoffs. \item \code{cutoffs}: Vector of the
#'   computed optimal cutoffs }
#'
#' @examples
#'  data(lymphoma_test)
#'  lymphoma_test_df$random_norm <- rnorm(dim(lymphoma_test_df)[1])
#'  temp_list <- cut_breaks_as_intervals(
#'    lymphoma_test_df$random_norm,
#'    in_outlier_cutoffs=c(-4,4),
#'    in_cutoff_ranges_list=list(c(-2.5,-1.5),c(0.5,1.5)),
#'    in_labels=c("small","intermediate","big"))
#'   temp_list$density_plot
#'
#' @seealso \code{\link{cut}}
#' @seealso \code{\link[stats]{density}}
#'
#' @export
#' 
cut_breaks_as_intervals <- function(in_vector,
                                    in_outlier_cutoffs=c(0,3000),
                                    in_cutoff_ranges_list=list(c(60,69),
                                                                c(25,32)),
                                    in_labels=c("late","intermediate","early"),
                                    in_name="",
                                    output_path=NULL) {
    ## prepare
    .e <- environment()
    finite_ind <- which(is.finite(in_vector))
    finite_vector <- in_vector[finite_ind]
    choice_ind <- which(finite_vector<=in_outlier_cutoffs[2] &
                        finite_vector>=in_outlier_cutoffs[1])
    my_density <- density(finite_vector[choice_ind])
    my_density_df <- data.frame(x=my_density$x,y=my_density$y)
    ## find appropriate cutoffs in the ranges provided (refine!!)
    number_of_cutoffs <- length(in_cutoff_ranges_list)
    my_cutoff_vector <- rep(0,number_of_cutoffs)
    for(j in seq_len(number_of_cutoffs)) {
        my_cutoff_interval <- in_cutoff_ranges_list[[j]]
        my_range_ind <- which(my_density_df$x<my_cutoff_interval[2] & 
                                my_density_df$x>my_cutoff_interval[1])
        my_min_ind <- which.min(my_density_df$y[my_range_ind])
        my_cutoff_vector[j] <- my_density_df$x[my_range_ind[my_min_ind]]
    }
    ## make density plot
    p <- ggplot(my_density_df,aes_string(x="x",y="y"), environment = .e) + 
        geom_line() + labs(x=in_name, y="density") + 
        geom_vline(xintercept=my_cutoff_vector,colour="red")
    if(!is.null(output_path)) {
        png(output_path,width=400,height=400); p; dev.off()    
    }
    my_cutoff_vector <- c(in_outlier_cutoffs[1],my_cutoff_vector,
                            in_outlier_cutoffs[2])
    ## discretize into categories
    cut_vector <- cut(in_vector,my_cutoff_vector,right=FALSE,labels=in_labels)
    cut_vector <- as.character(cut_vector)
    cut_vector[which(is.na(cut_vector))] <- "undetermined"
    cut_vector <- factor(cut_vector)
    return(list(category_vector=cut_vector,density_plot=p,
                cutoffs=my_cutoff_vector))
}


#' Return those PIDs which have an extreme pattern for signature exposure
#'
#' For all signatures found in a project, this function returns the sample
#' identifiers (PIDs) with extremely high or extremely low exposures of the
#' respective signatures.
#'
#' @param in_exposures_df Data frame with the signature exposures
#' @param in_quantile Quantile for the amount of extreme PIDs to be selected.
#'
#' @return A data frame with 4 rows per signature (high PIDs, high exposures,
#'   low PIDs, low exposures); the number of columns depends on the quantile
#'   chosen.
#'
#' @examples
#'  data(lymphoma_cohort_LCD_results)
#'  get_extreme_PIDs(lymphoma_Nature2013_COSMIC_cutoff_exposures_df,0.05)
#'
#' @import ggplot2
#' @export
#' 
get_extreme_PIDs <- function(in_exposures_df,in_quantile=0.03) {
    my_number_of_sigs <- dim(in_exposures_df)[1]
    my_number_of_PIDs <- dim(in_exposures_df)[2]
    my_number_of_extremes <- ceiling(in_quantile*my_number_of_PIDs)
    extreme_PID_df <- repeat_df(0,my_number_of_extremes,my_number_of_sigs*4)
    for(i in seq_len(my_number_of_sigs)) {
        this_sig <- rownames(in_exposures_df)[i]
        my_exposure_vector <- in_exposures_df[i,]
        my_breaks <- as.numeric(quantile(my_exposure_vector,
                        probs=c(in_quantile,(1-in_quantile))))
        extreme_low_ind <- which(my_exposure_vector<=my_breaks[1])
        extreme_high_ind <- which(my_exposure_vector>=my_breaks[2])
        extreme_PID_df[,4*i-3] <- names(
            my_exposure_vector[extreme_low_ind[seq_len
                                                (my_number_of_extremes)]])
        names(extreme_PID_df)[4*i-3] <- paste0(this_sig,"_low_PIDs")
        extreme_PID_df[,4*i-2] <- as.numeric(
            my_exposure_vector[extreme_low_ind[seq_len
                                                (my_number_of_extremes)]])
        names(extreme_PID_df)[4*i-2] <- paste0(this_sig,"_low_exp")
        extreme_PID_df[,4*i-1] <- names(
            my_exposure_vector[extreme_high_ind[seq_len
                                                (my_number_of_extremes)]])
        names(extreme_PID_df)[4*i-1] <- paste0(this_sig,"_high_PIDs")
        extreme_PID_df[,4*i] <- as.numeric(
            my_exposure_vector[extreme_high_ind[seq_len
                                                (my_number_of_extremes)]])
        names(extreme_PID_df)[4*i] <- paste0(this_sig,"_high_exp")
    }
    out_df <- data.frame(t(extreme_PID_df))
    names(out_df) <- paste0("PID_",seq_len(dim(out_df)[2]))
    return(out_df)
}


#' Test if mutated PIDs are enriched in signatures
#'
#' For all signatures found in a project, this function tests whether PIDs
#' having mutations in a specified list of genes of interest have significantly
#' higher exposures.
#'
#' @param in_gene_list List with genes of interest
#' @param in_exposure_df Data frame with the signature exposures
#' @param in_mut_table Data frame or table of mutations (derived from
#'   vcf-format)
#' @param in_gene.field Name of the column in which the gene names are to be
#'   looked up
#' @param in_p_cutoff Significance threshold
#'
#' @return A list with entries \code{pvals}, \code{exposure_df},
#'   \code{number_of_mutated}, \itemize{ \item \code{pvals}: p-values of the
#'   t-tests performed on mutated vs. unmutated PIDs \item \code{exposure_df}:
#'   Transposed input exposures data frame with additional annotations for
#'   mutation status \item \code{number_of_mutated}: Number of PIDs carrying a
#'   mutation }
#'
#' @examples
#'  NULL
#'
#' @export
#' 
test_gene_list_in_exposures <- function(in_gene_list,
                                        in_exposure_df,
                                        in_mut_table,
                                        in_gene.field="GENE_short",
                                        in_p_cutoff=0.05) {
    gene_index <- which(names(in_mut_table)==in_gene.field)
    number_of_signatures <- dim(in_exposure_df)[1]
    choice_ind <- which(in_mut_table[,gene_index] %in% in_gene_list)
    #choice_table <- in_mut_table[choice_ind,c(1:5,16:18,20,24,26,29)]
    choice_table <- in_mut_table[choice_ind,]
    this_PIDs <- unique(choice_table$PID)
    temp_ind <- match(this_PIDs,names(in_exposure_df))
    temp_ind_ind <- which(temp_ind>0)
    cat("test_gene_list_in_exposures: number of PIDs in overlap ",
        "between exposures, gene list and mutation table: ",
        length(temp_ind_ind),"\n")
    this_exposure_df <- as.data.frame(t(in_exposure_df))
    this_exposure_df$this_status <- "none"
    this_exposure_df$this_col <- "green"
    this_exposure_df$this_status[temp_ind[temp_ind_ind]] <- "mut"
    this_exposure_df$this_col[temp_ind[temp_ind_ind]] <- "red"
    ## now check for differences in signature counts
    significance_counter <- 0
    out_t_test_vector <- c()
    for (i in seq_len(number_of_signatures)) {
        temp_test_dat <- t.test(
            this_exposure_df[which(this_exposure_df$this_status=="none"),i],
            this_exposure_df[which(this_exposure_df$this_status=="mut"),i],
            alternative="less")
        out_t_test_vector[i] <- temp_test_dat$p.value
        if(out_t_test_vector[i] < in_p_cutoff & !is.na(out_t_test_vector[i])) {
            significance_counter <- significance_counter + 1
        }
    }
    cat("test_gene_list_in_exposures: Number of significancies: ",
        significance_counter,"\n")
    return(list(pvals=out_t_test_vector,
                exposure_df=this_exposure_df,
                number_of_mutated=length(temp_ind_ind)))
}


#' Compare one mutational catalogue to reference mutational catalogues
#'
#' Compare one mutational catalogue (e.g. of one index patient) to a list of
#' reference mutational catalogues (e.g. from the initial Alexandrov
#' puplication) by cosine similarities
#'
#' @param in_index_df Data frame containing the mutational catalogue of interest
#' @param in_comparison_list List of data frames (ideally named) containing the
#'   reference mutational catalogues
#'
#' @return A similarity dataframe
#'
#' @examples
#'  NULL
#'
#' @export
#' 
compare_to_catalogues <- function(in_index_df,in_comparison_list){
    sim_vec_list <- lapply(in_comparison_list,FUN=function(x){
        temp_list <- compare_sets(in_index_df,x)
        return(c(as.numeric(1-as.matrix(temp_list$distance))))
    })
    sim_df_list <- list()
    for(i in seq_along(sim_vec_list)) {
        my_entity <- names(sim_vec_list)[i]
        temp_df <- data.frame(entity=my_entity,sim=sim_vec_list[[i]])
        sim_df_list[[i]] <- temp_df
    }
    sim_df <- do.call(rbind,sim_df_list)
    return(sim_df)
}


#' Extract statistical measures for entity comparison
#'
#' Compare one mutational catalogue (e.g. of one index patient) to a list of
#' reference mutational catalogues (e.g. from the initial Alexandrov
#' puplication) by cosine similarities
#'
#' @param in_sim_df A similarity data frame as extracted by
#'   \code{\link{compare_to_catalogues}}
#'
#' @return A dataframe containing statistical measures, prepared for bar plot
#'
#' @examples
#'  NULL
#'
#' @export
#' 
compute_comparison_stat_df <- function(in_sim_df){
    mean_df <- aggregate(sim~entity,data=in_sim_df,FUN=mean)
    median_df <- aggregate(sim~entity,data=in_sim_df,FUN=median)
    sd_df <- aggregate(sim~entity,data=in_sim_df,FUN=sd)
    stat_df <- mean_df
    stat_df$median_sim <- median_df$sim
    stat_df$sd_sim <- sd_df$sim
    names(stat_df)[2] <- "mean_sim"
    return(stat_df)
}


#' Aggregate exposures by category
#'
#' If a valid category (i.e. it matches to a category specified in
#' in_sig_ind_df) is supplied, then the exposures are aggregated over this
#' category.
#'
#' @param in_exposures_df Input data frame of exposures.
#' @param in_sig_ind_df Input data frame of meta information on the signatures.
#'   Has to match the signatures in \code{in_exposures_df}
#' @param in_category Category to be aggregated over
#'
#' @return A list with entries: \itemize{ \item \code{exposures}: The exposures
#'   \code{H}, a numeric data frame with \code{l} rows and \code{m} columns,
#'   \code{l} being the number of aggregated signatures and \code{m} being the
#'   number of samples \item \code{norm_exposures}: The normalized exposures
#'   \code{H}, a numeric data frame with \code{l} rows and \code{m} columns,
#'   \code{l} being the number of aggregated signatures and \code{m} being the
#'   number of samples \item \code{out_sig_ind_df}: Data frame of the type
#'   \code{signature_indices_df}, i.e. indicating name, function and
#'   meta-information of the aggregated signatures.. }
#'
#' @seealso \code{\link{LCD_complex_cutoff}}
#'
#' @examples
#'  NULL
#'
#' @export
#' 
aggregate_exposures_by_category <- function(
    in_exposures_df,in_sig_ind_df,in_category){
    if(any(in_category==names(in_sig_ind_df))) {
        ## process exposures
        temp_exposures <- cbind(
            in_exposures_df,in_sig_ind_df[,in_category,drop=FALSE])
        names(temp_exposures) <- gsub(in_category,"cat",names(temp_exposures))
        temp_aggregate_exposures <- aggregate(.~cat,data=temp_exposures,FUN=sum)
        rownames(temp_aggregate_exposures) <- temp_aggregate_exposures$cat
        temp_aggregate_exposures$cat <- NULL
        ## normalize exposures
        norm_aggregate_exposures <- 
            normalize_df_per_dim(temp_aggregate_exposures, 2)
        ## process sig_ind_df
        names(in_sig_ind_df) <- gsub(in_category,"cat",names(in_sig_ind_df))
        out_sig_ind_df <- aggregate(.~cat,data=in_sig_ind_df,FUN=head,1)
        out_sig_ind_df$sig <- out_sig_ind_df$cat
        out_sig_ind_df$cat <- NULL
        out_sig_ind_df$colour <- as.character(out_sig_ind_df$colour)
        return(list(exposures=temp_aggregate_exposures,
                    norm_exposures=norm_aggregate_exposures,
                    out_sig_ind_df=out_sig_ind_df))
    } else {
        return(NULL)
    }
}