R/plots_coverage.R

Defines functions plot_template_cvg_mismatches prepare_template_cvg_mm_data plot_template_cvg_unstratified plot.excluded.hist plot_primer get_plot_primer_data plot_primer_subsets get_cvg_stats_primer get_template_cvg_data prepare_template_cvg_data prepare_mm_plot primer.binding.regions.data cbind.fill plot_penalty_vs_set_size plot_cvg_vs_set_size

Documented in get_cvg_stats_primer plot_cvg_vs_set_size plot_penalty_vs_set_size plot_primer plot_primer_subsets

#' @rdname Plots
#' @name Plots
#' @aliases plot_cvg_vs_set_size
#' @return \code{plot_cvg_vs_set_size} returns a plot of coverage vs set size.
#' @export
#' @examples
#' 
#' # Plot coverage vs primer set size
#' data(Comparison)
#' p <- plot_cvg_vs_set_size(primer.data, template.data)
plot_cvg_vs_set_size <- function(primer.data, template.data, show.labels = TRUE, highlight.set = NULL) {
    if (length(primer.data) == 0 || length(template.data) == 0) {
        return(NULL)
    }
    # check type of primer and template data
    template.classes <- sapply(template.data, function(x) class(x))
    primer.classes <- sapply(primer.data, function(x) class(x))
    if (any(template.classes != "Templates") || any(primer.classes != "Primers")) {
        stop("Check types of primers/templates.")
    }
    cvg <- unlist(lapply(seq_along(primer.data), function(x) get_cvg_ratio(primer.data[[x]], 
                                                      template.data[[x]])))
    set.size <- unlist(lapply(primer.data, function(primer.df) nrow(primer.df)))
    set.names <- get.run.names(primer.data)
    plot.df <- data.frame("Run" = set.names, "Coverage" = cvg, "Set_Size" = set.size)
    title <- "Coverage vs set size"
     if (length(highlight.set) != 0) {
        # check whether highlight set is specified correctly
        m <- match(highlight.set, plot.df$Run)
        na.idx <- which(is.na(m))
        if (length(na.idx) != 0) {
            msg <- paste("Highlight set not available in data:",
                paste(highlight.set[na.idx], collapse = ","))
            warning(msg)
            highlight.set <- highlight.set[!is.na(m)]
        }
    }
    pal <- getOption("openPrimeR.plot_colors")["Run"] # the RColorBrewer palette to use
    colors <- colorRampPalette(brewer.pal(8, pal))(length(unique(plot.df[, "Run"])))
    # determine rate of constraint fulfillment:
    fulfilled.rate <- unlist(lapply(primer.data, function(x) {
        fulfilled.counts <- create_fulfilled_counts(x)
        if (nrow(fulfilled.counts) != 0) {
            fulfilled.counts <- fulfilled.counts[!grepl("Run", colnames(fulfilled.counts))]
            ok.names <- which(!grepl("_failure", colnames(fulfilled.counts)))
            rate <- sum(fulfilled.counts[ok.names]) / (nrow(x) * length(ok.names))
        } else {
            # empty primer set
            rate <- 0
        }
    }))
    #print(fulfilled.rate)
    plot.df$Constraint_Fulfillment_Rate <- fulfilled.rate
    p <- ggplot(plot.df, aes_string(x = "Set_Size", y = "Coverage", size = "Constraint_Fulfillment_Rate"), key = substitute(Run)) + 
    # trans = scales::probability_trans("exp")
        scale_radius(limits = c(0,1), range = c(1,10),
            labels = scales::percent, 
            name = "Constraint Fulfillment") + 
        theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
        scale_y_continuous(labels = scales::percent, limits = c(0, 1)) +
        xlab("Number of primers") +
        ggtitle(title) +
        # only show integer values
        scale_x_continuous(breaks = function(x) unique(as.integer(pretty(x)))) +
        scale_colour_manual(values = colors)

    # decide whether to add some jitter
    overplotted <- sapply(seq_len(nrow(plot.df)), function(x) any(plot.df$Set_Size[x] - plot.df$Set_Size[-x] == 0 & abs(plot.df$Coverage[x] - plot.df$Coverage[-x]) < 0.2))
    if (any(overplotted)) {
        # there's overplotting of points -> add some jitter
        p <- p + geom_point(aes_string(fill = "Run", color = NULL), alpha = 0.65, pch=21, position = position_jitter(width = 0.35, height = 0.0))
    } else {
        p <- p + geom_point(aes_string(fill = "Run", color = NULL), alpha = 0.65, pch=21)
    }
    if (show.labels) {
        p <- p + geom_text(aes_string(label = "Run",
                  hjust = 0, vjust = 0), check_overlap = TRUE, size = 3) 
    }
    if (length(highlight.set) != 0) {
        p <- p + geom_point(data = plot.df[plot.df$Run %in% highlight.set,], colour="black", shape=1, size=5, stroke = 3, alpha = 0.6)  
   } 
    return(p)
}

#' @rdname Plots
#' @name Plots
#' @aliases plot_penalty_vs_set_size
#' @return \code{plot_penalty_vs_set_size} returns a plot of constraint penalties
#' vs primer set sizes.
#' @export
#' @examples
#' 
#' # Plot penalties vs number of primers
#' data(Comparison)
#' p <- plot_penalty_vs_set_size(primer.data, settings)
plot_penalty_vs_set_size <- function(primer.data, settings, 
        active.constraints = names(constraints(settings)), alpha = 0) {
    if (length(primer.data) == 0) {
        return(NULL)
    }
    # check types
    primer.classes <- sapply(primer.data, function(x) class(x))
    if (any(primer.classes != "Primers")) {
        stop("Ensure that 'primer.data' is a list of 'Primers' objects.")
    }
    set.size <- unlist(lapply(primer.data, function(primer.df) rep(nrow(primer.df), nrow(primer.df))))
    penalty.data <- lapply(primer.data, function(primer.df) score_primers(primer.df, settings, active.constraints = active.constraints, alpha = alpha))
    penalties <- unlist(lapply(penalty.data, function(x) x$Penalty))
    run.names <- get.run.names(primer.data)
    run.names <- unlist(lapply(seq_along(run.names), function(x) rep(run.names[x], nrow(primer.data[[x]]))))
    plot.df <- data.frame("Run" = run.names, "Penalty" = penalties, "Set_Size" = set.size)
    title <- "Penalty vs set size"
    pal <- getOption("openPrimeR.plot_colors")["Run"] # the RColorBrewer palette to use
    colors <- colorRampPalette(brewer.pal(8, pal))(length(unique(plot.df[, "Run"])))
    # determine rate of constraint fulfillment:
    p <- ggplot(plot.df, aes_string(x = "Set_Size", y = "Penalty", fill = "Run", group = "Run")) + 
        geom_boxplot() + 
        theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
        xlab("Number of primers") +
        ggtitle(title) +
        # only show integer values
        scale_x_continuous(breaks = function(x) unique(as.integer(pretty(x)))) +
        scale_fill_manual(values = colors)
   return(p)
}

cbind.fill <- function(...){
    nm <- list(...) 
    nm <- lapply(nm, as.matrix)
    n <- max(sapply(nm, nrow)) 
    do.call(cbind, lapply(nm, function (x) 
        rbind(x, matrix(, n-nrow(x), ncol(x))))) 
}

#' Primer Binding Region Data
#'
#' Collects all data concerning primer binding regions.
#'
#' @param primer.df Primer data frame.
#' @param template.df Template data frame.
#' @param direction Primer direction
#' @param group The groups for which binding data shall be retrieved.
#' @param relation Binding region data relative to forward/reverse binding region?
#' @return Data frame with primer binding data.
#' @keywords internal
primer.binding.regions.data <- function(primer.df, template.df, 
    direction = c("both", "fw", "rev"), group = NULL, relation = c("fw", "rev")) {
    if (length(relation) == 0) {
        stop("Please provide the 'relation' arg.")
    }
    relation <- match.arg(relation)
    if (length(direction) == 0) {
        stop("Please provide the 'direction' arg.")
    }
    direction <- match.arg(direction)
    if (length(primer.df) == 0 || nrow(primer.df) == 0) {
        return(NULL)
    }
    if (!"primer_coverage" %in% colnames(primer.df)) {
        stop(primer.df$Run[1], ": Please compute the primer coverage first.")
    }
    if (!is.null(group) && !"all" %in% group) { # select template subset
        idx <- which(template.df$Group %in% group)
        template.df <- template.df[idx,]
    }
    use.location.cols <- NULL
    if (relation == "fw") {
        use.location.cols <- c("Relative_Forward_Binding_Position_Start_fw", "Relative_Forward_Binding_Position_End_fw", 
            "Relative_Forward_Binding_Position_Start_rev", "Relative_Forward_Binding_Position_End_rev")
    } else {
        use.location.cols <- c("Relative_Reverse_Binding_Position_Start_fw", "Relative_Reverse_Binding_Position_End_fw", 
            "Relative_Reverse_Binding_Position_Start_rev", "Relative_Reverse_Binding_Position_End_rev")
    }
    # columns for 'start of binding':
    # 3 prime of primer: start
    # beginning of the target region'
    if (direction == "fw") {
        location.cols <- use.location.cols[c(1,2)]
    } else if (direction == "rev") {
        location.cols <- use.location.cols[c(3,4)]
    } else {
        location.cols <- use.location.cols
    }
    # select primer data belonging to the right group
    idx <- covered.seqs.to.idx(primer.df$Covered_Seqs, template.df)
    dfs <- do.call(rbind, lapply(seq_along(idx), function(x) if (length(idx[[x]]) == 0) 
        NULL else 
            data.frame(ID = primer.df$ID[x], Group = template.df[idx[[x]], "Group"], 
        StringIdx = seq_along(idx[[x]]))))  # id of primer + gene group
    if (any(!location.cols %in% colnames(primer.df))) {
        warning("Primer set doesn't have primers of indicated direction for plotting.")
        return(NULL)
    }
    # remove NA group entries
    if (length(dfs) != 0) {
        dfs <- dfs[!is.na(dfs$Group), ]
    }
    locations <- apply(asS3(primer.df)[, location.cols, drop = FALSE], 2, function(x) strsplit(as.character(x), 
        split = ","))
    all.IDs <- primer.df$ID
    if (length(group) != 0 && !"all" %in% group) {
        all.IDs <- dfs$ID
        m <- match(all.IDs, primer.df$ID)
        ## identify binding positions for the used location cols (fw and rev binding)
        locations <- lapply(seq_along(locations), function(y) lapply(seq_along(m),
                        function(x) {
                            pos <- locations[[y]][[m[x]]]
                            if (length(pos) != 0) {
                                pos[dfs$StringIdx[x]]
                            } else {
                                NULL
                            }
                        }
                    ))
    }
    my.locations <- lapply(locations, function(x) as.numeric(unlist(x)))
    my.IDs <- NULL
    cvg.per.primer <- lapply(locations, function(y) unlist(lapply(y, length)))
    IDs <- unlist(lapply(seq_along(cvg.per.primer), 
        function(x) unlist(lapply(seq_along(cvg.per.primer[[x]]), function(y) rep(all.IDs[y], cvg.per.primer[[x]][y])))))

    if (length(location.cols) == 2) {
        # one direction
        my.IDs <- all.IDs[unlist(lapply(seq_along(locations[[1]]), function(x)
                         rep(x, length(unlist(locations[[1]][x])))))]
        directions <- unlist(lapply(seq_along(cvg.per.primer[[1]]),    
                        function(y) rep(primer.df$Direction[y], cvg.per.primer[[1]][y])))
        location.df <- data.frame(ID = my.IDs, Direction = directions, do.call(cbind, my.locations))
    } else {
        # both directions
        my.IDs.fw <- all.IDs[unlist(lapply(seq_along(locations[[1]]), function(x)
                         rep(x, length(unlist(locations[[1]][x])))))]
         my.IDs.rev <- all.IDs[unlist(lapply(seq_along(locations[[3]]), function(x)
                         rep(x, length(unlist(locations[[3]][x])))))]
        my.IDs <- c(my.IDs.fw, my.IDs.rev)
        directions.fw <- unlist(lapply(seq_along(cvg.per.primer[[1]]),    
                        function(y) rep(primer.df$Direction[y], cvg.per.primer[[1]][y])))
        directions.rev <- unlist(lapply(seq_along(cvg.per.primer[[3]]),    
                        function(y) rep(primer.df$Direction[y], cvg.per.primer[[3]][y])))
        location.df.fw <- data.frame(ID = my.IDs.fw, Direction = directions.fw, cbind(my.locations[[1]], my.locations[[2]]))
        location.df.rev <- data.frame(ID = my.IDs.rev, Direction = directions.rev, cbind(my.locations[[3]], my.locations[[4]]))
        location.df <- rbind(location.df.fw, location.df.rev)
    }
    colnames(location.df) <- c("ID", "Direction", "Start", "End")
    if (relation == "rev") {
        old.cnames <- colnames(location.df)
        colnames(location.df)[old.cnames == "Start"] <- "End"
        colnames(location.df)[old.cnames == "End"] <- "Start"
    }
    if (nrow(location.df) == 0) {
        return(NULL)
    }
    return(location.df)
}

#' Data Preparation for Mismatch Plot.
#' @param primer.df A \code{Primers} object.
#' @param template.df A \code{Templates} object.
#' @param mode Whether to compute for on-target or off-target events.
#' @return A data frame with binding information for every primer.
#' @keywords internal
prepare_mm_plot <- function(primer.df, template.df, 
                    mode = c("on_target", "off_target")) {
    if (length(mode) == 0) {
        mode <- "on_target"
    } else {
        mode <- match.arg(mode)
    }
    all.mm.cols <- c("Mismatch_pos_fw", "Mismatch_pos_rev")
    if (mode == "off_target") {
        all.mm.cols <- paste0("Off_", all.mm.cols)
    }
    all.cvg.cols <- c("Covered_Seqs")
    if (mode == "off_target") {
        all.cvg.cols <- paste0("Off_", all.cvg.cols)
    }
    cvg.criteria <- c("off_annealing_DeltaG", "annealing_DeltaG", "off_primer_efficiency", "primer_efficiency", "coverage_model", "off_coverage_model")
    # get constrained coverage data:
    cvg.defs <- c("constrained", "basic")
    dfs <- vector("list", length(cvg.defs))
    for (i in seq_along(cvg.defs)) {
        # define current columns depending on coverage definition
        cvg.def <- cvg.defs[i]
        if (cvg.def == "basic") { 
            cvg.cols <- paste0("Basic_", all.cvg.cols)
            mm.cols <- paste0("Basic_", all.mm.cols)
        } else {
            cvg.cols <- all.cvg.cols
            mm.cols <- all.mm.cols
        }
        if (any(!mm.cols %in% colnames(primer.df))) {
            # no data for selected coverage definition
            next
        }
        abs.mm.pos.fw <- mismatch.string.to.list(primer.df[, mm.cols[1]])
        abs.mm.pos.rev <- mismatch.string.to.list(primer.df[, mm.cols[2]])
        # determine mismatch positions for forward and reverse primers
        fw.idx <- which(primer.df$Direction == "fw")
        rev.idx <- which(primer.df$Direction == "rev")
        both.idx <- which(primer.df$Direction == "both")
        # for primer pairs: consider for every coverage event the fw/rev primer with the higher nbr of mismatches
        # new idea: for 'both' primers: add both entries
        abs.mm.pos <- rep(NA, nrow(primer.df))
        abs.mm.pos[fw.idx] <- abs.mm.pos.fw[fw.idx]
        abs.mm.pos[rev.idx] <- abs.mm.pos.rev[rev.idx]
        abs.mm.pos[both.idx] <- lapply(seq_along(both.idx), function(x) c(abs.mm.pos.fw[both.idx[x]], abs.mm.pos.rev[both.idx[x]]))
        pos.3prime <- rep(NA, length(abs.mm.pos))
        pos.3prime[fw.idx] <- lapply(seq_along(fw.idx), function(x) lapply(abs.mm.pos[[fw.idx[x]]], function(y) nchar(primer.df$Forward[x]) - y + 1))
        pos.3prime[rev.idx] <- lapply(seq_along(rev.idx), function(x) lapply(abs.mm.pos[[rev.idx[x]]], function(y) nchar(primer.df$Reverse[x]) - y + 1))
        pos.3prime[both.idx] <- lapply(both.idx, function(x) unlist(lapply(seq_along(abs.mm.pos[[x]]), function(y) lapply(seq_along(abs.mm.pos[[x]][[y]]),
            function(z) {
                if (y == 1) { # fw primers
                    nchar(primer.df$Forward[x]) - abs.mm.pos[[x]][[y]][[z]] + 1
                } else { # rev primers
                    nchar(primer.df$Reverse[x]) - abs.mm.pos[[x]][[y]][[z]] + 1
                }
            }
         )), recursive = FALSE))
        pos.worst <- lapply(seq_along(abs.mm.pos), function(x) lapply(pos.3prime[[x]], function(y) rep(min(y), length(y))))
        primer.ids <- lapply(seq_along(pos.3prime), function(x) rep(as.character(primer.df$ID[x]), length(unlist(pos.3prime[[x]]))))
        directions <- lapply(seq_len(nrow(primer.df)), function(x) {
            dir <- primer.df$Direction[x]
            rep(dir, primer.df$primer_coverage[x])

            if (dir == "both") {
                c(rep("fw",  length(unlist(abs.mm.pos[[x]][[1]]))), 
                  rep("rev", length(unlist(abs.mm.pos[[x]][[2]]))))
            } else {
                rep(dir, length(unlist(abs.mm.pos[[x]])))
            }
         })
        cvd.idx <- covered.seqs.to.idx(primer.df[, cvg.cols], template.df)
        if (length(unlist(cvd.idx)) == 0) {
            #warning("Nothing covered")
            next
        }
        template.ids <- lapply(seq_len(nrow(primer.df)), function(x) {
            dir <- primer.df$Direction[x]
            if (dir == "both") {
                # fw and rev
                id1 <- unlist(lapply(seq_along(abs.mm.pos[[x]][[1]]), function(y) rep(template.df$ID[cvd.idx[[x]]][y], length(abs.mm.pos[[x]][[1]][[y]]))))
                id2 <- unlist(lapply(seq_along(abs.mm.pos[[x]][[2]]), function(y) rep(template.df$ID[cvd.idx[[x]]][y], length(abs.mm.pos[[x]][[2]][[y]]))))
                ids <- c(id1, id2)
            } else {
                ids <- unlist(lapply(seq_along(abs.mm.pos[[x]]), function(y) rep(template.df$ID[cvd.idx[[x]]][y], length(abs.mm.pos[[x]][[y]]))))
            }
            return(ids)
         })
        # count number of mismatches per primer-template pair
        nbr.mismatches <- lapply(seq_along(pos.3prime), function(x) lapply(pos.3prime[[x]], function(y) rep(length(which(!is.na(y))), length(y))))
        # annotate with group of templates
        m <- match(unlist(template.ids), template.df$ID)
        template.groups <- template.df$Group[m]
		#print(primer.ids) # character on windows
        df <- data.frame(Primer = as.character(unlist(primer.ids)), 
                        Direction = unlist(directions),
                        Template = unlist(template.ids),
                        Group = template.groups,
                        Position_3prime = unlist(pos.3prime),
                        Position_3terminus = unlist(pos.worst),
                        Number_of_mismatches = unlist(nbr.mismatches),
                        Coverage_Type = as.factor(rep(cvg.def, length(template.groups))))
		#print(df$Template)
        # if coverage criteria are available, also include these
        for (z in seq_along(cvg.criteria)) {
            crit <- cvg.criteria[z]
            if (crit %in% colnames(primer.df) && any(!is.na(primer.df[, crit]))) {
                val <- lapply(strsplit(primer.df[, crit], split = ","), as.numeric)
                val <- lapply(seq_along(pos.3prime), function(x)  {
                    dir <- primer.df$Direction[x]
                    if (dir == "both") {
                        val1 <- unlist(lapply(seq_along(abs.mm.pos[[x]][[1]]), function(y) rep(val[[x]][y], length(abs.mm.pos[[x]][[1]][[y]]))), recursive = FALSE)
                        val2 <- unlist(lapply(seq_along(abs.mm.pos[[x]][[2]]), function(y) rep(val[[x]][y], length(abs.mm.pos[[x]][[2]][[y]]))), recursive = FALSE)
                        c(val1, val2)
                    } else {
                        lapply(seq_along(pos.3prime[[x]]), function(y) rep(val[[x]][y], length(pos.3prime[[x]][[y]])))
                    }
                })
                df[, crit] <- unlist(val)
            }
        }
        ####
        # add coverage model position:
        #######
        df$Position_3terminusLocal <- df$Position_3terminus
        df$Position_3terminusLocal[is.na(df$Position_3terminusLocal) | df$Position_3terminusLocal >= 7]  <- 7
        # ensure that position is increasing monotonically: 0 is no mismatch, 6 is mismatch at last 3' hexamer position
        df$Position_3terminusLocal <- abs(df$Position_3terminusLocal - 7)
        # if we have a subsetted template data frame, some primer matches might not be identified -> remove those that shouldn't be considered.
        df <- df[!is.na(df$Template ),]
        dfs[[i]] <- df
    }
    df <- do.call(rbind, dfs)
    if (length(df) != 0) {
        # ensure that the order of the factors doesn't change
        df$Primer <- factor(df$Primer, levels(primer.df$ID))
    }
    # rename off_columns for later use for cvg model
    if (length(df) != 0 && mode == "off_target") {
        # note: this may 'overwrite' the regular columns ..
        colnames(df) <- gsub("^off_", "", colnames(df))
    }
    return(df)
}
prepare_template_cvg_data <- function(df, mode.directionality) {
    # df: coverage for one coverage type
    # select individual binding mode per primer-template pair instead of the representation per individual mismatch
    #########
    ddf <-  ddply(df, c("Primer", "Direction", "Template", "Group"), summarize,
                            Position = unique(substitute(Position_3terminus)), 
                            Number_of_mismatches = unique(substitute(Number_of_mismatches)))

    ##########################
    # take the minium number of mismatches binding mode per direction 
    dff <- ddply(ddf, c("Template", "Direction"), function(x) arrange(x, 
                                        substitute(Number_of_mismatches))[1, ])
    # if mode.directionality is "both" -> retain only templates where we have coverage from a fw AND a rev primer
    if (mode.directionality == "both") {
        # for individual primers, require coverage by fw & rev primers:
        dir.count <- ddply(dff, "Template", summarize, DirectionCount = length(unique(substitute(Direction))))
        # remove all template coverage events where we don't have two primers covering the template
        rm.template.id <- setdiff(dir.count$Template[dir.count$DirectionCount <= 1], dff$Template[dff$Direction == "both"])
        m <- which(dff$Template %in% rm.template.id)
        if (length(m) != 0) {
            dff <- dff[-m,]
        }
    }
    # take the maximum number of mismatches per template over all directions
    # result is: max(min(fw), min(rev))
    dff <- ddply(dff, c("Template"), function(x) arrange(x, 
                                        -substitute(Number_of_mismatches))[1, ])
    return(dff)
}
#' Retrieval of Template Coverage Data.
#'
#' Determines the coverage of the templates for individual
#' allowed mismatch settings and coverage definitions.
#'
#' @param primer.df A \code{Primers} object.
#' @param template.df A \code{Templates} object.
#' @return Computes a data frame providing the coverage of the 
#' templates for the basic as well as expected (constrained) coverage.
#' @keywords internal
get_template_cvg_data <- function(primer.df, template.df) {
    mode.directionality <- get.analysis.mode(primer.df)
    full.df <- prepare_mm_plot(primer.df, template.df)
    # select only unique primer-template pairs with a certain number of mismatches
    unique.cvg.types <- unique(full.df$Coverage_Type)
    plot.data <- vector("list", length(unique.cvg.types))
    for (z in seq_along(unique.cvg.types)) {
        df <- full.df[full.df$Coverage_Type == unique.cvg.types[z],]    
        # select representative binding events for every template
        ddf <- prepare_template_cvg_data(df, mode.directionality)
         # if there's no mismatches, plot these primers with mismatch position @ position 0
        ddf[is.na(ddf$Position), "Position"] <- 0
        ddf$Status <- unique.cvg.types[z]
        plot.data[[z]] <- ddf
    }
    plot.df <- do.call(rbind, plot.data)
    return(plot.df)
}

#' @rdname AnalysisStats
#' @name AnalysisStats
#' @aliases get_cvg_stats_primer
#' @details
#' For \code{get_cvg_stats_primer}, the cells corresponding
#' to columns with numeric identifiers 
#' indicate the percentage of coverage events occurring
#' with a certain number of mismatches. For example
#' column \code{3} provides the number of coverage events
#' where there are exactly three mismatches between primers and templates.
#' The column \code{Group_Coverage} provides a listing
#' of the percentage of covered templates per group.
#'
#' @return \code{get_cvg_stats_primer} returns a list with the following entries. \code{cvg_per_nbr_mismatches} contains a data frame listing 
#' the number of binding events broken down according to the number
#' of expected mismatches between primers and templates.
#' \code{cvg_per_group} contains a data frame listing the the coverage
#' of individual primers per group of templates.
#'
#' @export
#' @examples
#' 
#' data(Ippolito)
#' # Determine coverage stats per primer
#' primer.cvg.stats <- get_cvg_stats_primer(primer.df, template.df)
get_cvg_stats_primer <- function(primer.df, template.df,
                                cvg.definition = c("constrained", "basic")) {
    if (!is(primer.df, "Primers")) {
        stop("'primer.df' should be a 'Primers' object.")
    }
    if (!is(template.df, "Templates")) {
        stop("'template.df' should be a 'Templates' object.")
    }
    cvg.definition <- match.arg(cvg.definition)
    full.df <- prepare_mm_plot(primer.df, template.df)
    # select only constrained cvg events
    full.df <- full.df[full.df$Coverage_Type == cvg.definition, ]
    df <- ddply(full.df, c("Primer", "Template", "Group"), summarize,
                            Position = unique(substitute(Position_3terminus)), 
                            Number_of_mismatches = unique(substitute(Number_of_mismatches)))
    if (length(df) == 0 || nrow(df) == 0) {
        # no stats available
        return(NULL)
    }
    mm.range <- seq(0, max(df$Number_of_mismatches))
    # create a matrix containing the coverage of every primer for every mismatch setting
    mm.stats <- matrix(rep(0, length(mm.range) * nrow(primer.df)), nrow = nrow(primer.df), ncol = length(mm.range))
    for (i in seq_along(mm.range)) {
        mm <- mm.range[i]
        sub.df <- df[df$Number_of_mismatches == mm,]
        count.df <- ddply(sub.df, c("Primer"), summarize,
                            Coverage = length(unique(substitute(Template))))
        m <- match(count.df$Primer, primer.df$ID)
        mm.stats[m,i] <- count.df$Coverage
    }
    ####
    # annotate coverage counts per mismatch number with percentages
    ####
    mm.stats.x <- mm.stats / rowSums(mm.stats)
    mm.stats.x[is.na(mm.stats.x)] <- 0
    for (i in seq_len(nrow(mm.stats))) {
        mm.stats[i, ] <- paste0(mm.stats[i,], " (", round(mm.stats.x[i,], 3) * 100, "%)")
    }
    mm.stats <- data.frame(mm.stats)
    colnames(mm.stats) <- mm.range
    mm.stats <- cbind("Primer" = primer.df$ID, mm.stats)
    ########
    # determine number of templates covered from each group
    ########
    count.df <- ddply(df, c("Primer", "Group"), summarize,
                            Coverage = length(unique(substitute(Template))),
                            CoverageRatio = length(unique(substitute(Template))))
    count.df$CoverageRatio <- count.df$CoverageRatio / unlist(lapply(count.df$Group, function(x) length(which(template.df$Group == x))))
    # create table entry: value + percentage
    count.df$TabEntry <- paste0(count.df$Coverage, " (", 
                            paste0(round(count.df$CoverageRatio * 100, 1), "%"), ")")
    # add percentages
    # order by factorlevels to ensure group order when using 'reshape'
    #print(count.df)
    cols <- c("Primer", "Group", "TabEntry")
    count.df <- count.df[order(count.df$Group), cols]
    # to wide format
    group.df <- reshape(count.df, idvar = "Primer", timevar = "Group", direction = "wide")
    colnames(group.df) <- gsub("TabEntry.", "", colnames(group.df), fixed = TRUE)
    # replace NA with 0
    group.df[is.na(group.df)] <- "0 (0%)"
    out <- list("cvg_per_nbr_mismatches" = mm.stats, "cvg_per_group" = group.df)
    return(out)
}
#' @rdname Plots
#' @name Plots
#' @aliases plot_primer_subsets
#' @details
#' The \code{primer.subsets} argument for \code{plot_primer_subsets} can be computed using
#' \code{\link{subset_primer_set}}. 
#' The line plot indicates the ratio of covered templates when considering
#' all primers in a primer set of a given size. The bar plots indicate
#' the coverage ratios of individual primers in a set. The target coverage
#' ratio is indicated by a horizontal line. Bars exceeding the target ratio
#' possibly indicate the existence of redundant coverage events.
#' @return \code{plot_primer_subsets} plots the coverages of the primer subsets provided via \code{primer.subsets}.
#' @export
#' @examples
#' 
#' # Plot the coverage of optimal primer subsets
#' data(Ippolito)
#' primer.subsets <- subset_primer_set(primer.df, template.df, k = 3)
#' p <- plot_primer_subsets(primer.subsets, template.df)
plot_primer_subsets <- function(primer.subsets, template.df, required.cvg = 1) {
                            
    if (length(primer.subsets) == 0) {
        return(NULL)
    }
    if (!all(sapply(primer.subsets, function(x) is(x, "Primers")))) {
        stop("All subsets should represent primer sets.")
    }
    if (!is(template.df, "Templates")) {
        stop("Please provide a valid template data frame.")
    }
    # create plot.df: need cvg and size of set
    N <- sapply(primer.subsets, nrow)
    cvg <- sapply(seq_along(primer.subsets), function(x) get_cvg_ratio(primer.subsets[[x]], 
        template.df))
    # retrieve the IDs of the primer(s) that are different in the next subset size
    IDs <- lapply(primer.subsets, function(x) x$ID)
    # collect all set members?
    collect.all <- TRUE
    if (!collect.all) {
        # only consider primers that are different from previous ones
    IDs <- lapply(seq_along(IDs), function(x) {
        if (x == 1) {
            out <- IDs[[x]]
        } else {
            out <- setdiff(IDs[[x]], IDs[[x-1]])
        }
        })
    }
    primer.df <- tail(primer.subsets, n = 1)[[1]] # the complete primer set
    individual.cvg <- lapply(IDs, function(x) sapply(x, function(y) get_cvg_ratio(primer.df[match(y, primer.df$ID),], template.df))) # cvg of added primers for this subset size
    #IDs <- unlist(lapply(IDs, function(x) paste(x, collapse = "\n")))
    subset.df <- data.frame(Set_Size = N, Coverage = cvg, Type = "Overall")
    single.df <- data.frame(Set_Size = unlist(lapply(seq_along(N), function(x) rep(N[x], length(individual.cvg[[x]])))),
                            Single_Coverage = unlist(individual.cvg),
                            ID = unlist(IDs), Type = "Individual")
    set.labels <- seq_along(primer.subsets)
    max.cvg <- sum(single.df[single.df$Set_Size == tail(N, n = 1), "Single_Coverage"])
    # order primers by time of addition to set
    levels <- rev(unique(unlist(lapply(primer.subsets, function(x) x$ID)))) # first primers at the bottom
    single.df$ID <- factor(single.df$ID, levels = levels, labels = abbreviate(levels, getOption("openPrimeR.plot_abbrev")))
    pal <- getOption("openPrimeR.plot_colors")["Primer"] # the RColorBrewer palette to use
    colors <- colorRampPalette(brewer.pal(8, pal))(length(unique(single.df[, "ID"])))
    indi.color <- ifelse(length(colors) >= 1, colors[1], NA)
    p <- ggplot() + 
    # add bars for individual cvg
        geom_bar(data = single.df, stat = 'identity', 
                aes_string(x = "Set_Size", y = "Single_Coverage", 
                           fill = "ID", colour = "Type"), 
                alpha = 0.80) +
        scale_color_manual(values=c("Individual"= NA, "Overall" = "grey30"),
            guide = ggplot2::guide_legend(override.aes = list(linetype = c(0, 1), 
                                fill = c(indi.color, NA), 
                                colour = c(NA, "grey20")),
                                title = "Coverage type")) +
        scale_fill_manual(values = colors) +
        geom_point(data = subset.df, size = 1.5, aes_string(x = "Set_Size", y = "Coverage", colour = "Type"))  +
        geom_line(data = subset.df, size = 1, aes_string(x = "Set_Size", 
                  y = "Coverage", colour = "Type")) +
        xlab("Subset size") + ylab("Coverage of templates") + 
        ggtitle("Primer subset coverage") + 
        scale_y_continuous(labels = scales::percent, 
                            limits = c(0, max(max.cvg, 1))) + 
        scale_x_continuous(breaks = seq_along(primer.subsets),
                           labels = set.labels) +
        geom_hline(yintercept = required.cvg, linetype = 2, colour = "red") + # indicate the desired cvg
        theme(axis.text.x = element_text(angle = 60, hjust = 1))
    if (length(unique(single.df$ID)) > 30) {
        # don't show individual primer legend if nbr of primers exceeds 30
        p <- p + guides(fill = FALSE)
    }
    return(p)
}

#' Data for Primer Plot.
#'
#' Constructs a data frame containing information
#' about primer binding events.
#'
#' @param primer.df An object of class \code{Primers} containing
#' primers with evaluated primer coverage.
#' @param template.df An object of class \code{Templates} with template sequences
#' corresponding to \code{primer.df}.
#' @param identifier Identifiers of primers that are to be considered.
#' If \code{identifier} is set to \code{NULL} (the default), all primers are considered.
#' @param relation Compute binding positions relative to forward (\code{fw}) or reverse (\code{rev}) binding regions.
#' The default is "fw".
#' @return Data frame with primer binding data.
#' @keywords internal
get_plot_primer_data <- function(primer.df, template.df, identifier = NULL, relation = c("fw", "rev")) {

    if (!is(primer.df, "Primers")) {
        stop("Please input a valid primer data frame.")
    }
    if (!is(template.df, "Templates")) {
        stop("Please provide a valid template data frame.")
    }
    if (!"primer_coverage" %in% colnames(primer.df)) {
        stop(primer.df$Run[1], ": Please compute the primer coverage first.")
    }
    if (is.null(identifier)) { # plot all primers
        identifier <- primer.df$Identifier
    }
    if (length(relation) == 0) {
        stop("Please provide the 'relation' arg.")
    }
    relation <- match.arg(relation)
    m <- match(identifier, primer.df$Identifier)  # determine primers to be plotted
    if (any(is.na(m))) {
        stop("Could not find specified identifiers.")
    }
    if (length(m) == 0 || is.na(identifier) || identifier == "") {
        # nothing to plot
        return(NULL)
    }
    primer.info <- primer.df[m, ]  # info about selected primers
    s.idx <- lapply(covered.seqs.to.idx(primer.info$Covered_Seqs, template.df), function(x) if (length(x) != 0) {which(!is.na(x))} else {NULL})  # idx of of covered seqs per primer
    sel.idx <- which(!is.na(unlist(covered.seqs.to.idx(primer.info$Covered_Seqs, 
        template.df))))  # unlist idx of all primers covering something
    # need to find the idx of fw covering and rev covering seqs
    fw.idx <- which(primer.info$Forward != "")
    rev.idx <- which(primer.info$Reverse != "")
    if (any(primer.info$Forward != "")) {
        sel.idx.fw <- which(!is.na(unlist(covered.seqs.to.idx(primer.info$Covered_Seqs[fw.idx], 
            template.df))))
        s.idx.fw <- sapply(covered.seqs.to.idx(primer.info$Covered_Seqs[fw.idx], 
            template.df), function(x) if (length(x) != 0) {which(!is.na(x))} else {NULL})
    } else {
        sel.idx.fw <- NULL
        s.idx.fw <- NULL
    }
    if (any(primer.info$Reverse != "")) {
        sel.idx.rev <- which(!is.na(unlist(covered.seqs.to.idx(primer.info$Covered_Seqs[rev.idx], 
            template.df))))
        s.idx.rev <- sapply(covered.seqs.to.idx(primer.info$Covered_Seqs[rev.idx], 
            template.df), function(x) if (length(x) != 0) {which(!is.na(x))} else {NULL})
    } else {
        sel.idx.rev <- NULL
        s.idx.rev <- NULL
    }
    # retrieve coverage info:
    covered.seqs <- lapply(seq_along(primer.info$Covered_Seqs), function(x) as.numeric(unlist(strsplit(primer.info$Covered_Seqs[x], 
        split = ",")[s.idx[[x]]])))
    # determine relative primer starts and ends depending on whether we want to
    # analyze regarding fw allowed region or rev allowed region
    if (relation == "fw") {
        primer.start <- lapply(seq_len(nrow(primer.info)), function(x) as.numeric(unlist(strsplit(primer.info$Relative_Forward_Binding_Position_Start_fw[x], 
            split = ","))[s.idx[[x]]]))
        primer.end <- lapply(seq_len(nrow(primer.info)), function(x) as.numeric(unlist(strsplit(primer.info$Relative_Forward_Binding_Position_End_fw[x], 
            split = ","))[s.idx[[x]]]))
        primer.start.rev <- lapply(seq_len(nrow(primer.info)), function(x) as.numeric(unlist(strsplit(primer.info$Relative_Forward_Binding_Position_End_rev[x], 
            split = ","))[s.idx[[x]]]))
        primer.end.rev <- lapply(seq_len(nrow(primer.info)), function(x) as.numeric(unlist(strsplit(primer.info$Relative_Forward_Binding_Position_Start_rev[x], 
            split = ","))[s.idx[[x]]]))
    } else {
        primer.start <- lapply(seq_len(nrow(primer.info)), function(x) as.numeric(unlist(strsplit(primer.info$Relative_Reverse_Binding_Position_Start_fw[x], 
            split = ","))[s.idx[[x]]]))
        primer.end <- lapply(seq_len(nrow(primer.info)), function(x) as.numeric(unlist(strsplit(primer.info$Relative_Reverse_Binding_Position_End_fw[x], 
            split = ","))[s.idx[[x]]]))
        #print("primer posis:")
        #print(primer.start)
        #print(primer.end)
        primer.start.rev <- lapply(seq_len(nrow(primer.info)), function(x) as.numeric(unlist(strsplit(primer.info$Relative_Reverse_Binding_Position_End_rev[x], 
            split = ","))[s.idx[[x]]]))
        primer.end.rev <- lapply(seq_len(nrow(primer.info)), function(x) as.numeric(unlist(strsplit(primer.info$Relative_Reverse_Binding_Position_Start_rev[x], 
            split = ","))[s.idx[[x]]]))
        
    }
    covered.seq.identifiers <- as.numeric(unlist(strsplit(primer.info$Covered_Seqs, 
        split = ",")))[sel.idx]
    # determine nbr covered for given template.df
    nbr.covered <- sapply(seq_along(seq_len(nrow(primer.info))), function(x) length(strsplit(primer.info$Covered_Seqs[x], 
        split = ",")[[1]][s.idx[[x]]]))  # nbr of covered seqs per primer
    m <- match(covered.seq.identifiers, template.df$Identifier)
    not.na.mapping <- which(!is.na(m))
    m <- m[not.na.mapping]
    covered.seqs <- template.df[m, ]
    not.m <- setdiff(seq_len(nrow(template.df)), m)
    uncovered.seqs <- template.df[not.m, ]
    repeat.idx <- unlist(sapply(seq_along(nbr.covered), function(x) rep(x, each = nbr.covered[x])))  # assignment of index in primer.info to all covered seqs from each primers
    primer.info.m <- primer.info[repeat.idx, ]
    total.covered <- sum(nbr.covered)
    total.y <- total.covered + nrow(template.df)  # dimension of plot: nbr of primers + nbr of seqs
    plot.df <- data.frame(Type = rep(NA, total.y), y = rep(NA, total.y), 
                          Map = rep(NA, total.y))
    y.idx <- 1
    mode.directionality <- get.analysis.mode(primer.info)
    for (i in seq_len(nrow(template.df))) {
        id <- template.df$Identifier[i]
        # determine nbr of occurrences of this seq in covered.seq.identifiers vector
        count.idx <- which(covered.seq.identifiers == id)  # how often was this sequence covered?
        p.idx <- repeat.idx[count.idx] # index in primer info
        covered <- FALSE
        if (mode.directionality == "both") {
            # require at least 1 fw and 1 rev primer
            fw.count <- length(which(primer.info[p.idx,]$Forward != ""))
            rev.count <- length(which(primer.info[p.idx,]$Reverse != ""))
            count <- min(c(fw.count, rev.count))
        } else {
            # require any primer to cover a sequence
            count <- length(count.idx)
        }
        if (count != 0) { # 
            covered <- TRUE
        }
        seq.idx <- y.idx
        primer.idx <- (seq.idx + 1):(seq.idx + 1 + length(count.idx) - 1)
        plot.df$Type[seq.idx] <- ifelse(covered, "Covered Sequence", "Not-covered Sequence")
        plot.df$y[seq.idx] <- seq.idx
        plot.df$Map[seq.idx] <- i
        if (length(count.idx) != 0) {
            # determine fw and rev primers
            plot.df$Type[primer.idx] <- "Primer"
            plot.df$y[primer.idx] <- primer.idx
            plot.df$Map[primer.idx] <- p.idx  # gives the index of primer in primer.info
        } 
        y.idx <- y.idx + 1 + length(count.idx)
    }
    # enrich with info
    primer.data.idx <- plot.df$Map[which(plot.df$Type == "Primer")]
    pos.idx <- get.unlist.idx(primer.start, primer.data.idx)
    covered.seq.identifiers <- sapply(seq_along(primer.info$Covered_Seqs), function(x) as.numeric(strsplit(primer.info$Covered_Seqs[x], 
        split = ",")[[1]][s.idx[[x]]]))
    primer.cov.idx <- sapply(covered.seq.identifiers, function(x) match(x, template.df$Identifier))
    primer.gene.groups <- sapply(primer.cov.idx, function(x) template.df[x, "Group"])
    fw.primer.name <- "fw primer"
    rev.primer.name <- "rev primer"
    primer.names <- c(fw.primer.name, rev.primer.name)
    primer.data <- data.frame(Identifier = as.character(primer.info$Identifier[primer.data.idx]), 
                    ID = as.character(primer.info$ID[primer.data.idx]),
                    stringsAsFactors = FALSE) 
    primer.data <- cbind(primer.data, x_start = unlist(primer.start)[pos.idx][not.na.mapping], 
        x_end = unlist(primer.end)[pos.idx][not.na.mapping], x_start_rev = unlist(primer.start.rev)[pos.idx][not.na.mapping], 
        x_end_rev = unlist(primer.end.rev)[pos.idx][not.na.mapping], Group = unlist(primer.gene.groups)[pos.idx][not.na.mapping])
    seq.data.idx <- plot.df$Map[which(plot.df$Type != "Primer")]
    #print("primer data:")
    #print(primer.data)
    # n.b.: not using initial here anymore -> plot relates to the current binding region
    seq.data <- data.frame(Identifier = template.df$Identifier[seq.data.idx], 
        ID = template.df$ID[seq.data.idx], 
        x_start = -template.df$Allowed_End_fw[seq.data.idx], 
        x_end = nchar(template.df$Sequence)[seq.data.idx] - 
                template.df$Allowed_End_fw[seq.data.idx] - 1, 
        x_start_rev = NA,  # only relevant for primers
        x_end_rev = NA, 
        Group = template.df$Group[seq.data.idx],
        stringsAsFactors = FALSE)
    if (relation == "fw") {
        seq.data$x_start <- -template.df$Allowed_End_fw[seq.data.idx]
        seq.data$x_end <- nchar(template.df$Sequence)[seq.data.idx] - 
                            template.df$Allowed_End_fw[seq.data.idx] - 1 
    } else {
        seq.data$x_start <- template.df$Allowed_Start_rev[seq.data.idx] - nchar(template.df$Sequence)[seq.data.idx]
        seq.data$x_end <- nchar(template.df$Sequence[seq.data.idx]) - seq.data$x_start - 1
    }
    add.df <- rbind(primer.data, seq.data)[FALSE, ]  # primer data + seq data should make up the plot.df
    add.df[which(plot.df$Type == "Primer"), ] <- primer.data
    add.df[which(plot.df$Type != "Primer"), ] <- seq.data
    rownames(add.df) <- NULL
    rownames(plot.df) <- NULL
    plot.data <- cbind(plot.df, add.df)
    plot.data$Type <- factor(plot.data$Type, levels = c("Primer", "Covered Sequence", 
        "Not-covered Sequence"))
    ########## determine limits of plot
    #seq.idx <- which(plot.data$Type != "Primer")
    #s.min <- min(plot.data[seq.idx, "x_start"], na.rm = TRUE)
    #seq.max <- max(plot.data[seq.idx, "x_end"], na.rm = TRUE)
    #primer.idx <- which(plot.data$Type == "Primer")
    #x.lim <- c(s.min, seq.max)  # max was p.max before
    #print("before plot lim adjustment")
    #print(plot.data)
    #plot.data$x_start <- sapply(plot.data$x_start, function(x) max(x, x.lim[1]))
    #plot.data$x_end <- sapply(plot.data$x_end, function(x) min(x, x.lim[2]))
    #plot.data$x_start_rev <- sapply(plot.data$x_start_rev, function(x) max(x, x.lim[1]))
    #plot.data$x_end_rev <- sapply(plot.data$x_end_rev, function(x) min(x, x.lim[2]))
    ########## 
    cur.plot.df <- plot.data
    cur.plot.df$ID <- as.character(cur.plot.df$ID)
    # plot only the templates
    rel.cols <- c("ID", "y", "Type", "x_start", "x_end", "x_start_rev", "x_end_rev")
    m <- melt(cur.plot.df[, rel.cols], id = c("y", "Type", "ID"))
    # remove NA entries
    na.idx <- which(is.na(m[, "value"]))
    if (length(na.idx) != 0) {
        m <- m[-na.idx, ]
    }
    # update types
    types <- ifelse(m$variable == "x_start_rev" | m$variable == "x_end_rev", "Reverse Primer", 
        ifelse(m$Type == "Primer", "Forward Primer", as.character(m$Type)))
    m$Type <- types
    m$variable <- ifelse(grepl("start", m$variable), "x_start", "x_end")
    d <- dcast(m, ID + y + Type ~ variable)
    return(d)
}
#' @rdname Plots
#' @name Plots
#' @aliases plot_primer
#' @return \code{plot_primer} plots the primer binding sites in the templates.
#' @export
#' @examples
#' 
#' # Plot of individual primer binding positions
#' data(Ippolito)
#' p <- plot_primer(primer.df[1,], template.df[1:30,])
plot_primer <- function(primer.df, template.df, identifier = NULL, 
                        relation = c("fw", "rev"), 
                        region.names = c("Binding region", "Amplification region")) {

    if (length(region.names) != 2) {
        stop("Need 2 region names.")
    }
    relation <- match.arg(relation)
    d <- get_plot_primer_data(primer.df, template.df, identifier = identifier, relation = relation)
    # region annotation:
    y.val <- 0.05 * nrow(d)
    y.ext <- min(10, y.val)
    ymin <- min(-y.ext, -3)
    ymax <- min(-1, -0.01 * nrow(d))
    region.df <- create_region_boxes(list(primer.df), list(template.df), relation, region.names, ymin, ymax, max(d$x_end, na.rm = TRUE))

    if (length(d) == 0 || nrow(d) == 0) {
        # nothing to plot
        return(NULL)
    }
    col <- c("Forward Primer" = brewer.pal(8, "Accent")[5], 
            "Reverse Primer" = "#0B4D46", 
            "Covered Sequence" = "black", 
            "Not-covered Sequence" = "grey70")
    m <- match(unique(d$Type), names(col))
    col <- col[m[which(!is.na(m))]]
    label.idx <- which(d$Type %in% c("Covered Sequence", "Not-covered Sequence"))
    label.idx.primer <- which(d$Type %in% c("Forward Primer", "Reverse Primer") & 
        !duplicated(d$y))
    label.idx <- c(label.idx, label.idx.primer)
    label.order <- order(d$y[label.idx])
    labels <- d$ID[label.idx][label.order]
    template.data <- d[d$Type %in% c("Covered Sequence", "Not-covered Sequence"), ]
    primer.data <- d[d$Type %in% c("Forward Primer", "Reverse Primer"), ]
    if (length(primer.data) == 0 || nrow(primer.data) == 0) {
        # no primers to plot
        return(NULL)
    }
    break.step.size <- 10
    lim.min <- min(d$x_start, na.rm = TRUE)
    lim.max <- max(d$x_end, na.rm = TRUE)
    ticks <- c(seq(lim.min, lim.max, break.step.size))
    # determine coverage count
    N.total <- length(which(!d$Type %in% c("Forward Primer", "Reverse Primer")))
    N.covered <- length(which(d$Type == "Covered Sequence"))
    N.p <- N.covered/N.total
    stat.text <- paste(N.covered, " of ", N.total, " (", round(N.p, 3) * 100, "%) seqs covered", 
        sep = "")
    title <- stat.text
    segment.size <- 1.5
    arrow.size <- 2
    # arrow angle needs to be a function of the number of coverage events -> the more events, the larger the angle: from 5 to 30
    # number of primers
    min.arrow.angle <- 30 # was 5 before, not necessary anymore
    max.arrow.angle <- 30
    arrow.angle <- max(min(0.5 * (nrow(primer.data) + nrow(template.data)), max.arrow.angle), min.arrow.angle)
    # arrow length should be scaled according to the x-extent of the plot
    x.extent <- max(template.data$x_end) - min(template.data$x_start)
    p.extent <- max(primer.data$x_end) - min(primer.data$x_start)
    arrow.length <- 80 * (p.extent / x.extent^2)
    x.ticks <- c(seq(floor(lim.min/10) * 10, floor(lim.max/10) * 10, break.step.size))
    x.labels <- x.ticks
    x.labels[x.labels > 0] <- paste0("+", x.labels[x.labels > 0])
    r.colors <- c("#e5f4ff", "#ffefe5")
    ggplot() + 
        # need show.legend = FALSE for the segment, otherwise arrow shows
        geom_segment(data = primer.data, show.legend = FALSE,
            lineend = "butt", size = arrow.size,  
            aes_string(x = "x_start", xend = "x_end", y = "y", 
                       yend = "y", colour = "Type"),
            arrow = arrow(angle = arrow.angle, ends = "last", type = "closed")) + 
             geom_segment(data = template.data, 
                     aes_string(x = "x_start", xend = "x_end", 
                                y = "y", yend = "y", colour = "Type"), 
                    lineend = "square", size = segment.size) + 

        # x-axis rectangles to annotate binding/amplification region:
        geom_rect(data = region.df, 
            mapping = aes_string(xmin="xmin", xmax="xmax", 
                        ymin="ymin", ymax="ymax"), 
            fill = r.colors, alpha = 0.5,
            colour = "#3d3835", 
            size = segment.size) +
        # text for region annotation
        geom_text(data=region.df, 
                  aes_string(x = "xmin+(xmax-xmin)/2", 
                             y = "ymin+(ymax-ymin)/2", 
                             label = "Region"), 
                  size = 4) +
        ggtitle(title) + 
        xlab("Relative primer position") + 
        ylab("Sequence") + 
        scale_x_continuous(limits = c(lim.min, lim.max), 
                            breaks = x.ticks,
                            labels = x.labels) + 
        theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) +
        geom_vline(data = region.df, aes_string(xintercept = "RelStartPosition"), colour = "red") + # start of binding region
        geom_vline(xintercept = -1, colour = "red") + 
        #scale_y_continuous(limits = c(1, max(d$y, na.rm = TRUE)), 
        scale_y_continuous(
                breaks = seq_along(labels), labels = abbreviate(labels, getOption("openPrimeR.plot_abbrev"))) + 
        scale_colour_manual(values = col) +
        theme(legend.title = element_text(face = "bold"),
              legend.position = "top")
        # arrow arg doesn't work for linetype ...
        #guides(colour = guide_legend(override.aes = 
                #list(size = 1, linetype = "solid", arrow = arrow(length = unit(1, "cm")))))
       
}

#' Plot of Excluded Primers
#' 
#' Plots histogram of excluded primers.
#'
#' @param excluded.df Data frame with excluded primers.
#' @param filtered.stats Data frame with statistics of the filtering procedure.
#' @param template.df Template data frame.
#' @return A plot of excluded primers.
#' @keywords internal
plot.excluded.hist <- function(excluded.df, filtered.stats, template.df) {
    # histogram of excluded sequences
    if (length(excluded.df) == 0 || nrow(excluded.df) == 0) {
        # nothing to plot
        return(NULL)
    }
    if (!"Coverage_Ratio" %in% excluded.df || all(is.na(excluded.df$Coverage_Ratio))) {
        # no cvg to plot
        return(NULL)
    }
    ylab <- "Coverage"
    xlab <- "Constraint"
    tit <- "Excluded primer coverage"
    plot.df <- excluded.df
    plot.df$Filter_Reason <- factor(plot.df$Exclusion_Reason, levels = unique(filtered.stats$Constraint))
    ggplot(plot.df, aes_string(x = "Filter_Reason", y = "Coverage_Ratio", colour = "Coverage_Ratio")) + 
        ylab(ylab) + xlab(xlab) + ggtitle(tit) + 
        theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
        scale_colour_gradient(name = "Coverage") + 
        geom_boxplot(outlier.colour = NA) + 
        geom_point(position = position_jitter(width = 0.5, height = 0)) +
        scale_y_continuous(labels = scales::percent) +
        facet_wrap(~Direction)
}

#' @rdname Plots
#' @name Plots
#' @aliases plot_template_cvg
#' @return \code{plot_template_cvg} creates a plot showing the number of covered template sequences.
#' @family templates
#' @export
#' @include primers.R templates.R
#' @examples
#' 
#' # Visualize the template coverage of a single primer set
#' data(Ippolito)
#' p.cvg <- plot_template_cvg(primer.df, template.df)
#' # Stratify by allowed mismatches:
#' p.mm.cvg <- plot_template_cvg(primer.df, template.df, per.mismatch = TRUE)
#' # Compare the coverage of multiple primer sets
#' data(Comparison)
#' p.cmp.cvg <- plot_template_cvg(primer.data[1:2], template.data[1:2])
#' # Stratify by allowed mismatches:
#' p.cmp.cvg.mm <- plot_template_cvg(primer.data[1:2], template.data[1:2], 
#'                                  per.mismatch = TRUE)
setGeneric("plot_template_cvg", 
    function(primers, templates, per.mismatch = FALSE, ...) {
        standardGeneric("plot_template_cvg")
})

#' Bar Plot of Template Coverage.
#'
#' Creates a bar plot showing the coverage for every group of template sequences.
#'
#' @param primers A \code{Primers} object with evaluated primer coverage.
#' @param templates A \code{Templates} object containing the template sequences.
#' @param per.mismatch Whether to stratify by mismatches.
#' @param groups Identifiers of template groups for which plot should be created. By default, \code{groups} is set to \code{NULL} such that all 
#' templates are considered.
#' according to the number of mismatches between primer-template pairs.
#' @return A plot showing the number of covered template sequences.
#' @keywords internal
setMethod("plot_template_cvg", 
    signature(primers = "Primers", templates = "Templates"),
    function(primers, templates, per.mismatch, groups = NULL) {
    if (length(primers) == 0 || length(templates) == 0) {
        return(NULL)
    } 
    if (!is(primers, "Primers")) {
        stop("Please input a valid primer data frame.")
    }
    if (!is(templates, "Templates")) {
        stop("Please input a valid template data frame.")
    }
    if (per.mismatch) {
        # stratify by mismatches
        p <- plot_template_cvg_mismatches(primers, templates, groups)
    } else {
        p <- plot_template_cvg_unstratified(primers, templates, groups)
    }
    return(p)
})

#' Bar Plot of Template Coverage.
#'
#' Creates a bar plot showing the coverage for every group of template sequences.
#'
#' @param primers A \code{Primers} object with evaluated primer coverage.
#' @param templates A \code{Templates} object containing the template sequences.
#' @param groups Identifiers of template groups for which plot should be created. By default, \code{groups} is set to \code{NULL} such that all 
#' templates are considered.
#' according to the number of mismatches between primer-template pairs.
#' @return A plot showing the number of covered template sequences.
#' @keywords internal
plot_template_cvg_unstratified <- function(primers, templates, groups = NULL) {
    # get statistics on expected coverage
    stats <- get_cvg_stats(primers, templates)
    if (length(stats) == 0) {
        warning("No coverage statistics available for the input primers.")
        return(NULL)
    }
    cvg.con <- paste0(round(stats[stats$Group == "Total", "Coverage_Ratio"] * 100, 0), "%")
    # compute coverage stats according to text identity:
    stats.txt <- get_cvg_stats(primers, templates, allowed.mismatches = 0, cvg.definition = "basic")
    cvg.txt <- paste0(round(stats.txt[stats.txt$Group == "Total", "Coverage_Ratio"] * 100, 0), "%")
    # need to melt
    # change names of columns
    vars <- c("Group", "primer_coverage", "N")
    p.df.con <- melt(stats[, vars], c("Group"), variable.name = "Status", value.name = "Count")
    levels(p.df.con$Status) <- c("Expected Coverage (E)", "Available Templates")
    vars <- c("Group", "primer_coverage")
    p.df.txt <- melt(stats.txt[, vars], c("Group"), variable.name = "Status", value.name = "Count")
    p.df.txt$Status <- "Identity Coverage (I)"
    # integrate both coverage results
    plot.df <- rbind(p.df.txt, p.df.con)
    plot.df$Status <- factor(plot.df$Status, levels = unique(plot.df$Status))
    # remove total cvg column
    plot.df <- plot.df[plot.df$Group != "Total",]
    # select groups
    if (length(groups) != 0 && !"all" %in% groups) {
        plot.df <- plot.df[plot.df$Group %in% groups, ]
    }
    colnames(plot.df)[colnames(plot.df) == "N"] <- "Available Templates"
    xlab <- "Group"
    ylab <- "Number of templates"
    title <- paste0("Covered templates (I = ", cvg.txt, ", E = ", cvg.con, ")")
    cols.1 <- brewer.pal(8, "Accent")[c(5, 8)]
    cols.2 <- brewer.pal(8, "Blues")[3]
    colors <- c(cols.2, cols.1)
    names(colors) <- levels(plot.df$Status)
    p <- ggplot(plot.df) + geom_bar(aes_string(x = "Group", y = "Count", fill = "Status"), stat = "identity", 
        position = "dodge") + xlab(xlab) + ggtitle(title) + 
        ylab(ylab) + 
        theme(axis.text.x = element_text(
                angle = 90, 
                hjust = 1, vjust = 0.5)) +
        scale_fill_manual(values = colors)
    return(p)
}
#' Preparation of Data for Plotting Mismatch Template Coverage.
#'
#' Creates a data frame for plotting a bar plot for the covered templates per allowed mismatches.
#'
#' @param primer.df A \code{Primers} object.
#' @param template.df A \code{Templates} object.
#' @param allowed.mismatches An optional numeric specifying the number of mismatches to be considered at most for plotting.
#' If not provided, the maximal number of mismatches found for the input primer 
#' set is used.
#' @return A data frame for creating a plot.
#' @keywords internal
prepare_template_cvg_mm_data <- function(primer.df, template.df, allowed.mismatches = NULL) {
    plot.df <- get_template_cvg_data(primer.df, template.df)
    #########################
    # get data for available templates for every mismatch category:
    t.df <- data.frame("Template" = unique(template.df$ID), 
                       "Group" = NA, "Position" = NA, "Number_of_mismatches" = NA)
    m <- match(t.df$Template, template.df$ID)
    t.df$Group <- template.df$Group[m]
    if (length(allowed.mismatches) == 0) {
        allowed.mismatches <- max(plot.df$Number_of_mismatches)
    }
    mm.settings <- seq(0, allowed.mismatches)
    mm.settings <- mm.settings[order(mm.settings)]
    # identify available number of template sequences
    available.data <- vector("list", length(mm.settings))
    for (i in seq_along(mm.settings)) {
        df.a <- t.df
        df.a$Number_of_mismatches <- mm.settings[i]
        available.data[[i]] <- df.a
    }
    available.df <- do.call(rbind, available.data)
    available.df$Status <- as.factor("Available")
    p.df <- rbind(plot.df[, !colnames(plot.df) %in% c("Primer", "Direction")], available.df)
    cvg.groups <- c("Coverage", "Basic Coverage")
    all.groups <- c(cvg.groups, "Available Templates")
    levels(p.df$Status) <- all.groups
    # do not consider the exact nbr of mismatches, but the cumulative number (cvg with >= x mismatches rather than cvg with == x mismatches)
    cum.data <- p.df[p.df$Status == "Available Templates", ] # start from available data
    cum.data$Maximal_mismatches <- cum.data$Number_of_mismatches
    for (i in seq_along(mm.settings)) {
        mm <- mm.settings[i]
        for (j in seq_along(cvg.groups)) {
            cvg.status <- cvg.groups[j]
            event.idx <- which(p.df$Status == cvg.status & p.df$Number_of_mismatches <= mm)
            if (length(event.idx) != 0) {
                cur.df <- p.df[event.idx,]
                cur.df$Maximal_mismatches <- mm
                cum.data <- rbind(cum.data, cur.df) 
            }
        }
    }
    o <- order(unique(as.character(cum.data$Group)))
    cum.data$Group <- factor(cum.data$Group, levels = unique(as.character(cum.data$Group))[o])
    # need to plot using identity as stat in order to ensure that bars have all the same width
    plot.df <- ddply(cum.data, c("Group", "Maximal_mismatches", "Status"), summarize, Count = length(substitute(Group)))
    additional.df <- expand.grid(Maximal_mismatches = mm.settings, Group = unique(template.df$Group), Status = unique(plot.df$Status), Count = 0)
    plot.df <- merge(plot.df, additional.df, all = TRUE)
    # for the added, duplicate events, select the 'real events'
    plot.df <- ddply(plot.df, c("Group", "Maximal_mismatches", "Status"), summarize,
                           Count = max(substitute(Count)))
    total.percentages <- TRUE
    if (!total.percentages) {
        idx <- which(plot.df$Status == "Available Templates")
        idx <- idx[!duplicated(plot.df$Group[idx])]
        available.per.group <- plot.df$Count[idx]
        names(available.per.group) <- plot.df$Group[idx]
        m <- match(plot.df$Group, names(available.per.group))
        plot.df$Coverage_Ratio <- plot.df$Count / available.per.group[m]
    } else {
        plot.df$Coverage_Ratio <- plot.df$Count / nrow(template.df)
    }
    return(plot.df)
}
#' Bar Plot of Template Coverage for Mismatches.
#'
#' Creates a bar plot showing the coverage for every group of template sequences.
#'
#' @param primer.df A \code{Primers} object.
#' @param template.df A \code{Templates} object.
#' @param groups Identifiers of template groups for which plot should be created. By default, \code{groups} is set to \code{NULL} such that all 
#' templates are considered.
#' @param nfacets The number of facets columns to plot. By default,
#' \code{nfacets} is set to 2.
#' @return A plot showing the number of covered template sequences.
#' @keywords internal
plot_template_cvg_mismatches <- function(primer.df, template.df, groups = NULL,
                                        nfacets = 2) {
    if (!is.null(groups) && !"all" %in% groups) {
        # select a template subset
        idx <- which(template.df$Group %in% groups)
        template.df <- template.df[idx,]
    }
    plot.df <- prepare_template_cvg_mm_data(primer.df, template.df)
    if (nrow(plot.df) == 0) {
        return(NULL)
    }
    # compute cvg ratio per mismatch setting to show in facet labels:
    cvg.per.mm <- ddply(plot.df, c("Maximal_mismatches", "Status"), here(summarize),
                           Coverage_Ratio = sum(substitute(Count)) / nrow(template.df)) 
    cvg.per.mm <- cvg.per.mm[cvg.per.mm$Status == "Coverage",]
    cvg.info <- paste0("(", round(cvg.per.mm$Coverage_Ratio * 100, 0), "% coverage)")
    plot.df$Maximal_mismatches <- paste0(plot.df$Maximal_mismatches, " ", cvg.info[match(plot.df$Maximal_mismatches, cvg.per.mm$Maximal_mismatches)])
    xlab <- "Group"
    ylab <- "Number of templates"
    title <- "Covered templates: mismatches"
    cols.1 <- brewer.pal(8, "Accent")[c(5, 8)]
    cols.2 <- brewer.pal(8, "Blues")[3]
    colors <- c(cols.1[1], cols.2, cols.1[2])
    names(colors) <- levels(plot.df$Status)
    p <- ggplot(plot.df) + 
        geom_bar(aes_string(x = "Group", y = "Count", fill = "Status"),
                 position = "dodge", stat = "identity") + 
        xlab(xlab) + ggtitle(title) + 
        ylab(ylab) + 
        theme(axis.text.x = element_text(
                angle = 90, 
                hjust = 1, vjust = 0.5)) +
        scale_fill_manual(values = colors) + 
        facet_wrap(~Maximal_mismatches, ncol = nfacets,
        labeller = label_bquote("Mismatches"<=.(substitute(Maximal_mismatches))))
    return(p)
}
#' Templates Coverage for Multiple Primer Sets.
#'
#' Plots the coverage of multiple primer sets.
#'
#' @param primers List with primer data frames.
#' @param templates List with template data frames.
#' @param colors Color for every primer set.
#' @param highlight.set Primer sets to be highlighted.
#' @return A plot for comparing primer coverage.
#' @keywords internal
setMethod("plot_template_cvg", 
    signature(primers = "list", templates = "list"),
    function(primers, templates, per.mismatch, highlight.set = NULL) {
    if (length(primers) == 0 || length(templates) == 0) {
        return(NULL)
    }
    if (per.mismatch) {
        p <- plot_template_cvg_comparison_mismatch(primers, templates, highlight.set)
    } else {
        p <- plot_template_cvg_comparison_unstratified(primers, templates, highlight.set)
    }
    return(p)
})
#' Getter for Run Names.
#'
#' Retrieves the run names of the input data.
#'
#' @param primer.data A list with \code{Primers} or \code{Templates}.
#' @return A vector with identifiers for every set.
#' @keywords internal
get.run.names <- function(primer.data) {
    # if primer.data is a named list, use the names of the list
    if (length(primer.data) == 0) {
        return(NULL)
    }
    if (length(names(primer.data) != 0)) {
        run.names <- names(primer.data)
    } else {
        # primer.data is not a named list -> use the 'Run' identifier instead
        #run.names <- unname(unlist(lapply(primer.data, function(x) unique(x$Run))))
        run.names <- lapply(primer.data, function(x) unique(x$Run))
        empty.idx <- which(unlist(lapply(primer.data, function(x) nrow(x) == 0)))
        run.names[empty.idx] <- "Unknown"
        run.names <- unname(unlist(run.names))
    }
    return(run.names)
}
#' Templates Coverage for Multiple Primer Sets.
#'
#' Plots the coverage of multiple primer sets.
#'
#' @param primers List with primer data frames.
#' @param templates List with template data frames.
#' @param highlight.set Primer sets to be highlighted.
#' @return A plot for comparing primer coverage.
#' @keywords internal
plot_template_cvg_comparison_unstratified <- function(primers, templates, highlight.set = NULL) {

    if (length(primers) == 0 || length(templates) == 0) {
        return(NULL)
    }
    run.names <- get.run.names(primers)
    cvg <- lapply(seq_along(primers), function(x) {
        stats <-  openPrimeR::get_cvg_stats(primers[[x]], 
                    templates[[x]],
                    total.percentages = TRUE)
        if (length(stats) != 0) {
            stats <- cbind("Run" = run.names[x], stats)
        }
    })
    plot.df <- do.call(rbind.fill, cvg)
    if (length(plot.df) == 0 || nrow(plot.df) == 0) {
        return(NULL)
    }
    # remove the total group
    plot.df <- plot.df[plot.df$Group != "Total", ]
    plot.df$Group <- factor(plot.df$Group, levels = unique(plot.df$Group)[order(unique(plot.df$Group))])
    title <- "Coverage of the template sequences"
    plot.df$Run <- abbreviate(plot.df$Run, getOption("openPrimeR.plot_abbrev"))
    plot.df$Run <- factor(plot.df$Run, levels = unique(plot.df$Run)[order(as.character(unique(plot.df$Run)))])
    if (length(highlight.set) != 0) {
        # check whether highlight set is specified correctly
        m <- match(highlight.set, plot.df$Run)
        na.idx <- which(is.na(m))
        if (length(na.idx) != 0) {
            msg <- paste("Highlight set not available in data:",
                paste(highlight.set[na.idx], collapse = ","))
            warning(msg)
            highlight.set <- highlight.set[!is.na(m)]
        }
    }
    pal <- getOption("openPrimeR.plot_colors")["Group"] # the RColorBrewer palette to use
    colors <- colorRampPalette(brewer.pal(8, pal))(length(unique(plot.df$Group)))
    # add label for top of the bars (the first group) 
    labels <- rep(NA, nrow(plot.df))
    for (i in seq_along(levels(plot.df$Run))) {
        run <- levels(plot.df$Run)[i]
        idx <- which(plot.df$Run == run)
        cur.data <- plot.df[idx,]
        res <- sum(cur.data$Coverage_Ratio) # the coverage to show for this bar
        out.idx <- which(plot.df$Run == run & plot.df$Group == unique(cur.data$Group)[1]) # row index to show the coverage
        labels[out.idx] <- res
    }
    plot.df$Label <- ifelse(is.na(labels), "", paste0(round(labels * 100, 1), "%"))

    p <- ggplot(plot.df, aes_string(x = "Run", y = "Coverage_Ratio", fill = "Group")) + 
        geom_bar(stat = "identity") + 
        theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
        scale_y_continuous(labels = scales::percent, limits = c(0, 1)) +
        ggtitle(title) +
        ylab("Coverage") +
        scale_fill_manual(values = colors) + 
        # add coverage above bars:
        geom_text(aes(label = plot.df$Label),
        #geom_text(aes(label = substitute(Label)),
                    size = 5, 
                    position = position_stack(vjust = 0.5),
                    check_overlap = FALSE)
    if (length(highlight.set) != 0) {
        # highlight selected sets
        sel <- levels(plot.df$Run) %in% highlight.set
        p <- p + theme(
                axis.text.x = element_text(
                    face=ifelse(sel, "bold","plain"),
                    colour = ifelse(sel, 
                            "grey20", "grey30")))
    }
    return(p)
}
#' Templates Coverage for Multiple Primer Sets.
#'
#' Plots the coverage of multiple primer sets.
#'
#' @param primers List with primer data frames.
#' @param templates List with template data frames.
#' @param highlight.set Primer sets to be highlighted.
#' @return A plot for comparing primer coverage.
#' @keywords internal
plot_template_cvg_comparison_mismatch <- function(primers, templates, 
            highlight.set = NULL) {

    if (length(primers) == 0 || length(templates) == 0) {
        return(NULL)
    }
    run.names <- get.run.names(primers)
    max.mm <- max(unlist(lapply(primers, function(x) as.numeric(strsplit(c(x$Nbr_of_mismatches_fw, x$Nbr_of_mismatches_rev), split = ",")[[1]]))))
    cvg <- parallel::mclapply(seq_along(primers), function(x) {
        # supply allowed mismatch arg to show the same number of mismatches for all sets in the plot's facets
        plot.df <- prepare_template_cvg_mm_data(primers[[x]], templates[[x]],
                    allowed.mismatches = max.mm)
        if (length(plot.df) != 0) {
            # only select the expected coverage events
            plot.df <- plot.df[plot.df$Status == "Coverage",]
            # annotate with Run
            plot.df <- cbind("Run" = run.names[x], plot.df)
        }
    })
    plot.df <- do.call(rbind.fill, cvg)
    if (length(plot.df) == 0 || nrow(plot.df) == 0) {
        return(NULL)
    }
    title <- "Coverage of the template sequences"
    plot.df$Run <- abbreviate(plot.df$Run, getOption("openPrimeR.plot_abbrev"))
    plot.df$Run <- factor(plot.df$Run, levels = unique(plot.df$Run)[order(as.character(unique(plot.df$Run)))])
    if (length(highlight.set) != 0) {
        # check whether highlight set is specified correctly
        m <- match(highlight.set, plot.df$Run)
        na.idx <- which(is.na(m))
        if (length(na.idx) != 0) {
            msg <- paste("Highlight set not available in data:",
                paste(highlight.set[na.idx], collapse = ","))
            warning(msg)
            highlight.set <- highlight.set[!is.na(m)]
        }
    }
    pal <- getOption("openPrimeR.plot_colors")["Group"] # the RColorBrewer palette to use
    colors <- colorRampPalette(brewer.pal(8, pal))(length(unique(plot.df$Group)))
    p <- ggplot(plot.df, aes_string(x = "Run", y = "Coverage_Ratio", fill = "Group", group = "Group")) + 
        geom_bar(stat = "identity") + 
        facet_wrap(~Maximal_mismatches,
            labeller = label_bquote("Mismatches"<=.(substitute(Maximal_mismatches)))
        ) + 
        theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
        scale_y_continuous(labels = scales::percent, limits = c(0, 1)) +
        ggtitle(title) +
        ylab("Coverage") +
        scale_fill_manual(values = colors)
    if (length(highlight.set) != 0) {
        # highlight selected sets
        sel <- levels(plot.df$Run) %in% highlight.set
        p <- p + theme(
                axis.text.x = element_text(
                    face=ifelse(sel, "bold","plain"),
                    colour = ifelse(sel, "grey20", "grey30")))
    }
    return(p)
}

#' @rdname AnalysisStats
#' @name AnalysisStats
#' @aliases get_cvg_stats
#' @return \code{get_cvg_stats} returns a data frame whose entries provide 
#' the coverage of templates per group of templates.
#' @export
#' @include primers.R templates.R
#' @examples
#' 
#' # Coverage statistics for a single primer set
#' data(Ippolito)
#' cvg.stats <- get_cvg_stats(primer.df, template.df)
#' # Coverage statistics for multiple primer sets
#' data(Comparison)
#' cvg.stats.comp <- get_cvg_stats(primer.data[1:2], template.data[1:2])
setGeneric("get_cvg_stats", 
    function(primers, templates, for.viewing = FALSE, total.percentages = FALSE, 
            allowed.mismatches = Inf, cvg.definition = c("constrained", "basic")) {
        standardGeneric("get_cvg_stats")
})
#' Coverage Statistics of a Primer Set.
#'
#' Retrieve statistics on the templates that are covered by a primer set.
#'
#' @param primer.df An object of class \code{Primers} containing
#' primers with evaluated coverage.
#' @param template.df An object of class \code{Templates} containing
#' templates with evaluated coverage.
#' @param for.viewing Whether the table should be formatted
#' for viewing rather than processing.
#' @param total.percentages Whether group coverage percentages
#' should relate to all template sequences or just those templates
#' belonging to a specific group.
#' @param allowed.mismatches The maximal allowed number of mismatches.
#' By default, the number of mismatches is not restricted. 
#' @param cvg.definition If \code{cvg.definition} is set to
#' "constrained", the statistics for the expected
#' coverage (after applying the coverage constraints) are retrieved.
#' If \code{cvg.definition} is set to "basic", the coverage is determined 
#' solely by string matching (i.e. without applying the coverage constraints).
#' By default, \code{cvg.definition} is set to "constrained".
#' @return Data frame with coverage statistics.
#' @keywords internal
setMethod("get_cvg_stats", signature(primers = "Primers"), 
    function(primers, templates, for.viewing = FALSE, total.percentages = FALSE, 
             allowed.mismatches = Inf, cvg.definition = c("constrained", "basic")) {

    # compute some stats on the coverage table structure: group | nbr templates |
    # [nbr covered | covered percent |]
    if (!is(primers, "Primers")) {
        stop("Please input a valid primer data frame.")
    }
    if (!is(templates, "Templates")) {
        stop("Please input a valid template data frame.")
    }
    if (!"Covered_Seqs" %in% colnames(primers) || !"primer_coverage" %in% colnames(primers)) {
        warning("get_cvg_stats: Primer/template coverage not available.")
        return(NULL)  # not possible
    }
    cvg.definition <- match.arg(cvg.definition)
    # update primer & template coverage according to the input conditions:
    primers <- update_primer_cvg(primers, templates, allowed.mismatches = allowed.mismatches, cvg.definition = cvg.definition)
    templates <- update_template_cvg(templates, primers)
    if (total.percentages) {
        # normalize coverage by total number of templates
        df <- ddply(templates, "Group", here(summarize), N = nrow(templates),
            N_primer = length(unique(strsplit(as.character(substitute(Covered_By_Primers)), split = ",")[[1]])), 
            primer_coverage = length(which(substitute(primer_coverage) > 0)), 
            Coverage_Ratio = sum(substitute(primer_coverage))/nrow(templates),
            N_primer_fw = length(unique(strsplit(as.character(substitute(Covered_By_Primers_fw)), split = ",")[[1]])), 
            primer_coverage_fw = length(which(substitute(primer_coverage_fw) > 0)), 
            Coverage_Ratio_fw = sum(substitute(primer_coverage_fw))/nrow(templates),
            N_primer_rev = length(unique(strsplit(as.character(substitute(Covered_By_Primers_rev)), split = ",")[[1]])), 
            primer_coverage_rev = length(which(substitute(primer_coverage_rev) > 0)), 
            Coverage_Ratio_rev = sum(substitute(primer_coverage_rev))/nrow(templates))
    } else {
        # normalize coverage by number of templates per group
        df <- ddply(templates, "Group", here(summarize), N = length(substitute(Group)), N_primer = length(unique(strsplit(as.character(substitute(Covered_By_Primers)), 
            split = ",")[[1]])), primer_coverage = length(which(substitute(primer_coverage) > 0)), 
            Coverage_Ratio = sum(substitute(primer_coverage))/length(substitute(Group)), N_primer_fw = length(unique(strsplit(as.character(substitute(Covered_By_Primers_fw)), 
                split = ",")[[1]])), primer_coverage_fw = length(which(substitute(primer_coverage_fw) > 
                0)), Coverage_Ratio_fw = sum(substitute(primer_coverage_fw))/length(substitute(Group)), 
            N_primer_rev = length(unique(strsplit(as.character(substitute(Covered_By_Primers_rev)), 
                split = ",")[[1]])), primer_coverage_rev = length(which(substitute(primer_coverage_rev) > 
                0)), Coverage_Ratio_rev = sum(substitute(primer_coverage_rev))/length(substitute(Group)))
    }
    # order by groups
    o <- order(df$Group)
    df <- df[o, ]
    # summary of all groups in 'Total' row
    df <- rbind(data.frame(Group = "Total", N = nrow(templates), N_primer = nrow(primers), 
        primer_coverage = sum(df$primer_coverage), Coverage_Ratio = sum(df$primer_coverage)/nrow(templates), 
        N_primer_fw = length(which(primers$Forward != "")), primer_coverage_fw = sum(df$primer_coverage_fw), 
        Coverage_Ratio_fw = sum(df$primer_coverage_fw)/nrow(templates), N_primer_rev = length(which(primers$Reverse != 
            "")), primer_coverage_rev = sum(df$primer_coverage_rev), Coverage_Ratio_rev = sum(df$primer_coverage_rev)/nrow(templates), 
        stringsAsFactors = FALSE), data.frame(df, stringsAsFactors = FALSE))
    df$Coverage <- paste(df$primer_coverage, " of ", df$N, " (", round(df$Coverage_Ratio * 
        100, 2), "%)", sep = "")
    df$Coverage_fw <- paste(df$primer_coverage_fw, " of ", df$N, " (", round(df$Coverage_Ratio_fw * 
        100, 2), "%)", sep = "")
    df$Coverage_rev <- paste(df$primer_coverage_rev, " of ", df$N, " (", round(df$Coverage_Ratio_rev * 
        100, 2), "%)", sep = "")
    out.df <- df
    if (for.viewing) {
        cols.both <- c("Group", "Coverage", 
                "Coverage_fw", "Coverage_rev")
        out.names <-  c("Group", "Coverage", "Coverage (fw)", "Coverage (rev)")
        mode <- "both"
        if (sum(out.df$N_primer_rev) == 0 || sum(out.df$N_primer_fw) == 0) {
            mode <- "single"
        }
        if (mode == "single") {
            out.df <- out.df[, cols.both[seq_len(2)]]
            colnames(out.df) <- out.names[seq_len(2)]
        } else {
            out.df <- out.df[, cols.both]
            colnames(out.df) <- out.names
        }
    }
    return(out.df)
})
#' Coverage Statistics for Multiple Primer Sets.
#'
#' Retrieve statistics on covered templates for multiple primer sets.
#'
#' @param primers A list with objects of class \code{Primers} containing
#' primers with evaluated coverage.
#' @param templates A list with objects of class \code{Templates} containing
#' templates with evaluated coverage.
#' @param for.viewing Whether the table should be formatted
#' for viewing rather than processing.
#' @param total.percentages Whether group coverage percentages
#' should relate to all template sequences or just those templates
#' belonging to a specific group.
#' @param allowed.mismatches The maximal allowed number of mismatches.
#' By default, the number of mismatches is not restricted. 
#' @param cvg.definition If \code{cvg.definition} is set to
#' "constrained", the statistics for the expected
#' coverage (after applying the coverage constraints) are retrieved.
#' If \code{cvg.definition} is set to "basic", the coverage is determined 
#' solely by string matching (i.e. without applying the coverage constraints).
#' By default, \code{cvg.definition} is set to "constrained".
#' @return Data frame with coverage statistics.
#' @keywords internal
setMethod("get_cvg_stats", signature(primers = "list"),
    function(primers, templates, for.viewing = FALSE, total.percentages = FALSE,
             allowed.mismatches = Inf, cvg.definition = c("constrained", "basic")) {


    if (length(primers) == 0 || length(templates) == 0) {
        return(NULL)
    }
    template.classes <- sapply(templates, function(x) class(x))
    primer.classes <- sapply(primers, function(x) class(x))
    if (any(template.classes != "Templates") || any(primer.classes != "Primers")) {
        stop("Check types of primers/templates.")
    }
    runs <- get.run.names(primers)
    stats <- lapply(seq_along(primers), function(x)  {
                df <- get_cvg_stats(primers[[x]], templates[[x]], FALSE, total.percentages) 
                groups <- as.character(df$Group)
                if (for.viewing) {
                    # only select the coverage entry
                    cvg <- df[, which(colnames(df) == "Coverage_Ratio") ]
                    # format cvg
                    cvg <- paste0(round(cvg * 100, 1), "%")
                    df <- data.frame(Run = runs[x], Coverage = t(cvg), stringsAsFactors = FALSE)
                } else {
                    df <- data.frame(cbind(Run = runs[x], t(df)), stringsAsFactors = FALSE)
                }
                colnames(df) <- c("Run", groups)
                return(df)
            })
    stat.df <- do.call(rbind.fill, stats)
    if (ncol(stat.df) >= 3) {
        # don't change Run & Total columns
        idx <- seq(3, ncol(stat.df))
        group.order <- colnames(stat.df)[idx]
        group.order <- group.order[order(group.order)]
        colnames(stat.df)[idx] <- group.order
    }
    # order rows by 'Run':
    stat.df <- stat.df[order(stat.df$Run), ]
    return(stat.df)
})
#' @rdname Plots
#' @name Plots
#' @aliases plot_primer_cvg
#' @return \code{plot_primer_cvg} creates a plot showing the coverage of individual primers.
#' @export
#' @include primers.R templates.R
#' @examples
#' 
#' # Plot expected coverage per primer
#' data(Ippolito)
#' p.cvg <- plot_primer_cvg(primer.df, template.df)
#' # Plot coverage stratified by allowed mismatches:
#' p.cvg.mm <- plot_primer_cvg(primer.df, template.df, per.mismatch = TRUE)
#' # Plot coverage of multiple primer sets
#' data(Comparison)
#' p.cvg.cmp <- plot_primer_cvg(primer.data[1:3], template.data[1:3])
setGeneric("plot_primer_cvg", 
    function(primers, templates, per.mismatch = FALSE, ...) {
        standardGeneric("plot_primer_cvg")
})
#' Plot Individual Primer Coverage.
#'
#' Shows which templates are covered by individual primers.
#'
#' @param p.df Primer data frame.
#' @param template.df Template data frame.
#' @param per.mismatch Whether to stratify by allowed mismatches.
#' @param excluded.seqs Identifiers of templates that should not be considered.
#' @param per.mismatch Whether the coverage should
#' be broken down for individual settings of allowed mismatches.
#' @return A bar plot showing the coverage of individual primers.
#' @keywords internal
setMethod("plot_primer_cvg", 
    signature(primers = "Primers", templates = "Templates"),
    function(primers, templates, per.mismatch = FALSE, groups = NULL) {
    
    if (length(primers) == 0 || nrow(primers) == 0 || length(templates) == 0) {
        return(NULL)
    }
    if (!is(primers, "Primers")) {
        stop("Please input a valid primer data frame.")
    }
    if (!is(templates, "Templates")) {
        stop("Please input a valid template data frame.")
    }
    if (per.mismatch) {
        p <- plot_primer_cvg_mismatches(primers, templates)
    } else {
        p <- plot_primer_cvg_unstratified(primers, templates, groups = groups)
    }
    return(p)
})
#' Plot Multiple Primer Coverages.
#'
#' Plots the coverage of individual primers for multiple sets.
#'
#' @param primers List with \code{Primers} objects.
#' @param templates List with \code{Templates} objects.
#' @param per.mismatch Whether the coverage should
#' be broken down for individual settings of allowed mismatches.
#' @return A bar plot showing the coverage of individual primers.
#' @keywords internal
setMethod("plot_primer_cvg", 
    signature(primers = "list", templates = "list"),
    function(primers, templates, per.mismatch = FALSE) {

    if (length(primers) == 0 || length(templates) == 0) {
        return(NULL)
    }
    if (per.mismatch) {
        warning("Not supported yet.")
        return(NULL)
    } else {
        # alias for existing exported function
        p <- plot_constraint(primers, NULL, "primer_coverage")
    }
    return(p)
})
#' Plot Individual Primer Coverage.
#'
#' Plots the coverage of individual primers.
#'
#' @param p.df Primer data frame.
#' @param template.df Template data frame.
#' @param groups Optional identifiers of template groups to be considered.
#' If not provided, all template groups are considered.
#' @return A bar plot showing the coverage of individual primers.
#' @keywords internal
plot_primer_cvg_unstratified <- function(p.df, template.df, groups = NULL) {
    # Select only the selected groups of templates:
     if (!is.null(groups) && !"all" %in% groups) { # select subset
        idx <- which(template.df$Group %in% groups)
        # set excluded seqs for updating the coverage
        excluded.seqs <- setdiff(template.df$Identifier[seq_len(nrow(template.df))], template.df$Identifier[idx])
        # select relevant templates
        template.df <- template.df[idx,]
        # re-evalaute coverage with the new templates
        p.df <- evaluate.diff.primer.cvg(p.df, excluded.seqs, template.df)
    }
    title <- "Template coverage per primer"
    xlab <- "Primer Identifier"
    title <- "Template coverage per primer and group"
    # annotate p.df with strat column
    m <- covered.seqs.to.idx(p.df$Covered_Seqs, template.df)
    plot.df <- NULL
    strat <- "Group"
    # TODO: maybe modify the code inline with prepare_mm_plot?
    for (i in seq_along(m)) {
        missing.groups <- unique(template.df$Group)
        new.data <- NULL
        if (length(m[[i]]) != 0) {
            hit.groups <- unique(template.df[m[[i]], strat])
            new.data <- data.frame(ID = rep(p.df[i, "ID"], length(hit.groups)))
            new.data[, strat] <- hit.groups
            cvd.idx <- m[[i]]
            cvg.idx <- lapply(seq_along(hit.groups), function(x) {
              s <- hit.groups[x]
              sel <- which(template.df[cvd.idx, strat] == s)
              cvd.idx[sel]
            })
            cvg.counts <- sapply(cvg.idx, length)
            cvd.seqs <- sapply(cvg.idx, function(x) paste(template.df$Identifier[x], 
              collapse = ","))
            available.seqs.per.group <- sapply(hit.groups, function(x) length(which(template.df$Group == x)))
            new.data$primer_coverage <- cvg.counts/available.seqs.per.group # percentage of group templates covered
            new.data$Covered_Seqs <- cvd.seqs
            missing.groups <- setdiff(unique(template.df$Group), new.data$Group)
        }
        # also add all combinations for missing groups -> ensures bar width stays constant throughout the plot observations
        missing.df <- data.frame(ID = rep(p.df$ID[i], length(missing.groups)), Group = missing.groups, 
                                primer_coverage = rep(0, length(missing.groups)), Covered_Seqs = rep("", length(missing.groups)))
        new.data <- rbind(new.data, missing.df)
        plot.df <- rbind(plot.df, new.data)
    }
    ###
    ylab <- "Covered Templates"
    pal <- getOption("openPrimeR.plot_colors")[strat] # the RColorBrewer palette to use
    group.colors <- colorRampPalette(brewer.pal(8, pal))(length(unique(plot.df[, strat])))
    plot.df$ID <- factor(plot.df$ID)
    unique.cvd.idx <- compute.unique.covered.idx(plot.df, template.df)
    unique.cvg <- sapply(unique.cvd.idx, length)
    unique.cvg.ratio <- unique.cvg/nrow(template.df)
    p.df.unique <- plot.df
    p.df.unique$Coverage_Type <- "Unique Coverage"
    plot.df$Coverage_Type <- "Overall Coverage"
    p.df.unique$primer_coverage <- unique.cvg.ratio
    plot.df <- rbind(plot.df, p.df.unique)
    levels(plot.df$ID) <- abbreviate(levels(plot.df$ID), getOption("openPrimeR.plot_abbrev"))
    plot.df <- plot.df[plot.df$Coverage_Type == "Overall Coverage",] # only plot overall cvg
    bar.width <- 0.75
    dist.buffer <- (1- 0.75) / 2
    # add shading for every primer region in the plot to differentiate them better
    base.pos <- seq_along(unique(plot.df$ID))
    starts <- base.pos - 0.5 + dist.buffer
    ends <- base.pos + 0.5 - dist.buffer
    rects <- data.frame(xstart = starts, xend = ends,
                        ymin = 0, ymax = 1)
    rects$col <- factor(rep(c(0,1), nrow(rects))[seq_len(nrow(rects))])
    # only retain rectangles alternately
    rects <- rects[rects$col == 0, ]
    p <- ggplot() +
        geom_bar(data = plot.df, stat = "identity", position = "dodge", 
                    aes_string(x = "ID", y = "primer_coverage",
                    fill = strat), width = bar.width, show.legend = TRUE) + 
        geom_rect(data = rects, aes_string(xmin = "xstart", xmax = "xend", ymin = "ymin", ymax = "ymax"), 
                  fill = "grey60", alpha = 0.15, inherit.aes = FALSE, show.legend = FALSE) +
        xlab(xlab) + ggtitle(title) + ylab(ylab) + 
        theme(axis.text.x = element_text(angle = 60, 
                                hjust = 1)) +
        scale_fill_manual(values = group.colors) + 
        scale_y_continuous(limits = c(0, 1), labels = scales::percent)
    return(p)
}

#' Data for Mismatch Primer Coverage Plot.
#'
#' Ensures that there's an entry for every possible mismatch setting.
#'
#' @param primer.df A \code{Primers} object.
#' @param template.df A \code{Templates} object.
#' @return A data frame for plotting mismatch primer coverage.
#' @keywords internal
get_primer_cvg_mm_plot_df <- function(primer.df, template.df) {
    full.df <- prepare_mm_plot(primer.df, template.df)
    full.df <- full.df[full.df$Coverage_Type == "constrained",]
    # go from individual mismatches to individual coverage events
    df <- ddply(full.df, c("Primer", "Template", "Group"), summarize,
                            Position = unique(substitute(Position_3terminus)), 
                            Number_of_mismatches = unique(substitute(Number_of_mismatches)))
    # for every primer and group, ensure that there's an entry for every number of mismatches:
    count.df <- ddply(df, c("Primer", "Group", "Number_of_mismatches"), summarize,
                            Coverage = length(unique(substitute(Template))))
    if (nrow(df) == 0) {
        # just plot a single facet to show nothing is covered
        max.mm <- 0 
    } else {
        max.mm <- max(df$Number_of_mismatches)
    }
    result.data <- vector("list", max.mm + 1)
    for (i in seq(0, max.mm)) {
        additional.df <- data.frame(Primer = unlist(lapply(seq_len(nrow(primer.df)), function(x) rep(primer.df$ID[x], length(unique(template.df$Group))))),
                                    Group = unlist(lapply(seq_len(nrow(primer.df)), function(x) unique(template.df$Group))),
                                    Number_of_mismatches = i, Coverage = 0)
        count.df <- merge(count.df, additional.df, all = TRUE)
    }
    # ensure that for duplicate entries (0 coverage entries), the one with the maximal coverage is chosen:
    count.df <- ddply(count.df, c("Primer", "Group", "Number_of_mismatches"), summarize,
                            Coverage = max(substitute(Coverage)))
    # make the counts cumulative
    count.df$Cumulative_Coverage <- ave(count.df$Coverage, count.df$Primer, count.df$Group, FUN = cumsum)
    # determine ratio of covered seqs per group
    group.counts <- table(template.df$Group) # match to count.df$Group for division
    m <- match(count.df$Group, names(group.counts))
    count.df$Coverage_Ratio <- count.df$Cumulative_Coverage / as.vector(group.counts[m])
    o <- order(unique(as.character(count.df$Group)))
    count.df$Group <- factor(count.df$Group, levels = unique(as.character(count.df$Group))[o])
    colnames(count.df)[colnames(count.df) == "Number_of_mismatches"] <- "Maximal_mismatches" 
    count.df$Primer <- factor(abbreviate(count.df$Primer, getOption("openPrimeR.plot_abbrev")), levels = abbreviate(levels(count.df$Primer), getOption("openPrimeR.plot_abbrev"))) # shorter identifiers
    return(count.df)
}
#' Plot of Individual Primer Coverage and Mismatches.
#'
#' Plots the coverage of individual primers for different mismatch settings.
#'
#' @param primer.df A \code{Primers} object.
#' @param template.df A \code{Templates} object.
#' @param groups Optional identifiers of template groups to be considered.
#' If not provided, all template groups are considered.
#' @param nfacets A numeric providing the number of facet columns to use. 
#' By default, \code{nfacets} is set to \code{NULL} such that a
#' suitable number of columns is chosen automatically.
#' @return A bar plot showing the coverage of individual primers for different mismatch settings.
#' @keywords internal
plot_primer_cvg_mismatches <- function(primer.df, template.df, groups = NULL,
                                       nfacets = NULL) {
    # select template group subset
    if (!is.null(groups) && !"all" %in% groups) { # select subset
        idx <- which(template.df$Group %in% groups)
        template.df <- template.df[idx,]
    }
    # retrieve cvg stats of each primer:
    count.df <- get_primer_cvg_mm_plot_df(primer.df, template.df)
    pal <- getOption("openPrimeR.plot_colors")["Group"] # the RColorBrewer palette to use
    group.colors <- colorRampPalette(brewer.pal(8, pal))(length(unique(count.df[, "Group"])))
    p <- ggplot(count.df) + 
        geom_bar(width = 0.75, position = "stack", stat = "identity",
            aes_string("Primer", 
                "Cumulative_Coverage",
                fill = "Group")) + 
        xlab("Primer") + ggtitle("Mismatch primer coverage") + 
        ylab("Number of covered templates") + 
        theme(axis.text.x = element_text(
                angle = 90,  # angle @ 90 to prevent overplotting when facetting
                hjust = 1)) +
        scale_fill_manual(values = group.colors) + 
        facet_wrap(~Maximal_mismatches, ncol = nfacets,
            labeller = label_bquote("Mismatches"<=.(substitute(Maximal_mismatches))))
    return(p)
}

Try the openPrimeR package in your browser

Any scripts or data that you put into this service are public.

openPrimeR documentation built on Nov. 16, 2020, 2 a.m.