R/optimization_greedy.R

Defines functions select.best.opti.result select.primers.by.cvg optimize.primer.cvg check_correspondence filter_primers.by.Tm.delta select.best.primer.idx reorder.primer.table

###############
# Greedy optimization of primer sets
#################

#' Reorder Primers 
#'
#' Reorders a primer set according to the IDs of primers.
#'
#' @param filtered.primers Primer data frame.
#' @param primer.ID.order new ordering of IDs in the data frame.
#' @return Reordered primer data frame.
#' @keywords internal
reorder.primer.table <- function(filtered.primers, primer.ID.order) {
    # reorders the input primers table according to the given ID order
    # filtered.primers: input primer df primer.ID.order: new order of ID column in
    # primer.df
    if (length(primer.ID.order) == 0) {
        return(filtered.primers)
    }
    filtered.primers$ID <- factor(filtered.primers$ID, levels = primer.ID.order)
    o <- order(filtered.primers$ID)
    filtered.primers <- filtered.primers[o, ]
    return(filtered.primers)
}
#' Greedy Choice
#'
#' Selects the currently best primer for Greedy primer selection.
#'
#' @param result Data frame of current optimized primer data set that is to be augmented.
#' @param primers Data frame of candidate primers for addition to \code{result}.
#' @param deltaG.cutoff Free energy cutoff for cross-dimerization.
#' @param target.temp Target annealing temperature in Celsius.
#' @param primer_conc Primer concentration.
#' @param na_salt_conc Sodium ion concentration.
#' @param mg_salt_conc Magensium ion concentration.
#' @param k_salt_conc Potassium ion concentration.
#' @param tris_salt_conc Tris ion concentration.
#' @return The index of a suitable primer according to Greedy selection.
#' @keywords internal
select.best.primer.idx <- function(result, primers, deltaG.cutoff, target.temp,
    primer_conc, na_salt_conc, mg_salt_conc, k_salt_conc, tris_salt_conc)  {

    if (length(primers) == 0 || nrow(primers) == 0) {
        return(NULL)
    }
    candidates <- NULL  # idx of all selection candidates
    bad.primer.df <- data.frame(Identifier = numeric(0), Index = numeric(0), melting_temp = numeric(0), 
        deltaG_dimer = numeric(0))
    for (idx in seq_len(nrow(primers))) {
        # check whether constraints are fulfilled by selection candidates best primers
        # already found, check if cur primer cvg is the same as the best primer cvg of
        # the current primer is not as high as the selected ones -> don't consider for
        # inclusion
        if (length(candidates) != 0) {
            if (primers[idx, "primer_coverage"] < primers[candidates[1], "primer_coverage"]) {
                break
            }
        }
        if (!is.na(deltaG.cutoff) && length(result) != 0 && nrow(result) != 0) {
            # check for cross-dimerization between current primers and candidate primer
            tmp.result <- suppressWarnings(my_rbind(result, primers[idx, ]))
            comp <- compute.all.cross.dimers(tmp.result, primer_conc, na_salt_conc, 
                    mg_salt_conc, k_salt_conc, tris_salt_conc, 
                    target.temp, check.idx = nrow(tmp.result), 
                    no.structures = TRUE)
            if (any(comp$DeltaG < deltaG.cutoff)) {
                # any deltaG exceeds the allowed cutoff -> don't select this primer
                min.deltaG <- min(comp$DeltaG)
                #message(paste("bad primer id: ", primers[idx, "Identifier"], ", dimerization deltaG: ", 
                  #min.deltaG))
                bad.primer <- data.frame(Identifier = primers[idx, "Identifier"], 
                  Index = idx, melting_temp = NA, deltaG_dimer = min.deltaG)
                bad.primer.df <- rbind(bad.primer.df, bad.primer)
            } else {
                candidates <- c(candidates, idx)
            }
        } else {
            # if there's no results yet, the first primer is automatically a candidate ..
            candidates <- c(candidates, idx)
        }
    }
    if (length(candidates) != 0) {
        # select the best primer from the candidates (all have the same cvg) by
        # resolution through primer efficiency (if available) select primer by highest
        # primer efficiency
        if ("primer_efficiency" %in% colnames(primers)) {
            o <- order(primers[candidates, "mean_primer_efficiency"], decreasing = TRUE)
            idx <- candidates[o][1]
        } else {
            idx <- candidates[1]
        }
    }
    res <- list(sel_idx = idx, filtered_info = bad.primer.df)
    return(res)
}
#' Filter by Melting Temperature Difference
#'
#' Filters primers by melting temperature differences.
#'
#' @param target.temp Target melting temperature in Celsius.
#' @param selected.primers Current candidate primer data frame.
#' @param max.Tm.delta Maximum allowed difference of primer melting temperatures to target temperature.
#' @return Filtered primer data frame.
#' @keywords internal
filter_primers.by.Tm.delta <- function(target.temp, selected.primers, max.Tm.delta) {
    # target.temp: target melting temperature of primers in the optimization set
    # selected.primers: current candidates for selection max.Tm.delta: maximal
    # allowed difference of primer melting temperature to target temperature
    r <- selected.primers
    sel.idx <- NULL
    filtered.df <- data.frame(Identifier = numeric(0), Index = numeric(0), melting_temp = numeric(0), 
        deltaG_dimer = numeric(0))  # excluded primers
    if (!is.na(max.Tm.delta)) {
        Tm.diff <- abs(selected.primers$melting_temp - target.temp)
        sel.idx <- which(Tm.diff > max.Tm.delta)
        if (length(sel.idx) != 0) {
            f.df <- data.frame(Identifier = selected.primers$Identifier[sel.idx], 
                Index = sel.idx, melting_temp = Tm.diff[sel.idx], deltaG_dimer = NA)
            filtered.df <- rbind(filtered.df, f.df)
            r <- selected.primers[-sel.idx, ]
        }
    }
    out <- list(data = r, filtered = Primers(filtered.df))  # data: selected primers, filtered: excluded primers. melting temp entry of filtered: refers to DeltaTm!
    return(out)
}
#' Check of Primer and Template Correspondence.
#'
#' Checks whether the primers relate to the correct templates.
#'
#' @param primer.df An object of class \code{Primers}.
#' @param template.df An object of class \code{Templates}.
#' @return \code{TRUE} if the primers and templates seem to correspond,
#' \code{FALSE} otherwise.
#' @keywords internal
check_correspondence <- function(primer.df, template.df) {
    # check whether primers and templates correspond to one another
    cvd.seqs <- unlist(sapply(strsplit(primer.df$Covered_Seqs, split = ","), function(x) as.numeric(x)))
    if (length(cvd.seqs) == 0) {
        # no covered seqs -> always corresponds
        return(TRUE)
    }
    cvd.seqs <- cvd.seqs[!is.na(cvd.seqs)] # exclude primers for which cvg is not available.
    m <- match(cvd.seqs, template.df$Identifier)
    if (any(is.na(m))) {
        return(FALSE)
        msg <- paste("Could not match all coverage events to the input template data frame.",
            "Please check whether the primers and templates correspond.")
        stop(msg)
    } else {
        return(TRUE)
    }
}
#' Greedy Optimization
#'
#' Greedy approach for solving the primer set coverage problem.
#'
#' @param primers Primer data frame to be optimized.
#' @param template.df Template data frame.
#' @param mode.directionality Primer direction.
#' @param cur.opti.constraints List with optimization constraint settings.
#' @param target.temp Target annealing temperature of the optimized primer set in Celsius.
#' @param allowed.mismatches The number of mismatches primers are allowed to have with the templates.
#' @param opti.limits List with optimization limits.
#' @param primer_conc Primer concentration.
#' @param template_conc Template concentration.
#' @param na_salt_conc Sodium ion concentration.
#' @param mg_salt_conc Magensium ion concentration.
#' @param k_salt_conc Potassium ion concentration.
#' @param tris_salt_conc Tris ion concentration.
#' @param updateProgress Shiny progress callback function.
#' @return List with optimization data.
#' @keywords internal
optimize.primer.cvg <- function(primers, template.df, mode.directionality, 
    cur.opti.constraints, target.temp, allowed.mismatches, opti.limits, primer_conc, 
    na_salt_conc, mg_salt_conc, k_salt_conc, tris_salt_conc, updateProgress = NULL) {

    #print("optimizing primer cvg")
    deltaG.cutoff <- NA
    if ("cross_dimerization" %in% names(cur.opti.constraints)) {
        deltaG.cutoff <- cur.opti.constraints[["cross_dimerization"]]["min"]
    }
    l.df <- template.df
    max.cvg <- get_cvg_ratio(primers, template.df)
    cur.cvg <- 0
    it <- 0
    result <- NULL # not initialized here due to Primers class.
    all.bad.primers <- data.frame(Identifier = numeric(0), Index = numeric(0), 
                        melting_temp = numeric(0), deltaG_dimer = numeric(0))
    # order primers by cvg for greedy selection
    selected.primers <- primers[order(primers$primer_coverage, decreasing = TRUE), ]  
    while ((cur.cvg != max.cvg) && nrow(selected.primers) >= 1) {
        it <- it + 1
        #message(it)
        best.primer.data <- select.best.primer.idx(result, selected.primers, 
            deltaG.cutoff, target.temp, primer_conc, 
            na_salt_conc, mg_salt_conc, k_salt_conc, tris_salt_conc)
        if (length(best.primer.data) == 0 || is.na(best.primer.data$sel_idx)) {
            # no primer could be selected..
            selected.primers <- primers[FALSE, ]
            break
        } else {
            best.primer.idx <- best.primer.data$sel_idx
            bad.primer.df <- best.primer.data$filtered_info  # e.g. cross-dimerizing primers
            all.bad.primers <- rbind(all.bad.primers, bad.primer.df)
            best.covered <- as.numeric(unlist(strsplit(selected.primers[best.primer.idx, "Covered_Seqs"], split = ",")))
            if (length(best.covered) == 0) {
                # cannot improve anymore: best (highest cvg primer) didn't cover anything
                break
            }
            # selected.primers gives only the coverage count for sequences that haven't been covered yet :-)
            cumulative.coverage <- sum(result$Marginal_Coverage_Gain) + selected.primers[best.primer.idx, "primer_coverage"]  
            # compute current coverage
            cur.cvg <- cumulative.coverage/nrow(template.df)
            # use primer entry from original table to have the 'true' covered seqs in the
            # result, not only the marginal gain (primer_coverage is not the marginal gain!!)
            primer.entry <- primers[match(selected.primers[best.primer.idx, "Identifier"], primers$Identifier), ]
            entry <- data.frame(Iteration = it, 
                                Marginal_Coverage_Gain = selected.primers[best.primer.idx, "primer_coverage"], 
                                Cumulative_Coverage = cumulative.coverage, Cumulative_Coverage_Ratio = cur.cvg, 
                                primer.entry, stringsAsFactors = FALSE)
            result <- rbind(result, entry)
            rm.idx <- c(bad.primer.df$Index, best.primer.idx)
            selected.primers <- selected.primers[-rm.idx, ]  # remove the selected primer and selection candidates that were filtered this round
            # re-compute coverage by removing templates that are covered by newly added primer (best.covered)
            selected.primers <- evaluate.diff.primer.cvg(selected.primers, best.covered, template.df)  
        }
    }
    # reset cvg to consistent values from before (not marginal gain anymore):
    if (length(result) != 0) {
        m <- match(result$Identifier, primers$Identifier)
        result[, colnames(primers)] <- primers[m, ]
        # remove redundant primers
        cvg.matrix <- get.coverage.matrix(primers, template.df)
        primer.idx <- remove.redundant.cols(m, cvg.matrix)  # updated primer.idx with unnecessary primers removed
        result <- result[match(primers$Identifier[primer.idx], result$Identifier), , drop = FALSE]
    } else {
        result <- primers # could not optimize anything
    }
    out <- list(data = Primers(result), bad_primers = all.bad.primers)
    return(out)
}
#' Greedy Optimization.
#'
#' Greedy approach for solving the primer set coverage problem.
#'
#' @param primers Primer data frame to be optimized.
#' @param settings A \code{DesignSettings} object.
#' @param template.df Template data frame.
#' @param mode.directionality Primer direction.
#' @param required.cvg Target coverage ratio.
#' @param allowed.mismatches The number of mismatches primers are allowed to have with the templates.
#' @param primer_conc Primer concentration.
#' @param na_salt_conc Sodium ion concentration.
#' @param mg_salt_conc Magensium ion concentration.
#' @param k_salt_conc Potassium ion concentration.
#' @param tris_salt_conc Tris ion concentration.
#' @param template_conc Template data frame.
#' @param allowed.other.binding.ratio Ratio of primers allowed to bind to non-target regions.
#' @param allowed.stop.codons  Consider mismatch binding events that induce stop codons.
#' @param allowed.region.definition Definition of the target binding sites used for evaluating the coverage.
#' If \code{allowed.region.definition} is \code{within}, primers have to lie within the allowed binding region.
#' If \code{allowed.region.definition} is \code{any}, primers have to overlap with the allowed binding region.
#' The default is that primers have to bind within the target binding region.
#' @param disallowed.mismatch.pos The number of positions from the primer 3' end where mismatches should not be allowed.
#' All primers binding templates with mismatches within \code{disallowed.mismatch.pos} from the 3' end are disregarded.
#' @param target.temps Target melting temperatures for optimized sets in Celsius.
#' @param fw.primers List with already optimized primer data frames corresponding to \code{target.temps}.
#' @param updateProgress Shiny progress callback function.
#' @return List with optimization data.
#' @keywords internal
select.primers.by.cvg <- function(primers, settings, template.df, mode.directionality = c("fw", "rev"), 
    required.cvg = 1, allowed.mismatches, primer_conc, na_salt_conc, mg_salt_conc, 
    k_salt_conc, tris_salt_conc, template_conc, allowed.other.binding.ratio,
    allowed.stop.codons, allowed.region.definition = c("within", "any"), 
    disallowed.mismatch.pos, target.temps = NULL, fw.primers = NULL, updateProgress = NULL) {

    if (length(mode.directionality) == 0) {
        stop("Please supply the 'mode.directionality' arg.")
    }
    mode.directionality <- match.arg(mode.directionality)
    if (length(allowed.region.definition) == 0) {
        stop("Please supply the 'allowed.region.definition' arg.")
    }
    allowed.region.definition <- match.arg(allowed.region.definition)
    opti.constraints <- opti(settings)
    opti.limits <- optiLimits(settings)
    if (length(primers) == 0) {
        return(NULL)
    } else if (nrow(primers) == 0) {
        # nothing to optimize
        return(list(opti = primers, all_results = list(primers), all_used_constraints = list(settings), 
            used_constraints = settings))
    }
    # add primer cvg if not present yet: needed for optimization
    if (!"primer_coverage" %in% colnames(primers)) {
        stop("Optimization requires pre-computed primer coverage")
    }
    # determine computation constraints:
    Tm.brackets <- create.Tm.brackets(primers, template.df, settings, target.temps)
    primers <- Tm.brackets$primers
    Tm.groups <- Tm.brackets$group_idx
    Tm.group.df <- Tm.brackets$df
    opti.results <- vector("list", nrow(Tm.group.df))
    # 1. precompute constraints for all temperatures
    diagnostic.location <- NULL
    Tm.data <- compute.Tm.sets(primers, template.df, Tm.brackets, settings, mode.directionality, 
        primer_conc, template_conc, na_salt_conc, mg_salt_conc, k_salt_conc, tris_salt_conc, 
        allowed.mismatches, allowed.other.binding.ratio, 
        allowed.stop.codons, allowed.region.definition, disallowed.mismatch.pos, 
        TRUE, required.cvg,
        fw.primers, diagnostic.location, updateProgress)
    Tm.sets <- Tm.data$sets
    Tm.settings <- Tm.data$settings
    #############
    # 2. optimize all temperature primer sets
    Tm.group.idx <- NULL
    max.nbr.relaxations <- 3
    opti.results.data <- vector("list", length(Tm.sets)) # for debug
    for (Tm.group.idx in seq_along(Tm.sets)) { # for debug only
    #opti.results.data <- foreach(Tm.group.idx = seq_along(Tm.sets), .combine = c) %dopar% {
        Tm.set <- Tm.sets[[Tm.group.idx]]
        cur.settings <- Tm.settings[[Tm.group.idx]]
        if (length(Tm.set) == 0 || nrow(Tm.set) == 0) {
            # no Tm set -> nothing to optimize. this shouldn't really happen except if there's no data.
            Tm.out <- list(list(data = Tm.set, used_constraints = opti.constraints))
        } else {
            #message(paste("Nbr of primers: ", dim(Tm.set)[1], sep = ""))
            target.temp <- Tm.group.df$target_Tm[Tm.group.idx]
            annealing.temp <- Tm.group.df$annealing_temp[Tm.group.idx]
            if (is.function(updateProgress)) {
                detail <- paste("Tm group: ", Tm.group.idx, sep = "")
                updateProgress(1/length(opti.results), detail, "inc")
            }
            #message(paste("Optimizing for Tm-group: ", Tm.group.idx, "/", nrow(Tm.group.df), 
            #    sep = ""))
            target.cvg <- min(get_cvg_ratio(Tm.set, template.df), required.cvg)
            result.data <- optimize.primer.cvg(Tm.set, template.df, mode.directionality, 
                opti(cur.settings), annealing.temp, allowed.mismatches, 
                opti.limits, primer_conc, na_salt_conc, mg_salt_conc, k_salt_conc, 
                tris_salt_conc, updateProgress)
            result <- result.data$data
            result.cvg <- get_cvg_ratio(result, template.df)
            # relaxation
            while (result.cvg < target.cvg && "cross_dimerization" %in% names(opti(cur.settings))) {
                # don't lose any coverage here in the optimization ...
                # adjust DeltaG
                message("Greedy deltaG relaxation. Current coverage: ", result.cvg, 
                            "Target coverage: ", target.cvg, 
                            ". Old deltaG: ", constraints(cur.settings)$cross_dimerization["min"])
                constraints(cur.settings)$cross_dimerization <- relax.opti.constraints(opti(cur.settings), optiLimits(Tm.settings[[Tm.group.idx]]), 
                                                                                      opti(Tm.settings[[Tm.group.idx]]))$cross_dimerization
                constraintLimits(cur.settings)$cross_dimerization <- set.new.limits(optiLimits(cur.settings), optiLimits(Tm.settings[[Tm.group.idx]]), 
                                                                                      opti(Tm.settings[[Tm.group.idx]]))$cross_dimerization
                result.data <- optimize.primer.cvg(Tm.set, template.df, mode.directionality, 
                    opti(cur.settings), annealing.temp, allowed.mismatches, 
                    opti.limits, primer_conc, na_salt_conc, mg_salt_conc, k_salt_conc, 
                    tris_salt_conc,  updateProgress)
               result <- result.data$data
               result.cvg <- get_cvg_ratio(result, template.df)
               message("-> New coverage: ", result.cvg, 
                            ". New deltaG: ", constraints(cur.settings)$cross_dimerization["min"])
            }
            Tm.out <- list(list(data = result.data$data, used_constraints = cur.settings))
            opti.results.data[[Tm.group.idx]] <- Tm.out[[1]] # for debug only
        }
    }
    opti.results <- lapply(opti.results.data, function(x) x$data)  # extract optimized sets
    names(opti.results) <- Tm.group.df$target_Tm # uses the target Tm's before relaxation
    # remove empty results
    opti.results <- opti.results[lengths(opti.results) != 0]

    best.idx <- select.best.opti.result(opti.results, template.df)
    if (length(best.idx) == 0) {
        # could not determine a solution, arbitrarily return the first opti.result
        best.idx <- 1
    }
    opti.result <- opti.results[[best.idx]]
    all.used.constraints <- lapply(opti.results.data, function(x) x$used_constraints)  # all used settings
    result <- list(opti = opti.result, 
        used_constraints = opti.results.data[[best.idx]]$used_constraints, 
        all_used_constraints = all.used.constraints, 
        all_results = opti.results)
    #message("optimization w/ greedy finished")
    return(result)
}
#' Selection of Best Greedy Result
#'
#' Selects best greedy primer data set.
#'
#' @param opti.results List with primer data frames for different target melting temperatures.
#' @param template.df Template data frame.
#' @return The index of the best primer data set.
#' @keywords internal
select.best.opti.result <- function(opti.results, template.df) {
    # could have a single function for ILP and greedy result selection?
    cvg.per.group <- sapply(opti.results, function(x) if (length(x) != 0 && nrow(x) != 
        0) {
        max(x$Cumulative_Coverage)
    } else {
        0
    })
    if (all(cvg.per.group == 0)) {
        message("Could not find a solution with primers covering any of the templates.")
        if (length(opti.results) > 0) {
            return(1)
        } else {
            return(NULL)
        }
    }
    cvg.ratios <- sapply(cvg.per.group, function(x) x/nrow(template.df))
    #message("Cumulative coverage per group: ")
    #message(cvg.per.group)
    #message("Coverage ratios: ")
    #message(cvg.ratios)
    # TIE BREAKING: at the moment: just use primer_efficiency: later: score primer
    # set using all available parameters
    eps <- 0.01  # if the cvg difference is just 1%, choose the set with better efficiency
    m.cvg <- max(cvg.per.group, na.rm = TRUE)
    max.idx <- which(cvg.per.group == m.cvg)[1]
    sel.idx <- which((cvg.ratios[max.idx] - cvg.ratios) < eps)
    idx <- NULL
    if ("primer_efficiency" %in% colnames(opti.results[[1]])) {
        eff <- unlist(lapply(opti.results[sel.idx], function(x) mean(x$mean_primer_efficiency, 
            na.rm = TRUE)))
        # message(opti.results[[1]]$primer_efficiency)
        #message(paste("Primer set mean efficiencies:", eff, sep = ""))
        idx <- sel.idx[which.max(eff)]
    } else {
        # no primer efficiency available -> just select the max
        idx <- max.idx[1]
    }
    #message(paste("Selected primer set: ", idx, sep = ""))
    return(idx)
}

Try the openPrimeR package in your browser

Any scripts or data that you put into this service are public.

openPrimeR documentation built on Nov. 16, 2020, 2 a.m.