R/hap2seqs.R
In geneHapR: Gene Haplotype Statistics, Phenotype Association and Visualization

Documented in seqs2hap trimSeqs

#' @name seqs2hap
#' @title Generate Hap Results from Seqs
#' @description generate hapResults from aligned and trimed sequences
#' @examples
#' \donttest{
#' data("geneHapR_test")
#' seqs
#' seqs <- trimSeqs(seqs,
#'          minFlankFraction = 0.1)
#' seqs
#' hapResult <- seqs2hap(seqs,
#'                       Ref = names(seqs)[1],
#'                       hetero_remove = TRUE, na_drop = TRUE,
#'                       maxGapsPerSeq = 0.25,
#'                       hapPrefix = "H")
#' }
#' @importFrom Biostrings letterFrequency width
#' @importFrom methods as
#' @param seqs object of DNAStringSet or DNAMultipleAlignment class
#' @param Ref the name of reference sequences.
#' Default as the name of the first sequence
#' @param hapPrefix prefix of hap names. Default as "H"
#' @param maxGapsPerSeq value in `[0, 1]` that indicates the maximum
#' fraction of gaps allowed in each seq after alignment (default as 0.25).
#' Seqs with gap percent exceed that will be dropped
#' @param hetero_remove whether remove accessions contains hybrid site or not.
#' Default as `TRUE`
#' @param na_drop whether drop sequeces contain "N"
#' Default as `TRUE`.
#' @param pad The number length in haplotype names should be extend to.
#' @param chrName the Name should be used for haplotype
#' @param ... Parameters not used.
#' @inherit hap_summary examples
#' @return
#' object of hapResult class
#' @export
seqs2hap <- function(seqs,
                     Ref = names(seqs)[1],
                     hetero_remove = TRUE, na_drop = TRUE,
                     maxGapsPerSeq = 0.25,
                     hapPrefix = "H", pad = 3,
                     chrName = "Chr0",
                     ...) {
    seqs <- as(seqs, "DNAStringSet")
    options <- c(hapPrefix = hapPrefix)
    RefSeq <- seqs[Ref]
    accAll <- names(seqs)
    if (names(seqs)[1] != Ref)
        seqs <- seqs[c(Ref, setdiff(names(seqs), Ref))]
    allS_new <- allS


    # remove seqs contains too much gaps
    rowGaps <- Biostrings::letterFrequency(seqs, "-")[, 1]
    nc <- Biostrings::width(seqs)[1]
    probe <- rowGaps / nc < maxGapsPerSeq
    if (FALSE %in% probe) {
        message(table(probe)["FALSE"],
                " seq(s) will be removed due to too much gaps")
        message("  ", paste(names(seqs)[!probe], collapse = ", "))
        seqs <- seqs[probe]
    }

    # seq2hapData
    hapData <-
        seq2hap_data(seqs, allS_new = allS_new, RefSeq = RefSeq)
    allS_new <- hapData$allS_new
    hap <- hapData$hap

    # Drop hyb or N
    if (hetero_remove) {
        hap[!hap %in% allS_new$homo] <- NA
        hap <- na.omit(hap)
        options <- c(options, hetero_remove = "YES")
    } else
        options <- c(options, hetero_remove = "NO")
    if (na_drop) {
        hap[hap == "N"] <- NA
        hap <- na.omit(hap)
        options <- c(options, NA_remove = "YES")
    } else
        options <- c(options, NA_remove = "NO")

    # set meta info
    POS <- colnames(hap)
    ALLELE <-
        apply(hap, 2, function(x)
            paste(unique(x), collapse = ","))
    ALLELE <- stringr::str_replace(ALLELE, ",", "/")
    INFO <- rep(NA, ncol(hap))
    CHR <- rep(chrName, ncol(hap))

    # assign hapID
    hap <- assign_hapID(hap, hapPrefix, pad)
    hap <- rbind(
        CHR = c("CHR", CHR, ""),
        POS = c("POS", POS, ""),
        INFO = c("INFO", INFO, ""),
        ALLELE = c("ALLELE", ALLELE, ""),
        hap
    )

    hap <- remove_redundancy_col(hap)

    # set attributes
    class(hap) <- unique(c('hapResult', "data.frame"))
    attr(hap, "AccAll") <- accAll
    accRemain <- hap$Accession[hap$Accession != ""]
    attr(hap, "AccRemain") <- accRemain
    attr(hap, "AccRemoved") <- accAll[!accAll %in% accRemain]
    attr(hap, "options") <- options
    hap2acc <- hap$Accession[-c(1:4)]
    names(hap2acc) <- hap$Hap[-c(1:4)]
    attr(hap, "hap2acc") <- hap2acc
    attr(hap, "freq") <- table(hap$Hap[-c(1:4)])
    return(hap)
}


#' #' @name seqs2hap
#' #' @description allign imported sequences
#' #' @usage allignSeqs(seqs, ...)
#' #' @param ... parameters will pass to `muscle::muscle()`
#' #' @seealso
#' #' \code{\link[muscle:muscle]{muscle::muscle()}}
#' #' @export
#' allignSeqs <- function(seqs, ...) {
#'     muscle::muscle(seqs, ...)
#' }



#' @name seqs2hap
#' @usage
#' trimSeqs(seqs,
#'          minFlankFraction = 0.1)
#' @importFrom Biostrings as.matrix
#' @param minFlankFraction A value in `[0, 1]` that indicates the minimum
#' fraction needed to call a gap in the consensus string (default as 0.1).
#' @export
trimSeqs <- function(seqs,
                     minFlankFraction = 0.1) {
    mseqs <- Biostrings::as.matrix(seqs)
    nr <- nrow(mseqs)
    gaps <- apply(mseqs, 2, function(x)
        table(x)["-"])
    gaps <- gaps / nr

    trimLeft <- 0
    for (i in seq_len(length(gaps))) {
        if (gaps[i] <= minFlankFraction) {
            trimLeft <- i
            break
        }
    }

    trimRight <- 0
    for (j in rev(seq_len(length(gaps)))) {
        if (gaps[j] <= minFlankFraction) {
            trimRight <- j
            break
        }
    }

    if (trimLeft >= trimRight)
        stop(
            "Too many gaps found in flank of aligned seqs.
Please consider elevate 'minFlankFraction' OR align and trim seqs manually"
        )
    seqs <- as(seqs, "DNAStringSet")
    seqs <- Biostrings::subseq(seqs, trimLeft, trimRight)
    return(seqs)
}


# return data.frame,
seq2hap_data <- function(seqs,
                         allS_new = allS_new,
                         RefSeq = RefSeq) {
    # get refSeq trimed length
    Ref <- names(RefSeq)
    # convert DNAset into matrix
    RefSeq <- as.matrix(RefSeq)
    mseqs <- as.matrix(seqs)
    mRef <- mseqs[Ref, ]

    # trim addtional oligonucleotide exceed than Ref
    trimLength <- 0
    for(i in mRef){
        if(i == "-") trimLength <- trimLength + 1 else break
    }
    if(trimLength > 0) {
        probe <- seq_len(trimLength)
        mseqs <- mseqs[, -probe]
        mRef <- mRef[-probe]
    }
    trimLength <- 0
    for(i in rev(mRef)){
        if(i == "-") trimLength <- trimLength + 1 else break
    }
    if(trimLength > 0) {
        probe <- ncol(mseqs) - seq_len(trimLength) + 1
        mseqs <- mseqs[, -probe]
        mRef <- mRef[-probe]
    }

    # deal with indels
    newmseqs <-
        matrix(
            nrow = nrow(mseqs),
            ncol = 0,
            dimnames = list(rownames(mseqs))
        )
    POS <- c()
    n <- 1
    for (i in seq_len(ncol(mseqs))) {
        if (!"-" %in% mseqs[, i]) {
            newmseqs <- cbind(newmseqs, mseqs[, i])
            POS <- c(POS, n)
        } else {
            newmseqs[, ncol(newmseqs)] = paste0(newmseqs[, ncol(newmseqs)], mseqs[, i])
        }
        if (mRef[i] != "-")
            n <- n + 1
    }
    colnames(newmseqs) <- POS

    # remove redudant cols
    probe <- apply(newmseqs, 2, function(x)
        length(unique(x)))
    newmseqs <- newmseqs[, probe != 1]

    # remove "-"
    newmseqs[, ] <- stringr::str_remove_all(newmseqs, "-")

    # update allSite information
    ALLELE <-
        apply(newmseqs, 2, function(x)
            paste(unique(x), collapse = ","))
    ALLELE <- stringr::str_replace(ALLELE, ",", "/")
    for (i in unique(ALLELE)) {
        ALi <- unlist(strsplit(i, "/"))
        allS_new <- update_allS(allS_new = allS_new,
                                REF = ALi[1],
                                ALT = ALi[2])
    }

    newmseqs <- t(newmseqs) %>%
        toupper() %>%
        t()
    return(list(hap = newmseqs, allS_new = allS_new))
}