R/hap2vcf.R

Defines functions remove_redundancy_col assign_hapID vcf2hap_data vcf2hap

Documented in vcf2hap

#' @name vcf2hap
#' @title Generat Haps from VCF
#' @description
#' Generate hapResult from vcfR object
#' A simple filter by position was provided in this function,
#' however it's prefer to filter VCF (vcfR object) through
#' \code{\link[geneHapR:filter_vcf]{filter_vcf()}}.
#' @author Zhangrenl
#' @examples
#' data("geneHapR_test")
#' hapResult <- vcf2hap(vcf)
#' @param vcf vcfR object imported by `import_vcf()`
#' @param filter_Chr not used
#' @param filter_POS not used
#' @param hapPrefix prefix of hap names, default as "H"
#' @param hetero_remove whether remove accessions contains hybrid site or not.
#' Default as `TRUE`
#' @param pad The number length in haplotype names should be extend to.
#' @param na_drop whether remove accessions contains unknown allele site or not
#' Default as `TRUE`.
#' @param ... Parameters not used
#' @seealso
#' extract genotype from vcf:
#' \code{\link[vcfR:extract_gt_tidy]{vcfR::extract_gt_tidy()}},
#' import vcf files:
#' \code{\link[geneHapR:import_vcf]{import_vcf()}} (preferred) and
#' \code{\link[vcfR:read.vcfR]{vcfR::read.vcfR()}},
#' filter vcf according **position** and **annotations**:
#' \code{\link[geneHapR:filter_vcf]{filter_vcf()}}
#' @return
#' object of hapResult class
#' @importFrom vcfR getPOS getCHROM getREF getALT getINFO extract_gt_tidy is.indel is.biallelic read.vcfR
#' @importFrom rlang .data
#' @importFrom stats na.omit
#' @export
vcf2hap <- function(vcf,
                    hapPrefix = "H",
                    filter_Chr = FALSE,
                    # Chr = Chr,
                    filter_POS = FALSE,
                    # startPOS = startPOS,
                    # endPOS = endPOS,
                    pad = 3,
                    hetero_remove = TRUE,
                    na_drop = TRUE, ...) {
    requireNamespace('tidyr')
    allS_new <- allS
    options <- c(hapPrefix = hapPrefix)
    if (filter_Chr) {
        message("Filter VCF by chromosome has been detached from vcf2hap")
    }
    # else {
    #         if (length(Chr) > 1)
    #             stop("
    # only one CHROM should be in vcf, consider set 'filter_Chr' as 'TRUE'
    #                  ")
    # }

    # filter Postion according given range
    if (filter_POS) {
        message("Filter VCF by position has been detached from vcf2hap")
    }

    # extract information from vcf
    CHR <- vcfR::getCHROM(vcf)
    POS <- vcfR::getPOS(vcf)
    options <- c(options, CHROM = unique(CHR))
    options <- c(options, POS = paste0(min(POS), "-", max(POS)))
    REF <- vcfR::getREF(vcf)
    ALT <- vcfR::getALT(vcf)
    ALLELE <- paste0(REF, "/", ALT)
    INFO <- vcfR::getINFO(vcf)

    # generate hap data from vcf
    hapData <- vcf2hap_data(
        vcf,
        allS_new = allS_new,
        REF = REF,
        ALT = ALT,
        ALLELE = ALLELE,
        POS = POS
    )
    allS_new <- hapData$allS_new
    hap <- hapData$hap
    # Drop hyb or N
    if (hetero_remove) {
        hap[grepl("|", hap, fixed = T)] <- NA
        hap[grepl("/", hap, fixed = T)] <- NA
        hap <- na.omit(hap)
        options <- c(options, hetero_remove = "YES")
    } else
        options <- c(options, hetero_remove = "NO")

    if (na_drop) {
        hap[hap == "N"] <- NA
        hap <- na.omit(hap)
        options <- c(options, NA_remove = "YES")
    } else
        options <- c(options, NA_remove = "NO")

    hap <- assign_hapID(hap, hapPrefix, pad)

    # add infos
    meta <- rbind(c("CHR", CHR, ""),
                  c("POS", POS, ""),
                  c("INFO", INFO, ""),
                  c("ALLELE", ALLELE, ""))
    colnames(meta) <- colnames(hap)
    hap <- rbind(meta, hap)
    rownames(hap) <- seq_len(nrow(hap))

    # set attributes
    hap <- remove_redundancy_col(hap)
    class(hap) <- unique(c('hapResult', "data.frame"))
    accAll <- colnames(vcf@gt)[-1]
    attr(hap, "AccAll") <- accAll
    accRemain <- hap$Accession[hap$Accession != ""]
    attr(hap, "AccRemain") <- accRemain
    attr(hap, "AccRemoved") <- accAll[!accAll %in% accRemain]
    attr(hap, "options") <- options
    hap2acc <- hap$Accession[-c(1:4)]
    names(hap2acc) <- hap$Hap[-c(1:4)]
    attr(hap, "hap2acc") <- hap2acc
    attr(hap, "freq") <- table(hap$Hap[-c(1:4)])

    return(hap)
}



# return: data.frame, individuals in rows and positions in cols
# with additional attrs
# eg. # $hap
#     #    41  136
#     # a  A   A|T
#     # b  G   G
#     # $allS_new
vcf2hap_data <- function(vcf,
                         allS_new = allS_new,
                         REF = REF,
                         ALT = ALT,
                         ALLELE = ALLELE,
                         POS = POS) {
    # vcf2data.frame for analysis
    gt <- vcfR::extract_gt_tidy(vcf, format_fields = "GT")
    hap <- tidyr::pivot_wider(
        data = gt,
        id_cols = .data$Key,
        names_from = .data$Indiv,
        values_from = .data$gt_GT_alleles
    )
    hap <- hap[, colnames(hap) != "Key"]
    hap <- as.matrix(hap)
    rownames(hap) <- POS

    # convert "." into "N/N"
    hap[hap == "."] <- "N/N"
    hap[is.na(hap)] <- "N/N"

    # convert Indel(biallelic site) into +/-
    nr = nrow(hap)
    indel_probe <- vcfR::is.indel(vcf)
    biallelic_probe <- vcfR::is.biallelic(vcf)
    for (l in seq_len(nr)) {
        if (biallelic_probe[l]) {
            if (indel_probe[l])
                allS_new <-
                    update_allS(allS_new, REF = REF[l], ALT = ALT[l])
        } else
            allS_new <-
                update_allS(allS_new, REF = REF[l], ALT = ALT[l])
    }

    hap <- gsub(pattern = "|", replacement = "/", x = hap, fixed = TRUE) %>%
        t() %>%
        toupper()

    # reform the genotypes
    # homo site convert into single
    # ++ -> +; -- -> -
    # A/A -> A; T/T ->T; C/C -> C; G/G ->G
    # N/N -> N
    # hetero convert "/" to "|"
    for (i in seq_len(ncol(hap))) {
        hap[, i] <- allS_new$all[hap[, i]]
    }
    return(list(hap = hap, allS_new = allS_new))
}



assign_hapID <- function(hap, hapPrefix, pad) {
    # name haps
    hap <- data.frame(hap, check.rows = FALSE, check.names = FALSE)

    HapID <- tidyr::unite(hap,
                          tidyr::matches("[0-9]{1, }"),
                          col = "IDs",
                          sep = "")
    HapID <- HapID$IDs
    hap <- cbind(Hap = HapID, hap)
    if (!"Accession" %in% colnames(hap))
        hap$Accession <- row.names(hap)
    haps <- table(hap$Hap)
    haps <- haps[order(haps, decreasing = TRUE)]
    n_hs <- length(haps)
    hapnms <- stringr::str_pad(seq_len(n_hs), pad, "left", "0")
    hapnms <- paste0(hapPrefix, hapnms)
    names(hapnms) <- names(haps)
    hap$Hap <- hapnms[hap$Hap]
    hap <- hap[order(hap$Hap),]
    return(hap)
}


remove_redundancy_col <- function(hap) {
    # removed Redundancy cols
    removecols <- c()
    nc_hap <- ncol(hap)
    if (hap[1, 1] == "CHR")
        gth <- hap[-c(1, 2, 3, 4),]
    else
        gth <- hap
    for (c in seq_len(nc_hap)) {
        namec <- colnames(hap)[c]
        if (!(namec %in% c("Hap", "Accession", "freq"))) {
            gtc <- unique(gth[, c])
            if (length(gtc) == 1) {
                removecols <- c(removecols, c)
            }
        }
    }
    if (!is.null(removecols))
        hap <- hap[,-removecols]
    return(hap)
}

Try the geneHapR package in your browser

Any scripts or data that you put into this service are public.

geneHapR documentation built on May 29, 2024, 11:59 a.m.